xref: /openbmc/linux/kernel/sched/fair.c (revision 9d749629)
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21  */
22 
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/slab.h>
27 #include <linux/profile.h>
28 #include <linux/interrupt.h>
29 #include <linux/mempolicy.h>
30 #include <linux/migrate.h>
31 #include <linux/task_work.h>
32 
33 #include <trace/events/sched.h>
34 
35 #include "sched.h"
36 
37 /*
38  * Targeted preemption latency for CPU-bound tasks:
39  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
40  *
41  * NOTE: this latency value is not the same as the concept of
42  * 'timeslice length' - timeslices in CFS are of variable length
43  * and have no persistent notion like in traditional, time-slice
44  * based scheduling concepts.
45  *
46  * (to see the precise effective timeslice length of your workload,
47  *  run vmstat and monitor the context-switches (cs) field)
48  */
49 unsigned int sysctl_sched_latency = 6000000ULL;
50 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
51 
52 /*
53  * The initial- and re-scaling of tunables is configurable
54  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
55  *
56  * Options are:
57  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
58  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
59  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
60  */
61 enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 	= SCHED_TUNABLESCALING_LOG;
63 
64 /*
65  * Minimal preemption granularity for CPU-bound tasks:
66  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
67  */
68 unsigned int sysctl_sched_min_granularity = 750000ULL;
69 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
70 
71 /*
72  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
73  */
74 static unsigned int sched_nr_latency = 8;
75 
76 /*
77  * After fork, child runs first. If set to 0 (default) then
78  * parent will (try to) run first.
79  */
80 unsigned int sysctl_sched_child_runs_first __read_mostly;
81 
82 /*
83  * SCHED_OTHER wake-up granularity.
84  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
85  *
86  * This option delays the preemption effects of decoupled workloads
87  * and reduces their over-scheduling. Synchronous workloads will still
88  * have immediate wakeup/sleep latencies.
89  */
90 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
92 
93 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94 
95 /*
96  * The exponential sliding  window over which load is averaged for shares
97  * distribution.
98  * (default: 10msec)
99  */
100 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101 
102 #ifdef CONFIG_CFS_BANDWIDTH
103 /*
104  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
105  * each time a cfs_rq requests quota.
106  *
107  * Note: in the case that the slice exceeds the runtime remaining (either due
108  * to consumption or the quota being specified to be smaller than the slice)
109  * we will always only issue the remaining available time.
110  *
111  * default: 5 msec, units: microseconds
112   */
113 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114 #endif
115 
116 /*
117  * Increase the granularity value when there are more CPUs,
118  * because with more CPUs the 'effective latency' as visible
119  * to users decreases. But the relationship is not linear,
120  * so pick a second-best guess by going with the log2 of the
121  * number of CPUs.
122  *
123  * This idea comes from the SD scheduler of Con Kolivas:
124  */
125 static int get_update_sysctl_factor(void)
126 {
127 	unsigned int cpus = min_t(int, num_online_cpus(), 8);
128 	unsigned int factor;
129 
130 	switch (sysctl_sched_tunable_scaling) {
131 	case SCHED_TUNABLESCALING_NONE:
132 		factor = 1;
133 		break;
134 	case SCHED_TUNABLESCALING_LINEAR:
135 		factor = cpus;
136 		break;
137 	case SCHED_TUNABLESCALING_LOG:
138 	default:
139 		factor = 1 + ilog2(cpus);
140 		break;
141 	}
142 
143 	return factor;
144 }
145 
146 static void update_sysctl(void)
147 {
148 	unsigned int factor = get_update_sysctl_factor();
149 
150 #define SET_SYSCTL(name) \
151 	(sysctl_##name = (factor) * normalized_sysctl_##name)
152 	SET_SYSCTL(sched_min_granularity);
153 	SET_SYSCTL(sched_latency);
154 	SET_SYSCTL(sched_wakeup_granularity);
155 #undef SET_SYSCTL
156 }
157 
158 void sched_init_granularity(void)
159 {
160 	update_sysctl();
161 }
162 
163 #if BITS_PER_LONG == 32
164 # define WMULT_CONST	(~0UL)
165 #else
166 # define WMULT_CONST	(1UL << 32)
167 #endif
168 
169 #define WMULT_SHIFT	32
170 
171 /*
172  * Shift right and round:
173  */
174 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
175 
176 /*
177  * delta *= weight / lw
178  */
179 static unsigned long
180 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
181 		struct load_weight *lw)
182 {
183 	u64 tmp;
184 
185 	/*
186 	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
187 	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
188 	 * 2^SCHED_LOAD_RESOLUTION.
189 	 */
190 	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
191 		tmp = (u64)delta_exec * scale_load_down(weight);
192 	else
193 		tmp = (u64)delta_exec;
194 
195 	if (!lw->inv_weight) {
196 		unsigned long w = scale_load_down(lw->weight);
197 
198 		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199 			lw->inv_weight = 1;
200 		else if (unlikely(!w))
201 			lw->inv_weight = WMULT_CONST;
202 		else
203 			lw->inv_weight = WMULT_CONST / w;
204 	}
205 
206 	/*
207 	 * Check whether we'd overflow the 64-bit multiplication:
208 	 */
209 	if (unlikely(tmp > WMULT_CONST))
210 		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
211 			WMULT_SHIFT/2);
212 	else
213 		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
214 
215 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
216 }
217 
218 
219 const struct sched_class fair_sched_class;
220 
221 /**************************************************************
222  * CFS operations on generic schedulable entities:
223  */
224 
225 #ifdef CONFIG_FAIR_GROUP_SCHED
226 
227 /* cpu runqueue to which this cfs_rq is attached */
228 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
229 {
230 	return cfs_rq->rq;
231 }
232 
233 /* An entity is a task if it doesn't "own" a runqueue */
234 #define entity_is_task(se)	(!se->my_q)
235 
236 static inline struct task_struct *task_of(struct sched_entity *se)
237 {
238 #ifdef CONFIG_SCHED_DEBUG
239 	WARN_ON_ONCE(!entity_is_task(se));
240 #endif
241 	return container_of(se, struct task_struct, se);
242 }
243 
244 /* Walk up scheduling entities hierarchy */
245 #define for_each_sched_entity(se) \
246 		for (; se; se = se->parent)
247 
248 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
249 {
250 	return p->se.cfs_rq;
251 }
252 
253 /* runqueue on which this entity is (to be) queued */
254 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
255 {
256 	return se->cfs_rq;
257 }
258 
259 /* runqueue "owned" by this group */
260 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
261 {
262 	return grp->my_q;
263 }
264 
265 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
266 				       int force_update);
267 
268 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
269 {
270 	if (!cfs_rq->on_list) {
271 		/*
272 		 * Ensure we either appear before our parent (if already
273 		 * enqueued) or force our parent to appear after us when it is
274 		 * enqueued.  The fact that we always enqueue bottom-up
275 		 * reduces this to two cases.
276 		 */
277 		if (cfs_rq->tg->parent &&
278 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
279 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
280 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
281 		} else {
282 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
283 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
284 		}
285 
286 		cfs_rq->on_list = 1;
287 		/* We should have no load, but we need to update last_decay. */
288 		update_cfs_rq_blocked_load(cfs_rq, 0);
289 	}
290 }
291 
292 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
293 {
294 	if (cfs_rq->on_list) {
295 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
296 		cfs_rq->on_list = 0;
297 	}
298 }
299 
300 /* Iterate thr' all leaf cfs_rq's on a runqueue */
301 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
302 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
303 
304 /* Do the two (enqueued) entities belong to the same group ? */
305 static inline int
306 is_same_group(struct sched_entity *se, struct sched_entity *pse)
307 {
308 	if (se->cfs_rq == pse->cfs_rq)
309 		return 1;
310 
311 	return 0;
312 }
313 
314 static inline struct sched_entity *parent_entity(struct sched_entity *se)
315 {
316 	return se->parent;
317 }
318 
319 /* return depth at which a sched entity is present in the hierarchy */
320 static inline int depth_se(struct sched_entity *se)
321 {
322 	int depth = 0;
323 
324 	for_each_sched_entity(se)
325 		depth++;
326 
327 	return depth;
328 }
329 
330 static void
331 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
332 {
333 	int se_depth, pse_depth;
334 
335 	/*
336 	 * preemption test can be made between sibling entities who are in the
337 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
338 	 * both tasks until we find their ancestors who are siblings of common
339 	 * parent.
340 	 */
341 
342 	/* First walk up until both entities are at same depth */
343 	se_depth = depth_se(*se);
344 	pse_depth = depth_se(*pse);
345 
346 	while (se_depth > pse_depth) {
347 		se_depth--;
348 		*se = parent_entity(*se);
349 	}
350 
351 	while (pse_depth > se_depth) {
352 		pse_depth--;
353 		*pse = parent_entity(*pse);
354 	}
355 
356 	while (!is_same_group(*se, *pse)) {
357 		*se = parent_entity(*se);
358 		*pse = parent_entity(*pse);
359 	}
360 }
361 
362 #else	/* !CONFIG_FAIR_GROUP_SCHED */
363 
364 static inline struct task_struct *task_of(struct sched_entity *se)
365 {
366 	return container_of(se, struct task_struct, se);
367 }
368 
369 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
370 {
371 	return container_of(cfs_rq, struct rq, cfs);
372 }
373 
374 #define entity_is_task(se)	1
375 
376 #define for_each_sched_entity(se) \
377 		for (; se; se = NULL)
378 
379 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
380 {
381 	return &task_rq(p)->cfs;
382 }
383 
384 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
385 {
386 	struct task_struct *p = task_of(se);
387 	struct rq *rq = task_rq(p);
388 
389 	return &rq->cfs;
390 }
391 
392 /* runqueue "owned" by this group */
393 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
394 {
395 	return NULL;
396 }
397 
398 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
399 {
400 }
401 
402 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
403 {
404 }
405 
406 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
407 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
408 
409 static inline int
410 is_same_group(struct sched_entity *se, struct sched_entity *pse)
411 {
412 	return 1;
413 }
414 
415 static inline struct sched_entity *parent_entity(struct sched_entity *se)
416 {
417 	return NULL;
418 }
419 
420 static inline void
421 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
422 {
423 }
424 
425 #endif	/* CONFIG_FAIR_GROUP_SCHED */
426 
427 static __always_inline
428 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
429 
430 /**************************************************************
431  * Scheduling class tree data structure manipulation methods:
432  */
433 
434 static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
435 {
436 	s64 delta = (s64)(vruntime - min_vruntime);
437 	if (delta > 0)
438 		min_vruntime = vruntime;
439 
440 	return min_vruntime;
441 }
442 
443 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
444 {
445 	s64 delta = (s64)(vruntime - min_vruntime);
446 	if (delta < 0)
447 		min_vruntime = vruntime;
448 
449 	return min_vruntime;
450 }
451 
452 static inline int entity_before(struct sched_entity *a,
453 				struct sched_entity *b)
454 {
455 	return (s64)(a->vruntime - b->vruntime) < 0;
456 }
457 
458 static void update_min_vruntime(struct cfs_rq *cfs_rq)
459 {
460 	u64 vruntime = cfs_rq->min_vruntime;
461 
462 	if (cfs_rq->curr)
463 		vruntime = cfs_rq->curr->vruntime;
464 
465 	if (cfs_rq->rb_leftmost) {
466 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
467 						   struct sched_entity,
468 						   run_node);
469 
470 		if (!cfs_rq->curr)
471 			vruntime = se->vruntime;
472 		else
473 			vruntime = min_vruntime(vruntime, se->vruntime);
474 	}
475 
476 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477 #ifndef CONFIG_64BIT
478 	smp_wmb();
479 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
480 #endif
481 }
482 
483 /*
484  * Enqueue an entity into the rb-tree:
485  */
486 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
487 {
488 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
489 	struct rb_node *parent = NULL;
490 	struct sched_entity *entry;
491 	int leftmost = 1;
492 
493 	/*
494 	 * Find the right place in the rbtree:
495 	 */
496 	while (*link) {
497 		parent = *link;
498 		entry = rb_entry(parent, struct sched_entity, run_node);
499 		/*
500 		 * We dont care about collisions. Nodes with
501 		 * the same key stay together.
502 		 */
503 		if (entity_before(se, entry)) {
504 			link = &parent->rb_left;
505 		} else {
506 			link = &parent->rb_right;
507 			leftmost = 0;
508 		}
509 	}
510 
511 	/*
512 	 * Maintain a cache of leftmost tree entries (it is frequently
513 	 * used):
514 	 */
515 	if (leftmost)
516 		cfs_rq->rb_leftmost = &se->run_node;
517 
518 	rb_link_node(&se->run_node, parent, link);
519 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
520 }
521 
522 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
523 {
524 	if (cfs_rq->rb_leftmost == &se->run_node) {
525 		struct rb_node *next_node;
526 
527 		next_node = rb_next(&se->run_node);
528 		cfs_rq->rb_leftmost = next_node;
529 	}
530 
531 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
532 }
533 
534 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
535 {
536 	struct rb_node *left = cfs_rq->rb_leftmost;
537 
538 	if (!left)
539 		return NULL;
540 
541 	return rb_entry(left, struct sched_entity, run_node);
542 }
543 
544 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
545 {
546 	struct rb_node *next = rb_next(&se->run_node);
547 
548 	if (!next)
549 		return NULL;
550 
551 	return rb_entry(next, struct sched_entity, run_node);
552 }
553 
554 #ifdef CONFIG_SCHED_DEBUG
555 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
556 {
557 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
558 
559 	if (!last)
560 		return NULL;
561 
562 	return rb_entry(last, struct sched_entity, run_node);
563 }
564 
565 /**************************************************************
566  * Scheduling class statistics methods:
567  */
568 
569 int sched_proc_update_handler(struct ctl_table *table, int write,
570 		void __user *buffer, size_t *lenp,
571 		loff_t *ppos)
572 {
573 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
574 	int factor = get_update_sysctl_factor();
575 
576 	if (ret || !write)
577 		return ret;
578 
579 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
580 					sysctl_sched_min_granularity);
581 
582 #define WRT_SYSCTL(name) \
583 	(normalized_sysctl_##name = sysctl_##name / (factor))
584 	WRT_SYSCTL(sched_min_granularity);
585 	WRT_SYSCTL(sched_latency);
586 	WRT_SYSCTL(sched_wakeup_granularity);
587 #undef WRT_SYSCTL
588 
589 	return 0;
590 }
591 #endif
592 
593 /*
594  * delta /= w
595  */
596 static inline unsigned long
597 calc_delta_fair(unsigned long delta, struct sched_entity *se)
598 {
599 	if (unlikely(se->load.weight != NICE_0_LOAD))
600 		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
601 
602 	return delta;
603 }
604 
605 /*
606  * The idea is to set a period in which each task runs once.
607  *
608  * When there are too many tasks (sched_nr_latency) we have to stretch
609  * this period because otherwise the slices get too small.
610  *
611  * p = (nr <= nl) ? l : l*nr/nl
612  */
613 static u64 __sched_period(unsigned long nr_running)
614 {
615 	u64 period = sysctl_sched_latency;
616 	unsigned long nr_latency = sched_nr_latency;
617 
618 	if (unlikely(nr_running > nr_latency)) {
619 		period = sysctl_sched_min_granularity;
620 		period *= nr_running;
621 	}
622 
623 	return period;
624 }
625 
626 /*
627  * We calculate the wall-time slice from the period by taking a part
628  * proportional to the weight.
629  *
630  * s = p*P[w/rw]
631  */
632 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 {
634 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
635 
636 	for_each_sched_entity(se) {
637 		struct load_weight *load;
638 		struct load_weight lw;
639 
640 		cfs_rq = cfs_rq_of(se);
641 		load = &cfs_rq->load;
642 
643 		if (unlikely(!se->on_rq)) {
644 			lw = cfs_rq->load;
645 
646 			update_load_add(&lw, se->load.weight);
647 			load = &lw;
648 		}
649 		slice = calc_delta_mine(slice, se->load.weight, load);
650 	}
651 	return slice;
652 }
653 
654 /*
655  * We calculate the vruntime slice of a to be inserted task
656  *
657  * vs = s/w
658  */
659 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
660 {
661 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
662 }
663 
664 /*
665  * Update the current task's runtime statistics. Skip current tasks that
666  * are not in our scheduling class.
667  */
668 static inline void
669 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
670 	      unsigned long delta_exec)
671 {
672 	unsigned long delta_exec_weighted;
673 
674 	schedstat_set(curr->statistics.exec_max,
675 		      max((u64)delta_exec, curr->statistics.exec_max));
676 
677 	curr->sum_exec_runtime += delta_exec;
678 	schedstat_add(cfs_rq, exec_clock, delta_exec);
679 	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
680 
681 	curr->vruntime += delta_exec_weighted;
682 	update_min_vruntime(cfs_rq);
683 }
684 
685 static void update_curr(struct cfs_rq *cfs_rq)
686 {
687 	struct sched_entity *curr = cfs_rq->curr;
688 	u64 now = rq_of(cfs_rq)->clock_task;
689 	unsigned long delta_exec;
690 
691 	if (unlikely(!curr))
692 		return;
693 
694 	/*
695 	 * Get the amount of time the current task was running
696 	 * since the last time we changed load (this cannot
697 	 * overflow on 32 bits):
698 	 */
699 	delta_exec = (unsigned long)(now - curr->exec_start);
700 	if (!delta_exec)
701 		return;
702 
703 	__update_curr(cfs_rq, curr, delta_exec);
704 	curr->exec_start = now;
705 
706 	if (entity_is_task(curr)) {
707 		struct task_struct *curtask = task_of(curr);
708 
709 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
710 		cpuacct_charge(curtask, delta_exec);
711 		account_group_exec_runtime(curtask, delta_exec);
712 	}
713 
714 	account_cfs_rq_runtime(cfs_rq, delta_exec);
715 }
716 
717 static inline void
718 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
719 {
720 	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
721 }
722 
723 /*
724  * Task is being enqueued - update stats:
725  */
726 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
727 {
728 	/*
729 	 * Are we enqueueing a waiting task? (for current tasks
730 	 * a dequeue/enqueue event is a NOP)
731 	 */
732 	if (se != cfs_rq->curr)
733 		update_stats_wait_start(cfs_rq, se);
734 }
735 
736 static void
737 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
738 {
739 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
740 			rq_of(cfs_rq)->clock - se->statistics.wait_start));
741 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
742 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
743 			rq_of(cfs_rq)->clock - se->statistics.wait_start);
744 #ifdef CONFIG_SCHEDSTATS
745 	if (entity_is_task(se)) {
746 		trace_sched_stat_wait(task_of(se),
747 			rq_of(cfs_rq)->clock - se->statistics.wait_start);
748 	}
749 #endif
750 	schedstat_set(se->statistics.wait_start, 0);
751 }
752 
753 static inline void
754 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
755 {
756 	/*
757 	 * Mark the end of the wait period if dequeueing a
758 	 * waiting task:
759 	 */
760 	if (se != cfs_rq->curr)
761 		update_stats_wait_end(cfs_rq, se);
762 }
763 
764 /*
765  * We are picking a new current task - update its stats:
766  */
767 static inline void
768 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
769 {
770 	/*
771 	 * We are starting a new run period:
772 	 */
773 	se->exec_start = rq_of(cfs_rq)->clock_task;
774 }
775 
776 /**************************************************
777  * Scheduling class queueing methods:
778  */
779 
780 #ifdef CONFIG_NUMA_BALANCING
781 /*
782  * numa task sample period in ms
783  */
784 unsigned int sysctl_numa_balancing_scan_period_min = 100;
785 unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
786 unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
787 
788 /* Portion of address space to scan in MB */
789 unsigned int sysctl_numa_balancing_scan_size = 256;
790 
791 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
792 unsigned int sysctl_numa_balancing_scan_delay = 1000;
793 
794 static void task_numa_placement(struct task_struct *p)
795 {
796 	int seq;
797 
798 	if (!p->mm)	/* for example, ksmd faulting in a user's mm */
799 		return;
800 	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
801 	if (p->numa_scan_seq == seq)
802 		return;
803 	p->numa_scan_seq = seq;
804 
805 	/* FIXME: Scheduling placement policy hints go here */
806 }
807 
808 /*
809  * Got a PROT_NONE fault for a page on @node.
810  */
811 void task_numa_fault(int node, int pages, bool migrated)
812 {
813 	struct task_struct *p = current;
814 
815 	if (!sched_feat_numa(NUMA))
816 		return;
817 
818 	/* FIXME: Allocate task-specific structure for placement policy here */
819 
820 	/*
821 	 * If pages are properly placed (did not migrate) then scan slower.
822 	 * This is reset periodically in case of phase changes
823 	 */
824         if (!migrated)
825 		p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
826 			p->numa_scan_period + jiffies_to_msecs(10));
827 
828 	task_numa_placement(p);
829 }
830 
831 static void reset_ptenuma_scan(struct task_struct *p)
832 {
833 	ACCESS_ONCE(p->mm->numa_scan_seq)++;
834 	p->mm->numa_scan_offset = 0;
835 }
836 
837 /*
838  * The expensive part of numa migration is done from task_work context.
839  * Triggered from task_tick_numa().
840  */
841 void task_numa_work(struct callback_head *work)
842 {
843 	unsigned long migrate, next_scan, now = jiffies;
844 	struct task_struct *p = current;
845 	struct mm_struct *mm = p->mm;
846 	struct vm_area_struct *vma;
847 	unsigned long start, end;
848 	long pages;
849 
850 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
851 
852 	work->next = work; /* protect against double add */
853 	/*
854 	 * Who cares about NUMA placement when they're dying.
855 	 *
856 	 * NOTE: make sure not to dereference p->mm before this check,
857 	 * exit_task_work() happens _after_ exit_mm() so we could be called
858 	 * without p->mm even though we still had it when we enqueued this
859 	 * work.
860 	 */
861 	if (p->flags & PF_EXITING)
862 		return;
863 
864 	/*
865 	 * We do not care about task placement until a task runs on a node
866 	 * other than the first one used by the address space. This is
867 	 * largely because migrations are driven by what CPU the task
868 	 * is running on. If it's never scheduled on another node, it'll
869 	 * not migrate so why bother trapping the fault.
870 	 */
871 	if (mm->first_nid == NUMA_PTE_SCAN_INIT)
872 		mm->first_nid = numa_node_id();
873 	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
874 		/* Are we running on a new node yet? */
875 		if (numa_node_id() == mm->first_nid &&
876 		    !sched_feat_numa(NUMA_FORCE))
877 			return;
878 
879 		mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
880 	}
881 
882 	/*
883 	 * Reset the scan period if enough time has gone by. Objective is that
884 	 * scanning will be reduced if pages are properly placed. As tasks
885 	 * can enter different phases this needs to be re-examined. Lacking
886 	 * proper tracking of reference behaviour, this blunt hammer is used.
887 	 */
888 	migrate = mm->numa_next_reset;
889 	if (time_after(now, migrate)) {
890 		p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
891 		next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
892 		xchg(&mm->numa_next_reset, next_scan);
893 	}
894 
895 	/*
896 	 * Enforce maximal scan/migration frequency..
897 	 */
898 	migrate = mm->numa_next_scan;
899 	if (time_before(now, migrate))
900 		return;
901 
902 	if (p->numa_scan_period == 0)
903 		p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
904 
905 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
906 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
907 		return;
908 
909 	/*
910 	 * Do not set pte_numa if the current running node is rate-limited.
911 	 * This loses statistics on the fault but if we are unwilling to
912 	 * migrate to this node, it is less likely we can do useful work
913 	 */
914 	if (migrate_ratelimited(numa_node_id()))
915 		return;
916 
917 	start = mm->numa_scan_offset;
918 	pages = sysctl_numa_balancing_scan_size;
919 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
920 	if (!pages)
921 		return;
922 
923 	down_read(&mm->mmap_sem);
924 	vma = find_vma(mm, start);
925 	if (!vma) {
926 		reset_ptenuma_scan(p);
927 		start = 0;
928 		vma = mm->mmap;
929 	}
930 	for (; vma; vma = vma->vm_next) {
931 		if (!vma_migratable(vma))
932 			continue;
933 
934 		/* Skip small VMAs. They are not likely to be of relevance */
935 		if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
936 			continue;
937 
938 		do {
939 			start = max(start, vma->vm_start);
940 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
941 			end = min(end, vma->vm_end);
942 			pages -= change_prot_numa(vma, start, end);
943 
944 			start = end;
945 			if (pages <= 0)
946 				goto out;
947 		} while (end != vma->vm_end);
948 	}
949 
950 out:
951 	/*
952 	 * It is possible to reach the end of the VMA list but the last few VMAs are
953 	 * not guaranteed to the vma_migratable. If they are not, we would find the
954 	 * !migratable VMA on the next scan but not reset the scanner to the start
955 	 * so check it now.
956 	 */
957 	if (vma)
958 		mm->numa_scan_offset = start;
959 	else
960 		reset_ptenuma_scan(p);
961 	up_read(&mm->mmap_sem);
962 }
963 
964 /*
965  * Drive the periodic memory faults..
966  */
967 void task_tick_numa(struct rq *rq, struct task_struct *curr)
968 {
969 	struct callback_head *work = &curr->numa_work;
970 	u64 period, now;
971 
972 	/*
973 	 * We don't care about NUMA placement if we don't have memory.
974 	 */
975 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
976 		return;
977 
978 	/*
979 	 * Using runtime rather than walltime has the dual advantage that
980 	 * we (mostly) drive the selection from busy threads and that the
981 	 * task needs to have done some actual work before we bother with
982 	 * NUMA placement.
983 	 */
984 	now = curr->se.sum_exec_runtime;
985 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
986 
987 	if (now - curr->node_stamp > period) {
988 		if (!curr->node_stamp)
989 			curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
990 		curr->node_stamp = now;
991 
992 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
993 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
994 			task_work_add(curr, work, true);
995 		}
996 	}
997 }
998 #else
999 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1000 {
1001 }
1002 #endif /* CONFIG_NUMA_BALANCING */
1003 
1004 static void
1005 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1006 {
1007 	update_load_add(&cfs_rq->load, se->load.weight);
1008 	if (!parent_entity(se))
1009 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1010 #ifdef CONFIG_SMP
1011 	if (entity_is_task(se))
1012 		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1013 #endif
1014 	cfs_rq->nr_running++;
1015 }
1016 
1017 static void
1018 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1019 {
1020 	update_load_sub(&cfs_rq->load, se->load.weight);
1021 	if (!parent_entity(se))
1022 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1023 	if (entity_is_task(se))
1024 		list_del_init(&se->group_node);
1025 	cfs_rq->nr_running--;
1026 }
1027 
1028 #ifdef CONFIG_FAIR_GROUP_SCHED
1029 # ifdef CONFIG_SMP
1030 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1031 {
1032 	long tg_weight;
1033 
1034 	/*
1035 	 * Use this CPU's actual weight instead of the last load_contribution
1036 	 * to gain a more accurate current total weight. See
1037 	 * update_cfs_rq_load_contribution().
1038 	 */
1039 	tg_weight = atomic64_read(&tg->load_avg);
1040 	tg_weight -= cfs_rq->tg_load_contrib;
1041 	tg_weight += cfs_rq->load.weight;
1042 
1043 	return tg_weight;
1044 }
1045 
1046 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1047 {
1048 	long tg_weight, load, shares;
1049 
1050 	tg_weight = calc_tg_weight(tg, cfs_rq);
1051 	load = cfs_rq->load.weight;
1052 
1053 	shares = (tg->shares * load);
1054 	if (tg_weight)
1055 		shares /= tg_weight;
1056 
1057 	if (shares < MIN_SHARES)
1058 		shares = MIN_SHARES;
1059 	if (shares > tg->shares)
1060 		shares = tg->shares;
1061 
1062 	return shares;
1063 }
1064 # else /* CONFIG_SMP */
1065 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1066 {
1067 	return tg->shares;
1068 }
1069 # endif /* CONFIG_SMP */
1070 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1071 			    unsigned long weight)
1072 {
1073 	if (se->on_rq) {
1074 		/* commit outstanding execution time */
1075 		if (cfs_rq->curr == se)
1076 			update_curr(cfs_rq);
1077 		account_entity_dequeue(cfs_rq, se);
1078 	}
1079 
1080 	update_load_set(&se->load, weight);
1081 
1082 	if (se->on_rq)
1083 		account_entity_enqueue(cfs_rq, se);
1084 }
1085 
1086 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1087 
1088 static void update_cfs_shares(struct cfs_rq *cfs_rq)
1089 {
1090 	struct task_group *tg;
1091 	struct sched_entity *se;
1092 	long shares;
1093 
1094 	tg = cfs_rq->tg;
1095 	se = tg->se[cpu_of(rq_of(cfs_rq))];
1096 	if (!se || throttled_hierarchy(cfs_rq))
1097 		return;
1098 #ifndef CONFIG_SMP
1099 	if (likely(se->load.weight == tg->shares))
1100 		return;
1101 #endif
1102 	shares = calc_cfs_shares(cfs_rq, tg);
1103 
1104 	reweight_entity(cfs_rq_of(se), se, shares);
1105 }
1106 #else /* CONFIG_FAIR_GROUP_SCHED */
1107 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1108 {
1109 }
1110 #endif /* CONFIG_FAIR_GROUP_SCHED */
1111 
1112 /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1113 #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1114 /*
1115  * We choose a half-life close to 1 scheduling period.
1116  * Note: The tables below are dependent on this value.
1117  */
1118 #define LOAD_AVG_PERIOD 32
1119 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1120 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1121 
1122 /* Precomputed fixed inverse multiplies for multiplication by y^n */
1123 static const u32 runnable_avg_yN_inv[] = {
1124 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1125 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1126 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1127 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1128 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1129 	0x85aac367, 0x82cd8698,
1130 };
1131 
1132 /*
1133  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
1134  * over-estimates when re-combining.
1135  */
1136 static const u32 runnable_avg_yN_sum[] = {
1137 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1138 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1139 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1140 };
1141 
1142 /*
1143  * Approximate:
1144  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
1145  */
1146 static __always_inline u64 decay_load(u64 val, u64 n)
1147 {
1148 	unsigned int local_n;
1149 
1150 	if (!n)
1151 		return val;
1152 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1153 		return 0;
1154 
1155 	/* after bounds checking we can collapse to 32-bit */
1156 	local_n = n;
1157 
1158 	/*
1159 	 * As y^PERIOD = 1/2, we can combine
1160 	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1161 	 * With a look-up table which covers k^n (n<PERIOD)
1162 	 *
1163 	 * To achieve constant time decay_load.
1164 	 */
1165 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1166 		val >>= local_n / LOAD_AVG_PERIOD;
1167 		local_n %= LOAD_AVG_PERIOD;
1168 	}
1169 
1170 	val *= runnable_avg_yN_inv[local_n];
1171 	/* We don't use SRR here since we always want to round down. */
1172 	return val >> 32;
1173 }
1174 
1175 /*
1176  * For updates fully spanning n periods, the contribution to runnable
1177  * average will be: \Sum 1024*y^n
1178  *
1179  * We can compute this reasonably efficiently by combining:
1180  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
1181  */
1182 static u32 __compute_runnable_contrib(u64 n)
1183 {
1184 	u32 contrib = 0;
1185 
1186 	if (likely(n <= LOAD_AVG_PERIOD))
1187 		return runnable_avg_yN_sum[n];
1188 	else if (unlikely(n >= LOAD_AVG_MAX_N))
1189 		return LOAD_AVG_MAX;
1190 
1191 	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1192 	do {
1193 		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1194 		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1195 
1196 		n -= LOAD_AVG_PERIOD;
1197 	} while (n > LOAD_AVG_PERIOD);
1198 
1199 	contrib = decay_load(contrib, n);
1200 	return contrib + runnable_avg_yN_sum[n];
1201 }
1202 
1203 /*
1204  * We can represent the historical contribution to runnable average as the
1205  * coefficients of a geometric series.  To do this we sub-divide our runnable
1206  * history into segments of approximately 1ms (1024us); label the segment that
1207  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1208  *
1209  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1210  *      p0            p1           p2
1211  *     (now)       (~1ms ago)  (~2ms ago)
1212  *
1213  * Let u_i denote the fraction of p_i that the entity was runnable.
1214  *
1215  * We then designate the fractions u_i as our co-efficients, yielding the
1216  * following representation of historical load:
1217  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1218  *
1219  * We choose y based on the with of a reasonably scheduling period, fixing:
1220  *   y^32 = 0.5
1221  *
1222  * This means that the contribution to load ~32ms ago (u_32) will be weighted
1223  * approximately half as much as the contribution to load within the last ms
1224  * (u_0).
1225  *
1226  * When a period "rolls over" and we have new u_0`, multiplying the previous
1227  * sum again by y is sufficient to update:
1228  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1229  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1230  */
1231 static __always_inline int __update_entity_runnable_avg(u64 now,
1232 							struct sched_avg *sa,
1233 							int runnable)
1234 {
1235 	u64 delta, periods;
1236 	u32 runnable_contrib;
1237 	int delta_w, decayed = 0;
1238 
1239 	delta = now - sa->last_runnable_update;
1240 	/*
1241 	 * This should only happen when time goes backwards, which it
1242 	 * unfortunately does during sched clock init when we swap over to TSC.
1243 	 */
1244 	if ((s64)delta < 0) {
1245 		sa->last_runnable_update = now;
1246 		return 0;
1247 	}
1248 
1249 	/*
1250 	 * Use 1024ns as the unit of measurement since it's a reasonable
1251 	 * approximation of 1us and fast to compute.
1252 	 */
1253 	delta >>= 10;
1254 	if (!delta)
1255 		return 0;
1256 	sa->last_runnable_update = now;
1257 
1258 	/* delta_w is the amount already accumulated against our next period */
1259 	delta_w = sa->runnable_avg_period % 1024;
1260 	if (delta + delta_w >= 1024) {
1261 		/* period roll-over */
1262 		decayed = 1;
1263 
1264 		/*
1265 		 * Now that we know we're crossing a period boundary, figure
1266 		 * out how much from delta we need to complete the current
1267 		 * period and accrue it.
1268 		 */
1269 		delta_w = 1024 - delta_w;
1270 		if (runnable)
1271 			sa->runnable_avg_sum += delta_w;
1272 		sa->runnable_avg_period += delta_w;
1273 
1274 		delta -= delta_w;
1275 
1276 		/* Figure out how many additional periods this update spans */
1277 		periods = delta / 1024;
1278 		delta %= 1024;
1279 
1280 		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1281 						  periods + 1);
1282 		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1283 						     periods + 1);
1284 
1285 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
1286 		runnable_contrib = __compute_runnable_contrib(periods);
1287 		if (runnable)
1288 			sa->runnable_avg_sum += runnable_contrib;
1289 		sa->runnable_avg_period += runnable_contrib;
1290 	}
1291 
1292 	/* Remainder of delta accrued against u_0` */
1293 	if (runnable)
1294 		sa->runnable_avg_sum += delta;
1295 	sa->runnable_avg_period += delta;
1296 
1297 	return decayed;
1298 }
1299 
1300 /* Synchronize an entity's decay with its parenting cfs_rq.*/
1301 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1302 {
1303 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1304 	u64 decays = atomic64_read(&cfs_rq->decay_counter);
1305 
1306 	decays -= se->avg.decay_count;
1307 	if (!decays)
1308 		return 0;
1309 
1310 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1311 	se->avg.decay_count = 0;
1312 
1313 	return decays;
1314 }
1315 
1316 #ifdef CONFIG_FAIR_GROUP_SCHED
1317 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1318 						 int force_update)
1319 {
1320 	struct task_group *tg = cfs_rq->tg;
1321 	s64 tg_contrib;
1322 
1323 	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1324 	tg_contrib -= cfs_rq->tg_load_contrib;
1325 
1326 	if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1327 		atomic64_add(tg_contrib, &tg->load_avg);
1328 		cfs_rq->tg_load_contrib += tg_contrib;
1329 	}
1330 }
1331 
1332 /*
1333  * Aggregate cfs_rq runnable averages into an equivalent task_group
1334  * representation for computing load contributions.
1335  */
1336 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1337 						  struct cfs_rq *cfs_rq)
1338 {
1339 	struct task_group *tg = cfs_rq->tg;
1340 	long contrib;
1341 
1342 	/* The fraction of a cpu used by this cfs_rq */
1343 	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1344 			  sa->runnable_avg_period + 1);
1345 	contrib -= cfs_rq->tg_runnable_contrib;
1346 
1347 	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1348 		atomic_add(contrib, &tg->runnable_avg);
1349 		cfs_rq->tg_runnable_contrib += contrib;
1350 	}
1351 }
1352 
1353 static inline void __update_group_entity_contrib(struct sched_entity *se)
1354 {
1355 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
1356 	struct task_group *tg = cfs_rq->tg;
1357 	int runnable_avg;
1358 
1359 	u64 contrib;
1360 
1361 	contrib = cfs_rq->tg_load_contrib * tg->shares;
1362 	se->avg.load_avg_contrib = div64_u64(contrib,
1363 					     atomic64_read(&tg->load_avg) + 1);
1364 
1365 	/*
1366 	 * For group entities we need to compute a correction term in the case
1367 	 * that they are consuming <1 cpu so that we would contribute the same
1368 	 * load as a task of equal weight.
1369 	 *
1370 	 * Explicitly co-ordinating this measurement would be expensive, but
1371 	 * fortunately the sum of each cpus contribution forms a usable
1372 	 * lower-bound on the true value.
1373 	 *
1374 	 * Consider the aggregate of 2 contributions.  Either they are disjoint
1375 	 * (and the sum represents true value) or they are disjoint and we are
1376 	 * understating by the aggregate of their overlap.
1377 	 *
1378 	 * Extending this to N cpus, for a given overlap, the maximum amount we
1379 	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1380 	 * cpus that overlap for this interval and w_i is the interval width.
1381 	 *
1382 	 * On a small machine; the first term is well-bounded which bounds the
1383 	 * total error since w_i is a subset of the period.  Whereas on a
1384 	 * larger machine, while this first term can be larger, if w_i is the
1385 	 * of consequential size guaranteed to see n_i*w_i quickly converge to
1386 	 * our upper bound of 1-cpu.
1387 	 */
1388 	runnable_avg = atomic_read(&tg->runnable_avg);
1389 	if (runnable_avg < NICE_0_LOAD) {
1390 		se->avg.load_avg_contrib *= runnable_avg;
1391 		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1392 	}
1393 }
1394 #else
1395 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1396 						 int force_update) {}
1397 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1398 						  struct cfs_rq *cfs_rq) {}
1399 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1400 #endif
1401 
1402 static inline void __update_task_entity_contrib(struct sched_entity *se)
1403 {
1404 	u32 contrib;
1405 
1406 	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1407 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1408 	contrib /= (se->avg.runnable_avg_period + 1);
1409 	se->avg.load_avg_contrib = scale_load(contrib);
1410 }
1411 
1412 /* Compute the current contribution to load_avg by se, return any delta */
1413 static long __update_entity_load_avg_contrib(struct sched_entity *se)
1414 {
1415 	long old_contrib = se->avg.load_avg_contrib;
1416 
1417 	if (entity_is_task(se)) {
1418 		__update_task_entity_contrib(se);
1419 	} else {
1420 		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1421 		__update_group_entity_contrib(se);
1422 	}
1423 
1424 	return se->avg.load_avg_contrib - old_contrib;
1425 }
1426 
1427 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1428 						 long load_contrib)
1429 {
1430 	if (likely(load_contrib < cfs_rq->blocked_load_avg))
1431 		cfs_rq->blocked_load_avg -= load_contrib;
1432 	else
1433 		cfs_rq->blocked_load_avg = 0;
1434 }
1435 
1436 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1437 
1438 /* Update a sched_entity's runnable average */
1439 static inline void update_entity_load_avg(struct sched_entity *se,
1440 					  int update_cfs_rq)
1441 {
1442 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1443 	long contrib_delta;
1444 	u64 now;
1445 
1446 	/*
1447 	 * For a group entity we need to use their owned cfs_rq_clock_task() in
1448 	 * case they are the parent of a throttled hierarchy.
1449 	 */
1450 	if (entity_is_task(se))
1451 		now = cfs_rq_clock_task(cfs_rq);
1452 	else
1453 		now = cfs_rq_clock_task(group_cfs_rq(se));
1454 
1455 	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1456 		return;
1457 
1458 	contrib_delta = __update_entity_load_avg_contrib(se);
1459 
1460 	if (!update_cfs_rq)
1461 		return;
1462 
1463 	if (se->on_rq)
1464 		cfs_rq->runnable_load_avg += contrib_delta;
1465 	else
1466 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1467 }
1468 
1469 /*
1470  * Decay the load contributed by all blocked children and account this so that
1471  * their contribution may appropriately discounted when they wake up.
1472  */
1473 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1474 {
1475 	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1476 	u64 decays;
1477 
1478 	decays = now - cfs_rq->last_decay;
1479 	if (!decays && !force_update)
1480 		return;
1481 
1482 	if (atomic64_read(&cfs_rq->removed_load)) {
1483 		u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1484 		subtract_blocked_load_contrib(cfs_rq, removed_load);
1485 	}
1486 
1487 	if (decays) {
1488 		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1489 						      decays);
1490 		atomic64_add(decays, &cfs_rq->decay_counter);
1491 		cfs_rq->last_decay = now;
1492 	}
1493 
1494 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1495 }
1496 
1497 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1498 {
1499 	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1500 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
1501 }
1502 
1503 /* Add the load generated by se into cfs_rq's child load-average */
1504 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1505 						  struct sched_entity *se,
1506 						  int wakeup)
1507 {
1508 	/*
1509 	 * We track migrations using entity decay_count <= 0, on a wake-up
1510 	 * migration we use a negative decay count to track the remote decays
1511 	 * accumulated while sleeping.
1512 	 */
1513 	if (unlikely(se->avg.decay_count <= 0)) {
1514 		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1515 		if (se->avg.decay_count) {
1516 			/*
1517 			 * In a wake-up migration we have to approximate the
1518 			 * time sleeping.  This is because we can't synchronize
1519 			 * clock_task between the two cpus, and it is not
1520 			 * guaranteed to be read-safe.  Instead, we can
1521 			 * approximate this using our carried decays, which are
1522 			 * explicitly atomically readable.
1523 			 */
1524 			se->avg.last_runnable_update -= (-se->avg.decay_count)
1525 							<< 20;
1526 			update_entity_load_avg(se, 0);
1527 			/* Indicate that we're now synchronized and on-rq */
1528 			se->avg.decay_count = 0;
1529 		}
1530 		wakeup = 0;
1531 	} else {
1532 		__synchronize_entity_decay(se);
1533 	}
1534 
1535 	/* migrated tasks did not contribute to our blocked load */
1536 	if (wakeup) {
1537 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1538 		update_entity_load_avg(se, 0);
1539 	}
1540 
1541 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1542 	/* we force update consideration on load-balancer moves */
1543 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1544 }
1545 
1546 /*
1547  * Remove se's load from this cfs_rq child load-average, if the entity is
1548  * transitioning to a blocked state we track its projected decay using
1549  * blocked_load_avg.
1550  */
1551 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1552 						  struct sched_entity *se,
1553 						  int sleep)
1554 {
1555 	update_entity_load_avg(se, 1);
1556 	/* we force update consideration on load-balancer moves */
1557 	update_cfs_rq_blocked_load(cfs_rq, !sleep);
1558 
1559 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1560 	if (sleep) {
1561 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1562 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1563 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
1564 }
1565 #else
1566 static inline void update_entity_load_avg(struct sched_entity *se,
1567 					  int update_cfs_rq) {}
1568 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1569 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1570 					   struct sched_entity *se,
1571 					   int wakeup) {}
1572 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1573 					   struct sched_entity *se,
1574 					   int sleep) {}
1575 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1576 					      int force_update) {}
1577 #endif
1578 
1579 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1580 {
1581 #ifdef CONFIG_SCHEDSTATS
1582 	struct task_struct *tsk = NULL;
1583 
1584 	if (entity_is_task(se))
1585 		tsk = task_of(se);
1586 
1587 	if (se->statistics.sleep_start) {
1588 		u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
1589 
1590 		if ((s64)delta < 0)
1591 			delta = 0;
1592 
1593 		if (unlikely(delta > se->statistics.sleep_max))
1594 			se->statistics.sleep_max = delta;
1595 
1596 		se->statistics.sleep_start = 0;
1597 		se->statistics.sum_sleep_runtime += delta;
1598 
1599 		if (tsk) {
1600 			account_scheduler_latency(tsk, delta >> 10, 1);
1601 			trace_sched_stat_sleep(tsk, delta);
1602 		}
1603 	}
1604 	if (se->statistics.block_start) {
1605 		u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1606 
1607 		if ((s64)delta < 0)
1608 			delta = 0;
1609 
1610 		if (unlikely(delta > se->statistics.block_max))
1611 			se->statistics.block_max = delta;
1612 
1613 		se->statistics.block_start = 0;
1614 		se->statistics.sum_sleep_runtime += delta;
1615 
1616 		if (tsk) {
1617 			if (tsk->in_iowait) {
1618 				se->statistics.iowait_sum += delta;
1619 				se->statistics.iowait_count++;
1620 				trace_sched_stat_iowait(tsk, delta);
1621 			}
1622 
1623 			trace_sched_stat_blocked(tsk, delta);
1624 
1625 			/*
1626 			 * Blocking time is in units of nanosecs, so shift by
1627 			 * 20 to get a milliseconds-range estimation of the
1628 			 * amount of time that the task spent sleeping:
1629 			 */
1630 			if (unlikely(prof_on == SLEEP_PROFILING)) {
1631 				profile_hits(SLEEP_PROFILING,
1632 						(void *)get_wchan(tsk),
1633 						delta >> 20);
1634 			}
1635 			account_scheduler_latency(tsk, delta >> 10, 0);
1636 		}
1637 	}
1638 #endif
1639 }
1640 
1641 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1642 {
1643 #ifdef CONFIG_SCHED_DEBUG
1644 	s64 d = se->vruntime - cfs_rq->min_vruntime;
1645 
1646 	if (d < 0)
1647 		d = -d;
1648 
1649 	if (d > 3*sysctl_sched_latency)
1650 		schedstat_inc(cfs_rq, nr_spread_over);
1651 #endif
1652 }
1653 
1654 static void
1655 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1656 {
1657 	u64 vruntime = cfs_rq->min_vruntime;
1658 
1659 	/*
1660 	 * The 'current' period is already promised to the current tasks,
1661 	 * however the extra weight of the new task will slow them down a
1662 	 * little, place the new task so that it fits in the slot that
1663 	 * stays open at the end.
1664 	 */
1665 	if (initial && sched_feat(START_DEBIT))
1666 		vruntime += sched_vslice(cfs_rq, se);
1667 
1668 	/* sleeps up to a single latency don't count. */
1669 	if (!initial) {
1670 		unsigned long thresh = sysctl_sched_latency;
1671 
1672 		/*
1673 		 * Halve their sleep time's effect, to allow
1674 		 * for a gentler effect of sleepers:
1675 		 */
1676 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
1677 			thresh >>= 1;
1678 
1679 		vruntime -= thresh;
1680 	}
1681 
1682 	/* ensure we never gain time by being placed backwards. */
1683 	se->vruntime = max_vruntime(se->vruntime, vruntime);
1684 }
1685 
1686 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1687 
1688 static void
1689 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1690 {
1691 	/*
1692 	 * Update the normalized vruntime before updating min_vruntime
1693 	 * through callig update_curr().
1694 	 */
1695 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1696 		se->vruntime += cfs_rq->min_vruntime;
1697 
1698 	/*
1699 	 * Update run-time statistics of the 'current'.
1700 	 */
1701 	update_curr(cfs_rq);
1702 	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1703 	account_entity_enqueue(cfs_rq, se);
1704 	update_cfs_shares(cfs_rq);
1705 
1706 	if (flags & ENQUEUE_WAKEUP) {
1707 		place_entity(cfs_rq, se, 0);
1708 		enqueue_sleeper(cfs_rq, se);
1709 	}
1710 
1711 	update_stats_enqueue(cfs_rq, se);
1712 	check_spread(cfs_rq, se);
1713 	if (se != cfs_rq->curr)
1714 		__enqueue_entity(cfs_rq, se);
1715 	se->on_rq = 1;
1716 
1717 	if (cfs_rq->nr_running == 1) {
1718 		list_add_leaf_cfs_rq(cfs_rq);
1719 		check_enqueue_throttle(cfs_rq);
1720 	}
1721 }
1722 
1723 static void __clear_buddies_last(struct sched_entity *se)
1724 {
1725 	for_each_sched_entity(se) {
1726 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1727 		if (cfs_rq->last == se)
1728 			cfs_rq->last = NULL;
1729 		else
1730 			break;
1731 	}
1732 }
1733 
1734 static void __clear_buddies_next(struct sched_entity *se)
1735 {
1736 	for_each_sched_entity(se) {
1737 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1738 		if (cfs_rq->next == se)
1739 			cfs_rq->next = NULL;
1740 		else
1741 			break;
1742 	}
1743 }
1744 
1745 static void __clear_buddies_skip(struct sched_entity *se)
1746 {
1747 	for_each_sched_entity(se) {
1748 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1749 		if (cfs_rq->skip == se)
1750 			cfs_rq->skip = NULL;
1751 		else
1752 			break;
1753 	}
1754 }
1755 
1756 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1757 {
1758 	if (cfs_rq->last == se)
1759 		__clear_buddies_last(se);
1760 
1761 	if (cfs_rq->next == se)
1762 		__clear_buddies_next(se);
1763 
1764 	if (cfs_rq->skip == se)
1765 		__clear_buddies_skip(se);
1766 }
1767 
1768 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1769 
1770 static void
1771 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1772 {
1773 	/*
1774 	 * Update run-time statistics of the 'current'.
1775 	 */
1776 	update_curr(cfs_rq);
1777 	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1778 
1779 	update_stats_dequeue(cfs_rq, se);
1780 	if (flags & DEQUEUE_SLEEP) {
1781 #ifdef CONFIG_SCHEDSTATS
1782 		if (entity_is_task(se)) {
1783 			struct task_struct *tsk = task_of(se);
1784 
1785 			if (tsk->state & TASK_INTERRUPTIBLE)
1786 				se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1787 			if (tsk->state & TASK_UNINTERRUPTIBLE)
1788 				se->statistics.block_start = rq_of(cfs_rq)->clock;
1789 		}
1790 #endif
1791 	}
1792 
1793 	clear_buddies(cfs_rq, se);
1794 
1795 	if (se != cfs_rq->curr)
1796 		__dequeue_entity(cfs_rq, se);
1797 	se->on_rq = 0;
1798 	account_entity_dequeue(cfs_rq, se);
1799 
1800 	/*
1801 	 * Normalize the entity after updating the min_vruntime because the
1802 	 * update can refer to the ->curr item and we need to reflect this
1803 	 * movement in our normalized position.
1804 	 */
1805 	if (!(flags & DEQUEUE_SLEEP))
1806 		se->vruntime -= cfs_rq->min_vruntime;
1807 
1808 	/* return excess runtime on last dequeue */
1809 	return_cfs_rq_runtime(cfs_rq);
1810 
1811 	update_min_vruntime(cfs_rq);
1812 	update_cfs_shares(cfs_rq);
1813 }
1814 
1815 /*
1816  * Preempt the current task with a newly woken task if needed:
1817  */
1818 static void
1819 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1820 {
1821 	unsigned long ideal_runtime, delta_exec;
1822 	struct sched_entity *se;
1823 	s64 delta;
1824 
1825 	ideal_runtime = sched_slice(cfs_rq, curr);
1826 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1827 	if (delta_exec > ideal_runtime) {
1828 		resched_task(rq_of(cfs_rq)->curr);
1829 		/*
1830 		 * The current task ran long enough, ensure it doesn't get
1831 		 * re-elected due to buddy favours.
1832 		 */
1833 		clear_buddies(cfs_rq, curr);
1834 		return;
1835 	}
1836 
1837 	/*
1838 	 * Ensure that a task that missed wakeup preemption by a
1839 	 * narrow margin doesn't have to wait for a full slice.
1840 	 * This also mitigates buddy induced latencies under load.
1841 	 */
1842 	if (delta_exec < sysctl_sched_min_granularity)
1843 		return;
1844 
1845 	se = __pick_first_entity(cfs_rq);
1846 	delta = curr->vruntime - se->vruntime;
1847 
1848 	if (delta < 0)
1849 		return;
1850 
1851 	if (delta > ideal_runtime)
1852 		resched_task(rq_of(cfs_rq)->curr);
1853 }
1854 
1855 static void
1856 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1857 {
1858 	/* 'current' is not kept within the tree. */
1859 	if (se->on_rq) {
1860 		/*
1861 		 * Any task has to be enqueued before it get to execute on
1862 		 * a CPU. So account for the time it spent waiting on the
1863 		 * runqueue.
1864 		 */
1865 		update_stats_wait_end(cfs_rq, se);
1866 		__dequeue_entity(cfs_rq, se);
1867 	}
1868 
1869 	update_stats_curr_start(cfs_rq, se);
1870 	cfs_rq->curr = se;
1871 #ifdef CONFIG_SCHEDSTATS
1872 	/*
1873 	 * Track our maximum slice length, if the CPU's load is at
1874 	 * least twice that of our own weight (i.e. dont track it
1875 	 * when there are only lesser-weight tasks around):
1876 	 */
1877 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1878 		se->statistics.slice_max = max(se->statistics.slice_max,
1879 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
1880 	}
1881 #endif
1882 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
1883 }
1884 
1885 static int
1886 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1887 
1888 /*
1889  * Pick the next process, keeping these things in mind, in this order:
1890  * 1) keep things fair between processes/task groups
1891  * 2) pick the "next" process, since someone really wants that to run
1892  * 3) pick the "last" process, for cache locality
1893  * 4) do not run the "skip" process, if something else is available
1894  */
1895 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1896 {
1897 	struct sched_entity *se = __pick_first_entity(cfs_rq);
1898 	struct sched_entity *left = se;
1899 
1900 	/*
1901 	 * Avoid running the skip buddy, if running something else can
1902 	 * be done without getting too unfair.
1903 	 */
1904 	if (cfs_rq->skip == se) {
1905 		struct sched_entity *second = __pick_next_entity(se);
1906 		if (second && wakeup_preempt_entity(second, left) < 1)
1907 			se = second;
1908 	}
1909 
1910 	/*
1911 	 * Prefer last buddy, try to return the CPU to a preempted task.
1912 	 */
1913 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1914 		se = cfs_rq->last;
1915 
1916 	/*
1917 	 * Someone really wants this to run. If it's not unfair, run it.
1918 	 */
1919 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1920 		se = cfs_rq->next;
1921 
1922 	clear_buddies(cfs_rq, se);
1923 
1924 	return se;
1925 }
1926 
1927 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1928 
1929 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1930 {
1931 	/*
1932 	 * If still on the runqueue then deactivate_task()
1933 	 * was not called and update_curr() has to be done:
1934 	 */
1935 	if (prev->on_rq)
1936 		update_curr(cfs_rq);
1937 
1938 	/* throttle cfs_rqs exceeding runtime */
1939 	check_cfs_rq_runtime(cfs_rq);
1940 
1941 	check_spread(cfs_rq, prev);
1942 	if (prev->on_rq) {
1943 		update_stats_wait_start(cfs_rq, prev);
1944 		/* Put 'current' back into the tree. */
1945 		__enqueue_entity(cfs_rq, prev);
1946 		/* in !on_rq case, update occurred at dequeue */
1947 		update_entity_load_avg(prev, 1);
1948 	}
1949 	cfs_rq->curr = NULL;
1950 }
1951 
1952 static void
1953 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1954 {
1955 	/*
1956 	 * Update run-time statistics of the 'current'.
1957 	 */
1958 	update_curr(cfs_rq);
1959 
1960 	/*
1961 	 * Ensure that runnable average is periodically updated.
1962 	 */
1963 	update_entity_load_avg(curr, 1);
1964 	update_cfs_rq_blocked_load(cfs_rq, 1);
1965 
1966 #ifdef CONFIG_SCHED_HRTICK
1967 	/*
1968 	 * queued ticks are scheduled to match the slice, so don't bother
1969 	 * validating it and just reschedule.
1970 	 */
1971 	if (queued) {
1972 		resched_task(rq_of(cfs_rq)->curr);
1973 		return;
1974 	}
1975 	/*
1976 	 * don't let the period tick interfere with the hrtick preemption
1977 	 */
1978 	if (!sched_feat(DOUBLE_TICK) &&
1979 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1980 		return;
1981 #endif
1982 
1983 	if (cfs_rq->nr_running > 1)
1984 		check_preempt_tick(cfs_rq, curr);
1985 }
1986 
1987 
1988 /**************************************************
1989  * CFS bandwidth control machinery
1990  */
1991 
1992 #ifdef CONFIG_CFS_BANDWIDTH
1993 
1994 #ifdef HAVE_JUMP_LABEL
1995 static struct static_key __cfs_bandwidth_used;
1996 
1997 static inline bool cfs_bandwidth_used(void)
1998 {
1999 	return static_key_false(&__cfs_bandwidth_used);
2000 }
2001 
2002 void account_cfs_bandwidth_used(int enabled, int was_enabled)
2003 {
2004 	/* only need to count groups transitioning between enabled/!enabled */
2005 	if (enabled && !was_enabled)
2006 		static_key_slow_inc(&__cfs_bandwidth_used);
2007 	else if (!enabled && was_enabled)
2008 		static_key_slow_dec(&__cfs_bandwidth_used);
2009 }
2010 #else /* HAVE_JUMP_LABEL */
2011 static bool cfs_bandwidth_used(void)
2012 {
2013 	return true;
2014 }
2015 
2016 void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
2017 #endif /* HAVE_JUMP_LABEL */
2018 
2019 /*
2020  * default period for cfs group bandwidth.
2021  * default: 0.1s, units: nanoseconds
2022  */
2023 static inline u64 default_cfs_period(void)
2024 {
2025 	return 100000000ULL;
2026 }
2027 
2028 static inline u64 sched_cfs_bandwidth_slice(void)
2029 {
2030 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2031 }
2032 
2033 /*
2034  * Replenish runtime according to assigned quota and update expiration time.
2035  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2036  * additional synchronization around rq->lock.
2037  *
2038  * requires cfs_b->lock
2039  */
2040 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2041 {
2042 	u64 now;
2043 
2044 	if (cfs_b->quota == RUNTIME_INF)
2045 		return;
2046 
2047 	now = sched_clock_cpu(smp_processor_id());
2048 	cfs_b->runtime = cfs_b->quota;
2049 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2050 }
2051 
2052 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2053 {
2054 	return &tg->cfs_bandwidth;
2055 }
2056 
2057 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2058 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2059 {
2060 	if (unlikely(cfs_rq->throttle_count))
2061 		return cfs_rq->throttled_clock_task;
2062 
2063 	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2064 }
2065 
2066 /* returns 0 on failure to allocate runtime */
2067 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2068 {
2069 	struct task_group *tg = cfs_rq->tg;
2070 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2071 	u64 amount = 0, min_amount, expires;
2072 
2073 	/* note: this is a positive sum as runtime_remaining <= 0 */
2074 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2075 
2076 	raw_spin_lock(&cfs_b->lock);
2077 	if (cfs_b->quota == RUNTIME_INF)
2078 		amount = min_amount;
2079 	else {
2080 		/*
2081 		 * If the bandwidth pool has become inactive, then at least one
2082 		 * period must have elapsed since the last consumption.
2083 		 * Refresh the global state and ensure bandwidth timer becomes
2084 		 * active.
2085 		 */
2086 		if (!cfs_b->timer_active) {
2087 			__refill_cfs_bandwidth_runtime(cfs_b);
2088 			__start_cfs_bandwidth(cfs_b);
2089 		}
2090 
2091 		if (cfs_b->runtime > 0) {
2092 			amount = min(cfs_b->runtime, min_amount);
2093 			cfs_b->runtime -= amount;
2094 			cfs_b->idle = 0;
2095 		}
2096 	}
2097 	expires = cfs_b->runtime_expires;
2098 	raw_spin_unlock(&cfs_b->lock);
2099 
2100 	cfs_rq->runtime_remaining += amount;
2101 	/*
2102 	 * we may have advanced our local expiration to account for allowed
2103 	 * spread between our sched_clock and the one on which runtime was
2104 	 * issued.
2105 	 */
2106 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2107 		cfs_rq->runtime_expires = expires;
2108 
2109 	return cfs_rq->runtime_remaining > 0;
2110 }
2111 
2112 /*
2113  * Note: This depends on the synchronization provided by sched_clock and the
2114  * fact that rq->clock snapshots this value.
2115  */
2116 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2117 {
2118 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2119 	struct rq *rq = rq_of(cfs_rq);
2120 
2121 	/* if the deadline is ahead of our clock, nothing to do */
2122 	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
2123 		return;
2124 
2125 	if (cfs_rq->runtime_remaining < 0)
2126 		return;
2127 
2128 	/*
2129 	 * If the local deadline has passed we have to consider the
2130 	 * possibility that our sched_clock is 'fast' and the global deadline
2131 	 * has not truly expired.
2132 	 *
2133 	 * Fortunately we can check determine whether this the case by checking
2134 	 * whether the global deadline has advanced.
2135 	 */
2136 
2137 	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2138 		/* extend local deadline, drift is bounded above by 2 ticks */
2139 		cfs_rq->runtime_expires += TICK_NSEC;
2140 	} else {
2141 		/* global deadline is ahead, expiration has passed */
2142 		cfs_rq->runtime_remaining = 0;
2143 	}
2144 }
2145 
2146 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2147 				     unsigned long delta_exec)
2148 {
2149 	/* dock delta_exec before expiring quota (as it could span periods) */
2150 	cfs_rq->runtime_remaining -= delta_exec;
2151 	expire_cfs_rq_runtime(cfs_rq);
2152 
2153 	if (likely(cfs_rq->runtime_remaining > 0))
2154 		return;
2155 
2156 	/*
2157 	 * if we're unable to extend our runtime we resched so that the active
2158 	 * hierarchy can be throttled
2159 	 */
2160 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2161 		resched_task(rq_of(cfs_rq)->curr);
2162 }
2163 
2164 static __always_inline
2165 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
2166 {
2167 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
2168 		return;
2169 
2170 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
2171 }
2172 
2173 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2174 {
2175 	return cfs_bandwidth_used() && cfs_rq->throttled;
2176 }
2177 
2178 /* check whether cfs_rq, or any parent, is throttled */
2179 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2180 {
2181 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
2182 }
2183 
2184 /*
2185  * Ensure that neither of the group entities corresponding to src_cpu or
2186  * dest_cpu are members of a throttled hierarchy when performing group
2187  * load-balance operations.
2188  */
2189 static inline int throttled_lb_pair(struct task_group *tg,
2190 				    int src_cpu, int dest_cpu)
2191 {
2192 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2193 
2194 	src_cfs_rq = tg->cfs_rq[src_cpu];
2195 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
2196 
2197 	return throttled_hierarchy(src_cfs_rq) ||
2198 	       throttled_hierarchy(dest_cfs_rq);
2199 }
2200 
2201 /* updated child weight may affect parent so we have to do this bottom up */
2202 static int tg_unthrottle_up(struct task_group *tg, void *data)
2203 {
2204 	struct rq *rq = data;
2205 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2206 
2207 	cfs_rq->throttle_count--;
2208 #ifdef CONFIG_SMP
2209 	if (!cfs_rq->throttle_count) {
2210 		/* adjust cfs_rq_clock_task() */
2211 		cfs_rq->throttled_clock_task_time += rq->clock_task -
2212 					     cfs_rq->throttled_clock_task;
2213 	}
2214 #endif
2215 
2216 	return 0;
2217 }
2218 
2219 static int tg_throttle_down(struct task_group *tg, void *data)
2220 {
2221 	struct rq *rq = data;
2222 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2223 
2224 	/* group is entering throttled state, stop time */
2225 	if (!cfs_rq->throttle_count)
2226 		cfs_rq->throttled_clock_task = rq->clock_task;
2227 	cfs_rq->throttle_count++;
2228 
2229 	return 0;
2230 }
2231 
2232 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2233 {
2234 	struct rq *rq = rq_of(cfs_rq);
2235 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2236 	struct sched_entity *se;
2237 	long task_delta, dequeue = 1;
2238 
2239 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2240 
2241 	/* freeze hierarchy runnable averages while throttled */
2242 	rcu_read_lock();
2243 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2244 	rcu_read_unlock();
2245 
2246 	task_delta = cfs_rq->h_nr_running;
2247 	for_each_sched_entity(se) {
2248 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2249 		/* throttled entity or throttle-on-deactivate */
2250 		if (!se->on_rq)
2251 			break;
2252 
2253 		if (dequeue)
2254 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2255 		qcfs_rq->h_nr_running -= task_delta;
2256 
2257 		if (qcfs_rq->load.weight)
2258 			dequeue = 0;
2259 	}
2260 
2261 	if (!se)
2262 		rq->nr_running -= task_delta;
2263 
2264 	cfs_rq->throttled = 1;
2265 	cfs_rq->throttled_clock = rq->clock;
2266 	raw_spin_lock(&cfs_b->lock);
2267 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2268 	raw_spin_unlock(&cfs_b->lock);
2269 }
2270 
2271 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2272 {
2273 	struct rq *rq = rq_of(cfs_rq);
2274 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2275 	struct sched_entity *se;
2276 	int enqueue = 1;
2277 	long task_delta;
2278 
2279 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2280 
2281 	cfs_rq->throttled = 0;
2282 	raw_spin_lock(&cfs_b->lock);
2283 	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
2284 	list_del_rcu(&cfs_rq->throttled_list);
2285 	raw_spin_unlock(&cfs_b->lock);
2286 
2287 	update_rq_clock(rq);
2288 	/* update hierarchical throttle state */
2289 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2290 
2291 	if (!cfs_rq->load.weight)
2292 		return;
2293 
2294 	task_delta = cfs_rq->h_nr_running;
2295 	for_each_sched_entity(se) {
2296 		if (se->on_rq)
2297 			enqueue = 0;
2298 
2299 		cfs_rq = cfs_rq_of(se);
2300 		if (enqueue)
2301 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2302 		cfs_rq->h_nr_running += task_delta;
2303 
2304 		if (cfs_rq_throttled(cfs_rq))
2305 			break;
2306 	}
2307 
2308 	if (!se)
2309 		rq->nr_running += task_delta;
2310 
2311 	/* determine whether we need to wake up potentially idle cpu */
2312 	if (rq->curr == rq->idle && rq->cfs.nr_running)
2313 		resched_task(rq->curr);
2314 }
2315 
2316 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2317 		u64 remaining, u64 expires)
2318 {
2319 	struct cfs_rq *cfs_rq;
2320 	u64 runtime = remaining;
2321 
2322 	rcu_read_lock();
2323 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2324 				throttled_list) {
2325 		struct rq *rq = rq_of(cfs_rq);
2326 
2327 		raw_spin_lock(&rq->lock);
2328 		if (!cfs_rq_throttled(cfs_rq))
2329 			goto next;
2330 
2331 		runtime = -cfs_rq->runtime_remaining + 1;
2332 		if (runtime > remaining)
2333 			runtime = remaining;
2334 		remaining -= runtime;
2335 
2336 		cfs_rq->runtime_remaining += runtime;
2337 		cfs_rq->runtime_expires = expires;
2338 
2339 		/* we check whether we're throttled above */
2340 		if (cfs_rq->runtime_remaining > 0)
2341 			unthrottle_cfs_rq(cfs_rq);
2342 
2343 next:
2344 		raw_spin_unlock(&rq->lock);
2345 
2346 		if (!remaining)
2347 			break;
2348 	}
2349 	rcu_read_unlock();
2350 
2351 	return remaining;
2352 }
2353 
2354 /*
2355  * Responsible for refilling a task_group's bandwidth and unthrottling its
2356  * cfs_rqs as appropriate. If there has been no activity within the last
2357  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2358  * used to track this state.
2359  */
2360 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2361 {
2362 	u64 runtime, runtime_expires;
2363 	int idle = 1, throttled;
2364 
2365 	raw_spin_lock(&cfs_b->lock);
2366 	/* no need to continue the timer with no bandwidth constraint */
2367 	if (cfs_b->quota == RUNTIME_INF)
2368 		goto out_unlock;
2369 
2370 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2371 	/* idle depends on !throttled (for the case of a large deficit) */
2372 	idle = cfs_b->idle && !throttled;
2373 	cfs_b->nr_periods += overrun;
2374 
2375 	/* if we're going inactive then everything else can be deferred */
2376 	if (idle)
2377 		goto out_unlock;
2378 
2379 	__refill_cfs_bandwidth_runtime(cfs_b);
2380 
2381 	if (!throttled) {
2382 		/* mark as potentially idle for the upcoming period */
2383 		cfs_b->idle = 1;
2384 		goto out_unlock;
2385 	}
2386 
2387 	/* account preceding periods in which throttling occurred */
2388 	cfs_b->nr_throttled += overrun;
2389 
2390 	/*
2391 	 * There are throttled entities so we must first use the new bandwidth
2392 	 * to unthrottle them before making it generally available.  This
2393 	 * ensures that all existing debts will be paid before a new cfs_rq is
2394 	 * allowed to run.
2395 	 */
2396 	runtime = cfs_b->runtime;
2397 	runtime_expires = cfs_b->runtime_expires;
2398 	cfs_b->runtime = 0;
2399 
2400 	/*
2401 	 * This check is repeated as we are holding onto the new bandwidth
2402 	 * while we unthrottle.  This can potentially race with an unthrottled
2403 	 * group trying to acquire new bandwidth from the global pool.
2404 	 */
2405 	while (throttled && runtime > 0) {
2406 		raw_spin_unlock(&cfs_b->lock);
2407 		/* we can't nest cfs_b->lock while distributing bandwidth */
2408 		runtime = distribute_cfs_runtime(cfs_b, runtime,
2409 						 runtime_expires);
2410 		raw_spin_lock(&cfs_b->lock);
2411 
2412 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2413 	}
2414 
2415 	/* return (any) remaining runtime */
2416 	cfs_b->runtime = runtime;
2417 	/*
2418 	 * While we are ensured activity in the period following an
2419 	 * unthrottle, this also covers the case in which the new bandwidth is
2420 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
2421 	 * timer to remain active while there are any throttled entities.)
2422 	 */
2423 	cfs_b->idle = 0;
2424 out_unlock:
2425 	if (idle)
2426 		cfs_b->timer_active = 0;
2427 	raw_spin_unlock(&cfs_b->lock);
2428 
2429 	return idle;
2430 }
2431 
2432 /* a cfs_rq won't donate quota below this amount */
2433 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2434 /* minimum remaining period time to redistribute slack quota */
2435 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2436 /* how long we wait to gather additional slack before distributing */
2437 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2438 
2439 /* are we near the end of the current quota period? */
2440 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2441 {
2442 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
2443 	u64 remaining;
2444 
2445 	/* if the call-back is running a quota refresh is already occurring */
2446 	if (hrtimer_callback_running(refresh_timer))
2447 		return 1;
2448 
2449 	/* is a quota refresh about to occur? */
2450 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2451 	if (remaining < min_expire)
2452 		return 1;
2453 
2454 	return 0;
2455 }
2456 
2457 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2458 {
2459 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2460 
2461 	/* if there's a quota refresh soon don't bother with slack */
2462 	if (runtime_refresh_within(cfs_b, min_left))
2463 		return;
2464 
2465 	start_bandwidth_timer(&cfs_b->slack_timer,
2466 				ns_to_ktime(cfs_bandwidth_slack_period));
2467 }
2468 
2469 /* we know any runtime found here is valid as update_curr() precedes return */
2470 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2471 {
2472 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2473 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2474 
2475 	if (slack_runtime <= 0)
2476 		return;
2477 
2478 	raw_spin_lock(&cfs_b->lock);
2479 	if (cfs_b->quota != RUNTIME_INF &&
2480 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2481 		cfs_b->runtime += slack_runtime;
2482 
2483 		/* we are under rq->lock, defer unthrottling using a timer */
2484 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2485 		    !list_empty(&cfs_b->throttled_cfs_rq))
2486 			start_cfs_slack_bandwidth(cfs_b);
2487 	}
2488 	raw_spin_unlock(&cfs_b->lock);
2489 
2490 	/* even if it's not valid for return we don't want to try again */
2491 	cfs_rq->runtime_remaining -= slack_runtime;
2492 }
2493 
2494 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2495 {
2496 	if (!cfs_bandwidth_used())
2497 		return;
2498 
2499 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
2500 		return;
2501 
2502 	__return_cfs_rq_runtime(cfs_rq);
2503 }
2504 
2505 /*
2506  * This is done with a timer (instead of inline with bandwidth return) since
2507  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2508  */
2509 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2510 {
2511 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2512 	u64 expires;
2513 
2514 	/* confirm we're still not at a refresh boundary */
2515 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2516 		return;
2517 
2518 	raw_spin_lock(&cfs_b->lock);
2519 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2520 		runtime = cfs_b->runtime;
2521 		cfs_b->runtime = 0;
2522 	}
2523 	expires = cfs_b->runtime_expires;
2524 	raw_spin_unlock(&cfs_b->lock);
2525 
2526 	if (!runtime)
2527 		return;
2528 
2529 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2530 
2531 	raw_spin_lock(&cfs_b->lock);
2532 	if (expires == cfs_b->runtime_expires)
2533 		cfs_b->runtime = runtime;
2534 	raw_spin_unlock(&cfs_b->lock);
2535 }
2536 
2537 /*
2538  * When a group wakes up we want to make sure that its quota is not already
2539  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2540  * runtime as update_curr() throttling can not not trigger until it's on-rq.
2541  */
2542 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2543 {
2544 	if (!cfs_bandwidth_used())
2545 		return;
2546 
2547 	/* an active group must be handled by the update_curr()->put() path */
2548 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2549 		return;
2550 
2551 	/* ensure the group is not already throttled */
2552 	if (cfs_rq_throttled(cfs_rq))
2553 		return;
2554 
2555 	/* update runtime allocation */
2556 	account_cfs_rq_runtime(cfs_rq, 0);
2557 	if (cfs_rq->runtime_remaining <= 0)
2558 		throttle_cfs_rq(cfs_rq);
2559 }
2560 
2561 /* conditionally throttle active cfs_rq's from put_prev_entity() */
2562 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2563 {
2564 	if (!cfs_bandwidth_used())
2565 		return;
2566 
2567 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2568 		return;
2569 
2570 	/*
2571 	 * it's possible for a throttled entity to be forced into a running
2572 	 * state (e.g. set_curr_task), in this case we're finished.
2573 	 */
2574 	if (cfs_rq_throttled(cfs_rq))
2575 		return;
2576 
2577 	throttle_cfs_rq(cfs_rq);
2578 }
2579 
2580 static inline u64 default_cfs_period(void);
2581 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2582 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2583 
2584 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2585 {
2586 	struct cfs_bandwidth *cfs_b =
2587 		container_of(timer, struct cfs_bandwidth, slack_timer);
2588 	do_sched_cfs_slack_timer(cfs_b);
2589 
2590 	return HRTIMER_NORESTART;
2591 }
2592 
2593 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2594 {
2595 	struct cfs_bandwidth *cfs_b =
2596 		container_of(timer, struct cfs_bandwidth, period_timer);
2597 	ktime_t now;
2598 	int overrun;
2599 	int idle = 0;
2600 
2601 	for (;;) {
2602 		now = hrtimer_cb_get_time(timer);
2603 		overrun = hrtimer_forward(timer, now, cfs_b->period);
2604 
2605 		if (!overrun)
2606 			break;
2607 
2608 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
2609 	}
2610 
2611 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2612 }
2613 
2614 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2615 {
2616 	raw_spin_lock_init(&cfs_b->lock);
2617 	cfs_b->runtime = 0;
2618 	cfs_b->quota = RUNTIME_INF;
2619 	cfs_b->period = ns_to_ktime(default_cfs_period());
2620 
2621 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2622 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2623 	cfs_b->period_timer.function = sched_cfs_period_timer;
2624 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2625 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
2626 }
2627 
2628 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2629 {
2630 	cfs_rq->runtime_enabled = 0;
2631 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
2632 }
2633 
2634 /* requires cfs_b->lock, may release to reprogram timer */
2635 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2636 {
2637 	/*
2638 	 * The timer may be active because we're trying to set a new bandwidth
2639 	 * period or because we're racing with the tear-down path
2640 	 * (timer_active==0 becomes visible before the hrtimer call-back
2641 	 * terminates).  In either case we ensure that it's re-programmed
2642 	 */
2643 	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2644 		raw_spin_unlock(&cfs_b->lock);
2645 		/* ensure cfs_b->lock is available while we wait */
2646 		hrtimer_cancel(&cfs_b->period_timer);
2647 
2648 		raw_spin_lock(&cfs_b->lock);
2649 		/* if someone else restarted the timer then we're done */
2650 		if (cfs_b->timer_active)
2651 			return;
2652 	}
2653 
2654 	cfs_b->timer_active = 1;
2655 	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2656 }
2657 
2658 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2659 {
2660 	hrtimer_cancel(&cfs_b->period_timer);
2661 	hrtimer_cancel(&cfs_b->slack_timer);
2662 }
2663 
2664 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2665 {
2666 	struct cfs_rq *cfs_rq;
2667 
2668 	for_each_leaf_cfs_rq(rq, cfs_rq) {
2669 		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2670 
2671 		if (!cfs_rq->runtime_enabled)
2672 			continue;
2673 
2674 		/*
2675 		 * clock_task is not advancing so we just need to make sure
2676 		 * there's some valid quota amount
2677 		 */
2678 		cfs_rq->runtime_remaining = cfs_b->quota;
2679 		if (cfs_rq_throttled(cfs_rq))
2680 			unthrottle_cfs_rq(cfs_rq);
2681 	}
2682 }
2683 
2684 #else /* CONFIG_CFS_BANDWIDTH */
2685 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2686 {
2687 	return rq_of(cfs_rq)->clock_task;
2688 }
2689 
2690 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2691 				     unsigned long delta_exec) {}
2692 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2693 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2694 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2695 
2696 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2697 {
2698 	return 0;
2699 }
2700 
2701 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2702 {
2703 	return 0;
2704 }
2705 
2706 static inline int throttled_lb_pair(struct task_group *tg,
2707 				    int src_cpu, int dest_cpu)
2708 {
2709 	return 0;
2710 }
2711 
2712 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2713 
2714 #ifdef CONFIG_FAIR_GROUP_SCHED
2715 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2716 #endif
2717 
2718 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2719 {
2720 	return NULL;
2721 }
2722 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2723 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2724 
2725 #endif /* CONFIG_CFS_BANDWIDTH */
2726 
2727 /**************************************************
2728  * CFS operations on tasks:
2729  */
2730 
2731 #ifdef CONFIG_SCHED_HRTICK
2732 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2733 {
2734 	struct sched_entity *se = &p->se;
2735 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2736 
2737 	WARN_ON(task_rq(p) != rq);
2738 
2739 	if (cfs_rq->nr_running > 1) {
2740 		u64 slice = sched_slice(cfs_rq, se);
2741 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2742 		s64 delta = slice - ran;
2743 
2744 		if (delta < 0) {
2745 			if (rq->curr == p)
2746 				resched_task(p);
2747 			return;
2748 		}
2749 
2750 		/*
2751 		 * Don't schedule slices shorter than 10000ns, that just
2752 		 * doesn't make sense. Rely on vruntime for fairness.
2753 		 */
2754 		if (rq->curr != p)
2755 			delta = max_t(s64, 10000LL, delta);
2756 
2757 		hrtick_start(rq, delta);
2758 	}
2759 }
2760 
2761 /*
2762  * called from enqueue/dequeue and updates the hrtick when the
2763  * current task is from our class and nr_running is low enough
2764  * to matter.
2765  */
2766 static void hrtick_update(struct rq *rq)
2767 {
2768 	struct task_struct *curr = rq->curr;
2769 
2770 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2771 		return;
2772 
2773 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2774 		hrtick_start_fair(rq, curr);
2775 }
2776 #else /* !CONFIG_SCHED_HRTICK */
2777 static inline void
2778 hrtick_start_fair(struct rq *rq, struct task_struct *p)
2779 {
2780 }
2781 
2782 static inline void hrtick_update(struct rq *rq)
2783 {
2784 }
2785 #endif
2786 
2787 /*
2788  * The enqueue_task method is called before nr_running is
2789  * increased. Here we update the fair scheduling stats and
2790  * then put the task into the rbtree:
2791  */
2792 static void
2793 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2794 {
2795 	struct cfs_rq *cfs_rq;
2796 	struct sched_entity *se = &p->se;
2797 
2798 	for_each_sched_entity(se) {
2799 		if (se->on_rq)
2800 			break;
2801 		cfs_rq = cfs_rq_of(se);
2802 		enqueue_entity(cfs_rq, se, flags);
2803 
2804 		/*
2805 		 * end evaluation on encountering a throttled cfs_rq
2806 		 *
2807 		 * note: in the case of encountering a throttled cfs_rq we will
2808 		 * post the final h_nr_running increment below.
2809 		*/
2810 		if (cfs_rq_throttled(cfs_rq))
2811 			break;
2812 		cfs_rq->h_nr_running++;
2813 
2814 		flags = ENQUEUE_WAKEUP;
2815 	}
2816 
2817 	for_each_sched_entity(se) {
2818 		cfs_rq = cfs_rq_of(se);
2819 		cfs_rq->h_nr_running++;
2820 
2821 		if (cfs_rq_throttled(cfs_rq))
2822 			break;
2823 
2824 		update_cfs_shares(cfs_rq);
2825 		update_entity_load_avg(se, 1);
2826 	}
2827 
2828 	if (!se) {
2829 		update_rq_runnable_avg(rq, rq->nr_running);
2830 		inc_nr_running(rq);
2831 	}
2832 	hrtick_update(rq);
2833 }
2834 
2835 static void set_next_buddy(struct sched_entity *se);
2836 
2837 /*
2838  * The dequeue_task method is called before nr_running is
2839  * decreased. We remove the task from the rbtree and
2840  * update the fair scheduling stats:
2841  */
2842 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2843 {
2844 	struct cfs_rq *cfs_rq;
2845 	struct sched_entity *se = &p->se;
2846 	int task_sleep = flags & DEQUEUE_SLEEP;
2847 
2848 	for_each_sched_entity(se) {
2849 		cfs_rq = cfs_rq_of(se);
2850 		dequeue_entity(cfs_rq, se, flags);
2851 
2852 		/*
2853 		 * end evaluation on encountering a throttled cfs_rq
2854 		 *
2855 		 * note: in the case of encountering a throttled cfs_rq we will
2856 		 * post the final h_nr_running decrement below.
2857 		*/
2858 		if (cfs_rq_throttled(cfs_rq))
2859 			break;
2860 		cfs_rq->h_nr_running--;
2861 
2862 		/* Don't dequeue parent if it has other entities besides us */
2863 		if (cfs_rq->load.weight) {
2864 			/*
2865 			 * Bias pick_next to pick a task from this cfs_rq, as
2866 			 * p is sleeping when it is within its sched_slice.
2867 			 */
2868 			if (task_sleep && parent_entity(se))
2869 				set_next_buddy(parent_entity(se));
2870 
2871 			/* avoid re-evaluating load for this entity */
2872 			se = parent_entity(se);
2873 			break;
2874 		}
2875 		flags |= DEQUEUE_SLEEP;
2876 	}
2877 
2878 	for_each_sched_entity(se) {
2879 		cfs_rq = cfs_rq_of(se);
2880 		cfs_rq->h_nr_running--;
2881 
2882 		if (cfs_rq_throttled(cfs_rq))
2883 			break;
2884 
2885 		update_cfs_shares(cfs_rq);
2886 		update_entity_load_avg(se, 1);
2887 	}
2888 
2889 	if (!se) {
2890 		dec_nr_running(rq);
2891 		update_rq_runnable_avg(rq, 1);
2892 	}
2893 	hrtick_update(rq);
2894 }
2895 
2896 #ifdef CONFIG_SMP
2897 /* Used instead of source_load when we know the type == 0 */
2898 static unsigned long weighted_cpuload(const int cpu)
2899 {
2900 	return cpu_rq(cpu)->load.weight;
2901 }
2902 
2903 /*
2904  * Return a low guess at the load of a migration-source cpu weighted
2905  * according to the scheduling class and "nice" value.
2906  *
2907  * We want to under-estimate the load of migration sources, to
2908  * balance conservatively.
2909  */
2910 static unsigned long source_load(int cpu, int type)
2911 {
2912 	struct rq *rq = cpu_rq(cpu);
2913 	unsigned long total = weighted_cpuload(cpu);
2914 
2915 	if (type == 0 || !sched_feat(LB_BIAS))
2916 		return total;
2917 
2918 	return min(rq->cpu_load[type-1], total);
2919 }
2920 
2921 /*
2922  * Return a high guess at the load of a migration-target cpu weighted
2923  * according to the scheduling class and "nice" value.
2924  */
2925 static unsigned long target_load(int cpu, int type)
2926 {
2927 	struct rq *rq = cpu_rq(cpu);
2928 	unsigned long total = weighted_cpuload(cpu);
2929 
2930 	if (type == 0 || !sched_feat(LB_BIAS))
2931 		return total;
2932 
2933 	return max(rq->cpu_load[type-1], total);
2934 }
2935 
2936 static unsigned long power_of(int cpu)
2937 {
2938 	return cpu_rq(cpu)->cpu_power;
2939 }
2940 
2941 static unsigned long cpu_avg_load_per_task(int cpu)
2942 {
2943 	struct rq *rq = cpu_rq(cpu);
2944 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2945 
2946 	if (nr_running)
2947 		return rq->load.weight / nr_running;
2948 
2949 	return 0;
2950 }
2951 
2952 
2953 static void task_waking_fair(struct task_struct *p)
2954 {
2955 	struct sched_entity *se = &p->se;
2956 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2957 	u64 min_vruntime;
2958 
2959 #ifndef CONFIG_64BIT
2960 	u64 min_vruntime_copy;
2961 
2962 	do {
2963 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
2964 		smp_rmb();
2965 		min_vruntime = cfs_rq->min_vruntime;
2966 	} while (min_vruntime != min_vruntime_copy);
2967 #else
2968 	min_vruntime = cfs_rq->min_vruntime;
2969 #endif
2970 
2971 	se->vruntime -= min_vruntime;
2972 }
2973 
2974 #ifdef CONFIG_FAIR_GROUP_SCHED
2975 /*
2976  * effective_load() calculates the load change as seen from the root_task_group
2977  *
2978  * Adding load to a group doesn't make a group heavier, but can cause movement
2979  * of group shares between cpus. Assuming the shares were perfectly aligned one
2980  * can calculate the shift in shares.
2981  *
2982  * Calculate the effective load difference if @wl is added (subtracted) to @tg
2983  * on this @cpu and results in a total addition (subtraction) of @wg to the
2984  * total group weight.
2985  *
2986  * Given a runqueue weight distribution (rw_i) we can compute a shares
2987  * distribution (s_i) using:
2988  *
2989  *   s_i = rw_i / \Sum rw_j						(1)
2990  *
2991  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
2992  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
2993  * shares distribution (s_i):
2994  *
2995  *   rw_i = {   2,   4,   1,   0 }
2996  *   s_i  = { 2/7, 4/7, 1/7,   0 }
2997  *
2998  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
2999  * task used to run on and the CPU the waker is running on), we need to
3000  * compute the effect of waking a task on either CPU and, in case of a sync
3001  * wakeup, compute the effect of the current task going to sleep.
3002  *
3003  * So for a change of @wl to the local @cpu with an overall group weight change
3004  * of @wl we can compute the new shares distribution (s'_i) using:
3005  *
3006  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
3007  *
3008  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3009  * differences in waking a task to CPU 0. The additional task changes the
3010  * weight and shares distributions like:
3011  *
3012  *   rw'_i = {   3,   4,   1,   0 }
3013  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
3014  *
3015  * We can then compute the difference in effective weight by using:
3016  *
3017  *   dw_i = S * (s'_i - s_i)						(3)
3018  *
3019  * Where 'S' is the group weight as seen by its parent.
3020  *
3021  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3022  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3023  * 4/7) times the weight of the group.
3024  */
3025 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3026 {
3027 	struct sched_entity *se = tg->se[cpu];
3028 
3029 	if (!tg->parent)	/* the trivial, non-cgroup case */
3030 		return wl;
3031 
3032 	for_each_sched_entity(se) {
3033 		long w, W;
3034 
3035 		tg = se->my_q->tg;
3036 
3037 		/*
3038 		 * W = @wg + \Sum rw_j
3039 		 */
3040 		W = wg + calc_tg_weight(tg, se->my_q);
3041 
3042 		/*
3043 		 * w = rw_i + @wl
3044 		 */
3045 		w = se->my_q->load.weight + wl;
3046 
3047 		/*
3048 		 * wl = S * s'_i; see (2)
3049 		 */
3050 		if (W > 0 && w < W)
3051 			wl = (w * tg->shares) / W;
3052 		else
3053 			wl = tg->shares;
3054 
3055 		/*
3056 		 * Per the above, wl is the new se->load.weight value; since
3057 		 * those are clipped to [MIN_SHARES, ...) do so now. See
3058 		 * calc_cfs_shares().
3059 		 */
3060 		if (wl < MIN_SHARES)
3061 			wl = MIN_SHARES;
3062 
3063 		/*
3064 		 * wl = dw_i = S * (s'_i - s_i); see (3)
3065 		 */
3066 		wl -= se->load.weight;
3067 
3068 		/*
3069 		 * Recursively apply this logic to all parent groups to compute
3070 		 * the final effective load change on the root group. Since
3071 		 * only the @tg group gets extra weight, all parent groups can
3072 		 * only redistribute existing shares. @wl is the shift in shares
3073 		 * resulting from this level per the above.
3074 		 */
3075 		wg = 0;
3076 	}
3077 
3078 	return wl;
3079 }
3080 #else
3081 
3082 static inline unsigned long effective_load(struct task_group *tg, int cpu,
3083 		unsigned long wl, unsigned long wg)
3084 {
3085 	return wl;
3086 }
3087 
3088 #endif
3089 
3090 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3091 {
3092 	s64 this_load, load;
3093 	int idx, this_cpu, prev_cpu;
3094 	unsigned long tl_per_task;
3095 	struct task_group *tg;
3096 	unsigned long weight;
3097 	int balanced;
3098 
3099 	idx	  = sd->wake_idx;
3100 	this_cpu  = smp_processor_id();
3101 	prev_cpu  = task_cpu(p);
3102 	load	  = source_load(prev_cpu, idx);
3103 	this_load = target_load(this_cpu, idx);
3104 
3105 	/*
3106 	 * If sync wakeup then subtract the (maximum possible)
3107 	 * effect of the currently running task from the load
3108 	 * of the current CPU:
3109 	 */
3110 	if (sync) {
3111 		tg = task_group(current);
3112 		weight = current->se.load.weight;
3113 
3114 		this_load += effective_load(tg, this_cpu, -weight, -weight);
3115 		load += effective_load(tg, prev_cpu, 0, -weight);
3116 	}
3117 
3118 	tg = task_group(p);
3119 	weight = p->se.load.weight;
3120 
3121 	/*
3122 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
3123 	 * due to the sync cause above having dropped this_load to 0, we'll
3124 	 * always have an imbalance, but there's really nothing you can do
3125 	 * about that, so that's good too.
3126 	 *
3127 	 * Otherwise check if either cpus are near enough in load to allow this
3128 	 * task to be woken on this_cpu.
3129 	 */
3130 	if (this_load > 0) {
3131 		s64 this_eff_load, prev_eff_load;
3132 
3133 		this_eff_load = 100;
3134 		this_eff_load *= power_of(prev_cpu);
3135 		this_eff_load *= this_load +
3136 			effective_load(tg, this_cpu, weight, weight);
3137 
3138 		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3139 		prev_eff_load *= power_of(this_cpu);
3140 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3141 
3142 		balanced = this_eff_load <= prev_eff_load;
3143 	} else
3144 		balanced = true;
3145 
3146 	/*
3147 	 * If the currently running task will sleep within
3148 	 * a reasonable amount of time then attract this newly
3149 	 * woken task:
3150 	 */
3151 	if (sync && balanced)
3152 		return 1;
3153 
3154 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
3155 	tl_per_task = cpu_avg_load_per_task(this_cpu);
3156 
3157 	if (balanced ||
3158 	    (this_load <= load &&
3159 	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
3160 		/*
3161 		 * This domain has SD_WAKE_AFFINE and
3162 		 * p is cache cold in this domain, and
3163 		 * there is no bad imbalance.
3164 		 */
3165 		schedstat_inc(sd, ttwu_move_affine);
3166 		schedstat_inc(p, se.statistics.nr_wakeups_affine);
3167 
3168 		return 1;
3169 	}
3170 	return 0;
3171 }
3172 
3173 /*
3174  * find_idlest_group finds and returns the least busy CPU group within the
3175  * domain.
3176  */
3177 static struct sched_group *
3178 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
3179 		  int this_cpu, int load_idx)
3180 {
3181 	struct sched_group *idlest = NULL, *group = sd->groups;
3182 	unsigned long min_load = ULONG_MAX, this_load = 0;
3183 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
3184 
3185 	do {
3186 		unsigned long load, avg_load;
3187 		int local_group;
3188 		int i;
3189 
3190 		/* Skip over this group if it has no CPUs allowed */
3191 		if (!cpumask_intersects(sched_group_cpus(group),
3192 					tsk_cpus_allowed(p)))
3193 			continue;
3194 
3195 		local_group = cpumask_test_cpu(this_cpu,
3196 					       sched_group_cpus(group));
3197 
3198 		/* Tally up the load of all CPUs in the group */
3199 		avg_load = 0;
3200 
3201 		for_each_cpu(i, sched_group_cpus(group)) {
3202 			/* Bias balancing toward cpus of our domain */
3203 			if (local_group)
3204 				load = source_load(i, load_idx);
3205 			else
3206 				load = target_load(i, load_idx);
3207 
3208 			avg_load += load;
3209 		}
3210 
3211 		/* Adjust by relative CPU power of the group */
3212 		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
3213 
3214 		if (local_group) {
3215 			this_load = avg_load;
3216 		} else if (avg_load < min_load) {
3217 			min_load = avg_load;
3218 			idlest = group;
3219 		}
3220 	} while (group = group->next, group != sd->groups);
3221 
3222 	if (!idlest || 100*this_load < imbalance*min_load)
3223 		return NULL;
3224 	return idlest;
3225 }
3226 
3227 /*
3228  * find_idlest_cpu - find the idlest cpu among the cpus in group.
3229  */
3230 static int
3231 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3232 {
3233 	unsigned long load, min_load = ULONG_MAX;
3234 	int idlest = -1;
3235 	int i;
3236 
3237 	/* Traverse only the allowed CPUs */
3238 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
3239 		load = weighted_cpuload(i);
3240 
3241 		if (load < min_load || (load == min_load && i == this_cpu)) {
3242 			min_load = load;
3243 			idlest = i;
3244 		}
3245 	}
3246 
3247 	return idlest;
3248 }
3249 
3250 /*
3251  * Try and locate an idle CPU in the sched_domain.
3252  */
3253 static int select_idle_sibling(struct task_struct *p, int target)
3254 {
3255 	struct sched_domain *sd;
3256 	struct sched_group *sg;
3257 	int i = task_cpu(p);
3258 
3259 	if (idle_cpu(target))
3260 		return target;
3261 
3262 	/*
3263 	 * If the prevous cpu is cache affine and idle, don't be stupid.
3264 	 */
3265 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3266 		return i;
3267 
3268 	/*
3269 	 * Otherwise, iterate the domains and find an elegible idle cpu.
3270 	 */
3271 	sd = rcu_dereference(per_cpu(sd_llc, target));
3272 	for_each_lower_domain(sd) {
3273 		sg = sd->groups;
3274 		do {
3275 			if (!cpumask_intersects(sched_group_cpus(sg),
3276 						tsk_cpus_allowed(p)))
3277 				goto next;
3278 
3279 			for_each_cpu(i, sched_group_cpus(sg)) {
3280 				if (i == target || !idle_cpu(i))
3281 					goto next;
3282 			}
3283 
3284 			target = cpumask_first_and(sched_group_cpus(sg),
3285 					tsk_cpus_allowed(p));
3286 			goto done;
3287 next:
3288 			sg = sg->next;
3289 		} while (sg != sd->groups);
3290 	}
3291 done:
3292 	return target;
3293 }
3294 
3295 /*
3296  * sched_balance_self: balance the current task (running on cpu) in domains
3297  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
3298  * SD_BALANCE_EXEC.
3299  *
3300  * Balance, ie. select the least loaded group.
3301  *
3302  * Returns the target CPU number, or the same CPU if no balancing is needed.
3303  *
3304  * preempt must be disabled.
3305  */
3306 static int
3307 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
3308 {
3309 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3310 	int cpu = smp_processor_id();
3311 	int prev_cpu = task_cpu(p);
3312 	int new_cpu = cpu;
3313 	int want_affine = 0;
3314 	int sync = wake_flags & WF_SYNC;
3315 
3316 	if (p->nr_cpus_allowed == 1)
3317 		return prev_cpu;
3318 
3319 	if (sd_flag & SD_BALANCE_WAKE) {
3320 		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
3321 			want_affine = 1;
3322 		new_cpu = prev_cpu;
3323 	}
3324 
3325 	rcu_read_lock();
3326 	for_each_domain(cpu, tmp) {
3327 		if (!(tmp->flags & SD_LOAD_BALANCE))
3328 			continue;
3329 
3330 		/*
3331 		 * If both cpu and prev_cpu are part of this domain,
3332 		 * cpu is a valid SD_WAKE_AFFINE target.
3333 		 */
3334 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
3335 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
3336 			affine_sd = tmp;
3337 			break;
3338 		}
3339 
3340 		if (tmp->flags & sd_flag)
3341 			sd = tmp;
3342 	}
3343 
3344 	if (affine_sd) {
3345 		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
3346 			prev_cpu = cpu;
3347 
3348 		new_cpu = select_idle_sibling(p, prev_cpu);
3349 		goto unlock;
3350 	}
3351 
3352 	while (sd) {
3353 		int load_idx = sd->forkexec_idx;
3354 		struct sched_group *group;
3355 		int weight;
3356 
3357 		if (!(sd->flags & sd_flag)) {
3358 			sd = sd->child;
3359 			continue;
3360 		}
3361 
3362 		if (sd_flag & SD_BALANCE_WAKE)
3363 			load_idx = sd->wake_idx;
3364 
3365 		group = find_idlest_group(sd, p, cpu, load_idx);
3366 		if (!group) {
3367 			sd = sd->child;
3368 			continue;
3369 		}
3370 
3371 		new_cpu = find_idlest_cpu(group, p, cpu);
3372 		if (new_cpu == -1 || new_cpu == cpu) {
3373 			/* Now try balancing at a lower domain level of cpu */
3374 			sd = sd->child;
3375 			continue;
3376 		}
3377 
3378 		/* Now try balancing at a lower domain level of new_cpu */
3379 		cpu = new_cpu;
3380 		weight = sd->span_weight;
3381 		sd = NULL;
3382 		for_each_domain(cpu, tmp) {
3383 			if (weight <= tmp->span_weight)
3384 				break;
3385 			if (tmp->flags & sd_flag)
3386 				sd = tmp;
3387 		}
3388 		/* while loop will break here if sd == NULL */
3389 	}
3390 unlock:
3391 	rcu_read_unlock();
3392 
3393 	return new_cpu;
3394 }
3395 
3396 /*
3397  * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3398  * removed when useful for applications beyond shares distribution (e.g.
3399  * load-balance).
3400  */
3401 #ifdef CONFIG_FAIR_GROUP_SCHED
3402 /*
3403  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3404  * cfs_rq_of(p) references at time of call are still valid and identify the
3405  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
3406  * other assumptions, including the state of rq->lock, should be made.
3407  */
3408 static void
3409 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3410 {
3411 	struct sched_entity *se = &p->se;
3412 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3413 
3414 	/*
3415 	 * Load tracking: accumulate removed load so that it can be processed
3416 	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
3417 	 * to blocked load iff they have a positive decay-count.  It can never
3418 	 * be negative here since on-rq tasks have decay-count == 0.
3419 	 */
3420 	if (se->avg.decay_count) {
3421 		se->avg.decay_count = -__synchronize_entity_decay(se);
3422 		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3423 	}
3424 }
3425 #endif
3426 #endif /* CONFIG_SMP */
3427 
3428 static unsigned long
3429 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
3430 {
3431 	unsigned long gran = sysctl_sched_wakeup_granularity;
3432 
3433 	/*
3434 	 * Since its curr running now, convert the gran from real-time
3435 	 * to virtual-time in his units.
3436 	 *
3437 	 * By using 'se' instead of 'curr' we penalize light tasks, so
3438 	 * they get preempted easier. That is, if 'se' < 'curr' then
3439 	 * the resulting gran will be larger, therefore penalizing the
3440 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
3441 	 * be smaller, again penalizing the lighter task.
3442 	 *
3443 	 * This is especially important for buddies when the leftmost
3444 	 * task is higher priority than the buddy.
3445 	 */
3446 	return calc_delta_fair(gran, se);
3447 }
3448 
3449 /*
3450  * Should 'se' preempt 'curr'.
3451  *
3452  *             |s1
3453  *        |s2
3454  *   |s3
3455  *         g
3456  *      |<--->|c
3457  *
3458  *  w(c, s1) = -1
3459  *  w(c, s2) =  0
3460  *  w(c, s3) =  1
3461  *
3462  */
3463 static int
3464 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
3465 {
3466 	s64 gran, vdiff = curr->vruntime - se->vruntime;
3467 
3468 	if (vdiff <= 0)
3469 		return -1;
3470 
3471 	gran = wakeup_gran(curr, se);
3472 	if (vdiff > gran)
3473 		return 1;
3474 
3475 	return 0;
3476 }
3477 
3478 static void set_last_buddy(struct sched_entity *se)
3479 {
3480 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3481 		return;
3482 
3483 	for_each_sched_entity(se)
3484 		cfs_rq_of(se)->last = se;
3485 }
3486 
3487 static void set_next_buddy(struct sched_entity *se)
3488 {
3489 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3490 		return;
3491 
3492 	for_each_sched_entity(se)
3493 		cfs_rq_of(se)->next = se;
3494 }
3495 
3496 static void set_skip_buddy(struct sched_entity *se)
3497 {
3498 	for_each_sched_entity(se)
3499 		cfs_rq_of(se)->skip = se;
3500 }
3501 
3502 /*
3503  * Preempt the current task with a newly woken task if needed:
3504  */
3505 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
3506 {
3507 	struct task_struct *curr = rq->curr;
3508 	struct sched_entity *se = &curr->se, *pse = &p->se;
3509 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3510 	int scale = cfs_rq->nr_running >= sched_nr_latency;
3511 	int next_buddy_marked = 0;
3512 
3513 	if (unlikely(se == pse))
3514 		return;
3515 
3516 	/*
3517 	 * This is possible from callers such as move_task(), in which we
3518 	 * unconditionally check_prempt_curr() after an enqueue (which may have
3519 	 * lead to a throttle).  This both saves work and prevents false
3520 	 * next-buddy nomination below.
3521 	 */
3522 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3523 		return;
3524 
3525 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3526 		set_next_buddy(pse);
3527 		next_buddy_marked = 1;
3528 	}
3529 
3530 	/*
3531 	 * We can come here with TIF_NEED_RESCHED already set from new task
3532 	 * wake up path.
3533 	 *
3534 	 * Note: this also catches the edge-case of curr being in a throttled
3535 	 * group (e.g. via set_curr_task), since update_curr() (in the
3536 	 * enqueue of curr) will have resulted in resched being set.  This
3537 	 * prevents us from potentially nominating it as a false LAST_BUDDY
3538 	 * below.
3539 	 */
3540 	if (test_tsk_need_resched(curr))
3541 		return;
3542 
3543 	/* Idle tasks are by definition preempted by non-idle tasks. */
3544 	if (unlikely(curr->policy == SCHED_IDLE) &&
3545 	    likely(p->policy != SCHED_IDLE))
3546 		goto preempt;
3547 
3548 	/*
3549 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
3550 	 * is driven by the tick):
3551 	 */
3552 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
3553 		return;
3554 
3555 	find_matching_se(&se, &pse);
3556 	update_curr(cfs_rq_of(se));
3557 	BUG_ON(!pse);
3558 	if (wakeup_preempt_entity(se, pse) == 1) {
3559 		/*
3560 		 * Bias pick_next to pick the sched entity that is
3561 		 * triggering this preemption.
3562 		 */
3563 		if (!next_buddy_marked)
3564 			set_next_buddy(pse);
3565 		goto preempt;
3566 	}
3567 
3568 	return;
3569 
3570 preempt:
3571 	resched_task(curr);
3572 	/*
3573 	 * Only set the backward buddy when the current task is still
3574 	 * on the rq. This can happen when a wakeup gets interleaved
3575 	 * with schedule on the ->pre_schedule() or idle_balance()
3576 	 * point, either of which can * drop the rq lock.
3577 	 *
3578 	 * Also, during early boot the idle thread is in the fair class,
3579 	 * for obvious reasons its a bad idea to schedule back to it.
3580 	 */
3581 	if (unlikely(!se->on_rq || curr == rq->idle))
3582 		return;
3583 
3584 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3585 		set_last_buddy(se);
3586 }
3587 
3588 static struct task_struct *pick_next_task_fair(struct rq *rq)
3589 {
3590 	struct task_struct *p;
3591 	struct cfs_rq *cfs_rq = &rq->cfs;
3592 	struct sched_entity *se;
3593 
3594 	if (!cfs_rq->nr_running)
3595 		return NULL;
3596 
3597 	do {
3598 		se = pick_next_entity(cfs_rq);
3599 		set_next_entity(cfs_rq, se);
3600 		cfs_rq = group_cfs_rq(se);
3601 	} while (cfs_rq);
3602 
3603 	p = task_of(se);
3604 	if (hrtick_enabled(rq))
3605 		hrtick_start_fair(rq, p);
3606 
3607 	return p;
3608 }
3609 
3610 /*
3611  * Account for a descheduled task:
3612  */
3613 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
3614 {
3615 	struct sched_entity *se = &prev->se;
3616 	struct cfs_rq *cfs_rq;
3617 
3618 	for_each_sched_entity(se) {
3619 		cfs_rq = cfs_rq_of(se);
3620 		put_prev_entity(cfs_rq, se);
3621 	}
3622 }
3623 
3624 /*
3625  * sched_yield() is very simple
3626  *
3627  * The magic of dealing with the ->skip buddy is in pick_next_entity.
3628  */
3629 static void yield_task_fair(struct rq *rq)
3630 {
3631 	struct task_struct *curr = rq->curr;
3632 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3633 	struct sched_entity *se = &curr->se;
3634 
3635 	/*
3636 	 * Are we the only task in the tree?
3637 	 */
3638 	if (unlikely(rq->nr_running == 1))
3639 		return;
3640 
3641 	clear_buddies(cfs_rq, se);
3642 
3643 	if (curr->policy != SCHED_BATCH) {
3644 		update_rq_clock(rq);
3645 		/*
3646 		 * Update run-time statistics of the 'current'.
3647 		 */
3648 		update_curr(cfs_rq);
3649 		/*
3650 		 * Tell update_rq_clock() that we've just updated,
3651 		 * so we don't do microscopic update in schedule()
3652 		 * and double the fastpath cost.
3653 		 */
3654 		 rq->skip_clock_update = 1;
3655 	}
3656 
3657 	set_skip_buddy(se);
3658 }
3659 
3660 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3661 {
3662 	struct sched_entity *se = &p->se;
3663 
3664 	/* throttled hierarchies are not runnable */
3665 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3666 		return false;
3667 
3668 	/* Tell the scheduler that we'd really like pse to run next. */
3669 	set_next_buddy(se);
3670 
3671 	yield_task_fair(rq);
3672 
3673 	return true;
3674 }
3675 
3676 #ifdef CONFIG_SMP
3677 /**************************************************
3678  * Fair scheduling class load-balancing methods.
3679  *
3680  * BASICS
3681  *
3682  * The purpose of load-balancing is to achieve the same basic fairness the
3683  * per-cpu scheduler provides, namely provide a proportional amount of compute
3684  * time to each task. This is expressed in the following equation:
3685  *
3686  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
3687  *
3688  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3689  * W_i,0 is defined as:
3690  *
3691  *   W_i,0 = \Sum_j w_i,j                                             (2)
3692  *
3693  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3694  * is derived from the nice value as per prio_to_weight[].
3695  *
3696  * The weight average is an exponential decay average of the instantaneous
3697  * weight:
3698  *
3699  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
3700  *
3701  * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3702  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3703  * can also include other factors [XXX].
3704  *
3705  * To achieve this balance we define a measure of imbalance which follows
3706  * directly from (1):
3707  *
3708  *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
3709  *
3710  * We them move tasks around to minimize the imbalance. In the continuous
3711  * function space it is obvious this converges, in the discrete case we get
3712  * a few fun cases generally called infeasible weight scenarios.
3713  *
3714  * [XXX expand on:
3715  *     - infeasible weights;
3716  *     - local vs global optima in the discrete case. ]
3717  *
3718  *
3719  * SCHED DOMAINS
3720  *
3721  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3722  * for all i,j solution, we create a tree of cpus that follows the hardware
3723  * topology where each level pairs two lower groups (or better). This results
3724  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3725  * tree to only the first of the previous level and we decrease the frequency
3726  * of load-balance at each level inv. proportional to the number of cpus in
3727  * the groups.
3728  *
3729  * This yields:
3730  *
3731  *     log_2 n     1     n
3732  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
3733  *     i = 0      2^i   2^i
3734  *                               `- size of each group
3735  *         |         |     `- number of cpus doing load-balance
3736  *         |         `- freq
3737  *         `- sum over all levels
3738  *
3739  * Coupled with a limit on how many tasks we can migrate every balance pass,
3740  * this makes (5) the runtime complexity of the balancer.
3741  *
3742  * An important property here is that each CPU is still (indirectly) connected
3743  * to every other cpu in at most O(log n) steps:
3744  *
3745  * The adjacency matrix of the resulting graph is given by:
3746  *
3747  *             log_2 n
3748  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
3749  *             k = 0
3750  *
3751  * And you'll find that:
3752  *
3753  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
3754  *
3755  * Showing there's indeed a path between every cpu in at most O(log n) steps.
3756  * The task movement gives a factor of O(m), giving a convergence complexity
3757  * of:
3758  *
3759  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
3760  *
3761  *
3762  * WORK CONSERVING
3763  *
3764  * In order to avoid CPUs going idle while there's still work to do, new idle
3765  * balancing is more aggressive and has the newly idle cpu iterate up the domain
3766  * tree itself instead of relying on other CPUs to bring it work.
3767  *
3768  * This adds some complexity to both (5) and (8) but it reduces the total idle
3769  * time.
3770  *
3771  * [XXX more?]
3772  *
3773  *
3774  * CGROUPS
3775  *
3776  * Cgroups make a horror show out of (2), instead of a simple sum we get:
3777  *
3778  *                                s_k,i
3779  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
3780  *                                 S_k
3781  *
3782  * Where
3783  *
3784  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
3785  *
3786  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3787  *
3788  * The big problem is S_k, its a global sum needed to compute a local (W_i)
3789  * property.
3790  *
3791  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3792  *      rewrite all of this once again.]
3793  */
3794 
3795 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3796 
3797 #define LBF_ALL_PINNED	0x01
3798 #define LBF_NEED_BREAK	0x02
3799 #define LBF_SOME_PINNED 0x04
3800 
3801 struct lb_env {
3802 	struct sched_domain	*sd;
3803 
3804 	struct rq		*src_rq;
3805 	int			src_cpu;
3806 
3807 	int			dst_cpu;
3808 	struct rq		*dst_rq;
3809 
3810 	struct cpumask		*dst_grpmask;
3811 	int			new_dst_cpu;
3812 	enum cpu_idle_type	idle;
3813 	long			imbalance;
3814 	/* The set of CPUs under consideration for load-balancing */
3815 	struct cpumask		*cpus;
3816 
3817 	unsigned int		flags;
3818 
3819 	unsigned int		loop;
3820 	unsigned int		loop_break;
3821 	unsigned int		loop_max;
3822 };
3823 
3824 /*
3825  * move_task - move a task from one runqueue to another runqueue.
3826  * Both runqueues must be locked.
3827  */
3828 static void move_task(struct task_struct *p, struct lb_env *env)
3829 {
3830 	deactivate_task(env->src_rq, p, 0);
3831 	set_task_cpu(p, env->dst_cpu);
3832 	activate_task(env->dst_rq, p, 0);
3833 	check_preempt_curr(env->dst_rq, p, 0);
3834 }
3835 
3836 /*
3837  * Is this task likely cache-hot:
3838  */
3839 static int
3840 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3841 {
3842 	s64 delta;
3843 
3844 	if (p->sched_class != &fair_sched_class)
3845 		return 0;
3846 
3847 	if (unlikely(p->policy == SCHED_IDLE))
3848 		return 0;
3849 
3850 	/*
3851 	 * Buddy candidates are cache hot:
3852 	 */
3853 	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3854 			(&p->se == cfs_rq_of(&p->se)->next ||
3855 			 &p->se == cfs_rq_of(&p->se)->last))
3856 		return 1;
3857 
3858 	if (sysctl_sched_migration_cost == -1)
3859 		return 1;
3860 	if (sysctl_sched_migration_cost == 0)
3861 		return 0;
3862 
3863 	delta = now - p->se.exec_start;
3864 
3865 	return delta < (s64)sysctl_sched_migration_cost;
3866 }
3867 
3868 /*
3869  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3870  */
3871 static
3872 int can_migrate_task(struct task_struct *p, struct lb_env *env)
3873 {
3874 	int tsk_cache_hot = 0;
3875 	/*
3876 	 * We do not migrate tasks that are:
3877 	 * 1) running (obviously), or
3878 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3879 	 * 3) are cache-hot on their current CPU.
3880 	 */
3881 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3882 		int new_dst_cpu;
3883 
3884 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3885 
3886 		/*
3887 		 * Remember if this task can be migrated to any other cpu in
3888 		 * our sched_group. We may want to revisit it if we couldn't
3889 		 * meet load balance goals by pulling other tasks on src_cpu.
3890 		 *
3891 		 * Also avoid computing new_dst_cpu if we have already computed
3892 		 * one in current iteration.
3893 		 */
3894 		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3895 			return 0;
3896 
3897 		new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3898 						tsk_cpus_allowed(p));
3899 		if (new_dst_cpu < nr_cpu_ids) {
3900 			env->flags |= LBF_SOME_PINNED;
3901 			env->new_dst_cpu = new_dst_cpu;
3902 		}
3903 		return 0;
3904 	}
3905 
3906 	/* Record that we found atleast one task that could run on dst_cpu */
3907 	env->flags &= ~LBF_ALL_PINNED;
3908 
3909 	if (task_running(env->src_rq, p)) {
3910 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3911 		return 0;
3912 	}
3913 
3914 	/*
3915 	 * Aggressive migration if:
3916 	 * 1) task is cache cold, or
3917 	 * 2) too many balance attempts have failed.
3918 	 */
3919 
3920 	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3921 	if (!tsk_cache_hot ||
3922 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3923 #ifdef CONFIG_SCHEDSTATS
3924 		if (tsk_cache_hot) {
3925 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3926 			schedstat_inc(p, se.statistics.nr_forced_migrations);
3927 		}
3928 #endif
3929 		return 1;
3930 	}
3931 
3932 	if (tsk_cache_hot) {
3933 		schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3934 		return 0;
3935 	}
3936 	return 1;
3937 }
3938 
3939 /*
3940  * move_one_task tries to move exactly one task from busiest to this_rq, as
3941  * part of active balancing operations within "domain".
3942  * Returns 1 if successful and 0 otherwise.
3943  *
3944  * Called with both runqueues locked.
3945  */
3946 static int move_one_task(struct lb_env *env)
3947 {
3948 	struct task_struct *p, *n;
3949 
3950 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3951 		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3952 			continue;
3953 
3954 		if (!can_migrate_task(p, env))
3955 			continue;
3956 
3957 		move_task(p, env);
3958 		/*
3959 		 * Right now, this is only the second place move_task()
3960 		 * is called, so we can safely collect move_task()
3961 		 * stats here rather than inside move_task().
3962 		 */
3963 		schedstat_inc(env->sd, lb_gained[env->idle]);
3964 		return 1;
3965 	}
3966 	return 0;
3967 }
3968 
3969 static unsigned long task_h_load(struct task_struct *p);
3970 
3971 static const unsigned int sched_nr_migrate_break = 32;
3972 
3973 /*
3974  * move_tasks tries to move up to imbalance weighted load from busiest to
3975  * this_rq, as part of a balancing operation within domain "sd".
3976  * Returns 1 if successful and 0 otherwise.
3977  *
3978  * Called with both runqueues locked.
3979  */
3980 static int move_tasks(struct lb_env *env)
3981 {
3982 	struct list_head *tasks = &env->src_rq->cfs_tasks;
3983 	struct task_struct *p;
3984 	unsigned long load;
3985 	int pulled = 0;
3986 
3987 	if (env->imbalance <= 0)
3988 		return 0;
3989 
3990 	while (!list_empty(tasks)) {
3991 		p = list_first_entry(tasks, struct task_struct, se.group_node);
3992 
3993 		env->loop++;
3994 		/* We've more or less seen every task there is, call it quits */
3995 		if (env->loop > env->loop_max)
3996 			break;
3997 
3998 		/* take a breather every nr_migrate tasks */
3999 		if (env->loop > env->loop_break) {
4000 			env->loop_break += sched_nr_migrate_break;
4001 			env->flags |= LBF_NEED_BREAK;
4002 			break;
4003 		}
4004 
4005 		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
4006 			goto next;
4007 
4008 		load = task_h_load(p);
4009 
4010 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
4011 			goto next;
4012 
4013 		if ((load / 2) > env->imbalance)
4014 			goto next;
4015 
4016 		if (!can_migrate_task(p, env))
4017 			goto next;
4018 
4019 		move_task(p, env);
4020 		pulled++;
4021 		env->imbalance -= load;
4022 
4023 #ifdef CONFIG_PREEMPT
4024 		/*
4025 		 * NEWIDLE balancing is a source of latency, so preemptible
4026 		 * kernels will stop after the first task is pulled to minimize
4027 		 * the critical section.
4028 		 */
4029 		if (env->idle == CPU_NEWLY_IDLE)
4030 			break;
4031 #endif
4032 
4033 		/*
4034 		 * We only want to steal up to the prescribed amount of
4035 		 * weighted load.
4036 		 */
4037 		if (env->imbalance <= 0)
4038 			break;
4039 
4040 		continue;
4041 next:
4042 		list_move_tail(&p->se.group_node, tasks);
4043 	}
4044 
4045 	/*
4046 	 * Right now, this is one of only two places move_task() is called,
4047 	 * so we can safely collect move_task() stats here rather than
4048 	 * inside move_task().
4049 	 */
4050 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
4051 
4052 	return pulled;
4053 }
4054 
4055 #ifdef CONFIG_FAIR_GROUP_SCHED
4056 /*
4057  * update tg->load_weight by folding this cpu's load_avg
4058  */
4059 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
4060 {
4061 	struct sched_entity *se = tg->se[cpu];
4062 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
4063 
4064 	/* throttled entities do not contribute to load */
4065 	if (throttled_hierarchy(cfs_rq))
4066 		return;
4067 
4068 	update_cfs_rq_blocked_load(cfs_rq, 1);
4069 
4070 	if (se) {
4071 		update_entity_load_avg(se, 1);
4072 		/*
4073 		 * We pivot on our runnable average having decayed to zero for
4074 		 * list removal.  This generally implies that all our children
4075 		 * have also been removed (modulo rounding error or bandwidth
4076 		 * control); however, such cases are rare and we can fix these
4077 		 * at enqueue.
4078 		 *
4079 		 * TODO: fix up out-of-order children on enqueue.
4080 		 */
4081 		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4082 			list_del_leaf_cfs_rq(cfs_rq);
4083 	} else {
4084 		struct rq *rq = rq_of(cfs_rq);
4085 		update_rq_runnable_avg(rq, rq->nr_running);
4086 	}
4087 }
4088 
4089 static void update_blocked_averages(int cpu)
4090 {
4091 	struct rq *rq = cpu_rq(cpu);
4092 	struct cfs_rq *cfs_rq;
4093 	unsigned long flags;
4094 
4095 	raw_spin_lock_irqsave(&rq->lock, flags);
4096 	update_rq_clock(rq);
4097 	/*
4098 	 * Iterates the task_group tree in a bottom up fashion, see
4099 	 * list_add_leaf_cfs_rq() for details.
4100 	 */
4101 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4102 		/*
4103 		 * Note: We may want to consider periodically releasing
4104 		 * rq->lock about these updates so that creating many task
4105 		 * groups does not result in continually extending hold time.
4106 		 */
4107 		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
4108 	}
4109 
4110 	raw_spin_unlock_irqrestore(&rq->lock, flags);
4111 }
4112 
4113 /*
4114  * Compute the cpu's hierarchical load factor for each task group.
4115  * This needs to be done in a top-down fashion because the load of a child
4116  * group is a fraction of its parents load.
4117  */
4118 static int tg_load_down(struct task_group *tg, void *data)
4119 {
4120 	unsigned long load;
4121 	long cpu = (long)data;
4122 
4123 	if (!tg->parent) {
4124 		load = cpu_rq(cpu)->load.weight;
4125 	} else {
4126 		load = tg->parent->cfs_rq[cpu]->h_load;
4127 		load *= tg->se[cpu]->load.weight;
4128 		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
4129 	}
4130 
4131 	tg->cfs_rq[cpu]->h_load = load;
4132 
4133 	return 0;
4134 }
4135 
4136 static void update_h_load(long cpu)
4137 {
4138 	struct rq *rq = cpu_rq(cpu);
4139 	unsigned long now = jiffies;
4140 
4141 	if (rq->h_load_throttle == now)
4142 		return;
4143 
4144 	rq->h_load_throttle = now;
4145 
4146 	rcu_read_lock();
4147 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
4148 	rcu_read_unlock();
4149 }
4150 
4151 static unsigned long task_h_load(struct task_struct *p)
4152 {
4153 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
4154 	unsigned long load;
4155 
4156 	load = p->se.load.weight;
4157 	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
4158 
4159 	return load;
4160 }
4161 #else
4162 static inline void update_blocked_averages(int cpu)
4163 {
4164 }
4165 
4166 static inline void update_h_load(long cpu)
4167 {
4168 }
4169 
4170 static unsigned long task_h_load(struct task_struct *p)
4171 {
4172 	return p->se.load.weight;
4173 }
4174 #endif
4175 
4176 /********** Helpers for find_busiest_group ************************/
4177 /*
4178  * sd_lb_stats - Structure to store the statistics of a sched_domain
4179  * 		during load balancing.
4180  */
4181 struct sd_lb_stats {
4182 	struct sched_group *busiest; /* Busiest group in this sd */
4183 	struct sched_group *this;  /* Local group in this sd */
4184 	unsigned long total_load;  /* Total load of all groups in sd */
4185 	unsigned long total_pwr;   /*	Total power of all groups in sd */
4186 	unsigned long avg_load;	   /* Average load across all groups in sd */
4187 
4188 	/** Statistics of this group */
4189 	unsigned long this_load;
4190 	unsigned long this_load_per_task;
4191 	unsigned long this_nr_running;
4192 	unsigned long this_has_capacity;
4193 	unsigned int  this_idle_cpus;
4194 
4195 	/* Statistics of the busiest group */
4196 	unsigned int  busiest_idle_cpus;
4197 	unsigned long max_load;
4198 	unsigned long busiest_load_per_task;
4199 	unsigned long busiest_nr_running;
4200 	unsigned long busiest_group_capacity;
4201 	unsigned long busiest_has_capacity;
4202 	unsigned int  busiest_group_weight;
4203 
4204 	int group_imb; /* Is there imbalance in this sd */
4205 };
4206 
4207 /*
4208  * sg_lb_stats - stats of a sched_group required for load_balancing
4209  */
4210 struct sg_lb_stats {
4211 	unsigned long avg_load; /*Avg load across the CPUs of the group */
4212 	unsigned long group_load; /* Total load over the CPUs of the group */
4213 	unsigned long sum_nr_running; /* Nr tasks running in the group */
4214 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4215 	unsigned long group_capacity;
4216 	unsigned long idle_cpus;
4217 	unsigned long group_weight;
4218 	int group_imb; /* Is there an imbalance in the group ? */
4219 	int group_has_capacity; /* Is there extra capacity in the group? */
4220 };
4221 
4222 /**
4223  * get_sd_load_idx - Obtain the load index for a given sched domain.
4224  * @sd: The sched_domain whose load_idx is to be obtained.
4225  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
4226  */
4227 static inline int get_sd_load_idx(struct sched_domain *sd,
4228 					enum cpu_idle_type idle)
4229 {
4230 	int load_idx;
4231 
4232 	switch (idle) {
4233 	case CPU_NOT_IDLE:
4234 		load_idx = sd->busy_idx;
4235 		break;
4236 
4237 	case CPU_NEWLY_IDLE:
4238 		load_idx = sd->newidle_idx;
4239 		break;
4240 	default:
4241 		load_idx = sd->idle_idx;
4242 		break;
4243 	}
4244 
4245 	return load_idx;
4246 }
4247 
4248 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4249 {
4250 	return SCHED_POWER_SCALE;
4251 }
4252 
4253 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4254 {
4255 	return default_scale_freq_power(sd, cpu);
4256 }
4257 
4258 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4259 {
4260 	unsigned long weight = sd->span_weight;
4261 	unsigned long smt_gain = sd->smt_gain;
4262 
4263 	smt_gain /= weight;
4264 
4265 	return smt_gain;
4266 }
4267 
4268 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4269 {
4270 	return default_scale_smt_power(sd, cpu);
4271 }
4272 
4273 unsigned long scale_rt_power(int cpu)
4274 {
4275 	struct rq *rq = cpu_rq(cpu);
4276 	u64 total, available, age_stamp, avg;
4277 
4278 	/*
4279 	 * Since we're reading these variables without serialization make sure
4280 	 * we read them once before doing sanity checks on them.
4281 	 */
4282 	age_stamp = ACCESS_ONCE(rq->age_stamp);
4283 	avg = ACCESS_ONCE(rq->rt_avg);
4284 
4285 	total = sched_avg_period() + (rq->clock - age_stamp);
4286 
4287 	if (unlikely(total < avg)) {
4288 		/* Ensures that power won't end up being negative */
4289 		available = 0;
4290 	} else {
4291 		available = total - avg;
4292 	}
4293 
4294 	if (unlikely((s64)total < SCHED_POWER_SCALE))
4295 		total = SCHED_POWER_SCALE;
4296 
4297 	total >>= SCHED_POWER_SHIFT;
4298 
4299 	return div_u64(available, total);
4300 }
4301 
4302 static void update_cpu_power(struct sched_domain *sd, int cpu)
4303 {
4304 	unsigned long weight = sd->span_weight;
4305 	unsigned long power = SCHED_POWER_SCALE;
4306 	struct sched_group *sdg = sd->groups;
4307 
4308 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
4309 		if (sched_feat(ARCH_POWER))
4310 			power *= arch_scale_smt_power(sd, cpu);
4311 		else
4312 			power *= default_scale_smt_power(sd, cpu);
4313 
4314 		power >>= SCHED_POWER_SHIFT;
4315 	}
4316 
4317 	sdg->sgp->power_orig = power;
4318 
4319 	if (sched_feat(ARCH_POWER))
4320 		power *= arch_scale_freq_power(sd, cpu);
4321 	else
4322 		power *= default_scale_freq_power(sd, cpu);
4323 
4324 	power >>= SCHED_POWER_SHIFT;
4325 
4326 	power *= scale_rt_power(cpu);
4327 	power >>= SCHED_POWER_SHIFT;
4328 
4329 	if (!power)
4330 		power = 1;
4331 
4332 	cpu_rq(cpu)->cpu_power = power;
4333 	sdg->sgp->power = power;
4334 }
4335 
4336 void update_group_power(struct sched_domain *sd, int cpu)
4337 {
4338 	struct sched_domain *child = sd->child;
4339 	struct sched_group *group, *sdg = sd->groups;
4340 	unsigned long power;
4341 	unsigned long interval;
4342 
4343 	interval = msecs_to_jiffies(sd->balance_interval);
4344 	interval = clamp(interval, 1UL, max_load_balance_interval);
4345 	sdg->sgp->next_update = jiffies + interval;
4346 
4347 	if (!child) {
4348 		update_cpu_power(sd, cpu);
4349 		return;
4350 	}
4351 
4352 	power = 0;
4353 
4354 	if (child->flags & SD_OVERLAP) {
4355 		/*
4356 		 * SD_OVERLAP domains cannot assume that child groups
4357 		 * span the current group.
4358 		 */
4359 
4360 		for_each_cpu(cpu, sched_group_cpus(sdg))
4361 			power += power_of(cpu);
4362 	} else  {
4363 		/*
4364 		 * !SD_OVERLAP domains can assume that child groups
4365 		 * span the current group.
4366 		 */
4367 
4368 		group = child->groups;
4369 		do {
4370 			power += group->sgp->power;
4371 			group = group->next;
4372 		} while (group != child->groups);
4373 	}
4374 
4375 	sdg->sgp->power_orig = sdg->sgp->power = power;
4376 }
4377 
4378 /*
4379  * Try and fix up capacity for tiny siblings, this is needed when
4380  * things like SD_ASYM_PACKING need f_b_g to select another sibling
4381  * which on its own isn't powerful enough.
4382  *
4383  * See update_sd_pick_busiest() and check_asym_packing().
4384  */
4385 static inline int
4386 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4387 {
4388 	/*
4389 	 * Only siblings can have significantly less than SCHED_POWER_SCALE
4390 	 */
4391 	if (!(sd->flags & SD_SHARE_CPUPOWER))
4392 		return 0;
4393 
4394 	/*
4395 	 * If ~90% of the cpu_power is still there, we're good.
4396 	 */
4397 	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
4398 		return 1;
4399 
4400 	return 0;
4401 }
4402 
4403 /**
4404  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4405  * @env: The load balancing environment.
4406  * @group: sched_group whose statistics are to be updated.
4407  * @load_idx: Load index of sched_domain of this_cpu for load calc.
4408  * @local_group: Does group contain this_cpu.
4409  * @balance: Should we balance.
4410  * @sgs: variable to hold the statistics for this group.
4411  */
4412 static inline void update_sg_lb_stats(struct lb_env *env,
4413 			struct sched_group *group, int load_idx,
4414 			int local_group, int *balance, struct sg_lb_stats *sgs)
4415 {
4416 	unsigned long nr_running, max_nr_running, min_nr_running;
4417 	unsigned long load, max_cpu_load, min_cpu_load;
4418 	unsigned int balance_cpu = -1, first_idle_cpu = 0;
4419 	unsigned long avg_load_per_task = 0;
4420 	int i;
4421 
4422 	if (local_group)
4423 		balance_cpu = group_balance_cpu(group);
4424 
4425 	/* Tally up the load of all CPUs in the group */
4426 	max_cpu_load = 0;
4427 	min_cpu_load = ~0UL;
4428 	max_nr_running = 0;
4429 	min_nr_running = ~0UL;
4430 
4431 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4432 		struct rq *rq = cpu_rq(i);
4433 
4434 		nr_running = rq->nr_running;
4435 
4436 		/* Bias balancing toward cpus of our domain */
4437 		if (local_group) {
4438 			if (idle_cpu(i) && !first_idle_cpu &&
4439 					cpumask_test_cpu(i, sched_group_mask(group))) {
4440 				first_idle_cpu = 1;
4441 				balance_cpu = i;
4442 			}
4443 
4444 			load = target_load(i, load_idx);
4445 		} else {
4446 			load = source_load(i, load_idx);
4447 			if (load > max_cpu_load)
4448 				max_cpu_load = load;
4449 			if (min_cpu_load > load)
4450 				min_cpu_load = load;
4451 
4452 			if (nr_running > max_nr_running)
4453 				max_nr_running = nr_running;
4454 			if (min_nr_running > nr_running)
4455 				min_nr_running = nr_running;
4456 		}
4457 
4458 		sgs->group_load += load;
4459 		sgs->sum_nr_running += nr_running;
4460 		sgs->sum_weighted_load += weighted_cpuload(i);
4461 		if (idle_cpu(i))
4462 			sgs->idle_cpus++;
4463 	}
4464 
4465 	/*
4466 	 * First idle cpu or the first cpu(busiest) in this sched group
4467 	 * is eligible for doing load balancing at this and above
4468 	 * domains. In the newly idle case, we will allow all the cpu's
4469 	 * to do the newly idle load balance.
4470 	 */
4471 	if (local_group) {
4472 		if (env->idle != CPU_NEWLY_IDLE) {
4473 			if (balance_cpu != env->dst_cpu) {
4474 				*balance = 0;
4475 				return;
4476 			}
4477 			update_group_power(env->sd, env->dst_cpu);
4478 		} else if (time_after_eq(jiffies, group->sgp->next_update))
4479 			update_group_power(env->sd, env->dst_cpu);
4480 	}
4481 
4482 	/* Adjust by relative CPU power of the group */
4483 	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
4484 
4485 	/*
4486 	 * Consider the group unbalanced when the imbalance is larger
4487 	 * than the average weight of a task.
4488 	 *
4489 	 * APZ: with cgroup the avg task weight can vary wildly and
4490 	 *      might not be a suitable number - should we keep a
4491 	 *      normalized nr_running number somewhere that negates
4492 	 *      the hierarchy?
4493 	 */
4494 	if (sgs->sum_nr_running)
4495 		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4496 
4497 	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
4498 	    (max_nr_running - min_nr_running) > 1)
4499 		sgs->group_imb = 1;
4500 
4501 	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4502 						SCHED_POWER_SCALE);
4503 	if (!sgs->group_capacity)
4504 		sgs->group_capacity = fix_small_capacity(env->sd, group);
4505 	sgs->group_weight = group->group_weight;
4506 
4507 	if (sgs->group_capacity > sgs->sum_nr_running)
4508 		sgs->group_has_capacity = 1;
4509 }
4510 
4511 /**
4512  * update_sd_pick_busiest - return 1 on busiest group
4513  * @env: The load balancing environment.
4514  * @sds: sched_domain statistics
4515  * @sg: sched_group candidate to be checked for being the busiest
4516  * @sgs: sched_group statistics
4517  *
4518  * Determine if @sg is a busier group than the previously selected
4519  * busiest group.
4520  */
4521 static bool update_sd_pick_busiest(struct lb_env *env,
4522 				   struct sd_lb_stats *sds,
4523 				   struct sched_group *sg,
4524 				   struct sg_lb_stats *sgs)
4525 {
4526 	if (sgs->avg_load <= sds->max_load)
4527 		return false;
4528 
4529 	if (sgs->sum_nr_running > sgs->group_capacity)
4530 		return true;
4531 
4532 	if (sgs->group_imb)
4533 		return true;
4534 
4535 	/*
4536 	 * ASYM_PACKING needs to move all the work to the lowest
4537 	 * numbered CPUs in the group, therefore mark all groups
4538 	 * higher than ourself as busy.
4539 	 */
4540 	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
4541 	    env->dst_cpu < group_first_cpu(sg)) {
4542 		if (!sds->busiest)
4543 			return true;
4544 
4545 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
4546 			return true;
4547 	}
4548 
4549 	return false;
4550 }
4551 
4552 /**
4553  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4554  * @env: The load balancing environment.
4555  * @balance: Should we balance.
4556  * @sds: variable to hold the statistics for this sched_domain.
4557  */
4558 static inline void update_sd_lb_stats(struct lb_env *env,
4559 					int *balance, struct sd_lb_stats *sds)
4560 {
4561 	struct sched_domain *child = env->sd->child;
4562 	struct sched_group *sg = env->sd->groups;
4563 	struct sg_lb_stats sgs;
4564 	int load_idx, prefer_sibling = 0;
4565 
4566 	if (child && child->flags & SD_PREFER_SIBLING)
4567 		prefer_sibling = 1;
4568 
4569 	load_idx = get_sd_load_idx(env->sd, env->idle);
4570 
4571 	do {
4572 		int local_group;
4573 
4574 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4575 		memset(&sgs, 0, sizeof(sgs));
4576 		update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
4577 
4578 		if (local_group && !(*balance))
4579 			return;
4580 
4581 		sds->total_load += sgs.group_load;
4582 		sds->total_pwr += sg->sgp->power;
4583 
4584 		/*
4585 		 * In case the child domain prefers tasks go to siblings
4586 		 * first, lower the sg capacity to one so that we'll try
4587 		 * and move all the excess tasks away. We lower the capacity
4588 		 * of a group only if the local group has the capacity to fit
4589 		 * these excess tasks, i.e. nr_running < group_capacity. The
4590 		 * extra check prevents the case where you always pull from the
4591 		 * heaviest group when it is already under-utilized (possible
4592 		 * with a large weight task outweighs the tasks on the system).
4593 		 */
4594 		if (prefer_sibling && !local_group && sds->this_has_capacity)
4595 			sgs.group_capacity = min(sgs.group_capacity, 1UL);
4596 
4597 		if (local_group) {
4598 			sds->this_load = sgs.avg_load;
4599 			sds->this = sg;
4600 			sds->this_nr_running = sgs.sum_nr_running;
4601 			sds->this_load_per_task = sgs.sum_weighted_load;
4602 			sds->this_has_capacity = sgs.group_has_capacity;
4603 			sds->this_idle_cpus = sgs.idle_cpus;
4604 		} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4605 			sds->max_load = sgs.avg_load;
4606 			sds->busiest = sg;
4607 			sds->busiest_nr_running = sgs.sum_nr_running;
4608 			sds->busiest_idle_cpus = sgs.idle_cpus;
4609 			sds->busiest_group_capacity = sgs.group_capacity;
4610 			sds->busiest_load_per_task = sgs.sum_weighted_load;
4611 			sds->busiest_has_capacity = sgs.group_has_capacity;
4612 			sds->busiest_group_weight = sgs.group_weight;
4613 			sds->group_imb = sgs.group_imb;
4614 		}
4615 
4616 		sg = sg->next;
4617 	} while (sg != env->sd->groups);
4618 }
4619 
4620 /**
4621  * check_asym_packing - Check to see if the group is packed into the
4622  *			sched doman.
4623  *
4624  * This is primarily intended to used at the sibling level.  Some
4625  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
4626  * case of POWER7, it can move to lower SMT modes only when higher
4627  * threads are idle.  When in lower SMT modes, the threads will
4628  * perform better since they share less core resources.  Hence when we
4629  * have idle threads, we want them to be the higher ones.
4630  *
4631  * This packing function is run on idle threads.  It checks to see if
4632  * the busiest CPU in this domain (core in the P7 case) has a higher
4633  * CPU number than the packing function is being run on.  Here we are
4634  * assuming lower CPU number will be equivalent to lower a SMT thread
4635  * number.
4636  *
4637  * Returns 1 when packing is required and a task should be moved to
4638  * this CPU.  The amount of the imbalance is returned in *imbalance.
4639  *
4640  * @env: The load balancing environment.
4641  * @sds: Statistics of the sched_domain which is to be packed
4642  */
4643 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4644 {
4645 	int busiest_cpu;
4646 
4647 	if (!(env->sd->flags & SD_ASYM_PACKING))
4648 		return 0;
4649 
4650 	if (!sds->busiest)
4651 		return 0;
4652 
4653 	busiest_cpu = group_first_cpu(sds->busiest);
4654 	if (env->dst_cpu > busiest_cpu)
4655 		return 0;
4656 
4657 	env->imbalance = DIV_ROUND_CLOSEST(
4658 		sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
4659 
4660 	return 1;
4661 }
4662 
4663 /**
4664  * fix_small_imbalance - Calculate the minor imbalance that exists
4665  *			amongst the groups of a sched_domain, during
4666  *			load balancing.
4667  * @env: The load balancing environment.
4668  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
4669  */
4670 static inline
4671 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4672 {
4673 	unsigned long tmp, pwr_now = 0, pwr_move = 0;
4674 	unsigned int imbn = 2;
4675 	unsigned long scaled_busy_load_per_task;
4676 
4677 	if (sds->this_nr_running) {
4678 		sds->this_load_per_task /= sds->this_nr_running;
4679 		if (sds->busiest_load_per_task >
4680 				sds->this_load_per_task)
4681 			imbn = 1;
4682 	} else {
4683 		sds->this_load_per_task =
4684 			cpu_avg_load_per_task(env->dst_cpu);
4685 	}
4686 
4687 	scaled_busy_load_per_task = sds->busiest_load_per_task
4688 					 * SCHED_POWER_SCALE;
4689 	scaled_busy_load_per_task /= sds->busiest->sgp->power;
4690 
4691 	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4692 			(scaled_busy_load_per_task * imbn)) {
4693 		env->imbalance = sds->busiest_load_per_task;
4694 		return;
4695 	}
4696 
4697 	/*
4698 	 * OK, we don't have enough imbalance to justify moving tasks,
4699 	 * however we may be able to increase total CPU power used by
4700 	 * moving them.
4701 	 */
4702 
4703 	pwr_now += sds->busiest->sgp->power *
4704 			min(sds->busiest_load_per_task, sds->max_load);
4705 	pwr_now += sds->this->sgp->power *
4706 			min(sds->this_load_per_task, sds->this_load);
4707 	pwr_now /= SCHED_POWER_SCALE;
4708 
4709 	/* Amount of load we'd subtract */
4710 	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4711 		sds->busiest->sgp->power;
4712 	if (sds->max_load > tmp)
4713 		pwr_move += sds->busiest->sgp->power *
4714 			min(sds->busiest_load_per_task, sds->max_load - tmp);
4715 
4716 	/* Amount of load we'd add */
4717 	if (sds->max_load * sds->busiest->sgp->power <
4718 		sds->busiest_load_per_task * SCHED_POWER_SCALE)
4719 		tmp = (sds->max_load * sds->busiest->sgp->power) /
4720 			sds->this->sgp->power;
4721 	else
4722 		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4723 			sds->this->sgp->power;
4724 	pwr_move += sds->this->sgp->power *
4725 			min(sds->this_load_per_task, sds->this_load + tmp);
4726 	pwr_move /= SCHED_POWER_SCALE;
4727 
4728 	/* Move if we gain throughput */
4729 	if (pwr_move > pwr_now)
4730 		env->imbalance = sds->busiest_load_per_task;
4731 }
4732 
4733 /**
4734  * calculate_imbalance - Calculate the amount of imbalance present within the
4735  *			 groups of a given sched_domain during load balance.
4736  * @env: load balance environment
4737  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4738  */
4739 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4740 {
4741 	unsigned long max_pull, load_above_capacity = ~0UL;
4742 
4743 	sds->busiest_load_per_task /= sds->busiest_nr_running;
4744 	if (sds->group_imb) {
4745 		sds->busiest_load_per_task =
4746 			min(sds->busiest_load_per_task, sds->avg_load);
4747 	}
4748 
4749 	/*
4750 	 * In the presence of smp nice balancing, certain scenarios can have
4751 	 * max load less than avg load(as we skip the groups at or below
4752 	 * its cpu_power, while calculating max_load..)
4753 	 */
4754 	if (sds->max_load < sds->avg_load) {
4755 		env->imbalance = 0;
4756 		return fix_small_imbalance(env, sds);
4757 	}
4758 
4759 	if (!sds->group_imb) {
4760 		/*
4761 		 * Don't want to pull so many tasks that a group would go idle.
4762 		 */
4763 		load_above_capacity = (sds->busiest_nr_running -
4764 						sds->busiest_group_capacity);
4765 
4766 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4767 
4768 		load_above_capacity /= sds->busiest->sgp->power;
4769 	}
4770 
4771 	/*
4772 	 * We're trying to get all the cpus to the average_load, so we don't
4773 	 * want to push ourselves above the average load, nor do we wish to
4774 	 * reduce the max loaded cpu below the average load. At the same time,
4775 	 * we also don't want to reduce the group load below the group capacity
4776 	 * (so that we can implement power-savings policies etc). Thus we look
4777 	 * for the minimum possible imbalance.
4778 	 * Be careful of negative numbers as they'll appear as very large values
4779 	 * with unsigned longs.
4780 	 */
4781 	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4782 
4783 	/* How much load to actually move to equalise the imbalance */
4784 	env->imbalance = min(max_pull * sds->busiest->sgp->power,
4785 		(sds->avg_load - sds->this_load) * sds->this->sgp->power)
4786 			/ SCHED_POWER_SCALE;
4787 
4788 	/*
4789 	 * if *imbalance is less than the average load per runnable task
4790 	 * there is no guarantee that any tasks will be moved so we'll have
4791 	 * a think about bumping its value to force at least one task to be
4792 	 * moved
4793 	 */
4794 	if (env->imbalance < sds->busiest_load_per_task)
4795 		return fix_small_imbalance(env, sds);
4796 
4797 }
4798 
4799 /******* find_busiest_group() helpers end here *********************/
4800 
4801 /**
4802  * find_busiest_group - Returns the busiest group within the sched_domain
4803  * if there is an imbalance. If there isn't an imbalance, and
4804  * the user has opted for power-savings, it returns a group whose
4805  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4806  * such a group exists.
4807  *
4808  * Also calculates the amount of weighted load which should be moved
4809  * to restore balance.
4810  *
4811  * @env: The load balancing environment.
4812  * @balance: Pointer to a variable indicating if this_cpu
4813  *	is the appropriate cpu to perform load balancing at this_level.
4814  *
4815  * Returns:	- the busiest group if imbalance exists.
4816  *		- If no imbalance and user has opted for power-savings balance,
4817  *		   return the least loaded group whose CPUs can be
4818  *		   put to idle by rebalancing its tasks onto our group.
4819  */
4820 static struct sched_group *
4821 find_busiest_group(struct lb_env *env, int *balance)
4822 {
4823 	struct sd_lb_stats sds;
4824 
4825 	memset(&sds, 0, sizeof(sds));
4826 
4827 	/*
4828 	 * Compute the various statistics relavent for load balancing at
4829 	 * this level.
4830 	 */
4831 	update_sd_lb_stats(env, balance, &sds);
4832 
4833 	/*
4834 	 * this_cpu is not the appropriate cpu to perform load balancing at
4835 	 * this level.
4836 	 */
4837 	if (!(*balance))
4838 		goto ret;
4839 
4840 	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4841 	    check_asym_packing(env, &sds))
4842 		return sds.busiest;
4843 
4844 	/* There is no busy sibling group to pull tasks from */
4845 	if (!sds.busiest || sds.busiest_nr_running == 0)
4846 		goto out_balanced;
4847 
4848 	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4849 
4850 	/*
4851 	 * If the busiest group is imbalanced the below checks don't
4852 	 * work because they assumes all things are equal, which typically
4853 	 * isn't true due to cpus_allowed constraints and the like.
4854 	 */
4855 	if (sds.group_imb)
4856 		goto force_balance;
4857 
4858 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4859 	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4860 			!sds.busiest_has_capacity)
4861 		goto force_balance;
4862 
4863 	/*
4864 	 * If the local group is more busy than the selected busiest group
4865 	 * don't try and pull any tasks.
4866 	 */
4867 	if (sds.this_load >= sds.max_load)
4868 		goto out_balanced;
4869 
4870 	/*
4871 	 * Don't pull any tasks if this group is already above the domain
4872 	 * average load.
4873 	 */
4874 	if (sds.this_load >= sds.avg_load)
4875 		goto out_balanced;
4876 
4877 	if (env->idle == CPU_IDLE) {
4878 		/*
4879 		 * This cpu is idle. If the busiest group load doesn't
4880 		 * have more tasks than the number of available cpu's and
4881 		 * there is no imbalance between this and busiest group
4882 		 * wrt to idle cpu's, it is balanced.
4883 		 */
4884 		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4885 		    sds.busiest_nr_running <= sds.busiest_group_weight)
4886 			goto out_balanced;
4887 	} else {
4888 		/*
4889 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4890 		 * imbalance_pct to be conservative.
4891 		 */
4892 		if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4893 			goto out_balanced;
4894 	}
4895 
4896 force_balance:
4897 	/* Looks like there is an imbalance. Compute it */
4898 	calculate_imbalance(env, &sds);
4899 	return sds.busiest;
4900 
4901 out_balanced:
4902 ret:
4903 	env->imbalance = 0;
4904 	return NULL;
4905 }
4906 
4907 /*
4908  * find_busiest_queue - find the busiest runqueue among the cpus in group.
4909  */
4910 static struct rq *find_busiest_queue(struct lb_env *env,
4911 				     struct sched_group *group)
4912 {
4913 	struct rq *busiest = NULL, *rq;
4914 	unsigned long max_load = 0;
4915 	int i;
4916 
4917 	for_each_cpu(i, sched_group_cpus(group)) {
4918 		unsigned long power = power_of(i);
4919 		unsigned long capacity = DIV_ROUND_CLOSEST(power,
4920 							   SCHED_POWER_SCALE);
4921 		unsigned long wl;
4922 
4923 		if (!capacity)
4924 			capacity = fix_small_capacity(env->sd, group);
4925 
4926 		if (!cpumask_test_cpu(i, env->cpus))
4927 			continue;
4928 
4929 		rq = cpu_rq(i);
4930 		wl = weighted_cpuload(i);
4931 
4932 		/*
4933 		 * When comparing with imbalance, use weighted_cpuload()
4934 		 * which is not scaled with the cpu power.
4935 		 */
4936 		if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4937 			continue;
4938 
4939 		/*
4940 		 * For the load comparisons with the other cpu's, consider
4941 		 * the weighted_cpuload() scaled with the cpu power, so that
4942 		 * the load can be moved away from the cpu that is potentially
4943 		 * running at a lower capacity.
4944 		 */
4945 		wl = (wl * SCHED_POWER_SCALE) / power;
4946 
4947 		if (wl > max_load) {
4948 			max_load = wl;
4949 			busiest = rq;
4950 		}
4951 	}
4952 
4953 	return busiest;
4954 }
4955 
4956 /*
4957  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4958  * so long as it is large enough.
4959  */
4960 #define MAX_PINNED_INTERVAL	512
4961 
4962 /* Working cpumask for load_balance and load_balance_newidle. */
4963 DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4964 
4965 static int need_active_balance(struct lb_env *env)
4966 {
4967 	struct sched_domain *sd = env->sd;
4968 
4969 	if (env->idle == CPU_NEWLY_IDLE) {
4970 
4971 		/*
4972 		 * ASYM_PACKING needs to force migrate tasks from busy but
4973 		 * higher numbered CPUs in order to pack all tasks in the
4974 		 * lowest numbered CPUs.
4975 		 */
4976 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4977 			return 1;
4978 	}
4979 
4980 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
4981 }
4982 
4983 static int active_load_balance_cpu_stop(void *data);
4984 
4985 /*
4986  * Check this_cpu to ensure it is balanced within domain. Attempt to move
4987  * tasks if there is an imbalance.
4988  */
4989 static int load_balance(int this_cpu, struct rq *this_rq,
4990 			struct sched_domain *sd, enum cpu_idle_type idle,
4991 			int *balance)
4992 {
4993 	int ld_moved, cur_ld_moved, active_balance = 0;
4994 	int lb_iterations, max_lb_iterations;
4995 	struct sched_group *group;
4996 	struct rq *busiest;
4997 	unsigned long flags;
4998 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4999 
5000 	struct lb_env env = {
5001 		.sd		= sd,
5002 		.dst_cpu	= this_cpu,
5003 		.dst_rq		= this_rq,
5004 		.dst_grpmask    = sched_group_cpus(sd->groups),
5005 		.idle		= idle,
5006 		.loop_break	= sched_nr_migrate_break,
5007 		.cpus		= cpus,
5008 	};
5009 
5010 	cpumask_copy(cpus, cpu_active_mask);
5011 	max_lb_iterations = cpumask_weight(env.dst_grpmask);
5012 
5013 	schedstat_inc(sd, lb_count[idle]);
5014 
5015 redo:
5016 	group = find_busiest_group(&env, balance);
5017 
5018 	if (*balance == 0)
5019 		goto out_balanced;
5020 
5021 	if (!group) {
5022 		schedstat_inc(sd, lb_nobusyg[idle]);
5023 		goto out_balanced;
5024 	}
5025 
5026 	busiest = find_busiest_queue(&env, group);
5027 	if (!busiest) {
5028 		schedstat_inc(sd, lb_nobusyq[idle]);
5029 		goto out_balanced;
5030 	}
5031 
5032 	BUG_ON(busiest == env.dst_rq);
5033 
5034 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5035 
5036 	ld_moved = 0;
5037 	lb_iterations = 1;
5038 	if (busiest->nr_running > 1) {
5039 		/*
5040 		 * Attempt to move tasks. If find_busiest_group has found
5041 		 * an imbalance but busiest->nr_running <= 1, the group is
5042 		 * still unbalanced. ld_moved simply stays zero, so it is
5043 		 * correctly treated as an imbalance.
5044 		 */
5045 		env.flags |= LBF_ALL_PINNED;
5046 		env.src_cpu   = busiest->cpu;
5047 		env.src_rq    = busiest;
5048 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
5049 
5050 		update_h_load(env.src_cpu);
5051 more_balance:
5052 		local_irq_save(flags);
5053 		double_rq_lock(env.dst_rq, busiest);
5054 
5055 		/*
5056 		 * cur_ld_moved - load moved in current iteration
5057 		 * ld_moved     - cumulative load moved across iterations
5058 		 */
5059 		cur_ld_moved = move_tasks(&env);
5060 		ld_moved += cur_ld_moved;
5061 		double_rq_unlock(env.dst_rq, busiest);
5062 		local_irq_restore(flags);
5063 
5064 		if (env.flags & LBF_NEED_BREAK) {
5065 			env.flags &= ~LBF_NEED_BREAK;
5066 			goto more_balance;
5067 		}
5068 
5069 		/*
5070 		 * some other cpu did the load balance for us.
5071 		 */
5072 		if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5073 			resched_cpu(env.dst_cpu);
5074 
5075 		/*
5076 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5077 		 * us and move them to an alternate dst_cpu in our sched_group
5078 		 * where they can run. The upper limit on how many times we
5079 		 * iterate on same src_cpu is dependent on number of cpus in our
5080 		 * sched_group.
5081 		 *
5082 		 * This changes load balance semantics a bit on who can move
5083 		 * load to a given_cpu. In addition to the given_cpu itself
5084 		 * (or a ilb_cpu acting on its behalf where given_cpu is
5085 		 * nohz-idle), we now have balance_cpu in a position to move
5086 		 * load to given_cpu. In rare situations, this may cause
5087 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
5088 		 * _independently_ and at _same_ time to move some load to
5089 		 * given_cpu) causing exceess load to be moved to given_cpu.
5090 		 * This however should not happen so much in practice and
5091 		 * moreover subsequent load balance cycles should correct the
5092 		 * excess load moved.
5093 		 */
5094 		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
5095 				lb_iterations++ < max_lb_iterations) {
5096 
5097 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
5098 			env.dst_cpu	 = env.new_dst_cpu;
5099 			env.flags	&= ~LBF_SOME_PINNED;
5100 			env.loop	 = 0;
5101 			env.loop_break	 = sched_nr_migrate_break;
5102 			/*
5103 			 * Go back to "more_balance" rather than "redo" since we
5104 			 * need to continue with same src_cpu.
5105 			 */
5106 			goto more_balance;
5107 		}
5108 
5109 		/* All tasks on this runqueue were pinned by CPU affinity */
5110 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
5111 			cpumask_clear_cpu(cpu_of(busiest), cpus);
5112 			if (!cpumask_empty(cpus)) {
5113 				env.loop = 0;
5114 				env.loop_break = sched_nr_migrate_break;
5115 				goto redo;
5116 			}
5117 			goto out_balanced;
5118 		}
5119 	}
5120 
5121 	if (!ld_moved) {
5122 		schedstat_inc(sd, lb_failed[idle]);
5123 		/*
5124 		 * Increment the failure counter only on periodic balance.
5125 		 * We do not want newidle balance, which can be very
5126 		 * frequent, pollute the failure counter causing
5127 		 * excessive cache_hot migrations and active balances.
5128 		 */
5129 		if (idle != CPU_NEWLY_IDLE)
5130 			sd->nr_balance_failed++;
5131 
5132 		if (need_active_balance(&env)) {
5133 			raw_spin_lock_irqsave(&busiest->lock, flags);
5134 
5135 			/* don't kick the active_load_balance_cpu_stop,
5136 			 * if the curr task on busiest cpu can't be
5137 			 * moved to this_cpu
5138 			 */
5139 			if (!cpumask_test_cpu(this_cpu,
5140 					tsk_cpus_allowed(busiest->curr))) {
5141 				raw_spin_unlock_irqrestore(&busiest->lock,
5142 							    flags);
5143 				env.flags |= LBF_ALL_PINNED;
5144 				goto out_one_pinned;
5145 			}
5146 
5147 			/*
5148 			 * ->active_balance synchronizes accesses to
5149 			 * ->active_balance_work.  Once set, it's cleared
5150 			 * only after active load balance is finished.
5151 			 */
5152 			if (!busiest->active_balance) {
5153 				busiest->active_balance = 1;
5154 				busiest->push_cpu = this_cpu;
5155 				active_balance = 1;
5156 			}
5157 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
5158 
5159 			if (active_balance) {
5160 				stop_one_cpu_nowait(cpu_of(busiest),
5161 					active_load_balance_cpu_stop, busiest,
5162 					&busiest->active_balance_work);
5163 			}
5164 
5165 			/*
5166 			 * We've kicked active balancing, reset the failure
5167 			 * counter.
5168 			 */
5169 			sd->nr_balance_failed = sd->cache_nice_tries+1;
5170 		}
5171 	} else
5172 		sd->nr_balance_failed = 0;
5173 
5174 	if (likely(!active_balance)) {
5175 		/* We were unbalanced, so reset the balancing interval */
5176 		sd->balance_interval = sd->min_interval;
5177 	} else {
5178 		/*
5179 		 * If we've begun active balancing, start to back off. This
5180 		 * case may not be covered by the all_pinned logic if there
5181 		 * is only 1 task on the busy runqueue (because we don't call
5182 		 * move_tasks).
5183 		 */
5184 		if (sd->balance_interval < sd->max_interval)
5185 			sd->balance_interval *= 2;
5186 	}
5187 
5188 	goto out;
5189 
5190 out_balanced:
5191 	schedstat_inc(sd, lb_balanced[idle]);
5192 
5193 	sd->nr_balance_failed = 0;
5194 
5195 out_one_pinned:
5196 	/* tune up the balancing interval */
5197 	if (((env.flags & LBF_ALL_PINNED) &&
5198 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
5199 			(sd->balance_interval < sd->max_interval))
5200 		sd->balance_interval *= 2;
5201 
5202 	ld_moved = 0;
5203 out:
5204 	return ld_moved;
5205 }
5206 
5207 /*
5208  * idle_balance is called by schedule() if this_cpu is about to become
5209  * idle. Attempts to pull tasks from other CPUs.
5210  */
5211 void idle_balance(int this_cpu, struct rq *this_rq)
5212 {
5213 	struct sched_domain *sd;
5214 	int pulled_task = 0;
5215 	unsigned long next_balance = jiffies + HZ;
5216 
5217 	this_rq->idle_stamp = this_rq->clock;
5218 
5219 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
5220 		return;
5221 
5222 	update_rq_runnable_avg(this_rq, 1);
5223 
5224 	/*
5225 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
5226 	 */
5227 	raw_spin_unlock(&this_rq->lock);
5228 
5229 	update_blocked_averages(this_cpu);
5230 	rcu_read_lock();
5231 	for_each_domain(this_cpu, sd) {
5232 		unsigned long interval;
5233 		int balance = 1;
5234 
5235 		if (!(sd->flags & SD_LOAD_BALANCE))
5236 			continue;
5237 
5238 		if (sd->flags & SD_BALANCE_NEWIDLE) {
5239 			/* If we've pulled tasks over stop searching: */
5240 			pulled_task = load_balance(this_cpu, this_rq,
5241 						   sd, CPU_NEWLY_IDLE, &balance);
5242 		}
5243 
5244 		interval = msecs_to_jiffies(sd->balance_interval);
5245 		if (time_after(next_balance, sd->last_balance + interval))
5246 			next_balance = sd->last_balance + interval;
5247 		if (pulled_task) {
5248 			this_rq->idle_stamp = 0;
5249 			break;
5250 		}
5251 	}
5252 	rcu_read_unlock();
5253 
5254 	raw_spin_lock(&this_rq->lock);
5255 
5256 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
5257 		/*
5258 		 * We are going idle. next_balance may be set based on
5259 		 * a busy processor. So reset next_balance.
5260 		 */
5261 		this_rq->next_balance = next_balance;
5262 	}
5263 }
5264 
5265 /*
5266  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
5267  * running tasks off the busiest CPU onto idle CPUs. It requires at
5268  * least 1 task to be running on each physical CPU where possible, and
5269  * avoids physical / logical imbalances.
5270  */
5271 static int active_load_balance_cpu_stop(void *data)
5272 {
5273 	struct rq *busiest_rq = data;
5274 	int busiest_cpu = cpu_of(busiest_rq);
5275 	int target_cpu = busiest_rq->push_cpu;
5276 	struct rq *target_rq = cpu_rq(target_cpu);
5277 	struct sched_domain *sd;
5278 
5279 	raw_spin_lock_irq(&busiest_rq->lock);
5280 
5281 	/* make sure the requested cpu hasn't gone down in the meantime */
5282 	if (unlikely(busiest_cpu != smp_processor_id() ||
5283 		     !busiest_rq->active_balance))
5284 		goto out_unlock;
5285 
5286 	/* Is there any task to move? */
5287 	if (busiest_rq->nr_running <= 1)
5288 		goto out_unlock;
5289 
5290 	/*
5291 	 * This condition is "impossible", if it occurs
5292 	 * we need to fix it. Originally reported by
5293 	 * Bjorn Helgaas on a 128-cpu setup.
5294 	 */
5295 	BUG_ON(busiest_rq == target_rq);
5296 
5297 	/* move a task from busiest_rq to target_rq */
5298 	double_lock_balance(busiest_rq, target_rq);
5299 
5300 	/* Search for an sd spanning us and the target CPU. */
5301 	rcu_read_lock();
5302 	for_each_domain(target_cpu, sd) {
5303 		if ((sd->flags & SD_LOAD_BALANCE) &&
5304 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
5305 				break;
5306 	}
5307 
5308 	if (likely(sd)) {
5309 		struct lb_env env = {
5310 			.sd		= sd,
5311 			.dst_cpu	= target_cpu,
5312 			.dst_rq		= target_rq,
5313 			.src_cpu	= busiest_rq->cpu,
5314 			.src_rq		= busiest_rq,
5315 			.idle		= CPU_IDLE,
5316 		};
5317 
5318 		schedstat_inc(sd, alb_count);
5319 
5320 		if (move_one_task(&env))
5321 			schedstat_inc(sd, alb_pushed);
5322 		else
5323 			schedstat_inc(sd, alb_failed);
5324 	}
5325 	rcu_read_unlock();
5326 	double_unlock_balance(busiest_rq, target_rq);
5327 out_unlock:
5328 	busiest_rq->active_balance = 0;
5329 	raw_spin_unlock_irq(&busiest_rq->lock);
5330 	return 0;
5331 }
5332 
5333 #ifdef CONFIG_NO_HZ
5334 /*
5335  * idle load balancing details
5336  * - When one of the busy CPUs notice that there may be an idle rebalancing
5337  *   needed, they will kick the idle load balancer, which then does idle
5338  *   load balancing for all the idle CPUs.
5339  */
5340 static struct {
5341 	cpumask_var_t idle_cpus_mask;
5342 	atomic_t nr_cpus;
5343 	unsigned long next_balance;     /* in jiffy units */
5344 } nohz ____cacheline_aligned;
5345 
5346 static inline int find_new_ilb(int call_cpu)
5347 {
5348 	int ilb = cpumask_first(nohz.idle_cpus_mask);
5349 
5350 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
5351 		return ilb;
5352 
5353 	return nr_cpu_ids;
5354 }
5355 
5356 /*
5357  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
5358  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
5359  * CPU (if there is one).
5360  */
5361 static void nohz_balancer_kick(int cpu)
5362 {
5363 	int ilb_cpu;
5364 
5365 	nohz.next_balance++;
5366 
5367 	ilb_cpu = find_new_ilb(cpu);
5368 
5369 	if (ilb_cpu >= nr_cpu_ids)
5370 		return;
5371 
5372 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
5373 		return;
5374 	/*
5375 	 * Use smp_send_reschedule() instead of resched_cpu().
5376 	 * This way we generate a sched IPI on the target cpu which
5377 	 * is idle. And the softirq performing nohz idle load balance
5378 	 * will be run before returning from the IPI.
5379 	 */
5380 	smp_send_reschedule(ilb_cpu);
5381 	return;
5382 }
5383 
5384 static inline void nohz_balance_exit_idle(int cpu)
5385 {
5386 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5387 		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5388 		atomic_dec(&nohz.nr_cpus);
5389 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5390 	}
5391 }
5392 
5393 static inline void set_cpu_sd_state_busy(void)
5394 {
5395 	struct sched_domain *sd;
5396 	int cpu = smp_processor_id();
5397 
5398 	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5399 		return;
5400 	clear_bit(NOHZ_IDLE, nohz_flags(cpu));
5401 
5402 	rcu_read_lock();
5403 	for_each_domain(cpu, sd)
5404 		atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5405 	rcu_read_unlock();
5406 }
5407 
5408 void set_cpu_sd_state_idle(void)
5409 {
5410 	struct sched_domain *sd;
5411 	int cpu = smp_processor_id();
5412 
5413 	if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5414 		return;
5415 	set_bit(NOHZ_IDLE, nohz_flags(cpu));
5416 
5417 	rcu_read_lock();
5418 	for_each_domain(cpu, sd)
5419 		atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5420 	rcu_read_unlock();
5421 }
5422 
5423 /*
5424  * This routine will record that the cpu is going idle with tick stopped.
5425  * This info will be used in performing idle load balancing in the future.
5426  */
5427 void nohz_balance_enter_idle(int cpu)
5428 {
5429 	/*
5430 	 * If this cpu is going down, then nothing needs to be done.
5431 	 */
5432 	if (!cpu_active(cpu))
5433 		return;
5434 
5435 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
5436 		return;
5437 
5438 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
5439 	atomic_inc(&nohz.nr_cpus);
5440 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5441 }
5442 
5443 static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
5444 					unsigned long action, void *hcpu)
5445 {
5446 	switch (action & ~CPU_TASKS_FROZEN) {
5447 	case CPU_DYING:
5448 		nohz_balance_exit_idle(smp_processor_id());
5449 		return NOTIFY_OK;
5450 	default:
5451 		return NOTIFY_DONE;
5452 	}
5453 }
5454 #endif
5455 
5456 static DEFINE_SPINLOCK(balancing);
5457 
5458 /*
5459  * Scale the max load_balance interval with the number of CPUs in the system.
5460  * This trades load-balance latency on larger machines for less cross talk.
5461  */
5462 void update_max_interval(void)
5463 {
5464 	max_load_balance_interval = HZ*num_online_cpus()/10;
5465 }
5466 
5467 /*
5468  * It checks each scheduling domain to see if it is due to be balanced,
5469  * and initiates a balancing operation if so.
5470  *
5471  * Balancing parameters are set up in arch_init_sched_domains.
5472  */
5473 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5474 {
5475 	int balance = 1;
5476 	struct rq *rq = cpu_rq(cpu);
5477 	unsigned long interval;
5478 	struct sched_domain *sd;
5479 	/* Earliest time when we have to do rebalance again */
5480 	unsigned long next_balance = jiffies + 60*HZ;
5481 	int update_next_balance = 0;
5482 	int need_serialize;
5483 
5484 	update_blocked_averages(cpu);
5485 
5486 	rcu_read_lock();
5487 	for_each_domain(cpu, sd) {
5488 		if (!(sd->flags & SD_LOAD_BALANCE))
5489 			continue;
5490 
5491 		interval = sd->balance_interval;
5492 		if (idle != CPU_IDLE)
5493 			interval *= sd->busy_factor;
5494 
5495 		/* scale ms to jiffies */
5496 		interval = msecs_to_jiffies(interval);
5497 		interval = clamp(interval, 1UL, max_load_balance_interval);
5498 
5499 		need_serialize = sd->flags & SD_SERIALIZE;
5500 
5501 		if (need_serialize) {
5502 			if (!spin_trylock(&balancing))
5503 				goto out;
5504 		}
5505 
5506 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
5507 			if (load_balance(cpu, rq, sd, idle, &balance)) {
5508 				/*
5509 				 * We've pulled tasks over so either we're no
5510 				 * longer idle.
5511 				 */
5512 				idle = CPU_NOT_IDLE;
5513 			}
5514 			sd->last_balance = jiffies;
5515 		}
5516 		if (need_serialize)
5517 			spin_unlock(&balancing);
5518 out:
5519 		if (time_after(next_balance, sd->last_balance + interval)) {
5520 			next_balance = sd->last_balance + interval;
5521 			update_next_balance = 1;
5522 		}
5523 
5524 		/*
5525 		 * Stop the load balance at this level. There is another
5526 		 * CPU in our sched group which is doing load balancing more
5527 		 * actively.
5528 		 */
5529 		if (!balance)
5530 			break;
5531 	}
5532 	rcu_read_unlock();
5533 
5534 	/*
5535 	 * next_balance will be updated only when there is a need.
5536 	 * When the cpu is attached to null domain for ex, it will not be
5537 	 * updated.
5538 	 */
5539 	if (likely(update_next_balance))
5540 		rq->next_balance = next_balance;
5541 }
5542 
5543 #ifdef CONFIG_NO_HZ
5544 /*
5545  * In CONFIG_NO_HZ case, the idle balance kickee will do the
5546  * rebalancing for all the cpus for whom scheduler ticks are stopped.
5547  */
5548 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5549 {
5550 	struct rq *this_rq = cpu_rq(this_cpu);
5551 	struct rq *rq;
5552 	int balance_cpu;
5553 
5554 	if (idle != CPU_IDLE ||
5555 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5556 		goto end;
5557 
5558 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
5559 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
5560 			continue;
5561 
5562 		/*
5563 		 * If this cpu gets work to do, stop the load balancing
5564 		 * work being done for other cpus. Next load
5565 		 * balancing owner will pick it up.
5566 		 */
5567 		if (need_resched())
5568 			break;
5569 
5570 		rq = cpu_rq(balance_cpu);
5571 
5572 		raw_spin_lock_irq(&rq->lock);
5573 		update_rq_clock(rq);
5574 		update_idle_cpu_load(rq);
5575 		raw_spin_unlock_irq(&rq->lock);
5576 
5577 		rebalance_domains(balance_cpu, CPU_IDLE);
5578 
5579 		if (time_after(this_rq->next_balance, rq->next_balance))
5580 			this_rq->next_balance = rq->next_balance;
5581 	}
5582 	nohz.next_balance = this_rq->next_balance;
5583 end:
5584 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
5585 }
5586 
5587 /*
5588  * Current heuristic for kicking the idle load balancer in the presence
5589  * of an idle cpu is the system.
5590  *   - This rq has more than one task.
5591  *   - At any scheduler domain level, this cpu's scheduler group has multiple
5592  *     busy cpu's exceeding the group's power.
5593  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
5594  *     domain span are idle.
5595  */
5596 static inline int nohz_kick_needed(struct rq *rq, int cpu)
5597 {
5598 	unsigned long now = jiffies;
5599 	struct sched_domain *sd;
5600 
5601 	if (unlikely(idle_cpu(cpu)))
5602 		return 0;
5603 
5604        /*
5605 	* We may be recently in ticked or tickless idle mode. At the first
5606 	* busy tick after returning from idle, we will update the busy stats.
5607 	*/
5608 	set_cpu_sd_state_busy();
5609 	nohz_balance_exit_idle(cpu);
5610 
5611 	/*
5612 	 * None are in tickless mode and hence no need for NOHZ idle load
5613 	 * balancing.
5614 	 */
5615 	if (likely(!atomic_read(&nohz.nr_cpus)))
5616 		return 0;
5617 
5618 	if (time_before(now, nohz.next_balance))
5619 		return 0;
5620 
5621 	if (rq->nr_running >= 2)
5622 		goto need_kick;
5623 
5624 	rcu_read_lock();
5625 	for_each_domain(cpu, sd) {
5626 		struct sched_group *sg = sd->groups;
5627 		struct sched_group_power *sgp = sg->sgp;
5628 		int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5629 
5630 		if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
5631 			goto need_kick_unlock;
5632 
5633 		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5634 		    && (cpumask_first_and(nohz.idle_cpus_mask,
5635 					  sched_domain_span(sd)) < cpu))
5636 			goto need_kick_unlock;
5637 
5638 		if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5639 			break;
5640 	}
5641 	rcu_read_unlock();
5642 	return 0;
5643 
5644 need_kick_unlock:
5645 	rcu_read_unlock();
5646 need_kick:
5647 	return 1;
5648 }
5649 #else
5650 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
5651 #endif
5652 
5653 /*
5654  * run_rebalance_domains is triggered when needed from the scheduler tick.
5655  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
5656  */
5657 static void run_rebalance_domains(struct softirq_action *h)
5658 {
5659 	int this_cpu = smp_processor_id();
5660 	struct rq *this_rq = cpu_rq(this_cpu);
5661 	enum cpu_idle_type idle = this_rq->idle_balance ?
5662 						CPU_IDLE : CPU_NOT_IDLE;
5663 
5664 	rebalance_domains(this_cpu, idle);
5665 
5666 	/*
5667 	 * If this cpu has a pending nohz_balance_kick, then do the
5668 	 * balancing on behalf of the other idle cpus whose ticks are
5669 	 * stopped.
5670 	 */
5671 	nohz_idle_balance(this_cpu, idle);
5672 }
5673 
5674 static inline int on_null_domain(int cpu)
5675 {
5676 	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
5677 }
5678 
5679 /*
5680  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
5681  */
5682 void trigger_load_balance(struct rq *rq, int cpu)
5683 {
5684 	/* Don't need to rebalance while attached to NULL domain */
5685 	if (time_after_eq(jiffies, rq->next_balance) &&
5686 	    likely(!on_null_domain(cpu)))
5687 		raise_softirq(SCHED_SOFTIRQ);
5688 #ifdef CONFIG_NO_HZ
5689 	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5690 		nohz_balancer_kick(cpu);
5691 #endif
5692 }
5693 
5694 static void rq_online_fair(struct rq *rq)
5695 {
5696 	update_sysctl();
5697 }
5698 
5699 static void rq_offline_fair(struct rq *rq)
5700 {
5701 	update_sysctl();
5702 
5703 	/* Ensure any throttled groups are reachable by pick_next_task */
5704 	unthrottle_offline_cfs_rqs(rq);
5705 }
5706 
5707 #endif /* CONFIG_SMP */
5708 
5709 /*
5710  * scheduler tick hitting a task of our scheduling class:
5711  */
5712 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5713 {
5714 	struct cfs_rq *cfs_rq;
5715 	struct sched_entity *se = &curr->se;
5716 
5717 	for_each_sched_entity(se) {
5718 		cfs_rq = cfs_rq_of(se);
5719 		entity_tick(cfs_rq, se, queued);
5720 	}
5721 
5722 	if (sched_feat_numa(NUMA))
5723 		task_tick_numa(rq, curr);
5724 
5725 	update_rq_runnable_avg(rq, 1);
5726 }
5727 
5728 /*
5729  * called on fork with the child task as argument from the parent's context
5730  *  - child not yet on the tasklist
5731  *  - preemption disabled
5732  */
5733 static void task_fork_fair(struct task_struct *p)
5734 {
5735 	struct cfs_rq *cfs_rq;
5736 	struct sched_entity *se = &p->se, *curr;
5737 	int this_cpu = smp_processor_id();
5738 	struct rq *rq = this_rq();
5739 	unsigned long flags;
5740 
5741 	raw_spin_lock_irqsave(&rq->lock, flags);
5742 
5743 	update_rq_clock(rq);
5744 
5745 	cfs_rq = task_cfs_rq(current);
5746 	curr = cfs_rq->curr;
5747 
5748 	if (unlikely(task_cpu(p) != this_cpu)) {
5749 		rcu_read_lock();
5750 		__set_task_cpu(p, this_cpu);
5751 		rcu_read_unlock();
5752 	}
5753 
5754 	update_curr(cfs_rq);
5755 
5756 	if (curr)
5757 		se->vruntime = curr->vruntime;
5758 	place_entity(cfs_rq, se, 1);
5759 
5760 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5761 		/*
5762 		 * Upon rescheduling, sched_class::put_prev_task() will place
5763 		 * 'current' within the tree based on its new key value.
5764 		 */
5765 		swap(curr->vruntime, se->vruntime);
5766 		resched_task(rq->curr);
5767 	}
5768 
5769 	se->vruntime -= cfs_rq->min_vruntime;
5770 
5771 	raw_spin_unlock_irqrestore(&rq->lock, flags);
5772 }
5773 
5774 /*
5775  * Priority of the task has changed. Check to see if we preempt
5776  * the current task.
5777  */
5778 static void
5779 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
5780 {
5781 	if (!p->se.on_rq)
5782 		return;
5783 
5784 	/*
5785 	 * Reschedule if we are currently running on this runqueue and
5786 	 * our priority decreased, or if we are not currently running on
5787 	 * this runqueue and our priority is higher than the current's
5788 	 */
5789 	if (rq->curr == p) {
5790 		if (p->prio > oldprio)
5791 			resched_task(rq->curr);
5792 	} else
5793 		check_preempt_curr(rq, p, 0);
5794 }
5795 
5796 static void switched_from_fair(struct rq *rq, struct task_struct *p)
5797 {
5798 	struct sched_entity *se = &p->se;
5799 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5800 
5801 	/*
5802 	 * Ensure the task's vruntime is normalized, so that when its
5803 	 * switched back to the fair class the enqueue_entity(.flags=0) will
5804 	 * do the right thing.
5805 	 *
5806 	 * If it was on_rq, then the dequeue_entity(.flags=0) will already
5807 	 * have normalized the vruntime, if it was !on_rq, then only when
5808 	 * the task is sleeping will it still have non-normalized vruntime.
5809 	 */
5810 	if (!se->on_rq && p->state != TASK_RUNNING) {
5811 		/*
5812 		 * Fix up our vruntime so that the current sleep doesn't
5813 		 * cause 'unlimited' sleep bonus.
5814 		 */
5815 		place_entity(cfs_rq, se, 0);
5816 		se->vruntime -= cfs_rq->min_vruntime;
5817 	}
5818 
5819 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5820 	/*
5821 	* Remove our load from contribution when we leave sched_fair
5822 	* and ensure we don't carry in an old decay_count if we
5823 	* switch back.
5824 	*/
5825 	if (p->se.avg.decay_count) {
5826 		struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5827 		__synchronize_entity_decay(&p->se);
5828 		subtract_blocked_load_contrib(cfs_rq,
5829 				p->se.avg.load_avg_contrib);
5830 	}
5831 #endif
5832 }
5833 
5834 /*
5835  * We switched to the sched_fair class.
5836  */
5837 static void switched_to_fair(struct rq *rq, struct task_struct *p)
5838 {
5839 	if (!p->se.on_rq)
5840 		return;
5841 
5842 	/*
5843 	 * We were most likely switched from sched_rt, so
5844 	 * kick off the schedule if running, otherwise just see
5845 	 * if we can still preempt the current task.
5846 	 */
5847 	if (rq->curr == p)
5848 		resched_task(rq->curr);
5849 	else
5850 		check_preempt_curr(rq, p, 0);
5851 }
5852 
5853 /* Account for a task changing its policy or group.
5854  *
5855  * This routine is mostly called to set cfs_rq->curr field when a task
5856  * migrates between groups/classes.
5857  */
5858 static void set_curr_task_fair(struct rq *rq)
5859 {
5860 	struct sched_entity *se = &rq->curr->se;
5861 
5862 	for_each_sched_entity(se) {
5863 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
5864 
5865 		set_next_entity(cfs_rq, se);
5866 		/* ensure bandwidth has been allocated on our new cfs_rq */
5867 		account_cfs_rq_runtime(cfs_rq, 0);
5868 	}
5869 }
5870 
5871 void init_cfs_rq(struct cfs_rq *cfs_rq)
5872 {
5873 	cfs_rq->tasks_timeline = RB_ROOT;
5874 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5875 #ifndef CONFIG_64BIT
5876 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5877 #endif
5878 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5879 	atomic64_set(&cfs_rq->decay_counter, 1);
5880 	atomic64_set(&cfs_rq->removed_load, 0);
5881 #endif
5882 }
5883 
5884 #ifdef CONFIG_FAIR_GROUP_SCHED
5885 static void task_move_group_fair(struct task_struct *p, int on_rq)
5886 {
5887 	struct cfs_rq *cfs_rq;
5888 	/*
5889 	 * If the task was not on the rq at the time of this cgroup movement
5890 	 * it must have been asleep, sleeping tasks keep their ->vruntime
5891 	 * absolute on their old rq until wakeup (needed for the fair sleeper
5892 	 * bonus in place_entity()).
5893 	 *
5894 	 * If it was on the rq, we've just 'preempted' it, which does convert
5895 	 * ->vruntime to a relative base.
5896 	 *
5897 	 * Make sure both cases convert their relative position when migrating
5898 	 * to another cgroup's rq. This does somewhat interfere with the
5899 	 * fair sleeper stuff for the first placement, but who cares.
5900 	 */
5901 	/*
5902 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
5903 	 * But there are some cases where it has already been normalized:
5904 	 *
5905 	 * - Moving a forked child which is waiting for being woken up by
5906 	 *   wake_up_new_task().
5907 	 * - Moving a task which has been woken up by try_to_wake_up() and
5908 	 *   waiting for actually being woken up by sched_ttwu_pending().
5909 	 *
5910 	 * To prevent boost or penalty in the new cfs_rq caused by delta
5911 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5912 	 */
5913 	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5914 		on_rq = 1;
5915 
5916 	if (!on_rq)
5917 		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5918 	set_task_rq(p, task_cpu(p));
5919 	if (!on_rq) {
5920 		cfs_rq = cfs_rq_of(&p->se);
5921 		p->se.vruntime += cfs_rq->min_vruntime;
5922 #ifdef CONFIG_SMP
5923 		/*
5924 		 * migrate_task_rq_fair() will have removed our previous
5925 		 * contribution, but we must synchronize for ongoing future
5926 		 * decay.
5927 		 */
5928 		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5929 		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5930 #endif
5931 	}
5932 }
5933 
5934 void free_fair_sched_group(struct task_group *tg)
5935 {
5936 	int i;
5937 
5938 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5939 
5940 	for_each_possible_cpu(i) {
5941 		if (tg->cfs_rq)
5942 			kfree(tg->cfs_rq[i]);
5943 		if (tg->se)
5944 			kfree(tg->se[i]);
5945 	}
5946 
5947 	kfree(tg->cfs_rq);
5948 	kfree(tg->se);
5949 }
5950 
5951 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5952 {
5953 	struct cfs_rq *cfs_rq;
5954 	struct sched_entity *se;
5955 	int i;
5956 
5957 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5958 	if (!tg->cfs_rq)
5959 		goto err;
5960 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5961 	if (!tg->se)
5962 		goto err;
5963 
5964 	tg->shares = NICE_0_LOAD;
5965 
5966 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5967 
5968 	for_each_possible_cpu(i) {
5969 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5970 				      GFP_KERNEL, cpu_to_node(i));
5971 		if (!cfs_rq)
5972 			goto err;
5973 
5974 		se = kzalloc_node(sizeof(struct sched_entity),
5975 				  GFP_KERNEL, cpu_to_node(i));
5976 		if (!se)
5977 			goto err_free_rq;
5978 
5979 		init_cfs_rq(cfs_rq);
5980 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5981 	}
5982 
5983 	return 1;
5984 
5985 err_free_rq:
5986 	kfree(cfs_rq);
5987 err:
5988 	return 0;
5989 }
5990 
5991 void unregister_fair_sched_group(struct task_group *tg, int cpu)
5992 {
5993 	struct rq *rq = cpu_rq(cpu);
5994 	unsigned long flags;
5995 
5996 	/*
5997 	* Only empty task groups can be destroyed; so we can speculatively
5998 	* check on_list without danger of it being re-added.
5999 	*/
6000 	if (!tg->cfs_rq[cpu]->on_list)
6001 		return;
6002 
6003 	raw_spin_lock_irqsave(&rq->lock, flags);
6004 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
6005 	raw_spin_unlock_irqrestore(&rq->lock, flags);
6006 }
6007 
6008 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6009 			struct sched_entity *se, int cpu,
6010 			struct sched_entity *parent)
6011 {
6012 	struct rq *rq = cpu_rq(cpu);
6013 
6014 	cfs_rq->tg = tg;
6015 	cfs_rq->rq = rq;
6016 	init_cfs_rq_runtime(cfs_rq);
6017 
6018 	tg->cfs_rq[cpu] = cfs_rq;
6019 	tg->se[cpu] = se;
6020 
6021 	/* se could be NULL for root_task_group */
6022 	if (!se)
6023 		return;
6024 
6025 	if (!parent)
6026 		se->cfs_rq = &rq->cfs;
6027 	else
6028 		se->cfs_rq = parent->my_q;
6029 
6030 	se->my_q = cfs_rq;
6031 	update_load_set(&se->load, 0);
6032 	se->parent = parent;
6033 }
6034 
6035 static DEFINE_MUTEX(shares_mutex);
6036 
6037 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6038 {
6039 	int i;
6040 	unsigned long flags;
6041 
6042 	/*
6043 	 * We can't change the weight of the root cgroup.
6044 	 */
6045 	if (!tg->se[0])
6046 		return -EINVAL;
6047 
6048 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
6049 
6050 	mutex_lock(&shares_mutex);
6051 	if (tg->shares == shares)
6052 		goto done;
6053 
6054 	tg->shares = shares;
6055 	for_each_possible_cpu(i) {
6056 		struct rq *rq = cpu_rq(i);
6057 		struct sched_entity *se;
6058 
6059 		se = tg->se[i];
6060 		/* Propagate contribution to hierarchy */
6061 		raw_spin_lock_irqsave(&rq->lock, flags);
6062 		for_each_sched_entity(se)
6063 			update_cfs_shares(group_cfs_rq(se));
6064 		raw_spin_unlock_irqrestore(&rq->lock, flags);
6065 	}
6066 
6067 done:
6068 	mutex_unlock(&shares_mutex);
6069 	return 0;
6070 }
6071 #else /* CONFIG_FAIR_GROUP_SCHED */
6072 
6073 void free_fair_sched_group(struct task_group *tg) { }
6074 
6075 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6076 {
6077 	return 1;
6078 }
6079 
6080 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
6081 
6082 #endif /* CONFIG_FAIR_GROUP_SCHED */
6083 
6084 
6085 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
6086 {
6087 	struct sched_entity *se = &task->se;
6088 	unsigned int rr_interval = 0;
6089 
6090 	/*
6091 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
6092 	 * idle runqueue:
6093 	 */
6094 	if (rq->cfs.load.weight)
6095 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
6096 
6097 	return rr_interval;
6098 }
6099 
6100 /*
6101  * All the scheduling class methods:
6102  */
6103 const struct sched_class fair_sched_class = {
6104 	.next			= &idle_sched_class,
6105 	.enqueue_task		= enqueue_task_fair,
6106 	.dequeue_task		= dequeue_task_fair,
6107 	.yield_task		= yield_task_fair,
6108 	.yield_to_task		= yield_to_task_fair,
6109 
6110 	.check_preempt_curr	= check_preempt_wakeup,
6111 
6112 	.pick_next_task		= pick_next_task_fair,
6113 	.put_prev_task		= put_prev_task_fair,
6114 
6115 #ifdef CONFIG_SMP
6116 	.select_task_rq		= select_task_rq_fair,
6117 #ifdef CONFIG_FAIR_GROUP_SCHED
6118 	.migrate_task_rq	= migrate_task_rq_fair,
6119 #endif
6120 	.rq_online		= rq_online_fair,
6121 	.rq_offline		= rq_offline_fair,
6122 
6123 	.task_waking		= task_waking_fair,
6124 #endif
6125 
6126 	.set_curr_task          = set_curr_task_fair,
6127 	.task_tick		= task_tick_fair,
6128 	.task_fork		= task_fork_fair,
6129 
6130 	.prio_changed		= prio_changed_fair,
6131 	.switched_from		= switched_from_fair,
6132 	.switched_to		= switched_to_fair,
6133 
6134 	.get_rr_interval	= get_rr_interval_fair,
6135 
6136 #ifdef CONFIG_FAIR_GROUP_SCHED
6137 	.task_move_group	= task_move_group_fair,
6138 #endif
6139 };
6140 
6141 #ifdef CONFIG_SCHED_DEBUG
6142 void print_cfs_stats(struct seq_file *m, int cpu)
6143 {
6144 	struct cfs_rq *cfs_rq;
6145 
6146 	rcu_read_lock();
6147 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
6148 		print_cfs_rq(m, cpu, cfs_rq);
6149 	rcu_read_unlock();
6150 }
6151 #endif
6152 
6153 __init void init_sched_fair_class(void)
6154 {
6155 #ifdef CONFIG_SMP
6156 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6157 
6158 #ifdef CONFIG_NO_HZ
6159 	nohz.next_balance = jiffies;
6160 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
6161 	cpu_notifier(sched_ilb_notifier, 0);
6162 #endif
6163 #endif /* SMP */
6164 
6165 }
6166