xref: /openbmc/linux/kernel/sched/fair.c (revision ee89bd6b)
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21  */
22 
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/slab.h>
27 #include <linux/profile.h>
28 #include <linux/interrupt.h>
29 #include <linux/mempolicy.h>
30 #include <linux/migrate.h>
31 #include <linux/task_work.h>
32 
33 #include <trace/events/sched.h>
34 
35 #include "sched.h"
36 
37 /*
38  * Targeted preemption latency for CPU-bound tasks:
39  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
40  *
41  * NOTE: this latency value is not the same as the concept of
42  * 'timeslice length' - timeslices in CFS are of variable length
43  * and have no persistent notion like in traditional, time-slice
44  * based scheduling concepts.
45  *
46  * (to see the precise effective timeslice length of your workload,
47  *  run vmstat and monitor the context-switches (cs) field)
48  */
49 unsigned int sysctl_sched_latency = 6000000ULL;
50 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
51 
52 /*
53  * The initial- and re-scaling of tunables is configurable
54  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
55  *
56  * Options are:
57  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
58  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
59  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
60  */
61 enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 	= SCHED_TUNABLESCALING_LOG;
63 
64 /*
65  * Minimal preemption granularity for CPU-bound tasks:
66  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
67  */
68 unsigned int sysctl_sched_min_granularity = 750000ULL;
69 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
70 
71 /*
72  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
73  */
74 static unsigned int sched_nr_latency = 8;
75 
76 /*
77  * After fork, child runs first. If set to 0 (default) then
78  * parent will (try to) run first.
79  */
80 unsigned int sysctl_sched_child_runs_first __read_mostly;
81 
82 /*
83  * SCHED_OTHER wake-up granularity.
84  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
85  *
86  * This option delays the preemption effects of decoupled workloads
87  * and reduces their over-scheduling. Synchronous workloads will still
88  * have immediate wakeup/sleep latencies.
89  */
90 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
92 
93 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94 
95 /*
96  * The exponential sliding  window over which load is averaged for shares
97  * distribution.
98  * (default: 10msec)
99  */
100 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101 
102 #ifdef CONFIG_CFS_BANDWIDTH
103 /*
104  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
105  * each time a cfs_rq requests quota.
106  *
107  * Note: in the case that the slice exceeds the runtime remaining (either due
108  * to consumption or the quota being specified to be smaller than the slice)
109  * we will always only issue the remaining available time.
110  *
111  * default: 5 msec, units: microseconds
112   */
113 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114 #endif
115 
116 /*
117  * Increase the granularity value when there are more CPUs,
118  * because with more CPUs the 'effective latency' as visible
119  * to users decreases. But the relationship is not linear,
120  * so pick a second-best guess by going with the log2 of the
121  * number of CPUs.
122  *
123  * This idea comes from the SD scheduler of Con Kolivas:
124  */
125 static int get_update_sysctl_factor(void)
126 {
127 	unsigned int cpus = min_t(int, num_online_cpus(), 8);
128 	unsigned int factor;
129 
130 	switch (sysctl_sched_tunable_scaling) {
131 	case SCHED_TUNABLESCALING_NONE:
132 		factor = 1;
133 		break;
134 	case SCHED_TUNABLESCALING_LINEAR:
135 		factor = cpus;
136 		break;
137 	case SCHED_TUNABLESCALING_LOG:
138 	default:
139 		factor = 1 + ilog2(cpus);
140 		break;
141 	}
142 
143 	return factor;
144 }
145 
146 static void update_sysctl(void)
147 {
148 	unsigned int factor = get_update_sysctl_factor();
149 
150 #define SET_SYSCTL(name) \
151 	(sysctl_##name = (factor) * normalized_sysctl_##name)
152 	SET_SYSCTL(sched_min_granularity);
153 	SET_SYSCTL(sched_latency);
154 	SET_SYSCTL(sched_wakeup_granularity);
155 #undef SET_SYSCTL
156 }
157 
158 void sched_init_granularity(void)
159 {
160 	update_sysctl();
161 }
162 
163 #if BITS_PER_LONG == 32
164 # define WMULT_CONST	(~0UL)
165 #else
166 # define WMULT_CONST	(1UL << 32)
167 #endif
168 
169 #define WMULT_SHIFT	32
170 
171 /*
172  * Shift right and round:
173  */
174 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
175 
176 /*
177  * delta *= weight / lw
178  */
179 static unsigned long
180 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
181 		struct load_weight *lw)
182 {
183 	u64 tmp;
184 
185 	/*
186 	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
187 	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
188 	 * 2^SCHED_LOAD_RESOLUTION.
189 	 */
190 	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
191 		tmp = (u64)delta_exec * scale_load_down(weight);
192 	else
193 		tmp = (u64)delta_exec;
194 
195 	if (!lw->inv_weight) {
196 		unsigned long w = scale_load_down(lw->weight);
197 
198 		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199 			lw->inv_weight = 1;
200 		else if (unlikely(!w))
201 			lw->inv_weight = WMULT_CONST;
202 		else
203 			lw->inv_weight = WMULT_CONST / w;
204 	}
205 
206 	/*
207 	 * Check whether we'd overflow the 64-bit multiplication:
208 	 */
209 	if (unlikely(tmp > WMULT_CONST))
210 		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
211 			WMULT_SHIFT/2);
212 	else
213 		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
214 
215 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
216 }
217 
218 
219 const struct sched_class fair_sched_class;
220 
221 /**************************************************************
222  * CFS operations on generic schedulable entities:
223  */
224 
225 #ifdef CONFIG_FAIR_GROUP_SCHED
226 
227 /* cpu runqueue to which this cfs_rq is attached */
228 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
229 {
230 	return cfs_rq->rq;
231 }
232 
233 /* An entity is a task if it doesn't "own" a runqueue */
234 #define entity_is_task(se)	(!se->my_q)
235 
236 static inline struct task_struct *task_of(struct sched_entity *se)
237 {
238 #ifdef CONFIG_SCHED_DEBUG
239 	WARN_ON_ONCE(!entity_is_task(se));
240 #endif
241 	return container_of(se, struct task_struct, se);
242 }
243 
244 /* Walk up scheduling entities hierarchy */
245 #define for_each_sched_entity(se) \
246 		for (; se; se = se->parent)
247 
248 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
249 {
250 	return p->se.cfs_rq;
251 }
252 
253 /* runqueue on which this entity is (to be) queued */
254 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
255 {
256 	return se->cfs_rq;
257 }
258 
259 /* runqueue "owned" by this group */
260 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
261 {
262 	return grp->my_q;
263 }
264 
265 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
266 				       int force_update);
267 
268 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
269 {
270 	if (!cfs_rq->on_list) {
271 		/*
272 		 * Ensure we either appear before our parent (if already
273 		 * enqueued) or force our parent to appear after us when it is
274 		 * enqueued.  The fact that we always enqueue bottom-up
275 		 * reduces this to two cases.
276 		 */
277 		if (cfs_rq->tg->parent &&
278 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
279 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
280 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
281 		} else {
282 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
283 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
284 		}
285 
286 		cfs_rq->on_list = 1;
287 		/* We should have no load, but we need to update last_decay. */
288 		update_cfs_rq_blocked_load(cfs_rq, 0);
289 	}
290 }
291 
292 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
293 {
294 	if (cfs_rq->on_list) {
295 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
296 		cfs_rq->on_list = 0;
297 	}
298 }
299 
300 /* Iterate thr' all leaf cfs_rq's on a runqueue */
301 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
302 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
303 
304 /* Do the two (enqueued) entities belong to the same group ? */
305 static inline int
306 is_same_group(struct sched_entity *se, struct sched_entity *pse)
307 {
308 	if (se->cfs_rq == pse->cfs_rq)
309 		return 1;
310 
311 	return 0;
312 }
313 
314 static inline struct sched_entity *parent_entity(struct sched_entity *se)
315 {
316 	return se->parent;
317 }
318 
319 /* return depth at which a sched entity is present in the hierarchy */
320 static inline int depth_se(struct sched_entity *se)
321 {
322 	int depth = 0;
323 
324 	for_each_sched_entity(se)
325 		depth++;
326 
327 	return depth;
328 }
329 
330 static void
331 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
332 {
333 	int se_depth, pse_depth;
334 
335 	/*
336 	 * preemption test can be made between sibling entities who are in the
337 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
338 	 * both tasks until we find their ancestors who are siblings of common
339 	 * parent.
340 	 */
341 
342 	/* First walk up until both entities are at same depth */
343 	se_depth = depth_se(*se);
344 	pse_depth = depth_se(*pse);
345 
346 	while (se_depth > pse_depth) {
347 		se_depth--;
348 		*se = parent_entity(*se);
349 	}
350 
351 	while (pse_depth > se_depth) {
352 		pse_depth--;
353 		*pse = parent_entity(*pse);
354 	}
355 
356 	while (!is_same_group(*se, *pse)) {
357 		*se = parent_entity(*se);
358 		*pse = parent_entity(*pse);
359 	}
360 }
361 
362 #else	/* !CONFIG_FAIR_GROUP_SCHED */
363 
364 static inline struct task_struct *task_of(struct sched_entity *se)
365 {
366 	return container_of(se, struct task_struct, se);
367 }
368 
369 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
370 {
371 	return container_of(cfs_rq, struct rq, cfs);
372 }
373 
374 #define entity_is_task(se)	1
375 
376 #define for_each_sched_entity(se) \
377 		for (; se; se = NULL)
378 
379 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
380 {
381 	return &task_rq(p)->cfs;
382 }
383 
384 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
385 {
386 	struct task_struct *p = task_of(se);
387 	struct rq *rq = task_rq(p);
388 
389 	return &rq->cfs;
390 }
391 
392 /* runqueue "owned" by this group */
393 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
394 {
395 	return NULL;
396 }
397 
398 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
399 {
400 }
401 
402 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
403 {
404 }
405 
406 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
407 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
408 
409 static inline int
410 is_same_group(struct sched_entity *se, struct sched_entity *pse)
411 {
412 	return 1;
413 }
414 
415 static inline struct sched_entity *parent_entity(struct sched_entity *se)
416 {
417 	return NULL;
418 }
419 
420 static inline void
421 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
422 {
423 }
424 
425 #endif	/* CONFIG_FAIR_GROUP_SCHED */
426 
427 static __always_inline
428 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
429 
430 /**************************************************************
431  * Scheduling class tree data structure manipulation methods:
432  */
433 
434 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
435 {
436 	s64 delta = (s64)(vruntime - max_vruntime);
437 	if (delta > 0)
438 		max_vruntime = vruntime;
439 
440 	return max_vruntime;
441 }
442 
443 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
444 {
445 	s64 delta = (s64)(vruntime - min_vruntime);
446 	if (delta < 0)
447 		min_vruntime = vruntime;
448 
449 	return min_vruntime;
450 }
451 
452 static inline int entity_before(struct sched_entity *a,
453 				struct sched_entity *b)
454 {
455 	return (s64)(a->vruntime - b->vruntime) < 0;
456 }
457 
458 static void update_min_vruntime(struct cfs_rq *cfs_rq)
459 {
460 	u64 vruntime = cfs_rq->min_vruntime;
461 
462 	if (cfs_rq->curr)
463 		vruntime = cfs_rq->curr->vruntime;
464 
465 	if (cfs_rq->rb_leftmost) {
466 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
467 						   struct sched_entity,
468 						   run_node);
469 
470 		if (!cfs_rq->curr)
471 			vruntime = se->vruntime;
472 		else
473 			vruntime = min_vruntime(vruntime, se->vruntime);
474 	}
475 
476 	/* ensure we never gain time by being placed backwards. */
477 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
478 #ifndef CONFIG_64BIT
479 	smp_wmb();
480 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
481 #endif
482 }
483 
484 /*
485  * Enqueue an entity into the rb-tree:
486  */
487 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
488 {
489 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
490 	struct rb_node *parent = NULL;
491 	struct sched_entity *entry;
492 	int leftmost = 1;
493 
494 	/*
495 	 * Find the right place in the rbtree:
496 	 */
497 	while (*link) {
498 		parent = *link;
499 		entry = rb_entry(parent, struct sched_entity, run_node);
500 		/*
501 		 * We dont care about collisions. Nodes with
502 		 * the same key stay together.
503 		 */
504 		if (entity_before(se, entry)) {
505 			link = &parent->rb_left;
506 		} else {
507 			link = &parent->rb_right;
508 			leftmost = 0;
509 		}
510 	}
511 
512 	/*
513 	 * Maintain a cache of leftmost tree entries (it is frequently
514 	 * used):
515 	 */
516 	if (leftmost)
517 		cfs_rq->rb_leftmost = &se->run_node;
518 
519 	rb_link_node(&se->run_node, parent, link);
520 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
521 }
522 
523 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
524 {
525 	if (cfs_rq->rb_leftmost == &se->run_node) {
526 		struct rb_node *next_node;
527 
528 		next_node = rb_next(&se->run_node);
529 		cfs_rq->rb_leftmost = next_node;
530 	}
531 
532 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
533 }
534 
535 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
536 {
537 	struct rb_node *left = cfs_rq->rb_leftmost;
538 
539 	if (!left)
540 		return NULL;
541 
542 	return rb_entry(left, struct sched_entity, run_node);
543 }
544 
545 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
546 {
547 	struct rb_node *next = rb_next(&se->run_node);
548 
549 	if (!next)
550 		return NULL;
551 
552 	return rb_entry(next, struct sched_entity, run_node);
553 }
554 
555 #ifdef CONFIG_SCHED_DEBUG
556 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
557 {
558 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
559 
560 	if (!last)
561 		return NULL;
562 
563 	return rb_entry(last, struct sched_entity, run_node);
564 }
565 
566 /**************************************************************
567  * Scheduling class statistics methods:
568  */
569 
570 int sched_proc_update_handler(struct ctl_table *table, int write,
571 		void __user *buffer, size_t *lenp,
572 		loff_t *ppos)
573 {
574 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
575 	int factor = get_update_sysctl_factor();
576 
577 	if (ret || !write)
578 		return ret;
579 
580 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
581 					sysctl_sched_min_granularity);
582 
583 #define WRT_SYSCTL(name) \
584 	(normalized_sysctl_##name = sysctl_##name / (factor))
585 	WRT_SYSCTL(sched_min_granularity);
586 	WRT_SYSCTL(sched_latency);
587 	WRT_SYSCTL(sched_wakeup_granularity);
588 #undef WRT_SYSCTL
589 
590 	return 0;
591 }
592 #endif
593 
594 /*
595  * delta /= w
596  */
597 static inline unsigned long
598 calc_delta_fair(unsigned long delta, struct sched_entity *se)
599 {
600 	if (unlikely(se->load.weight != NICE_0_LOAD))
601 		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
602 
603 	return delta;
604 }
605 
606 /*
607  * The idea is to set a period in which each task runs once.
608  *
609  * When there are too many tasks (sched_nr_latency) we have to stretch
610  * this period because otherwise the slices get too small.
611  *
612  * p = (nr <= nl) ? l : l*nr/nl
613  */
614 static u64 __sched_period(unsigned long nr_running)
615 {
616 	u64 period = sysctl_sched_latency;
617 	unsigned long nr_latency = sched_nr_latency;
618 
619 	if (unlikely(nr_running > nr_latency)) {
620 		period = sysctl_sched_min_granularity;
621 		period *= nr_running;
622 	}
623 
624 	return period;
625 }
626 
627 /*
628  * We calculate the wall-time slice from the period by taking a part
629  * proportional to the weight.
630  *
631  * s = p*P[w/rw]
632  */
633 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
634 {
635 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
636 
637 	for_each_sched_entity(se) {
638 		struct load_weight *load;
639 		struct load_weight lw;
640 
641 		cfs_rq = cfs_rq_of(se);
642 		load = &cfs_rq->load;
643 
644 		if (unlikely(!se->on_rq)) {
645 			lw = cfs_rq->load;
646 
647 			update_load_add(&lw, se->load.weight);
648 			load = &lw;
649 		}
650 		slice = calc_delta_mine(slice, se->load.weight, load);
651 	}
652 	return slice;
653 }
654 
655 /*
656  * We calculate the vruntime slice of a to-be-inserted task.
657  *
658  * vs = s/w
659  */
660 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
661 {
662 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
663 }
664 
665 /*
666  * Update the current task's runtime statistics. Skip current tasks that
667  * are not in our scheduling class.
668  */
669 static inline void
670 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
671 	      unsigned long delta_exec)
672 {
673 	unsigned long delta_exec_weighted;
674 
675 	schedstat_set(curr->statistics.exec_max,
676 		      max((u64)delta_exec, curr->statistics.exec_max));
677 
678 	curr->sum_exec_runtime += delta_exec;
679 	schedstat_add(cfs_rq, exec_clock, delta_exec);
680 	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
681 
682 	curr->vruntime += delta_exec_weighted;
683 	update_min_vruntime(cfs_rq);
684 }
685 
686 static void update_curr(struct cfs_rq *cfs_rq)
687 {
688 	struct sched_entity *curr = cfs_rq->curr;
689 	u64 now = rq_of(cfs_rq)->clock_task;
690 	unsigned long delta_exec;
691 
692 	if (unlikely(!curr))
693 		return;
694 
695 	/*
696 	 * Get the amount of time the current task was running
697 	 * since the last time we changed load (this cannot
698 	 * overflow on 32 bits):
699 	 */
700 	delta_exec = (unsigned long)(now - curr->exec_start);
701 	if (!delta_exec)
702 		return;
703 
704 	__update_curr(cfs_rq, curr, delta_exec);
705 	curr->exec_start = now;
706 
707 	if (entity_is_task(curr)) {
708 		struct task_struct *curtask = task_of(curr);
709 
710 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
711 		cpuacct_charge(curtask, delta_exec);
712 		account_group_exec_runtime(curtask, delta_exec);
713 	}
714 
715 	account_cfs_rq_runtime(cfs_rq, delta_exec);
716 }
717 
718 static inline void
719 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
720 {
721 	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
722 }
723 
724 /*
725  * Task is being enqueued - update stats:
726  */
727 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
728 {
729 	/*
730 	 * Are we enqueueing a waiting task? (for current tasks
731 	 * a dequeue/enqueue event is a NOP)
732 	 */
733 	if (se != cfs_rq->curr)
734 		update_stats_wait_start(cfs_rq, se);
735 }
736 
737 static void
738 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
739 {
740 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
741 			rq_of(cfs_rq)->clock - se->statistics.wait_start));
742 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
743 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
744 			rq_of(cfs_rq)->clock - se->statistics.wait_start);
745 #ifdef CONFIG_SCHEDSTATS
746 	if (entity_is_task(se)) {
747 		trace_sched_stat_wait(task_of(se),
748 			rq_of(cfs_rq)->clock - se->statistics.wait_start);
749 	}
750 #endif
751 	schedstat_set(se->statistics.wait_start, 0);
752 }
753 
754 static inline void
755 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
756 {
757 	/*
758 	 * Mark the end of the wait period if dequeueing a
759 	 * waiting task:
760 	 */
761 	if (se != cfs_rq->curr)
762 		update_stats_wait_end(cfs_rq, se);
763 }
764 
765 /*
766  * We are picking a new current task - update its stats:
767  */
768 static inline void
769 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
770 {
771 	/*
772 	 * We are starting a new run period:
773 	 */
774 	se->exec_start = rq_of(cfs_rq)->clock_task;
775 }
776 
777 /**************************************************
778  * Scheduling class queueing methods:
779  */
780 
781 #ifdef CONFIG_NUMA_BALANCING
782 /*
783  * numa task sample period in ms
784  */
785 unsigned int sysctl_numa_balancing_scan_period_min = 100;
786 unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
787 unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
788 
789 /* Portion of address space to scan in MB */
790 unsigned int sysctl_numa_balancing_scan_size = 256;
791 
792 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
793 unsigned int sysctl_numa_balancing_scan_delay = 1000;
794 
795 static void task_numa_placement(struct task_struct *p)
796 {
797 	int seq;
798 
799 	if (!p->mm)	/* for example, ksmd faulting in a user's mm */
800 		return;
801 	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
802 	if (p->numa_scan_seq == seq)
803 		return;
804 	p->numa_scan_seq = seq;
805 
806 	/* FIXME: Scheduling placement policy hints go here */
807 }
808 
809 /*
810  * Got a PROT_NONE fault for a page on @node.
811  */
812 void task_numa_fault(int node, int pages, bool migrated)
813 {
814 	struct task_struct *p = current;
815 
816 	if (!sched_feat_numa(NUMA))
817 		return;
818 
819 	/* FIXME: Allocate task-specific structure for placement policy here */
820 
821 	/*
822 	 * If pages are properly placed (did not migrate) then scan slower.
823 	 * This is reset periodically in case of phase changes
824 	 */
825         if (!migrated)
826 		p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
827 			p->numa_scan_period + jiffies_to_msecs(10));
828 
829 	task_numa_placement(p);
830 }
831 
832 static void reset_ptenuma_scan(struct task_struct *p)
833 {
834 	ACCESS_ONCE(p->mm->numa_scan_seq)++;
835 	p->mm->numa_scan_offset = 0;
836 }
837 
838 /*
839  * The expensive part of numa migration is done from task_work context.
840  * Triggered from task_tick_numa().
841  */
842 void task_numa_work(struct callback_head *work)
843 {
844 	unsigned long migrate, next_scan, now = jiffies;
845 	struct task_struct *p = current;
846 	struct mm_struct *mm = p->mm;
847 	struct vm_area_struct *vma;
848 	unsigned long start, end;
849 	long pages;
850 
851 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
852 
853 	work->next = work; /* protect against double add */
854 	/*
855 	 * Who cares about NUMA placement when they're dying.
856 	 *
857 	 * NOTE: make sure not to dereference p->mm before this check,
858 	 * exit_task_work() happens _after_ exit_mm() so we could be called
859 	 * without p->mm even though we still had it when we enqueued this
860 	 * work.
861 	 */
862 	if (p->flags & PF_EXITING)
863 		return;
864 
865 	/*
866 	 * We do not care about task placement until a task runs on a node
867 	 * other than the first one used by the address space. This is
868 	 * largely because migrations are driven by what CPU the task
869 	 * is running on. If it's never scheduled on another node, it'll
870 	 * not migrate so why bother trapping the fault.
871 	 */
872 	if (mm->first_nid == NUMA_PTE_SCAN_INIT)
873 		mm->first_nid = numa_node_id();
874 	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
875 		/* Are we running on a new node yet? */
876 		if (numa_node_id() == mm->first_nid &&
877 		    !sched_feat_numa(NUMA_FORCE))
878 			return;
879 
880 		mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
881 	}
882 
883 	/*
884 	 * Reset the scan period if enough time has gone by. Objective is that
885 	 * scanning will be reduced if pages are properly placed. As tasks
886 	 * can enter different phases this needs to be re-examined. Lacking
887 	 * proper tracking of reference behaviour, this blunt hammer is used.
888 	 */
889 	migrate = mm->numa_next_reset;
890 	if (time_after(now, migrate)) {
891 		p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
892 		next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
893 		xchg(&mm->numa_next_reset, next_scan);
894 	}
895 
896 	/*
897 	 * Enforce maximal scan/migration frequency..
898 	 */
899 	migrate = mm->numa_next_scan;
900 	if (time_before(now, migrate))
901 		return;
902 
903 	if (p->numa_scan_period == 0)
904 		p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
905 
906 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
907 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
908 		return;
909 
910 	/*
911 	 * Do not set pte_numa if the current running node is rate-limited.
912 	 * This loses statistics on the fault but if we are unwilling to
913 	 * migrate to this node, it is less likely we can do useful work
914 	 */
915 	if (migrate_ratelimited(numa_node_id()))
916 		return;
917 
918 	start = mm->numa_scan_offset;
919 	pages = sysctl_numa_balancing_scan_size;
920 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
921 	if (!pages)
922 		return;
923 
924 	down_read(&mm->mmap_sem);
925 	vma = find_vma(mm, start);
926 	if (!vma) {
927 		reset_ptenuma_scan(p);
928 		start = 0;
929 		vma = mm->mmap;
930 	}
931 	for (; vma; vma = vma->vm_next) {
932 		if (!vma_migratable(vma))
933 			continue;
934 
935 		/* Skip small VMAs. They are not likely to be of relevance */
936 		if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
937 			continue;
938 
939 		do {
940 			start = max(start, vma->vm_start);
941 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
942 			end = min(end, vma->vm_end);
943 			pages -= change_prot_numa(vma, start, end);
944 
945 			start = end;
946 			if (pages <= 0)
947 				goto out;
948 		} while (end != vma->vm_end);
949 	}
950 
951 out:
952 	/*
953 	 * It is possible to reach the end of the VMA list but the last few VMAs are
954 	 * not guaranteed to the vma_migratable. If they are not, we would find the
955 	 * !migratable VMA on the next scan but not reset the scanner to the start
956 	 * so check it now.
957 	 */
958 	if (vma)
959 		mm->numa_scan_offset = start;
960 	else
961 		reset_ptenuma_scan(p);
962 	up_read(&mm->mmap_sem);
963 }
964 
965 /*
966  * Drive the periodic memory faults..
967  */
968 void task_tick_numa(struct rq *rq, struct task_struct *curr)
969 {
970 	struct callback_head *work = &curr->numa_work;
971 	u64 period, now;
972 
973 	/*
974 	 * We don't care about NUMA placement if we don't have memory.
975 	 */
976 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
977 		return;
978 
979 	/*
980 	 * Using runtime rather than walltime has the dual advantage that
981 	 * we (mostly) drive the selection from busy threads and that the
982 	 * task needs to have done some actual work before we bother with
983 	 * NUMA placement.
984 	 */
985 	now = curr->se.sum_exec_runtime;
986 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
987 
988 	if (now - curr->node_stamp > period) {
989 		if (!curr->node_stamp)
990 			curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
991 		curr->node_stamp = now;
992 
993 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
994 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
995 			task_work_add(curr, work, true);
996 		}
997 	}
998 }
999 #else
1000 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1001 {
1002 }
1003 #endif /* CONFIG_NUMA_BALANCING */
1004 
1005 static void
1006 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1007 {
1008 	update_load_add(&cfs_rq->load, se->load.weight);
1009 	if (!parent_entity(se))
1010 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1011 #ifdef CONFIG_SMP
1012 	if (entity_is_task(se))
1013 		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1014 #endif
1015 	cfs_rq->nr_running++;
1016 }
1017 
1018 static void
1019 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1020 {
1021 	update_load_sub(&cfs_rq->load, se->load.weight);
1022 	if (!parent_entity(se))
1023 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1024 	if (entity_is_task(se))
1025 		list_del_init(&se->group_node);
1026 	cfs_rq->nr_running--;
1027 }
1028 
1029 #ifdef CONFIG_FAIR_GROUP_SCHED
1030 # ifdef CONFIG_SMP
1031 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1032 {
1033 	long tg_weight;
1034 
1035 	/*
1036 	 * Use this CPU's actual weight instead of the last load_contribution
1037 	 * to gain a more accurate current total weight. See
1038 	 * update_cfs_rq_load_contribution().
1039 	 */
1040 	tg_weight = atomic64_read(&tg->load_avg);
1041 	tg_weight -= cfs_rq->tg_load_contrib;
1042 	tg_weight += cfs_rq->load.weight;
1043 
1044 	return tg_weight;
1045 }
1046 
1047 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1048 {
1049 	long tg_weight, load, shares;
1050 
1051 	tg_weight = calc_tg_weight(tg, cfs_rq);
1052 	load = cfs_rq->load.weight;
1053 
1054 	shares = (tg->shares * load);
1055 	if (tg_weight)
1056 		shares /= tg_weight;
1057 
1058 	if (shares < MIN_SHARES)
1059 		shares = MIN_SHARES;
1060 	if (shares > tg->shares)
1061 		shares = tg->shares;
1062 
1063 	return shares;
1064 }
1065 # else /* CONFIG_SMP */
1066 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1067 {
1068 	return tg->shares;
1069 }
1070 # endif /* CONFIG_SMP */
1071 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1072 			    unsigned long weight)
1073 {
1074 	if (se->on_rq) {
1075 		/* commit outstanding execution time */
1076 		if (cfs_rq->curr == se)
1077 			update_curr(cfs_rq);
1078 		account_entity_dequeue(cfs_rq, se);
1079 	}
1080 
1081 	update_load_set(&se->load, weight);
1082 
1083 	if (se->on_rq)
1084 		account_entity_enqueue(cfs_rq, se);
1085 }
1086 
1087 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1088 
1089 static void update_cfs_shares(struct cfs_rq *cfs_rq)
1090 {
1091 	struct task_group *tg;
1092 	struct sched_entity *se;
1093 	long shares;
1094 
1095 	tg = cfs_rq->tg;
1096 	se = tg->se[cpu_of(rq_of(cfs_rq))];
1097 	if (!se || throttled_hierarchy(cfs_rq))
1098 		return;
1099 #ifndef CONFIG_SMP
1100 	if (likely(se->load.weight == tg->shares))
1101 		return;
1102 #endif
1103 	shares = calc_cfs_shares(cfs_rq, tg);
1104 
1105 	reweight_entity(cfs_rq_of(se), se, shares);
1106 }
1107 #else /* CONFIG_FAIR_GROUP_SCHED */
1108 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1109 {
1110 }
1111 #endif /* CONFIG_FAIR_GROUP_SCHED */
1112 
1113 /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1114 #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1115 /*
1116  * We choose a half-life close to 1 scheduling period.
1117  * Note: The tables below are dependent on this value.
1118  */
1119 #define LOAD_AVG_PERIOD 32
1120 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1121 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1122 
1123 /* Precomputed fixed inverse multiplies for multiplication by y^n */
1124 static const u32 runnable_avg_yN_inv[] = {
1125 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1126 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1127 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1128 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1129 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1130 	0x85aac367, 0x82cd8698,
1131 };
1132 
1133 /*
1134  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
1135  * over-estimates when re-combining.
1136  */
1137 static const u32 runnable_avg_yN_sum[] = {
1138 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1139 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1140 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1141 };
1142 
1143 /*
1144  * Approximate:
1145  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
1146  */
1147 static __always_inline u64 decay_load(u64 val, u64 n)
1148 {
1149 	unsigned int local_n;
1150 
1151 	if (!n)
1152 		return val;
1153 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1154 		return 0;
1155 
1156 	/* after bounds checking we can collapse to 32-bit */
1157 	local_n = n;
1158 
1159 	/*
1160 	 * As y^PERIOD = 1/2, we can combine
1161 	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1162 	 * With a look-up table which covers k^n (n<PERIOD)
1163 	 *
1164 	 * To achieve constant time decay_load.
1165 	 */
1166 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1167 		val >>= local_n / LOAD_AVG_PERIOD;
1168 		local_n %= LOAD_AVG_PERIOD;
1169 	}
1170 
1171 	val *= runnable_avg_yN_inv[local_n];
1172 	/* We don't use SRR here since we always want to round down. */
1173 	return val >> 32;
1174 }
1175 
1176 /*
1177  * For updates fully spanning n periods, the contribution to runnable
1178  * average will be: \Sum 1024*y^n
1179  *
1180  * We can compute this reasonably efficiently by combining:
1181  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
1182  */
1183 static u32 __compute_runnable_contrib(u64 n)
1184 {
1185 	u32 contrib = 0;
1186 
1187 	if (likely(n <= LOAD_AVG_PERIOD))
1188 		return runnable_avg_yN_sum[n];
1189 	else if (unlikely(n >= LOAD_AVG_MAX_N))
1190 		return LOAD_AVG_MAX;
1191 
1192 	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1193 	do {
1194 		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1195 		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1196 
1197 		n -= LOAD_AVG_PERIOD;
1198 	} while (n > LOAD_AVG_PERIOD);
1199 
1200 	contrib = decay_load(contrib, n);
1201 	return contrib + runnable_avg_yN_sum[n];
1202 }
1203 
1204 /*
1205  * We can represent the historical contribution to runnable average as the
1206  * coefficients of a geometric series.  To do this we sub-divide our runnable
1207  * history into segments of approximately 1ms (1024us); label the segment that
1208  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1209  *
1210  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1211  *      p0            p1           p2
1212  *     (now)       (~1ms ago)  (~2ms ago)
1213  *
1214  * Let u_i denote the fraction of p_i that the entity was runnable.
1215  *
1216  * We then designate the fractions u_i as our co-efficients, yielding the
1217  * following representation of historical load:
1218  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1219  *
1220  * We choose y based on the with of a reasonably scheduling period, fixing:
1221  *   y^32 = 0.5
1222  *
1223  * This means that the contribution to load ~32ms ago (u_32) will be weighted
1224  * approximately half as much as the contribution to load within the last ms
1225  * (u_0).
1226  *
1227  * When a period "rolls over" and we have new u_0`, multiplying the previous
1228  * sum again by y is sufficient to update:
1229  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1230  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1231  */
1232 static __always_inline int __update_entity_runnable_avg(u64 now,
1233 							struct sched_avg *sa,
1234 							int runnable)
1235 {
1236 	u64 delta, periods;
1237 	u32 runnable_contrib;
1238 	int delta_w, decayed = 0;
1239 
1240 	delta = now - sa->last_runnable_update;
1241 	/*
1242 	 * This should only happen when time goes backwards, which it
1243 	 * unfortunately does during sched clock init when we swap over to TSC.
1244 	 */
1245 	if ((s64)delta < 0) {
1246 		sa->last_runnable_update = now;
1247 		return 0;
1248 	}
1249 
1250 	/*
1251 	 * Use 1024ns as the unit of measurement since it's a reasonable
1252 	 * approximation of 1us and fast to compute.
1253 	 */
1254 	delta >>= 10;
1255 	if (!delta)
1256 		return 0;
1257 	sa->last_runnable_update = now;
1258 
1259 	/* delta_w is the amount already accumulated against our next period */
1260 	delta_w = sa->runnable_avg_period % 1024;
1261 	if (delta + delta_w >= 1024) {
1262 		/* period roll-over */
1263 		decayed = 1;
1264 
1265 		/*
1266 		 * Now that we know we're crossing a period boundary, figure
1267 		 * out how much from delta we need to complete the current
1268 		 * period and accrue it.
1269 		 */
1270 		delta_w = 1024 - delta_w;
1271 		if (runnable)
1272 			sa->runnable_avg_sum += delta_w;
1273 		sa->runnable_avg_period += delta_w;
1274 
1275 		delta -= delta_w;
1276 
1277 		/* Figure out how many additional periods this update spans */
1278 		periods = delta / 1024;
1279 		delta %= 1024;
1280 
1281 		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1282 						  periods + 1);
1283 		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1284 						     periods + 1);
1285 
1286 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
1287 		runnable_contrib = __compute_runnable_contrib(periods);
1288 		if (runnable)
1289 			sa->runnable_avg_sum += runnable_contrib;
1290 		sa->runnable_avg_period += runnable_contrib;
1291 	}
1292 
1293 	/* Remainder of delta accrued against u_0` */
1294 	if (runnable)
1295 		sa->runnable_avg_sum += delta;
1296 	sa->runnable_avg_period += delta;
1297 
1298 	return decayed;
1299 }
1300 
1301 /* Synchronize an entity's decay with its parenting cfs_rq.*/
1302 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1303 {
1304 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1305 	u64 decays = atomic64_read(&cfs_rq->decay_counter);
1306 
1307 	decays -= se->avg.decay_count;
1308 	if (!decays)
1309 		return 0;
1310 
1311 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1312 	se->avg.decay_count = 0;
1313 
1314 	return decays;
1315 }
1316 
1317 #ifdef CONFIG_FAIR_GROUP_SCHED
1318 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1319 						 int force_update)
1320 {
1321 	struct task_group *tg = cfs_rq->tg;
1322 	s64 tg_contrib;
1323 
1324 	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1325 	tg_contrib -= cfs_rq->tg_load_contrib;
1326 
1327 	if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1328 		atomic64_add(tg_contrib, &tg->load_avg);
1329 		cfs_rq->tg_load_contrib += tg_contrib;
1330 	}
1331 }
1332 
1333 /*
1334  * Aggregate cfs_rq runnable averages into an equivalent task_group
1335  * representation for computing load contributions.
1336  */
1337 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1338 						  struct cfs_rq *cfs_rq)
1339 {
1340 	struct task_group *tg = cfs_rq->tg;
1341 	long contrib;
1342 
1343 	/* The fraction of a cpu used by this cfs_rq */
1344 	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1345 			  sa->runnable_avg_period + 1);
1346 	contrib -= cfs_rq->tg_runnable_contrib;
1347 
1348 	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1349 		atomic_add(contrib, &tg->runnable_avg);
1350 		cfs_rq->tg_runnable_contrib += contrib;
1351 	}
1352 }
1353 
1354 static inline void __update_group_entity_contrib(struct sched_entity *se)
1355 {
1356 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
1357 	struct task_group *tg = cfs_rq->tg;
1358 	int runnable_avg;
1359 
1360 	u64 contrib;
1361 
1362 	contrib = cfs_rq->tg_load_contrib * tg->shares;
1363 	se->avg.load_avg_contrib = div64_u64(contrib,
1364 					     atomic64_read(&tg->load_avg) + 1);
1365 
1366 	/*
1367 	 * For group entities we need to compute a correction term in the case
1368 	 * that they are consuming <1 cpu so that we would contribute the same
1369 	 * load as a task of equal weight.
1370 	 *
1371 	 * Explicitly co-ordinating this measurement would be expensive, but
1372 	 * fortunately the sum of each cpus contribution forms a usable
1373 	 * lower-bound on the true value.
1374 	 *
1375 	 * Consider the aggregate of 2 contributions.  Either they are disjoint
1376 	 * (and the sum represents true value) or they are disjoint and we are
1377 	 * understating by the aggregate of their overlap.
1378 	 *
1379 	 * Extending this to N cpus, for a given overlap, the maximum amount we
1380 	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1381 	 * cpus that overlap for this interval and w_i is the interval width.
1382 	 *
1383 	 * On a small machine; the first term is well-bounded which bounds the
1384 	 * total error since w_i is a subset of the period.  Whereas on a
1385 	 * larger machine, while this first term can be larger, if w_i is the
1386 	 * of consequential size guaranteed to see n_i*w_i quickly converge to
1387 	 * our upper bound of 1-cpu.
1388 	 */
1389 	runnable_avg = atomic_read(&tg->runnable_avg);
1390 	if (runnable_avg < NICE_0_LOAD) {
1391 		se->avg.load_avg_contrib *= runnable_avg;
1392 		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1393 	}
1394 }
1395 #else
1396 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1397 						 int force_update) {}
1398 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1399 						  struct cfs_rq *cfs_rq) {}
1400 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1401 #endif
1402 
1403 static inline void __update_task_entity_contrib(struct sched_entity *se)
1404 {
1405 	u32 contrib;
1406 
1407 	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1408 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1409 	contrib /= (se->avg.runnable_avg_period + 1);
1410 	se->avg.load_avg_contrib = scale_load(contrib);
1411 }
1412 
1413 /* Compute the current contribution to load_avg by se, return any delta */
1414 static long __update_entity_load_avg_contrib(struct sched_entity *se)
1415 {
1416 	long old_contrib = se->avg.load_avg_contrib;
1417 
1418 	if (entity_is_task(se)) {
1419 		__update_task_entity_contrib(se);
1420 	} else {
1421 		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1422 		__update_group_entity_contrib(se);
1423 	}
1424 
1425 	return se->avg.load_avg_contrib - old_contrib;
1426 }
1427 
1428 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1429 						 long load_contrib)
1430 {
1431 	if (likely(load_contrib < cfs_rq->blocked_load_avg))
1432 		cfs_rq->blocked_load_avg -= load_contrib;
1433 	else
1434 		cfs_rq->blocked_load_avg = 0;
1435 }
1436 
1437 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1438 
1439 /* Update a sched_entity's runnable average */
1440 static inline void update_entity_load_avg(struct sched_entity *se,
1441 					  int update_cfs_rq)
1442 {
1443 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1444 	long contrib_delta;
1445 	u64 now;
1446 
1447 	/*
1448 	 * For a group entity we need to use their owned cfs_rq_clock_task() in
1449 	 * case they are the parent of a throttled hierarchy.
1450 	 */
1451 	if (entity_is_task(se))
1452 		now = cfs_rq_clock_task(cfs_rq);
1453 	else
1454 		now = cfs_rq_clock_task(group_cfs_rq(se));
1455 
1456 	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1457 		return;
1458 
1459 	contrib_delta = __update_entity_load_avg_contrib(se);
1460 
1461 	if (!update_cfs_rq)
1462 		return;
1463 
1464 	if (se->on_rq)
1465 		cfs_rq->runnable_load_avg += contrib_delta;
1466 	else
1467 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1468 }
1469 
1470 /*
1471  * Decay the load contributed by all blocked children and account this so that
1472  * their contribution may appropriately discounted when they wake up.
1473  */
1474 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1475 {
1476 	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1477 	u64 decays;
1478 
1479 	decays = now - cfs_rq->last_decay;
1480 	if (!decays && !force_update)
1481 		return;
1482 
1483 	if (atomic64_read(&cfs_rq->removed_load)) {
1484 		u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1485 		subtract_blocked_load_contrib(cfs_rq, removed_load);
1486 	}
1487 
1488 	if (decays) {
1489 		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1490 						      decays);
1491 		atomic64_add(decays, &cfs_rq->decay_counter);
1492 		cfs_rq->last_decay = now;
1493 	}
1494 
1495 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1496 }
1497 
1498 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1499 {
1500 	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1501 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
1502 }
1503 
1504 /* Add the load generated by se into cfs_rq's child load-average */
1505 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1506 						  struct sched_entity *se,
1507 						  int wakeup)
1508 {
1509 	/*
1510 	 * We track migrations using entity decay_count <= 0, on a wake-up
1511 	 * migration we use a negative decay count to track the remote decays
1512 	 * accumulated while sleeping.
1513 	 */
1514 	if (unlikely(se->avg.decay_count <= 0)) {
1515 		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1516 		if (se->avg.decay_count) {
1517 			/*
1518 			 * In a wake-up migration we have to approximate the
1519 			 * time sleeping.  This is because we can't synchronize
1520 			 * clock_task between the two cpus, and it is not
1521 			 * guaranteed to be read-safe.  Instead, we can
1522 			 * approximate this using our carried decays, which are
1523 			 * explicitly atomically readable.
1524 			 */
1525 			se->avg.last_runnable_update -= (-se->avg.decay_count)
1526 							<< 20;
1527 			update_entity_load_avg(se, 0);
1528 			/* Indicate that we're now synchronized and on-rq */
1529 			se->avg.decay_count = 0;
1530 		}
1531 		wakeup = 0;
1532 	} else {
1533 		__synchronize_entity_decay(se);
1534 	}
1535 
1536 	/* migrated tasks did not contribute to our blocked load */
1537 	if (wakeup) {
1538 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1539 		update_entity_load_avg(se, 0);
1540 	}
1541 
1542 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1543 	/* we force update consideration on load-balancer moves */
1544 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1545 }
1546 
1547 /*
1548  * Remove se's load from this cfs_rq child load-average, if the entity is
1549  * transitioning to a blocked state we track its projected decay using
1550  * blocked_load_avg.
1551  */
1552 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1553 						  struct sched_entity *se,
1554 						  int sleep)
1555 {
1556 	update_entity_load_avg(se, 1);
1557 	/* we force update consideration on load-balancer moves */
1558 	update_cfs_rq_blocked_load(cfs_rq, !sleep);
1559 
1560 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1561 	if (sleep) {
1562 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1563 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1564 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
1565 }
1566 
1567 /*
1568  * Update the rq's load with the elapsed running time before entering
1569  * idle. if the last scheduled task is not a CFS task, idle_enter will
1570  * be the only way to update the runnable statistic.
1571  */
1572 void idle_enter_fair(struct rq *this_rq)
1573 {
1574 	update_rq_runnable_avg(this_rq, 1);
1575 }
1576 
1577 /*
1578  * Update the rq's load with the elapsed idle time before a task is
1579  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1580  * be the only way to update the runnable statistic.
1581  */
1582 void idle_exit_fair(struct rq *this_rq)
1583 {
1584 	update_rq_runnable_avg(this_rq, 0);
1585 }
1586 
1587 #else
1588 static inline void update_entity_load_avg(struct sched_entity *se,
1589 					  int update_cfs_rq) {}
1590 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1591 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1592 					   struct sched_entity *se,
1593 					   int wakeup) {}
1594 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1595 					   struct sched_entity *se,
1596 					   int sleep) {}
1597 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1598 					      int force_update) {}
1599 #endif
1600 
1601 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1602 {
1603 #ifdef CONFIG_SCHEDSTATS
1604 	struct task_struct *tsk = NULL;
1605 
1606 	if (entity_is_task(se))
1607 		tsk = task_of(se);
1608 
1609 	if (se->statistics.sleep_start) {
1610 		u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
1611 
1612 		if ((s64)delta < 0)
1613 			delta = 0;
1614 
1615 		if (unlikely(delta > se->statistics.sleep_max))
1616 			se->statistics.sleep_max = delta;
1617 
1618 		se->statistics.sleep_start = 0;
1619 		se->statistics.sum_sleep_runtime += delta;
1620 
1621 		if (tsk) {
1622 			account_scheduler_latency(tsk, delta >> 10, 1);
1623 			trace_sched_stat_sleep(tsk, delta);
1624 		}
1625 	}
1626 	if (se->statistics.block_start) {
1627 		u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1628 
1629 		if ((s64)delta < 0)
1630 			delta = 0;
1631 
1632 		if (unlikely(delta > se->statistics.block_max))
1633 			se->statistics.block_max = delta;
1634 
1635 		se->statistics.block_start = 0;
1636 		se->statistics.sum_sleep_runtime += delta;
1637 
1638 		if (tsk) {
1639 			if (tsk->in_iowait) {
1640 				se->statistics.iowait_sum += delta;
1641 				se->statistics.iowait_count++;
1642 				trace_sched_stat_iowait(tsk, delta);
1643 			}
1644 
1645 			trace_sched_stat_blocked(tsk, delta);
1646 
1647 			/*
1648 			 * Blocking time is in units of nanosecs, so shift by
1649 			 * 20 to get a milliseconds-range estimation of the
1650 			 * amount of time that the task spent sleeping:
1651 			 */
1652 			if (unlikely(prof_on == SLEEP_PROFILING)) {
1653 				profile_hits(SLEEP_PROFILING,
1654 						(void *)get_wchan(tsk),
1655 						delta >> 20);
1656 			}
1657 			account_scheduler_latency(tsk, delta >> 10, 0);
1658 		}
1659 	}
1660 #endif
1661 }
1662 
1663 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1664 {
1665 #ifdef CONFIG_SCHED_DEBUG
1666 	s64 d = se->vruntime - cfs_rq->min_vruntime;
1667 
1668 	if (d < 0)
1669 		d = -d;
1670 
1671 	if (d > 3*sysctl_sched_latency)
1672 		schedstat_inc(cfs_rq, nr_spread_over);
1673 #endif
1674 }
1675 
1676 static void
1677 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1678 {
1679 	u64 vruntime = cfs_rq->min_vruntime;
1680 
1681 	/*
1682 	 * The 'current' period is already promised to the current tasks,
1683 	 * however the extra weight of the new task will slow them down a
1684 	 * little, place the new task so that it fits in the slot that
1685 	 * stays open at the end.
1686 	 */
1687 	if (initial && sched_feat(START_DEBIT))
1688 		vruntime += sched_vslice(cfs_rq, se);
1689 
1690 	/* sleeps up to a single latency don't count. */
1691 	if (!initial) {
1692 		unsigned long thresh = sysctl_sched_latency;
1693 
1694 		/*
1695 		 * Halve their sleep time's effect, to allow
1696 		 * for a gentler effect of sleepers:
1697 		 */
1698 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
1699 			thresh >>= 1;
1700 
1701 		vruntime -= thresh;
1702 	}
1703 
1704 	/* ensure we never gain time by being placed backwards. */
1705 	se->vruntime = max_vruntime(se->vruntime, vruntime);
1706 }
1707 
1708 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1709 
1710 static void
1711 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1712 {
1713 	/*
1714 	 * Update the normalized vruntime before updating min_vruntime
1715 	 * through callig update_curr().
1716 	 */
1717 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1718 		se->vruntime += cfs_rq->min_vruntime;
1719 
1720 	/*
1721 	 * Update run-time statistics of the 'current'.
1722 	 */
1723 	update_curr(cfs_rq);
1724 	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1725 	account_entity_enqueue(cfs_rq, se);
1726 	update_cfs_shares(cfs_rq);
1727 
1728 	if (flags & ENQUEUE_WAKEUP) {
1729 		place_entity(cfs_rq, se, 0);
1730 		enqueue_sleeper(cfs_rq, se);
1731 	}
1732 
1733 	update_stats_enqueue(cfs_rq, se);
1734 	check_spread(cfs_rq, se);
1735 	if (se != cfs_rq->curr)
1736 		__enqueue_entity(cfs_rq, se);
1737 	se->on_rq = 1;
1738 
1739 	if (cfs_rq->nr_running == 1) {
1740 		list_add_leaf_cfs_rq(cfs_rq);
1741 		check_enqueue_throttle(cfs_rq);
1742 	}
1743 }
1744 
1745 static void __clear_buddies_last(struct sched_entity *se)
1746 {
1747 	for_each_sched_entity(se) {
1748 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1749 		if (cfs_rq->last == se)
1750 			cfs_rq->last = NULL;
1751 		else
1752 			break;
1753 	}
1754 }
1755 
1756 static void __clear_buddies_next(struct sched_entity *se)
1757 {
1758 	for_each_sched_entity(se) {
1759 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1760 		if (cfs_rq->next == se)
1761 			cfs_rq->next = NULL;
1762 		else
1763 			break;
1764 	}
1765 }
1766 
1767 static void __clear_buddies_skip(struct sched_entity *se)
1768 {
1769 	for_each_sched_entity(se) {
1770 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1771 		if (cfs_rq->skip == se)
1772 			cfs_rq->skip = NULL;
1773 		else
1774 			break;
1775 	}
1776 }
1777 
1778 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1779 {
1780 	if (cfs_rq->last == se)
1781 		__clear_buddies_last(se);
1782 
1783 	if (cfs_rq->next == se)
1784 		__clear_buddies_next(se);
1785 
1786 	if (cfs_rq->skip == se)
1787 		__clear_buddies_skip(se);
1788 }
1789 
1790 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1791 
1792 static void
1793 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1794 {
1795 	/*
1796 	 * Update run-time statistics of the 'current'.
1797 	 */
1798 	update_curr(cfs_rq);
1799 	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1800 
1801 	update_stats_dequeue(cfs_rq, se);
1802 	if (flags & DEQUEUE_SLEEP) {
1803 #ifdef CONFIG_SCHEDSTATS
1804 		if (entity_is_task(se)) {
1805 			struct task_struct *tsk = task_of(se);
1806 
1807 			if (tsk->state & TASK_INTERRUPTIBLE)
1808 				se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1809 			if (tsk->state & TASK_UNINTERRUPTIBLE)
1810 				se->statistics.block_start = rq_of(cfs_rq)->clock;
1811 		}
1812 #endif
1813 	}
1814 
1815 	clear_buddies(cfs_rq, se);
1816 
1817 	if (se != cfs_rq->curr)
1818 		__dequeue_entity(cfs_rq, se);
1819 	se->on_rq = 0;
1820 	account_entity_dequeue(cfs_rq, se);
1821 
1822 	/*
1823 	 * Normalize the entity after updating the min_vruntime because the
1824 	 * update can refer to the ->curr item and we need to reflect this
1825 	 * movement in our normalized position.
1826 	 */
1827 	if (!(flags & DEQUEUE_SLEEP))
1828 		se->vruntime -= cfs_rq->min_vruntime;
1829 
1830 	/* return excess runtime on last dequeue */
1831 	return_cfs_rq_runtime(cfs_rq);
1832 
1833 	update_min_vruntime(cfs_rq);
1834 	update_cfs_shares(cfs_rq);
1835 }
1836 
1837 /*
1838  * Preempt the current task with a newly woken task if needed:
1839  */
1840 static void
1841 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1842 {
1843 	unsigned long ideal_runtime, delta_exec;
1844 	struct sched_entity *se;
1845 	s64 delta;
1846 
1847 	ideal_runtime = sched_slice(cfs_rq, curr);
1848 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1849 	if (delta_exec > ideal_runtime) {
1850 		resched_task(rq_of(cfs_rq)->curr);
1851 		/*
1852 		 * The current task ran long enough, ensure it doesn't get
1853 		 * re-elected due to buddy favours.
1854 		 */
1855 		clear_buddies(cfs_rq, curr);
1856 		return;
1857 	}
1858 
1859 	/*
1860 	 * Ensure that a task that missed wakeup preemption by a
1861 	 * narrow margin doesn't have to wait for a full slice.
1862 	 * This also mitigates buddy induced latencies under load.
1863 	 */
1864 	if (delta_exec < sysctl_sched_min_granularity)
1865 		return;
1866 
1867 	se = __pick_first_entity(cfs_rq);
1868 	delta = curr->vruntime - se->vruntime;
1869 
1870 	if (delta < 0)
1871 		return;
1872 
1873 	if (delta > ideal_runtime)
1874 		resched_task(rq_of(cfs_rq)->curr);
1875 }
1876 
1877 static void
1878 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1879 {
1880 	/* 'current' is not kept within the tree. */
1881 	if (se->on_rq) {
1882 		/*
1883 		 * Any task has to be enqueued before it get to execute on
1884 		 * a CPU. So account for the time it spent waiting on the
1885 		 * runqueue.
1886 		 */
1887 		update_stats_wait_end(cfs_rq, se);
1888 		__dequeue_entity(cfs_rq, se);
1889 	}
1890 
1891 	update_stats_curr_start(cfs_rq, se);
1892 	cfs_rq->curr = se;
1893 #ifdef CONFIG_SCHEDSTATS
1894 	/*
1895 	 * Track our maximum slice length, if the CPU's load is at
1896 	 * least twice that of our own weight (i.e. dont track it
1897 	 * when there are only lesser-weight tasks around):
1898 	 */
1899 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1900 		se->statistics.slice_max = max(se->statistics.slice_max,
1901 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
1902 	}
1903 #endif
1904 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
1905 }
1906 
1907 static int
1908 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1909 
1910 /*
1911  * Pick the next process, keeping these things in mind, in this order:
1912  * 1) keep things fair between processes/task groups
1913  * 2) pick the "next" process, since someone really wants that to run
1914  * 3) pick the "last" process, for cache locality
1915  * 4) do not run the "skip" process, if something else is available
1916  */
1917 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1918 {
1919 	struct sched_entity *se = __pick_first_entity(cfs_rq);
1920 	struct sched_entity *left = se;
1921 
1922 	/*
1923 	 * Avoid running the skip buddy, if running something else can
1924 	 * be done without getting too unfair.
1925 	 */
1926 	if (cfs_rq->skip == se) {
1927 		struct sched_entity *second = __pick_next_entity(se);
1928 		if (second && wakeup_preempt_entity(second, left) < 1)
1929 			se = second;
1930 	}
1931 
1932 	/*
1933 	 * Prefer last buddy, try to return the CPU to a preempted task.
1934 	 */
1935 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1936 		se = cfs_rq->last;
1937 
1938 	/*
1939 	 * Someone really wants this to run. If it's not unfair, run it.
1940 	 */
1941 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1942 		se = cfs_rq->next;
1943 
1944 	clear_buddies(cfs_rq, se);
1945 
1946 	return se;
1947 }
1948 
1949 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1950 
1951 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1952 {
1953 	/*
1954 	 * If still on the runqueue then deactivate_task()
1955 	 * was not called and update_curr() has to be done:
1956 	 */
1957 	if (prev->on_rq)
1958 		update_curr(cfs_rq);
1959 
1960 	/* throttle cfs_rqs exceeding runtime */
1961 	check_cfs_rq_runtime(cfs_rq);
1962 
1963 	check_spread(cfs_rq, prev);
1964 	if (prev->on_rq) {
1965 		update_stats_wait_start(cfs_rq, prev);
1966 		/* Put 'current' back into the tree. */
1967 		__enqueue_entity(cfs_rq, prev);
1968 		/* in !on_rq case, update occurred at dequeue */
1969 		update_entity_load_avg(prev, 1);
1970 	}
1971 	cfs_rq->curr = NULL;
1972 }
1973 
1974 static void
1975 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1976 {
1977 	/*
1978 	 * Update run-time statistics of the 'current'.
1979 	 */
1980 	update_curr(cfs_rq);
1981 
1982 	/*
1983 	 * Ensure that runnable average is periodically updated.
1984 	 */
1985 	update_entity_load_avg(curr, 1);
1986 	update_cfs_rq_blocked_load(cfs_rq, 1);
1987 
1988 #ifdef CONFIG_SCHED_HRTICK
1989 	/*
1990 	 * queued ticks are scheduled to match the slice, so don't bother
1991 	 * validating it and just reschedule.
1992 	 */
1993 	if (queued) {
1994 		resched_task(rq_of(cfs_rq)->curr);
1995 		return;
1996 	}
1997 	/*
1998 	 * don't let the period tick interfere with the hrtick preemption
1999 	 */
2000 	if (!sched_feat(DOUBLE_TICK) &&
2001 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2002 		return;
2003 #endif
2004 
2005 	if (cfs_rq->nr_running > 1)
2006 		check_preempt_tick(cfs_rq, curr);
2007 }
2008 
2009 
2010 /**************************************************
2011  * CFS bandwidth control machinery
2012  */
2013 
2014 #ifdef CONFIG_CFS_BANDWIDTH
2015 
2016 #ifdef HAVE_JUMP_LABEL
2017 static struct static_key __cfs_bandwidth_used;
2018 
2019 static inline bool cfs_bandwidth_used(void)
2020 {
2021 	return static_key_false(&__cfs_bandwidth_used);
2022 }
2023 
2024 void account_cfs_bandwidth_used(int enabled, int was_enabled)
2025 {
2026 	/* only need to count groups transitioning between enabled/!enabled */
2027 	if (enabled && !was_enabled)
2028 		static_key_slow_inc(&__cfs_bandwidth_used);
2029 	else if (!enabled && was_enabled)
2030 		static_key_slow_dec(&__cfs_bandwidth_used);
2031 }
2032 #else /* HAVE_JUMP_LABEL */
2033 static bool cfs_bandwidth_used(void)
2034 {
2035 	return true;
2036 }
2037 
2038 void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
2039 #endif /* HAVE_JUMP_LABEL */
2040 
2041 /*
2042  * default period for cfs group bandwidth.
2043  * default: 0.1s, units: nanoseconds
2044  */
2045 static inline u64 default_cfs_period(void)
2046 {
2047 	return 100000000ULL;
2048 }
2049 
2050 static inline u64 sched_cfs_bandwidth_slice(void)
2051 {
2052 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2053 }
2054 
2055 /*
2056  * Replenish runtime according to assigned quota and update expiration time.
2057  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2058  * additional synchronization around rq->lock.
2059  *
2060  * requires cfs_b->lock
2061  */
2062 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2063 {
2064 	u64 now;
2065 
2066 	if (cfs_b->quota == RUNTIME_INF)
2067 		return;
2068 
2069 	now = sched_clock_cpu(smp_processor_id());
2070 	cfs_b->runtime = cfs_b->quota;
2071 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2072 }
2073 
2074 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2075 {
2076 	return &tg->cfs_bandwidth;
2077 }
2078 
2079 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2080 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2081 {
2082 	if (unlikely(cfs_rq->throttle_count))
2083 		return cfs_rq->throttled_clock_task;
2084 
2085 	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2086 }
2087 
2088 /* returns 0 on failure to allocate runtime */
2089 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2090 {
2091 	struct task_group *tg = cfs_rq->tg;
2092 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2093 	u64 amount = 0, min_amount, expires;
2094 
2095 	/* note: this is a positive sum as runtime_remaining <= 0 */
2096 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2097 
2098 	raw_spin_lock(&cfs_b->lock);
2099 	if (cfs_b->quota == RUNTIME_INF)
2100 		amount = min_amount;
2101 	else {
2102 		/*
2103 		 * If the bandwidth pool has become inactive, then at least one
2104 		 * period must have elapsed since the last consumption.
2105 		 * Refresh the global state and ensure bandwidth timer becomes
2106 		 * active.
2107 		 */
2108 		if (!cfs_b->timer_active) {
2109 			__refill_cfs_bandwidth_runtime(cfs_b);
2110 			__start_cfs_bandwidth(cfs_b);
2111 		}
2112 
2113 		if (cfs_b->runtime > 0) {
2114 			amount = min(cfs_b->runtime, min_amount);
2115 			cfs_b->runtime -= amount;
2116 			cfs_b->idle = 0;
2117 		}
2118 	}
2119 	expires = cfs_b->runtime_expires;
2120 	raw_spin_unlock(&cfs_b->lock);
2121 
2122 	cfs_rq->runtime_remaining += amount;
2123 	/*
2124 	 * we may have advanced our local expiration to account for allowed
2125 	 * spread between our sched_clock and the one on which runtime was
2126 	 * issued.
2127 	 */
2128 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2129 		cfs_rq->runtime_expires = expires;
2130 
2131 	return cfs_rq->runtime_remaining > 0;
2132 }
2133 
2134 /*
2135  * Note: This depends on the synchronization provided by sched_clock and the
2136  * fact that rq->clock snapshots this value.
2137  */
2138 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2139 {
2140 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2141 	struct rq *rq = rq_of(cfs_rq);
2142 
2143 	/* if the deadline is ahead of our clock, nothing to do */
2144 	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
2145 		return;
2146 
2147 	if (cfs_rq->runtime_remaining < 0)
2148 		return;
2149 
2150 	/*
2151 	 * If the local deadline has passed we have to consider the
2152 	 * possibility that our sched_clock is 'fast' and the global deadline
2153 	 * has not truly expired.
2154 	 *
2155 	 * Fortunately we can check determine whether this the case by checking
2156 	 * whether the global deadline has advanced.
2157 	 */
2158 
2159 	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2160 		/* extend local deadline, drift is bounded above by 2 ticks */
2161 		cfs_rq->runtime_expires += TICK_NSEC;
2162 	} else {
2163 		/* global deadline is ahead, expiration has passed */
2164 		cfs_rq->runtime_remaining = 0;
2165 	}
2166 }
2167 
2168 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2169 				     unsigned long delta_exec)
2170 {
2171 	/* dock delta_exec before expiring quota (as it could span periods) */
2172 	cfs_rq->runtime_remaining -= delta_exec;
2173 	expire_cfs_rq_runtime(cfs_rq);
2174 
2175 	if (likely(cfs_rq->runtime_remaining > 0))
2176 		return;
2177 
2178 	/*
2179 	 * if we're unable to extend our runtime we resched so that the active
2180 	 * hierarchy can be throttled
2181 	 */
2182 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2183 		resched_task(rq_of(cfs_rq)->curr);
2184 }
2185 
2186 static __always_inline
2187 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
2188 {
2189 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
2190 		return;
2191 
2192 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
2193 }
2194 
2195 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2196 {
2197 	return cfs_bandwidth_used() && cfs_rq->throttled;
2198 }
2199 
2200 /* check whether cfs_rq, or any parent, is throttled */
2201 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2202 {
2203 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
2204 }
2205 
2206 /*
2207  * Ensure that neither of the group entities corresponding to src_cpu or
2208  * dest_cpu are members of a throttled hierarchy when performing group
2209  * load-balance operations.
2210  */
2211 static inline int throttled_lb_pair(struct task_group *tg,
2212 				    int src_cpu, int dest_cpu)
2213 {
2214 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2215 
2216 	src_cfs_rq = tg->cfs_rq[src_cpu];
2217 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
2218 
2219 	return throttled_hierarchy(src_cfs_rq) ||
2220 	       throttled_hierarchy(dest_cfs_rq);
2221 }
2222 
2223 /* updated child weight may affect parent so we have to do this bottom up */
2224 static int tg_unthrottle_up(struct task_group *tg, void *data)
2225 {
2226 	struct rq *rq = data;
2227 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2228 
2229 	cfs_rq->throttle_count--;
2230 #ifdef CONFIG_SMP
2231 	if (!cfs_rq->throttle_count) {
2232 		/* adjust cfs_rq_clock_task() */
2233 		cfs_rq->throttled_clock_task_time += rq->clock_task -
2234 					     cfs_rq->throttled_clock_task;
2235 	}
2236 #endif
2237 
2238 	return 0;
2239 }
2240 
2241 static int tg_throttle_down(struct task_group *tg, void *data)
2242 {
2243 	struct rq *rq = data;
2244 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2245 
2246 	/* group is entering throttled state, stop time */
2247 	if (!cfs_rq->throttle_count)
2248 		cfs_rq->throttled_clock_task = rq->clock_task;
2249 	cfs_rq->throttle_count++;
2250 
2251 	return 0;
2252 }
2253 
2254 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2255 {
2256 	struct rq *rq = rq_of(cfs_rq);
2257 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2258 	struct sched_entity *se;
2259 	long task_delta, dequeue = 1;
2260 
2261 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2262 
2263 	/* freeze hierarchy runnable averages while throttled */
2264 	rcu_read_lock();
2265 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2266 	rcu_read_unlock();
2267 
2268 	task_delta = cfs_rq->h_nr_running;
2269 	for_each_sched_entity(se) {
2270 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2271 		/* throttled entity or throttle-on-deactivate */
2272 		if (!se->on_rq)
2273 			break;
2274 
2275 		if (dequeue)
2276 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2277 		qcfs_rq->h_nr_running -= task_delta;
2278 
2279 		if (qcfs_rq->load.weight)
2280 			dequeue = 0;
2281 	}
2282 
2283 	if (!se)
2284 		rq->nr_running -= task_delta;
2285 
2286 	cfs_rq->throttled = 1;
2287 	cfs_rq->throttled_clock = rq->clock;
2288 	raw_spin_lock(&cfs_b->lock);
2289 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2290 	raw_spin_unlock(&cfs_b->lock);
2291 }
2292 
2293 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2294 {
2295 	struct rq *rq = rq_of(cfs_rq);
2296 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2297 	struct sched_entity *se;
2298 	int enqueue = 1;
2299 	long task_delta;
2300 
2301 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2302 
2303 	cfs_rq->throttled = 0;
2304 	raw_spin_lock(&cfs_b->lock);
2305 	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
2306 	list_del_rcu(&cfs_rq->throttled_list);
2307 	raw_spin_unlock(&cfs_b->lock);
2308 
2309 	update_rq_clock(rq);
2310 	/* update hierarchical throttle state */
2311 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2312 
2313 	if (!cfs_rq->load.weight)
2314 		return;
2315 
2316 	task_delta = cfs_rq->h_nr_running;
2317 	for_each_sched_entity(se) {
2318 		if (se->on_rq)
2319 			enqueue = 0;
2320 
2321 		cfs_rq = cfs_rq_of(se);
2322 		if (enqueue)
2323 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2324 		cfs_rq->h_nr_running += task_delta;
2325 
2326 		if (cfs_rq_throttled(cfs_rq))
2327 			break;
2328 	}
2329 
2330 	if (!se)
2331 		rq->nr_running += task_delta;
2332 
2333 	/* determine whether we need to wake up potentially idle cpu */
2334 	if (rq->curr == rq->idle && rq->cfs.nr_running)
2335 		resched_task(rq->curr);
2336 }
2337 
2338 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2339 		u64 remaining, u64 expires)
2340 {
2341 	struct cfs_rq *cfs_rq;
2342 	u64 runtime = remaining;
2343 
2344 	rcu_read_lock();
2345 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2346 				throttled_list) {
2347 		struct rq *rq = rq_of(cfs_rq);
2348 
2349 		raw_spin_lock(&rq->lock);
2350 		if (!cfs_rq_throttled(cfs_rq))
2351 			goto next;
2352 
2353 		runtime = -cfs_rq->runtime_remaining + 1;
2354 		if (runtime > remaining)
2355 			runtime = remaining;
2356 		remaining -= runtime;
2357 
2358 		cfs_rq->runtime_remaining += runtime;
2359 		cfs_rq->runtime_expires = expires;
2360 
2361 		/* we check whether we're throttled above */
2362 		if (cfs_rq->runtime_remaining > 0)
2363 			unthrottle_cfs_rq(cfs_rq);
2364 
2365 next:
2366 		raw_spin_unlock(&rq->lock);
2367 
2368 		if (!remaining)
2369 			break;
2370 	}
2371 	rcu_read_unlock();
2372 
2373 	return remaining;
2374 }
2375 
2376 /*
2377  * Responsible for refilling a task_group's bandwidth and unthrottling its
2378  * cfs_rqs as appropriate. If there has been no activity within the last
2379  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2380  * used to track this state.
2381  */
2382 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2383 {
2384 	u64 runtime, runtime_expires;
2385 	int idle = 1, throttled;
2386 
2387 	raw_spin_lock(&cfs_b->lock);
2388 	/* no need to continue the timer with no bandwidth constraint */
2389 	if (cfs_b->quota == RUNTIME_INF)
2390 		goto out_unlock;
2391 
2392 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2393 	/* idle depends on !throttled (for the case of a large deficit) */
2394 	idle = cfs_b->idle && !throttled;
2395 	cfs_b->nr_periods += overrun;
2396 
2397 	/* if we're going inactive then everything else can be deferred */
2398 	if (idle)
2399 		goto out_unlock;
2400 
2401 	__refill_cfs_bandwidth_runtime(cfs_b);
2402 
2403 	if (!throttled) {
2404 		/* mark as potentially idle for the upcoming period */
2405 		cfs_b->idle = 1;
2406 		goto out_unlock;
2407 	}
2408 
2409 	/* account preceding periods in which throttling occurred */
2410 	cfs_b->nr_throttled += overrun;
2411 
2412 	/*
2413 	 * There are throttled entities so we must first use the new bandwidth
2414 	 * to unthrottle them before making it generally available.  This
2415 	 * ensures that all existing debts will be paid before a new cfs_rq is
2416 	 * allowed to run.
2417 	 */
2418 	runtime = cfs_b->runtime;
2419 	runtime_expires = cfs_b->runtime_expires;
2420 	cfs_b->runtime = 0;
2421 
2422 	/*
2423 	 * This check is repeated as we are holding onto the new bandwidth
2424 	 * while we unthrottle.  This can potentially race with an unthrottled
2425 	 * group trying to acquire new bandwidth from the global pool.
2426 	 */
2427 	while (throttled && runtime > 0) {
2428 		raw_spin_unlock(&cfs_b->lock);
2429 		/* we can't nest cfs_b->lock while distributing bandwidth */
2430 		runtime = distribute_cfs_runtime(cfs_b, runtime,
2431 						 runtime_expires);
2432 		raw_spin_lock(&cfs_b->lock);
2433 
2434 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2435 	}
2436 
2437 	/* return (any) remaining runtime */
2438 	cfs_b->runtime = runtime;
2439 	/*
2440 	 * While we are ensured activity in the period following an
2441 	 * unthrottle, this also covers the case in which the new bandwidth is
2442 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
2443 	 * timer to remain active while there are any throttled entities.)
2444 	 */
2445 	cfs_b->idle = 0;
2446 out_unlock:
2447 	if (idle)
2448 		cfs_b->timer_active = 0;
2449 	raw_spin_unlock(&cfs_b->lock);
2450 
2451 	return idle;
2452 }
2453 
2454 /* a cfs_rq won't donate quota below this amount */
2455 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2456 /* minimum remaining period time to redistribute slack quota */
2457 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2458 /* how long we wait to gather additional slack before distributing */
2459 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2460 
2461 /* are we near the end of the current quota period? */
2462 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2463 {
2464 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
2465 	u64 remaining;
2466 
2467 	/* if the call-back is running a quota refresh is already occurring */
2468 	if (hrtimer_callback_running(refresh_timer))
2469 		return 1;
2470 
2471 	/* is a quota refresh about to occur? */
2472 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2473 	if (remaining < min_expire)
2474 		return 1;
2475 
2476 	return 0;
2477 }
2478 
2479 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2480 {
2481 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2482 
2483 	/* if there's a quota refresh soon don't bother with slack */
2484 	if (runtime_refresh_within(cfs_b, min_left))
2485 		return;
2486 
2487 	start_bandwidth_timer(&cfs_b->slack_timer,
2488 				ns_to_ktime(cfs_bandwidth_slack_period));
2489 }
2490 
2491 /* we know any runtime found here is valid as update_curr() precedes return */
2492 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2493 {
2494 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2495 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2496 
2497 	if (slack_runtime <= 0)
2498 		return;
2499 
2500 	raw_spin_lock(&cfs_b->lock);
2501 	if (cfs_b->quota != RUNTIME_INF &&
2502 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2503 		cfs_b->runtime += slack_runtime;
2504 
2505 		/* we are under rq->lock, defer unthrottling using a timer */
2506 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2507 		    !list_empty(&cfs_b->throttled_cfs_rq))
2508 			start_cfs_slack_bandwidth(cfs_b);
2509 	}
2510 	raw_spin_unlock(&cfs_b->lock);
2511 
2512 	/* even if it's not valid for return we don't want to try again */
2513 	cfs_rq->runtime_remaining -= slack_runtime;
2514 }
2515 
2516 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2517 {
2518 	if (!cfs_bandwidth_used())
2519 		return;
2520 
2521 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
2522 		return;
2523 
2524 	__return_cfs_rq_runtime(cfs_rq);
2525 }
2526 
2527 /*
2528  * This is done with a timer (instead of inline with bandwidth return) since
2529  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2530  */
2531 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2532 {
2533 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2534 	u64 expires;
2535 
2536 	/* confirm we're still not at a refresh boundary */
2537 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2538 		return;
2539 
2540 	raw_spin_lock(&cfs_b->lock);
2541 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2542 		runtime = cfs_b->runtime;
2543 		cfs_b->runtime = 0;
2544 	}
2545 	expires = cfs_b->runtime_expires;
2546 	raw_spin_unlock(&cfs_b->lock);
2547 
2548 	if (!runtime)
2549 		return;
2550 
2551 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2552 
2553 	raw_spin_lock(&cfs_b->lock);
2554 	if (expires == cfs_b->runtime_expires)
2555 		cfs_b->runtime = runtime;
2556 	raw_spin_unlock(&cfs_b->lock);
2557 }
2558 
2559 /*
2560  * When a group wakes up we want to make sure that its quota is not already
2561  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2562  * runtime as update_curr() throttling can not not trigger until it's on-rq.
2563  */
2564 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2565 {
2566 	if (!cfs_bandwidth_used())
2567 		return;
2568 
2569 	/* an active group must be handled by the update_curr()->put() path */
2570 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2571 		return;
2572 
2573 	/* ensure the group is not already throttled */
2574 	if (cfs_rq_throttled(cfs_rq))
2575 		return;
2576 
2577 	/* update runtime allocation */
2578 	account_cfs_rq_runtime(cfs_rq, 0);
2579 	if (cfs_rq->runtime_remaining <= 0)
2580 		throttle_cfs_rq(cfs_rq);
2581 }
2582 
2583 /* conditionally throttle active cfs_rq's from put_prev_entity() */
2584 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2585 {
2586 	if (!cfs_bandwidth_used())
2587 		return;
2588 
2589 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2590 		return;
2591 
2592 	/*
2593 	 * it's possible for a throttled entity to be forced into a running
2594 	 * state (e.g. set_curr_task), in this case we're finished.
2595 	 */
2596 	if (cfs_rq_throttled(cfs_rq))
2597 		return;
2598 
2599 	throttle_cfs_rq(cfs_rq);
2600 }
2601 
2602 static inline u64 default_cfs_period(void);
2603 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2604 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2605 
2606 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2607 {
2608 	struct cfs_bandwidth *cfs_b =
2609 		container_of(timer, struct cfs_bandwidth, slack_timer);
2610 	do_sched_cfs_slack_timer(cfs_b);
2611 
2612 	return HRTIMER_NORESTART;
2613 }
2614 
2615 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2616 {
2617 	struct cfs_bandwidth *cfs_b =
2618 		container_of(timer, struct cfs_bandwidth, period_timer);
2619 	ktime_t now;
2620 	int overrun;
2621 	int idle = 0;
2622 
2623 	for (;;) {
2624 		now = hrtimer_cb_get_time(timer);
2625 		overrun = hrtimer_forward(timer, now, cfs_b->period);
2626 
2627 		if (!overrun)
2628 			break;
2629 
2630 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
2631 	}
2632 
2633 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2634 }
2635 
2636 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2637 {
2638 	raw_spin_lock_init(&cfs_b->lock);
2639 	cfs_b->runtime = 0;
2640 	cfs_b->quota = RUNTIME_INF;
2641 	cfs_b->period = ns_to_ktime(default_cfs_period());
2642 
2643 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2644 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2645 	cfs_b->period_timer.function = sched_cfs_period_timer;
2646 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2647 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
2648 }
2649 
2650 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2651 {
2652 	cfs_rq->runtime_enabled = 0;
2653 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
2654 }
2655 
2656 /* requires cfs_b->lock, may release to reprogram timer */
2657 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2658 {
2659 	/*
2660 	 * The timer may be active because we're trying to set a new bandwidth
2661 	 * period or because we're racing with the tear-down path
2662 	 * (timer_active==0 becomes visible before the hrtimer call-back
2663 	 * terminates).  In either case we ensure that it's re-programmed
2664 	 */
2665 	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2666 		raw_spin_unlock(&cfs_b->lock);
2667 		/* ensure cfs_b->lock is available while we wait */
2668 		hrtimer_cancel(&cfs_b->period_timer);
2669 
2670 		raw_spin_lock(&cfs_b->lock);
2671 		/* if someone else restarted the timer then we're done */
2672 		if (cfs_b->timer_active)
2673 			return;
2674 	}
2675 
2676 	cfs_b->timer_active = 1;
2677 	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2678 }
2679 
2680 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2681 {
2682 	hrtimer_cancel(&cfs_b->period_timer);
2683 	hrtimer_cancel(&cfs_b->slack_timer);
2684 }
2685 
2686 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2687 {
2688 	struct cfs_rq *cfs_rq;
2689 
2690 	for_each_leaf_cfs_rq(rq, cfs_rq) {
2691 		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2692 
2693 		if (!cfs_rq->runtime_enabled)
2694 			continue;
2695 
2696 		/*
2697 		 * clock_task is not advancing so we just need to make sure
2698 		 * there's some valid quota amount
2699 		 */
2700 		cfs_rq->runtime_remaining = cfs_b->quota;
2701 		if (cfs_rq_throttled(cfs_rq))
2702 			unthrottle_cfs_rq(cfs_rq);
2703 	}
2704 }
2705 
2706 #else /* CONFIG_CFS_BANDWIDTH */
2707 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2708 {
2709 	return rq_of(cfs_rq)->clock_task;
2710 }
2711 
2712 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2713 				     unsigned long delta_exec) {}
2714 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2715 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2716 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2717 
2718 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2719 {
2720 	return 0;
2721 }
2722 
2723 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2724 {
2725 	return 0;
2726 }
2727 
2728 static inline int throttled_lb_pair(struct task_group *tg,
2729 				    int src_cpu, int dest_cpu)
2730 {
2731 	return 0;
2732 }
2733 
2734 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2735 
2736 #ifdef CONFIG_FAIR_GROUP_SCHED
2737 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2738 #endif
2739 
2740 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2741 {
2742 	return NULL;
2743 }
2744 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2745 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2746 
2747 #endif /* CONFIG_CFS_BANDWIDTH */
2748 
2749 /**************************************************
2750  * CFS operations on tasks:
2751  */
2752 
2753 #ifdef CONFIG_SCHED_HRTICK
2754 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2755 {
2756 	struct sched_entity *se = &p->se;
2757 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2758 
2759 	WARN_ON(task_rq(p) != rq);
2760 
2761 	if (cfs_rq->nr_running > 1) {
2762 		u64 slice = sched_slice(cfs_rq, se);
2763 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2764 		s64 delta = slice - ran;
2765 
2766 		if (delta < 0) {
2767 			if (rq->curr == p)
2768 				resched_task(p);
2769 			return;
2770 		}
2771 
2772 		/*
2773 		 * Don't schedule slices shorter than 10000ns, that just
2774 		 * doesn't make sense. Rely on vruntime for fairness.
2775 		 */
2776 		if (rq->curr != p)
2777 			delta = max_t(s64, 10000LL, delta);
2778 
2779 		hrtick_start(rq, delta);
2780 	}
2781 }
2782 
2783 /*
2784  * called from enqueue/dequeue and updates the hrtick when the
2785  * current task is from our class and nr_running is low enough
2786  * to matter.
2787  */
2788 static void hrtick_update(struct rq *rq)
2789 {
2790 	struct task_struct *curr = rq->curr;
2791 
2792 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2793 		return;
2794 
2795 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2796 		hrtick_start_fair(rq, curr);
2797 }
2798 #else /* !CONFIG_SCHED_HRTICK */
2799 static inline void
2800 hrtick_start_fair(struct rq *rq, struct task_struct *p)
2801 {
2802 }
2803 
2804 static inline void hrtick_update(struct rq *rq)
2805 {
2806 }
2807 #endif
2808 
2809 /*
2810  * The enqueue_task method is called before nr_running is
2811  * increased. Here we update the fair scheduling stats and
2812  * then put the task into the rbtree:
2813  */
2814 static void
2815 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2816 {
2817 	struct cfs_rq *cfs_rq;
2818 	struct sched_entity *se = &p->se;
2819 
2820 	for_each_sched_entity(se) {
2821 		if (se->on_rq)
2822 			break;
2823 		cfs_rq = cfs_rq_of(se);
2824 		enqueue_entity(cfs_rq, se, flags);
2825 
2826 		/*
2827 		 * end evaluation on encountering a throttled cfs_rq
2828 		 *
2829 		 * note: in the case of encountering a throttled cfs_rq we will
2830 		 * post the final h_nr_running increment below.
2831 		*/
2832 		if (cfs_rq_throttled(cfs_rq))
2833 			break;
2834 		cfs_rq->h_nr_running++;
2835 
2836 		flags = ENQUEUE_WAKEUP;
2837 	}
2838 
2839 	for_each_sched_entity(se) {
2840 		cfs_rq = cfs_rq_of(se);
2841 		cfs_rq->h_nr_running++;
2842 
2843 		if (cfs_rq_throttled(cfs_rq))
2844 			break;
2845 
2846 		update_cfs_shares(cfs_rq);
2847 		update_entity_load_avg(se, 1);
2848 	}
2849 
2850 	if (!se) {
2851 		update_rq_runnable_avg(rq, rq->nr_running);
2852 		inc_nr_running(rq);
2853 	}
2854 	hrtick_update(rq);
2855 }
2856 
2857 static void set_next_buddy(struct sched_entity *se);
2858 
2859 /*
2860  * The dequeue_task method is called before nr_running is
2861  * decreased. We remove the task from the rbtree and
2862  * update the fair scheduling stats:
2863  */
2864 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2865 {
2866 	struct cfs_rq *cfs_rq;
2867 	struct sched_entity *se = &p->se;
2868 	int task_sleep = flags & DEQUEUE_SLEEP;
2869 
2870 	for_each_sched_entity(se) {
2871 		cfs_rq = cfs_rq_of(se);
2872 		dequeue_entity(cfs_rq, se, flags);
2873 
2874 		/*
2875 		 * end evaluation on encountering a throttled cfs_rq
2876 		 *
2877 		 * note: in the case of encountering a throttled cfs_rq we will
2878 		 * post the final h_nr_running decrement below.
2879 		*/
2880 		if (cfs_rq_throttled(cfs_rq))
2881 			break;
2882 		cfs_rq->h_nr_running--;
2883 
2884 		/* Don't dequeue parent if it has other entities besides us */
2885 		if (cfs_rq->load.weight) {
2886 			/*
2887 			 * Bias pick_next to pick a task from this cfs_rq, as
2888 			 * p is sleeping when it is within its sched_slice.
2889 			 */
2890 			if (task_sleep && parent_entity(se))
2891 				set_next_buddy(parent_entity(se));
2892 
2893 			/* avoid re-evaluating load for this entity */
2894 			se = parent_entity(se);
2895 			break;
2896 		}
2897 		flags |= DEQUEUE_SLEEP;
2898 	}
2899 
2900 	for_each_sched_entity(se) {
2901 		cfs_rq = cfs_rq_of(se);
2902 		cfs_rq->h_nr_running--;
2903 
2904 		if (cfs_rq_throttled(cfs_rq))
2905 			break;
2906 
2907 		update_cfs_shares(cfs_rq);
2908 		update_entity_load_avg(se, 1);
2909 	}
2910 
2911 	if (!se) {
2912 		dec_nr_running(rq);
2913 		update_rq_runnable_avg(rq, 1);
2914 	}
2915 	hrtick_update(rq);
2916 }
2917 
2918 #ifdef CONFIG_SMP
2919 /* Used instead of source_load when we know the type == 0 */
2920 static unsigned long weighted_cpuload(const int cpu)
2921 {
2922 	return cpu_rq(cpu)->load.weight;
2923 }
2924 
2925 /*
2926  * Return a low guess at the load of a migration-source cpu weighted
2927  * according to the scheduling class and "nice" value.
2928  *
2929  * We want to under-estimate the load of migration sources, to
2930  * balance conservatively.
2931  */
2932 static unsigned long source_load(int cpu, int type)
2933 {
2934 	struct rq *rq = cpu_rq(cpu);
2935 	unsigned long total = weighted_cpuload(cpu);
2936 
2937 	if (type == 0 || !sched_feat(LB_BIAS))
2938 		return total;
2939 
2940 	return min(rq->cpu_load[type-1], total);
2941 }
2942 
2943 /*
2944  * Return a high guess at the load of a migration-target cpu weighted
2945  * according to the scheduling class and "nice" value.
2946  */
2947 static unsigned long target_load(int cpu, int type)
2948 {
2949 	struct rq *rq = cpu_rq(cpu);
2950 	unsigned long total = weighted_cpuload(cpu);
2951 
2952 	if (type == 0 || !sched_feat(LB_BIAS))
2953 		return total;
2954 
2955 	return max(rq->cpu_load[type-1], total);
2956 }
2957 
2958 static unsigned long power_of(int cpu)
2959 {
2960 	return cpu_rq(cpu)->cpu_power;
2961 }
2962 
2963 static unsigned long cpu_avg_load_per_task(int cpu)
2964 {
2965 	struct rq *rq = cpu_rq(cpu);
2966 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2967 
2968 	if (nr_running)
2969 		return rq->load.weight / nr_running;
2970 
2971 	return 0;
2972 }
2973 
2974 
2975 static void task_waking_fair(struct task_struct *p)
2976 {
2977 	struct sched_entity *se = &p->se;
2978 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2979 	u64 min_vruntime;
2980 
2981 #ifndef CONFIG_64BIT
2982 	u64 min_vruntime_copy;
2983 
2984 	do {
2985 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
2986 		smp_rmb();
2987 		min_vruntime = cfs_rq->min_vruntime;
2988 	} while (min_vruntime != min_vruntime_copy);
2989 #else
2990 	min_vruntime = cfs_rq->min_vruntime;
2991 #endif
2992 
2993 	se->vruntime -= min_vruntime;
2994 }
2995 
2996 #ifdef CONFIG_FAIR_GROUP_SCHED
2997 /*
2998  * effective_load() calculates the load change as seen from the root_task_group
2999  *
3000  * Adding load to a group doesn't make a group heavier, but can cause movement
3001  * of group shares between cpus. Assuming the shares were perfectly aligned one
3002  * can calculate the shift in shares.
3003  *
3004  * Calculate the effective load difference if @wl is added (subtracted) to @tg
3005  * on this @cpu and results in a total addition (subtraction) of @wg to the
3006  * total group weight.
3007  *
3008  * Given a runqueue weight distribution (rw_i) we can compute a shares
3009  * distribution (s_i) using:
3010  *
3011  *   s_i = rw_i / \Sum rw_j						(1)
3012  *
3013  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
3014  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
3015  * shares distribution (s_i):
3016  *
3017  *   rw_i = {   2,   4,   1,   0 }
3018  *   s_i  = { 2/7, 4/7, 1/7,   0 }
3019  *
3020  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
3021  * task used to run on and the CPU the waker is running on), we need to
3022  * compute the effect of waking a task on either CPU and, in case of a sync
3023  * wakeup, compute the effect of the current task going to sleep.
3024  *
3025  * So for a change of @wl to the local @cpu with an overall group weight change
3026  * of @wl we can compute the new shares distribution (s'_i) using:
3027  *
3028  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
3029  *
3030  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3031  * differences in waking a task to CPU 0. The additional task changes the
3032  * weight and shares distributions like:
3033  *
3034  *   rw'_i = {   3,   4,   1,   0 }
3035  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
3036  *
3037  * We can then compute the difference in effective weight by using:
3038  *
3039  *   dw_i = S * (s'_i - s_i)						(3)
3040  *
3041  * Where 'S' is the group weight as seen by its parent.
3042  *
3043  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3044  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3045  * 4/7) times the weight of the group.
3046  */
3047 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3048 {
3049 	struct sched_entity *se = tg->se[cpu];
3050 
3051 	if (!tg->parent)	/* the trivial, non-cgroup case */
3052 		return wl;
3053 
3054 	for_each_sched_entity(se) {
3055 		long w, W;
3056 
3057 		tg = se->my_q->tg;
3058 
3059 		/*
3060 		 * W = @wg + \Sum rw_j
3061 		 */
3062 		W = wg + calc_tg_weight(tg, se->my_q);
3063 
3064 		/*
3065 		 * w = rw_i + @wl
3066 		 */
3067 		w = se->my_q->load.weight + wl;
3068 
3069 		/*
3070 		 * wl = S * s'_i; see (2)
3071 		 */
3072 		if (W > 0 && w < W)
3073 			wl = (w * tg->shares) / W;
3074 		else
3075 			wl = tg->shares;
3076 
3077 		/*
3078 		 * Per the above, wl is the new se->load.weight value; since
3079 		 * those are clipped to [MIN_SHARES, ...) do so now. See
3080 		 * calc_cfs_shares().
3081 		 */
3082 		if (wl < MIN_SHARES)
3083 			wl = MIN_SHARES;
3084 
3085 		/*
3086 		 * wl = dw_i = S * (s'_i - s_i); see (3)
3087 		 */
3088 		wl -= se->load.weight;
3089 
3090 		/*
3091 		 * Recursively apply this logic to all parent groups to compute
3092 		 * the final effective load change on the root group. Since
3093 		 * only the @tg group gets extra weight, all parent groups can
3094 		 * only redistribute existing shares. @wl is the shift in shares
3095 		 * resulting from this level per the above.
3096 		 */
3097 		wg = 0;
3098 	}
3099 
3100 	return wl;
3101 }
3102 #else
3103 
3104 static inline unsigned long effective_load(struct task_group *tg, int cpu,
3105 		unsigned long wl, unsigned long wg)
3106 {
3107 	return wl;
3108 }
3109 
3110 #endif
3111 
3112 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3113 {
3114 	s64 this_load, load;
3115 	int idx, this_cpu, prev_cpu;
3116 	unsigned long tl_per_task;
3117 	struct task_group *tg;
3118 	unsigned long weight;
3119 	int balanced;
3120 
3121 	idx	  = sd->wake_idx;
3122 	this_cpu  = smp_processor_id();
3123 	prev_cpu  = task_cpu(p);
3124 	load	  = source_load(prev_cpu, idx);
3125 	this_load = target_load(this_cpu, idx);
3126 
3127 	/*
3128 	 * If sync wakeup then subtract the (maximum possible)
3129 	 * effect of the currently running task from the load
3130 	 * of the current CPU:
3131 	 */
3132 	if (sync) {
3133 		tg = task_group(current);
3134 		weight = current->se.load.weight;
3135 
3136 		this_load += effective_load(tg, this_cpu, -weight, -weight);
3137 		load += effective_load(tg, prev_cpu, 0, -weight);
3138 	}
3139 
3140 	tg = task_group(p);
3141 	weight = p->se.load.weight;
3142 
3143 	/*
3144 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
3145 	 * due to the sync cause above having dropped this_load to 0, we'll
3146 	 * always have an imbalance, but there's really nothing you can do
3147 	 * about that, so that's good too.
3148 	 *
3149 	 * Otherwise check if either cpus are near enough in load to allow this
3150 	 * task to be woken on this_cpu.
3151 	 */
3152 	if (this_load > 0) {
3153 		s64 this_eff_load, prev_eff_load;
3154 
3155 		this_eff_load = 100;
3156 		this_eff_load *= power_of(prev_cpu);
3157 		this_eff_load *= this_load +
3158 			effective_load(tg, this_cpu, weight, weight);
3159 
3160 		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3161 		prev_eff_load *= power_of(this_cpu);
3162 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3163 
3164 		balanced = this_eff_load <= prev_eff_load;
3165 	} else
3166 		balanced = true;
3167 
3168 	/*
3169 	 * If the currently running task will sleep within
3170 	 * a reasonable amount of time then attract this newly
3171 	 * woken task:
3172 	 */
3173 	if (sync && balanced)
3174 		return 1;
3175 
3176 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
3177 	tl_per_task = cpu_avg_load_per_task(this_cpu);
3178 
3179 	if (balanced ||
3180 	    (this_load <= load &&
3181 	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
3182 		/*
3183 		 * This domain has SD_WAKE_AFFINE and
3184 		 * p is cache cold in this domain, and
3185 		 * there is no bad imbalance.
3186 		 */
3187 		schedstat_inc(sd, ttwu_move_affine);
3188 		schedstat_inc(p, se.statistics.nr_wakeups_affine);
3189 
3190 		return 1;
3191 	}
3192 	return 0;
3193 }
3194 
3195 /*
3196  * find_idlest_group finds and returns the least busy CPU group within the
3197  * domain.
3198  */
3199 static struct sched_group *
3200 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
3201 		  int this_cpu, int load_idx)
3202 {
3203 	struct sched_group *idlest = NULL, *group = sd->groups;
3204 	unsigned long min_load = ULONG_MAX, this_load = 0;
3205 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
3206 
3207 	do {
3208 		unsigned long load, avg_load;
3209 		int local_group;
3210 		int i;
3211 
3212 		/* Skip over this group if it has no CPUs allowed */
3213 		if (!cpumask_intersects(sched_group_cpus(group),
3214 					tsk_cpus_allowed(p)))
3215 			continue;
3216 
3217 		local_group = cpumask_test_cpu(this_cpu,
3218 					       sched_group_cpus(group));
3219 
3220 		/* Tally up the load of all CPUs in the group */
3221 		avg_load = 0;
3222 
3223 		for_each_cpu(i, sched_group_cpus(group)) {
3224 			/* Bias balancing toward cpus of our domain */
3225 			if (local_group)
3226 				load = source_load(i, load_idx);
3227 			else
3228 				load = target_load(i, load_idx);
3229 
3230 			avg_load += load;
3231 		}
3232 
3233 		/* Adjust by relative CPU power of the group */
3234 		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
3235 
3236 		if (local_group) {
3237 			this_load = avg_load;
3238 		} else if (avg_load < min_load) {
3239 			min_load = avg_load;
3240 			idlest = group;
3241 		}
3242 	} while (group = group->next, group != sd->groups);
3243 
3244 	if (!idlest || 100*this_load < imbalance*min_load)
3245 		return NULL;
3246 	return idlest;
3247 }
3248 
3249 /*
3250  * find_idlest_cpu - find the idlest cpu among the cpus in group.
3251  */
3252 static int
3253 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3254 {
3255 	unsigned long load, min_load = ULONG_MAX;
3256 	int idlest = -1;
3257 	int i;
3258 
3259 	/* Traverse only the allowed CPUs */
3260 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
3261 		load = weighted_cpuload(i);
3262 
3263 		if (load < min_load || (load == min_load && i == this_cpu)) {
3264 			min_load = load;
3265 			idlest = i;
3266 		}
3267 	}
3268 
3269 	return idlest;
3270 }
3271 
3272 /*
3273  * Try and locate an idle CPU in the sched_domain.
3274  */
3275 static int select_idle_sibling(struct task_struct *p, int target)
3276 {
3277 	struct sched_domain *sd;
3278 	struct sched_group *sg;
3279 	int i = task_cpu(p);
3280 
3281 	if (idle_cpu(target))
3282 		return target;
3283 
3284 	/*
3285 	 * If the prevous cpu is cache affine and idle, don't be stupid.
3286 	 */
3287 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3288 		return i;
3289 
3290 	/*
3291 	 * Otherwise, iterate the domains and find an elegible idle cpu.
3292 	 */
3293 	sd = rcu_dereference(per_cpu(sd_llc, target));
3294 	for_each_lower_domain(sd) {
3295 		sg = sd->groups;
3296 		do {
3297 			if (!cpumask_intersects(sched_group_cpus(sg),
3298 						tsk_cpus_allowed(p)))
3299 				goto next;
3300 
3301 			for_each_cpu(i, sched_group_cpus(sg)) {
3302 				if (i == target || !idle_cpu(i))
3303 					goto next;
3304 			}
3305 
3306 			target = cpumask_first_and(sched_group_cpus(sg),
3307 					tsk_cpus_allowed(p));
3308 			goto done;
3309 next:
3310 			sg = sg->next;
3311 		} while (sg != sd->groups);
3312 	}
3313 done:
3314 	return target;
3315 }
3316 
3317 /*
3318  * sched_balance_self: balance the current task (running on cpu) in domains
3319  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
3320  * SD_BALANCE_EXEC.
3321  *
3322  * Balance, ie. select the least loaded group.
3323  *
3324  * Returns the target CPU number, or the same CPU if no balancing is needed.
3325  *
3326  * preempt must be disabled.
3327  */
3328 static int
3329 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
3330 {
3331 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3332 	int cpu = smp_processor_id();
3333 	int prev_cpu = task_cpu(p);
3334 	int new_cpu = cpu;
3335 	int want_affine = 0;
3336 	int sync = wake_flags & WF_SYNC;
3337 
3338 	if (p->nr_cpus_allowed == 1)
3339 		return prev_cpu;
3340 
3341 	if (sd_flag & SD_BALANCE_WAKE) {
3342 		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
3343 			want_affine = 1;
3344 		new_cpu = prev_cpu;
3345 	}
3346 
3347 	rcu_read_lock();
3348 	for_each_domain(cpu, tmp) {
3349 		if (!(tmp->flags & SD_LOAD_BALANCE))
3350 			continue;
3351 
3352 		/*
3353 		 * If both cpu and prev_cpu are part of this domain,
3354 		 * cpu is a valid SD_WAKE_AFFINE target.
3355 		 */
3356 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
3357 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
3358 			affine_sd = tmp;
3359 			break;
3360 		}
3361 
3362 		if (tmp->flags & sd_flag)
3363 			sd = tmp;
3364 	}
3365 
3366 	if (affine_sd) {
3367 		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
3368 			prev_cpu = cpu;
3369 
3370 		new_cpu = select_idle_sibling(p, prev_cpu);
3371 		goto unlock;
3372 	}
3373 
3374 	while (sd) {
3375 		int load_idx = sd->forkexec_idx;
3376 		struct sched_group *group;
3377 		int weight;
3378 
3379 		if (!(sd->flags & sd_flag)) {
3380 			sd = sd->child;
3381 			continue;
3382 		}
3383 
3384 		if (sd_flag & SD_BALANCE_WAKE)
3385 			load_idx = sd->wake_idx;
3386 
3387 		group = find_idlest_group(sd, p, cpu, load_idx);
3388 		if (!group) {
3389 			sd = sd->child;
3390 			continue;
3391 		}
3392 
3393 		new_cpu = find_idlest_cpu(group, p, cpu);
3394 		if (new_cpu == -1 || new_cpu == cpu) {
3395 			/* Now try balancing at a lower domain level of cpu */
3396 			sd = sd->child;
3397 			continue;
3398 		}
3399 
3400 		/* Now try balancing at a lower domain level of new_cpu */
3401 		cpu = new_cpu;
3402 		weight = sd->span_weight;
3403 		sd = NULL;
3404 		for_each_domain(cpu, tmp) {
3405 			if (weight <= tmp->span_weight)
3406 				break;
3407 			if (tmp->flags & sd_flag)
3408 				sd = tmp;
3409 		}
3410 		/* while loop will break here if sd == NULL */
3411 	}
3412 unlock:
3413 	rcu_read_unlock();
3414 
3415 	return new_cpu;
3416 }
3417 
3418 /*
3419  * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3420  * removed when useful for applications beyond shares distribution (e.g.
3421  * load-balance).
3422  */
3423 #ifdef CONFIG_FAIR_GROUP_SCHED
3424 /*
3425  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3426  * cfs_rq_of(p) references at time of call are still valid and identify the
3427  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
3428  * other assumptions, including the state of rq->lock, should be made.
3429  */
3430 static void
3431 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3432 {
3433 	struct sched_entity *se = &p->se;
3434 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3435 
3436 	/*
3437 	 * Load tracking: accumulate removed load so that it can be processed
3438 	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
3439 	 * to blocked load iff they have a positive decay-count.  It can never
3440 	 * be negative here since on-rq tasks have decay-count == 0.
3441 	 */
3442 	if (se->avg.decay_count) {
3443 		se->avg.decay_count = -__synchronize_entity_decay(se);
3444 		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3445 	}
3446 }
3447 #endif
3448 #endif /* CONFIG_SMP */
3449 
3450 static unsigned long
3451 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
3452 {
3453 	unsigned long gran = sysctl_sched_wakeup_granularity;
3454 
3455 	/*
3456 	 * Since its curr running now, convert the gran from real-time
3457 	 * to virtual-time in his units.
3458 	 *
3459 	 * By using 'se' instead of 'curr' we penalize light tasks, so
3460 	 * they get preempted easier. That is, if 'se' < 'curr' then
3461 	 * the resulting gran will be larger, therefore penalizing the
3462 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
3463 	 * be smaller, again penalizing the lighter task.
3464 	 *
3465 	 * This is especially important for buddies when the leftmost
3466 	 * task is higher priority than the buddy.
3467 	 */
3468 	return calc_delta_fair(gran, se);
3469 }
3470 
3471 /*
3472  * Should 'se' preempt 'curr'.
3473  *
3474  *             |s1
3475  *        |s2
3476  *   |s3
3477  *         g
3478  *      |<--->|c
3479  *
3480  *  w(c, s1) = -1
3481  *  w(c, s2) =  0
3482  *  w(c, s3) =  1
3483  *
3484  */
3485 static int
3486 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
3487 {
3488 	s64 gran, vdiff = curr->vruntime - se->vruntime;
3489 
3490 	if (vdiff <= 0)
3491 		return -1;
3492 
3493 	gran = wakeup_gran(curr, se);
3494 	if (vdiff > gran)
3495 		return 1;
3496 
3497 	return 0;
3498 }
3499 
3500 static void set_last_buddy(struct sched_entity *se)
3501 {
3502 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3503 		return;
3504 
3505 	for_each_sched_entity(se)
3506 		cfs_rq_of(se)->last = se;
3507 }
3508 
3509 static void set_next_buddy(struct sched_entity *se)
3510 {
3511 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3512 		return;
3513 
3514 	for_each_sched_entity(se)
3515 		cfs_rq_of(se)->next = se;
3516 }
3517 
3518 static void set_skip_buddy(struct sched_entity *se)
3519 {
3520 	for_each_sched_entity(se)
3521 		cfs_rq_of(se)->skip = se;
3522 }
3523 
3524 /*
3525  * Preempt the current task with a newly woken task if needed:
3526  */
3527 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
3528 {
3529 	struct task_struct *curr = rq->curr;
3530 	struct sched_entity *se = &curr->se, *pse = &p->se;
3531 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3532 	int scale = cfs_rq->nr_running >= sched_nr_latency;
3533 	int next_buddy_marked = 0;
3534 
3535 	if (unlikely(se == pse))
3536 		return;
3537 
3538 	/*
3539 	 * This is possible from callers such as move_task(), in which we
3540 	 * unconditionally check_prempt_curr() after an enqueue (which may have
3541 	 * lead to a throttle).  This both saves work and prevents false
3542 	 * next-buddy nomination below.
3543 	 */
3544 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3545 		return;
3546 
3547 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3548 		set_next_buddy(pse);
3549 		next_buddy_marked = 1;
3550 	}
3551 
3552 	/*
3553 	 * We can come here with TIF_NEED_RESCHED already set from new task
3554 	 * wake up path.
3555 	 *
3556 	 * Note: this also catches the edge-case of curr being in a throttled
3557 	 * group (e.g. via set_curr_task), since update_curr() (in the
3558 	 * enqueue of curr) will have resulted in resched being set.  This
3559 	 * prevents us from potentially nominating it as a false LAST_BUDDY
3560 	 * below.
3561 	 */
3562 	if (test_tsk_need_resched(curr))
3563 		return;
3564 
3565 	/* Idle tasks are by definition preempted by non-idle tasks. */
3566 	if (unlikely(curr->policy == SCHED_IDLE) &&
3567 	    likely(p->policy != SCHED_IDLE))
3568 		goto preempt;
3569 
3570 	/*
3571 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
3572 	 * is driven by the tick):
3573 	 */
3574 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
3575 		return;
3576 
3577 	find_matching_se(&se, &pse);
3578 	update_curr(cfs_rq_of(se));
3579 	BUG_ON(!pse);
3580 	if (wakeup_preempt_entity(se, pse) == 1) {
3581 		/*
3582 		 * Bias pick_next to pick the sched entity that is
3583 		 * triggering this preemption.
3584 		 */
3585 		if (!next_buddy_marked)
3586 			set_next_buddy(pse);
3587 		goto preempt;
3588 	}
3589 
3590 	return;
3591 
3592 preempt:
3593 	resched_task(curr);
3594 	/*
3595 	 * Only set the backward buddy when the current task is still
3596 	 * on the rq. This can happen when a wakeup gets interleaved
3597 	 * with schedule on the ->pre_schedule() or idle_balance()
3598 	 * point, either of which can * drop the rq lock.
3599 	 *
3600 	 * Also, during early boot the idle thread is in the fair class,
3601 	 * for obvious reasons its a bad idea to schedule back to it.
3602 	 */
3603 	if (unlikely(!se->on_rq || curr == rq->idle))
3604 		return;
3605 
3606 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3607 		set_last_buddy(se);
3608 }
3609 
3610 static struct task_struct *pick_next_task_fair(struct rq *rq)
3611 {
3612 	struct task_struct *p;
3613 	struct cfs_rq *cfs_rq = &rq->cfs;
3614 	struct sched_entity *se;
3615 
3616 	if (!cfs_rq->nr_running)
3617 		return NULL;
3618 
3619 	do {
3620 		se = pick_next_entity(cfs_rq);
3621 		set_next_entity(cfs_rq, se);
3622 		cfs_rq = group_cfs_rq(se);
3623 	} while (cfs_rq);
3624 
3625 	p = task_of(se);
3626 	if (hrtick_enabled(rq))
3627 		hrtick_start_fair(rq, p);
3628 
3629 	return p;
3630 }
3631 
3632 /*
3633  * Account for a descheduled task:
3634  */
3635 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
3636 {
3637 	struct sched_entity *se = &prev->se;
3638 	struct cfs_rq *cfs_rq;
3639 
3640 	for_each_sched_entity(se) {
3641 		cfs_rq = cfs_rq_of(se);
3642 		put_prev_entity(cfs_rq, se);
3643 	}
3644 }
3645 
3646 /*
3647  * sched_yield() is very simple
3648  *
3649  * The magic of dealing with the ->skip buddy is in pick_next_entity.
3650  */
3651 static void yield_task_fair(struct rq *rq)
3652 {
3653 	struct task_struct *curr = rq->curr;
3654 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3655 	struct sched_entity *se = &curr->se;
3656 
3657 	/*
3658 	 * Are we the only task in the tree?
3659 	 */
3660 	if (unlikely(rq->nr_running == 1))
3661 		return;
3662 
3663 	clear_buddies(cfs_rq, se);
3664 
3665 	if (curr->policy != SCHED_BATCH) {
3666 		update_rq_clock(rq);
3667 		/*
3668 		 * Update run-time statistics of the 'current'.
3669 		 */
3670 		update_curr(cfs_rq);
3671 		/*
3672 		 * Tell update_rq_clock() that we've just updated,
3673 		 * so we don't do microscopic update in schedule()
3674 		 * and double the fastpath cost.
3675 		 */
3676 		 rq->skip_clock_update = 1;
3677 	}
3678 
3679 	set_skip_buddy(se);
3680 }
3681 
3682 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3683 {
3684 	struct sched_entity *se = &p->se;
3685 
3686 	/* throttled hierarchies are not runnable */
3687 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3688 		return false;
3689 
3690 	/* Tell the scheduler that we'd really like pse to run next. */
3691 	set_next_buddy(se);
3692 
3693 	yield_task_fair(rq);
3694 
3695 	return true;
3696 }
3697 
3698 #ifdef CONFIG_SMP
3699 /**************************************************
3700  * Fair scheduling class load-balancing methods.
3701  *
3702  * BASICS
3703  *
3704  * The purpose of load-balancing is to achieve the same basic fairness the
3705  * per-cpu scheduler provides, namely provide a proportional amount of compute
3706  * time to each task. This is expressed in the following equation:
3707  *
3708  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
3709  *
3710  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3711  * W_i,0 is defined as:
3712  *
3713  *   W_i,0 = \Sum_j w_i,j                                             (2)
3714  *
3715  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3716  * is derived from the nice value as per prio_to_weight[].
3717  *
3718  * The weight average is an exponential decay average of the instantaneous
3719  * weight:
3720  *
3721  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
3722  *
3723  * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3724  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3725  * can also include other factors [XXX].
3726  *
3727  * To achieve this balance we define a measure of imbalance which follows
3728  * directly from (1):
3729  *
3730  *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
3731  *
3732  * We them move tasks around to minimize the imbalance. In the continuous
3733  * function space it is obvious this converges, in the discrete case we get
3734  * a few fun cases generally called infeasible weight scenarios.
3735  *
3736  * [XXX expand on:
3737  *     - infeasible weights;
3738  *     - local vs global optima in the discrete case. ]
3739  *
3740  *
3741  * SCHED DOMAINS
3742  *
3743  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3744  * for all i,j solution, we create a tree of cpus that follows the hardware
3745  * topology where each level pairs two lower groups (or better). This results
3746  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3747  * tree to only the first of the previous level and we decrease the frequency
3748  * of load-balance at each level inv. proportional to the number of cpus in
3749  * the groups.
3750  *
3751  * This yields:
3752  *
3753  *     log_2 n     1     n
3754  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
3755  *     i = 0      2^i   2^i
3756  *                               `- size of each group
3757  *         |         |     `- number of cpus doing load-balance
3758  *         |         `- freq
3759  *         `- sum over all levels
3760  *
3761  * Coupled with a limit on how many tasks we can migrate every balance pass,
3762  * this makes (5) the runtime complexity of the balancer.
3763  *
3764  * An important property here is that each CPU is still (indirectly) connected
3765  * to every other cpu in at most O(log n) steps:
3766  *
3767  * The adjacency matrix of the resulting graph is given by:
3768  *
3769  *             log_2 n
3770  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
3771  *             k = 0
3772  *
3773  * And you'll find that:
3774  *
3775  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
3776  *
3777  * Showing there's indeed a path between every cpu in at most O(log n) steps.
3778  * The task movement gives a factor of O(m), giving a convergence complexity
3779  * of:
3780  *
3781  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
3782  *
3783  *
3784  * WORK CONSERVING
3785  *
3786  * In order to avoid CPUs going idle while there's still work to do, new idle
3787  * balancing is more aggressive and has the newly idle cpu iterate up the domain
3788  * tree itself instead of relying on other CPUs to bring it work.
3789  *
3790  * This adds some complexity to both (5) and (8) but it reduces the total idle
3791  * time.
3792  *
3793  * [XXX more?]
3794  *
3795  *
3796  * CGROUPS
3797  *
3798  * Cgroups make a horror show out of (2), instead of a simple sum we get:
3799  *
3800  *                                s_k,i
3801  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
3802  *                                 S_k
3803  *
3804  * Where
3805  *
3806  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
3807  *
3808  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3809  *
3810  * The big problem is S_k, its a global sum needed to compute a local (W_i)
3811  * property.
3812  *
3813  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3814  *      rewrite all of this once again.]
3815  */
3816 
3817 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3818 
3819 #define LBF_ALL_PINNED	0x01
3820 #define LBF_NEED_BREAK	0x02
3821 #define LBF_SOME_PINNED 0x04
3822 
3823 struct lb_env {
3824 	struct sched_domain	*sd;
3825 
3826 	struct rq		*src_rq;
3827 	int			src_cpu;
3828 
3829 	int			dst_cpu;
3830 	struct rq		*dst_rq;
3831 
3832 	struct cpumask		*dst_grpmask;
3833 	int			new_dst_cpu;
3834 	enum cpu_idle_type	idle;
3835 	long			imbalance;
3836 	/* The set of CPUs under consideration for load-balancing */
3837 	struct cpumask		*cpus;
3838 
3839 	unsigned int		flags;
3840 
3841 	unsigned int		loop;
3842 	unsigned int		loop_break;
3843 	unsigned int		loop_max;
3844 };
3845 
3846 /*
3847  * move_task - move a task from one runqueue to another runqueue.
3848  * Both runqueues must be locked.
3849  */
3850 static void move_task(struct task_struct *p, struct lb_env *env)
3851 {
3852 	deactivate_task(env->src_rq, p, 0);
3853 	set_task_cpu(p, env->dst_cpu);
3854 	activate_task(env->dst_rq, p, 0);
3855 	check_preempt_curr(env->dst_rq, p, 0);
3856 }
3857 
3858 /*
3859  * Is this task likely cache-hot:
3860  */
3861 static int
3862 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3863 {
3864 	s64 delta;
3865 
3866 	if (p->sched_class != &fair_sched_class)
3867 		return 0;
3868 
3869 	if (unlikely(p->policy == SCHED_IDLE))
3870 		return 0;
3871 
3872 	/*
3873 	 * Buddy candidates are cache hot:
3874 	 */
3875 	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3876 			(&p->se == cfs_rq_of(&p->se)->next ||
3877 			 &p->se == cfs_rq_of(&p->se)->last))
3878 		return 1;
3879 
3880 	if (sysctl_sched_migration_cost == -1)
3881 		return 1;
3882 	if (sysctl_sched_migration_cost == 0)
3883 		return 0;
3884 
3885 	delta = now - p->se.exec_start;
3886 
3887 	return delta < (s64)sysctl_sched_migration_cost;
3888 }
3889 
3890 /*
3891  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3892  */
3893 static
3894 int can_migrate_task(struct task_struct *p, struct lb_env *env)
3895 {
3896 	int tsk_cache_hot = 0;
3897 	/*
3898 	 * We do not migrate tasks that are:
3899 	 * 1) throttled_lb_pair, or
3900 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3901 	 * 3) running (obviously), or
3902 	 * 4) are cache-hot on their current CPU.
3903 	 */
3904 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3905 		return 0;
3906 
3907 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3908 		int cpu;
3909 
3910 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3911 
3912 		/*
3913 		 * Remember if this task can be migrated to any other cpu in
3914 		 * our sched_group. We may want to revisit it if we couldn't
3915 		 * meet load balance goals by pulling other tasks on src_cpu.
3916 		 *
3917 		 * Also avoid computing new_dst_cpu if we have already computed
3918 		 * one in current iteration.
3919 		 */
3920 		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3921 			return 0;
3922 
3923 		/* Prevent to re-select dst_cpu via env's cpus */
3924 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
3925 			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
3926 				env->flags |= LBF_SOME_PINNED;
3927 				env->new_dst_cpu = cpu;
3928 				break;
3929 			}
3930 		}
3931 
3932 		return 0;
3933 	}
3934 
3935 	/* Record that we found atleast one task that could run on dst_cpu */
3936 	env->flags &= ~LBF_ALL_PINNED;
3937 
3938 	if (task_running(env->src_rq, p)) {
3939 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3940 		return 0;
3941 	}
3942 
3943 	/*
3944 	 * Aggressive migration if:
3945 	 * 1) task is cache cold, or
3946 	 * 2) too many balance attempts have failed.
3947 	 */
3948 
3949 	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3950 	if (!tsk_cache_hot ||
3951 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3952 
3953 		if (tsk_cache_hot) {
3954 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3955 			schedstat_inc(p, se.statistics.nr_forced_migrations);
3956 		}
3957 
3958 		return 1;
3959 	}
3960 
3961 	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3962 	return 0;
3963 }
3964 
3965 /*
3966  * move_one_task tries to move exactly one task from busiest to this_rq, as
3967  * part of active balancing operations within "domain".
3968  * Returns 1 if successful and 0 otherwise.
3969  *
3970  * Called with both runqueues locked.
3971  */
3972 static int move_one_task(struct lb_env *env)
3973 {
3974 	struct task_struct *p, *n;
3975 
3976 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3977 		if (!can_migrate_task(p, env))
3978 			continue;
3979 
3980 		move_task(p, env);
3981 		/*
3982 		 * Right now, this is only the second place move_task()
3983 		 * is called, so we can safely collect move_task()
3984 		 * stats here rather than inside move_task().
3985 		 */
3986 		schedstat_inc(env->sd, lb_gained[env->idle]);
3987 		return 1;
3988 	}
3989 	return 0;
3990 }
3991 
3992 static unsigned long task_h_load(struct task_struct *p);
3993 
3994 static const unsigned int sched_nr_migrate_break = 32;
3995 
3996 /*
3997  * move_tasks tries to move up to imbalance weighted load from busiest to
3998  * this_rq, as part of a balancing operation within domain "sd".
3999  * Returns 1 if successful and 0 otherwise.
4000  *
4001  * Called with both runqueues locked.
4002  */
4003 static int move_tasks(struct lb_env *env)
4004 {
4005 	struct list_head *tasks = &env->src_rq->cfs_tasks;
4006 	struct task_struct *p;
4007 	unsigned long load;
4008 	int pulled = 0;
4009 
4010 	if (env->imbalance <= 0)
4011 		return 0;
4012 
4013 	while (!list_empty(tasks)) {
4014 		p = list_first_entry(tasks, struct task_struct, se.group_node);
4015 
4016 		env->loop++;
4017 		/* We've more or less seen every task there is, call it quits */
4018 		if (env->loop > env->loop_max)
4019 			break;
4020 
4021 		/* take a breather every nr_migrate tasks */
4022 		if (env->loop > env->loop_break) {
4023 			env->loop_break += sched_nr_migrate_break;
4024 			env->flags |= LBF_NEED_BREAK;
4025 			break;
4026 		}
4027 
4028 		if (!can_migrate_task(p, env))
4029 			goto next;
4030 
4031 		load = task_h_load(p);
4032 
4033 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
4034 			goto next;
4035 
4036 		if ((load / 2) > env->imbalance)
4037 			goto next;
4038 
4039 		move_task(p, env);
4040 		pulled++;
4041 		env->imbalance -= load;
4042 
4043 #ifdef CONFIG_PREEMPT
4044 		/*
4045 		 * NEWIDLE balancing is a source of latency, so preemptible
4046 		 * kernels will stop after the first task is pulled to minimize
4047 		 * the critical section.
4048 		 */
4049 		if (env->idle == CPU_NEWLY_IDLE)
4050 			break;
4051 #endif
4052 
4053 		/*
4054 		 * We only want to steal up to the prescribed amount of
4055 		 * weighted load.
4056 		 */
4057 		if (env->imbalance <= 0)
4058 			break;
4059 
4060 		continue;
4061 next:
4062 		list_move_tail(&p->se.group_node, tasks);
4063 	}
4064 
4065 	/*
4066 	 * Right now, this is one of only two places move_task() is called,
4067 	 * so we can safely collect move_task() stats here rather than
4068 	 * inside move_task().
4069 	 */
4070 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
4071 
4072 	return pulled;
4073 }
4074 
4075 #ifdef CONFIG_FAIR_GROUP_SCHED
4076 /*
4077  * update tg->load_weight by folding this cpu's load_avg
4078  */
4079 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
4080 {
4081 	struct sched_entity *se = tg->se[cpu];
4082 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
4083 
4084 	/* throttled entities do not contribute to load */
4085 	if (throttled_hierarchy(cfs_rq))
4086 		return;
4087 
4088 	update_cfs_rq_blocked_load(cfs_rq, 1);
4089 
4090 	if (se) {
4091 		update_entity_load_avg(se, 1);
4092 		/*
4093 		 * We pivot on our runnable average having decayed to zero for
4094 		 * list removal.  This generally implies that all our children
4095 		 * have also been removed (modulo rounding error or bandwidth
4096 		 * control); however, such cases are rare and we can fix these
4097 		 * at enqueue.
4098 		 *
4099 		 * TODO: fix up out-of-order children on enqueue.
4100 		 */
4101 		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4102 			list_del_leaf_cfs_rq(cfs_rq);
4103 	} else {
4104 		struct rq *rq = rq_of(cfs_rq);
4105 		update_rq_runnable_avg(rq, rq->nr_running);
4106 	}
4107 }
4108 
4109 static void update_blocked_averages(int cpu)
4110 {
4111 	struct rq *rq = cpu_rq(cpu);
4112 	struct cfs_rq *cfs_rq;
4113 	unsigned long flags;
4114 
4115 	raw_spin_lock_irqsave(&rq->lock, flags);
4116 	update_rq_clock(rq);
4117 	/*
4118 	 * Iterates the task_group tree in a bottom up fashion, see
4119 	 * list_add_leaf_cfs_rq() for details.
4120 	 */
4121 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4122 		/*
4123 		 * Note: We may want to consider periodically releasing
4124 		 * rq->lock about these updates so that creating many task
4125 		 * groups does not result in continually extending hold time.
4126 		 */
4127 		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
4128 	}
4129 
4130 	raw_spin_unlock_irqrestore(&rq->lock, flags);
4131 }
4132 
4133 /*
4134  * Compute the cpu's hierarchical load factor for each task group.
4135  * This needs to be done in a top-down fashion because the load of a child
4136  * group is a fraction of its parents load.
4137  */
4138 static int tg_load_down(struct task_group *tg, void *data)
4139 {
4140 	unsigned long load;
4141 	long cpu = (long)data;
4142 
4143 	if (!tg->parent) {
4144 		load = cpu_rq(cpu)->load.weight;
4145 	} else {
4146 		load = tg->parent->cfs_rq[cpu]->h_load;
4147 		load *= tg->se[cpu]->load.weight;
4148 		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
4149 	}
4150 
4151 	tg->cfs_rq[cpu]->h_load = load;
4152 
4153 	return 0;
4154 }
4155 
4156 static void update_h_load(long cpu)
4157 {
4158 	struct rq *rq = cpu_rq(cpu);
4159 	unsigned long now = jiffies;
4160 
4161 	if (rq->h_load_throttle == now)
4162 		return;
4163 
4164 	rq->h_load_throttle = now;
4165 
4166 	rcu_read_lock();
4167 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
4168 	rcu_read_unlock();
4169 }
4170 
4171 static unsigned long task_h_load(struct task_struct *p)
4172 {
4173 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
4174 	unsigned long load;
4175 
4176 	load = p->se.load.weight;
4177 	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
4178 
4179 	return load;
4180 }
4181 #else
4182 static inline void update_blocked_averages(int cpu)
4183 {
4184 }
4185 
4186 static inline void update_h_load(long cpu)
4187 {
4188 }
4189 
4190 static unsigned long task_h_load(struct task_struct *p)
4191 {
4192 	return p->se.load.weight;
4193 }
4194 #endif
4195 
4196 /********** Helpers for find_busiest_group ************************/
4197 /*
4198  * sd_lb_stats - Structure to store the statistics of a sched_domain
4199  * 		during load balancing.
4200  */
4201 struct sd_lb_stats {
4202 	struct sched_group *busiest; /* Busiest group in this sd */
4203 	struct sched_group *this;  /* Local group in this sd */
4204 	unsigned long total_load;  /* Total load of all groups in sd */
4205 	unsigned long total_pwr;   /*	Total power of all groups in sd */
4206 	unsigned long avg_load;	   /* Average load across all groups in sd */
4207 
4208 	/** Statistics of this group */
4209 	unsigned long this_load;
4210 	unsigned long this_load_per_task;
4211 	unsigned long this_nr_running;
4212 	unsigned long this_has_capacity;
4213 	unsigned int  this_idle_cpus;
4214 
4215 	/* Statistics of the busiest group */
4216 	unsigned int  busiest_idle_cpus;
4217 	unsigned long max_load;
4218 	unsigned long busiest_load_per_task;
4219 	unsigned long busiest_nr_running;
4220 	unsigned long busiest_group_capacity;
4221 	unsigned long busiest_has_capacity;
4222 	unsigned int  busiest_group_weight;
4223 
4224 	int group_imb; /* Is there imbalance in this sd */
4225 };
4226 
4227 /*
4228  * sg_lb_stats - stats of a sched_group required for load_balancing
4229  */
4230 struct sg_lb_stats {
4231 	unsigned long avg_load; /*Avg load across the CPUs of the group */
4232 	unsigned long group_load; /* Total load over the CPUs of the group */
4233 	unsigned long sum_nr_running; /* Nr tasks running in the group */
4234 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4235 	unsigned long group_capacity;
4236 	unsigned long idle_cpus;
4237 	unsigned long group_weight;
4238 	int group_imb; /* Is there an imbalance in the group ? */
4239 	int group_has_capacity; /* Is there extra capacity in the group? */
4240 };
4241 
4242 /**
4243  * get_sd_load_idx - Obtain the load index for a given sched domain.
4244  * @sd: The sched_domain whose load_idx is to be obtained.
4245  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
4246  */
4247 static inline int get_sd_load_idx(struct sched_domain *sd,
4248 					enum cpu_idle_type idle)
4249 {
4250 	int load_idx;
4251 
4252 	switch (idle) {
4253 	case CPU_NOT_IDLE:
4254 		load_idx = sd->busy_idx;
4255 		break;
4256 
4257 	case CPU_NEWLY_IDLE:
4258 		load_idx = sd->newidle_idx;
4259 		break;
4260 	default:
4261 		load_idx = sd->idle_idx;
4262 		break;
4263 	}
4264 
4265 	return load_idx;
4266 }
4267 
4268 static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4269 {
4270 	return SCHED_POWER_SCALE;
4271 }
4272 
4273 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4274 {
4275 	return default_scale_freq_power(sd, cpu);
4276 }
4277 
4278 static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4279 {
4280 	unsigned long weight = sd->span_weight;
4281 	unsigned long smt_gain = sd->smt_gain;
4282 
4283 	smt_gain /= weight;
4284 
4285 	return smt_gain;
4286 }
4287 
4288 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4289 {
4290 	return default_scale_smt_power(sd, cpu);
4291 }
4292 
4293 static unsigned long scale_rt_power(int cpu)
4294 {
4295 	struct rq *rq = cpu_rq(cpu);
4296 	u64 total, available, age_stamp, avg;
4297 
4298 	/*
4299 	 * Since we're reading these variables without serialization make sure
4300 	 * we read them once before doing sanity checks on them.
4301 	 */
4302 	age_stamp = ACCESS_ONCE(rq->age_stamp);
4303 	avg = ACCESS_ONCE(rq->rt_avg);
4304 
4305 	total = sched_avg_period() + (rq->clock - age_stamp);
4306 
4307 	if (unlikely(total < avg)) {
4308 		/* Ensures that power won't end up being negative */
4309 		available = 0;
4310 	} else {
4311 		available = total - avg;
4312 	}
4313 
4314 	if (unlikely((s64)total < SCHED_POWER_SCALE))
4315 		total = SCHED_POWER_SCALE;
4316 
4317 	total >>= SCHED_POWER_SHIFT;
4318 
4319 	return div_u64(available, total);
4320 }
4321 
4322 static void update_cpu_power(struct sched_domain *sd, int cpu)
4323 {
4324 	unsigned long weight = sd->span_weight;
4325 	unsigned long power = SCHED_POWER_SCALE;
4326 	struct sched_group *sdg = sd->groups;
4327 
4328 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
4329 		if (sched_feat(ARCH_POWER))
4330 			power *= arch_scale_smt_power(sd, cpu);
4331 		else
4332 			power *= default_scale_smt_power(sd, cpu);
4333 
4334 		power >>= SCHED_POWER_SHIFT;
4335 	}
4336 
4337 	sdg->sgp->power_orig = power;
4338 
4339 	if (sched_feat(ARCH_POWER))
4340 		power *= arch_scale_freq_power(sd, cpu);
4341 	else
4342 		power *= default_scale_freq_power(sd, cpu);
4343 
4344 	power >>= SCHED_POWER_SHIFT;
4345 
4346 	power *= scale_rt_power(cpu);
4347 	power >>= SCHED_POWER_SHIFT;
4348 
4349 	if (!power)
4350 		power = 1;
4351 
4352 	cpu_rq(cpu)->cpu_power = power;
4353 	sdg->sgp->power = power;
4354 }
4355 
4356 void update_group_power(struct sched_domain *sd, int cpu)
4357 {
4358 	struct sched_domain *child = sd->child;
4359 	struct sched_group *group, *sdg = sd->groups;
4360 	unsigned long power;
4361 	unsigned long interval;
4362 
4363 	interval = msecs_to_jiffies(sd->balance_interval);
4364 	interval = clamp(interval, 1UL, max_load_balance_interval);
4365 	sdg->sgp->next_update = jiffies + interval;
4366 
4367 	if (!child) {
4368 		update_cpu_power(sd, cpu);
4369 		return;
4370 	}
4371 
4372 	power = 0;
4373 
4374 	if (child->flags & SD_OVERLAP) {
4375 		/*
4376 		 * SD_OVERLAP domains cannot assume that child groups
4377 		 * span the current group.
4378 		 */
4379 
4380 		for_each_cpu(cpu, sched_group_cpus(sdg))
4381 			power += power_of(cpu);
4382 	} else  {
4383 		/*
4384 		 * !SD_OVERLAP domains can assume that child groups
4385 		 * span the current group.
4386 		 */
4387 
4388 		group = child->groups;
4389 		do {
4390 			power += group->sgp->power;
4391 			group = group->next;
4392 		} while (group != child->groups);
4393 	}
4394 
4395 	sdg->sgp->power_orig = sdg->sgp->power = power;
4396 }
4397 
4398 /*
4399  * Try and fix up capacity for tiny siblings, this is needed when
4400  * things like SD_ASYM_PACKING need f_b_g to select another sibling
4401  * which on its own isn't powerful enough.
4402  *
4403  * See update_sd_pick_busiest() and check_asym_packing().
4404  */
4405 static inline int
4406 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4407 {
4408 	/*
4409 	 * Only siblings can have significantly less than SCHED_POWER_SCALE
4410 	 */
4411 	if (!(sd->flags & SD_SHARE_CPUPOWER))
4412 		return 0;
4413 
4414 	/*
4415 	 * If ~90% of the cpu_power is still there, we're good.
4416 	 */
4417 	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
4418 		return 1;
4419 
4420 	return 0;
4421 }
4422 
4423 /**
4424  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4425  * @env: The load balancing environment.
4426  * @group: sched_group whose statistics are to be updated.
4427  * @load_idx: Load index of sched_domain of this_cpu for load calc.
4428  * @local_group: Does group contain this_cpu.
4429  * @balance: Should we balance.
4430  * @sgs: variable to hold the statistics for this group.
4431  */
4432 static inline void update_sg_lb_stats(struct lb_env *env,
4433 			struct sched_group *group, int load_idx,
4434 			int local_group, int *balance, struct sg_lb_stats *sgs)
4435 {
4436 	unsigned long nr_running, max_nr_running, min_nr_running;
4437 	unsigned long load, max_cpu_load, min_cpu_load;
4438 	unsigned int balance_cpu = -1, first_idle_cpu = 0;
4439 	unsigned long avg_load_per_task = 0;
4440 	int i;
4441 
4442 	if (local_group)
4443 		balance_cpu = group_balance_cpu(group);
4444 
4445 	/* Tally up the load of all CPUs in the group */
4446 	max_cpu_load = 0;
4447 	min_cpu_load = ~0UL;
4448 	max_nr_running = 0;
4449 	min_nr_running = ~0UL;
4450 
4451 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4452 		struct rq *rq = cpu_rq(i);
4453 
4454 		nr_running = rq->nr_running;
4455 
4456 		/* Bias balancing toward cpus of our domain */
4457 		if (local_group) {
4458 			if (idle_cpu(i) && !first_idle_cpu &&
4459 					cpumask_test_cpu(i, sched_group_mask(group))) {
4460 				first_idle_cpu = 1;
4461 				balance_cpu = i;
4462 			}
4463 
4464 			load = target_load(i, load_idx);
4465 		} else {
4466 			load = source_load(i, load_idx);
4467 			if (load > max_cpu_load)
4468 				max_cpu_load = load;
4469 			if (min_cpu_load > load)
4470 				min_cpu_load = load;
4471 
4472 			if (nr_running > max_nr_running)
4473 				max_nr_running = nr_running;
4474 			if (min_nr_running > nr_running)
4475 				min_nr_running = nr_running;
4476 		}
4477 
4478 		sgs->group_load += load;
4479 		sgs->sum_nr_running += nr_running;
4480 		sgs->sum_weighted_load += weighted_cpuload(i);
4481 		if (idle_cpu(i))
4482 			sgs->idle_cpus++;
4483 	}
4484 
4485 	/*
4486 	 * First idle cpu or the first cpu(busiest) in this sched group
4487 	 * is eligible for doing load balancing at this and above
4488 	 * domains. In the newly idle case, we will allow all the cpu's
4489 	 * to do the newly idle load balance.
4490 	 */
4491 	if (local_group) {
4492 		if (env->idle != CPU_NEWLY_IDLE) {
4493 			if (balance_cpu != env->dst_cpu) {
4494 				*balance = 0;
4495 				return;
4496 			}
4497 			update_group_power(env->sd, env->dst_cpu);
4498 		} else if (time_after_eq(jiffies, group->sgp->next_update))
4499 			update_group_power(env->sd, env->dst_cpu);
4500 	}
4501 
4502 	/* Adjust by relative CPU power of the group */
4503 	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
4504 
4505 	/*
4506 	 * Consider the group unbalanced when the imbalance is larger
4507 	 * than the average weight of a task.
4508 	 *
4509 	 * APZ: with cgroup the avg task weight can vary wildly and
4510 	 *      might not be a suitable number - should we keep a
4511 	 *      normalized nr_running number somewhere that negates
4512 	 *      the hierarchy?
4513 	 */
4514 	if (sgs->sum_nr_running)
4515 		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4516 
4517 	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
4518 	    (max_nr_running - min_nr_running) > 1)
4519 		sgs->group_imb = 1;
4520 
4521 	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4522 						SCHED_POWER_SCALE);
4523 	if (!sgs->group_capacity)
4524 		sgs->group_capacity = fix_small_capacity(env->sd, group);
4525 	sgs->group_weight = group->group_weight;
4526 
4527 	if (sgs->group_capacity > sgs->sum_nr_running)
4528 		sgs->group_has_capacity = 1;
4529 }
4530 
4531 /**
4532  * update_sd_pick_busiest - return 1 on busiest group
4533  * @env: The load balancing environment.
4534  * @sds: sched_domain statistics
4535  * @sg: sched_group candidate to be checked for being the busiest
4536  * @sgs: sched_group statistics
4537  *
4538  * Determine if @sg is a busier group than the previously selected
4539  * busiest group.
4540  */
4541 static bool update_sd_pick_busiest(struct lb_env *env,
4542 				   struct sd_lb_stats *sds,
4543 				   struct sched_group *sg,
4544 				   struct sg_lb_stats *sgs)
4545 {
4546 	if (sgs->avg_load <= sds->max_load)
4547 		return false;
4548 
4549 	if (sgs->sum_nr_running > sgs->group_capacity)
4550 		return true;
4551 
4552 	if (sgs->group_imb)
4553 		return true;
4554 
4555 	/*
4556 	 * ASYM_PACKING needs to move all the work to the lowest
4557 	 * numbered CPUs in the group, therefore mark all groups
4558 	 * higher than ourself as busy.
4559 	 */
4560 	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
4561 	    env->dst_cpu < group_first_cpu(sg)) {
4562 		if (!sds->busiest)
4563 			return true;
4564 
4565 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
4566 			return true;
4567 	}
4568 
4569 	return false;
4570 }
4571 
4572 /**
4573  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4574  * @env: The load balancing environment.
4575  * @balance: Should we balance.
4576  * @sds: variable to hold the statistics for this sched_domain.
4577  */
4578 static inline void update_sd_lb_stats(struct lb_env *env,
4579 					int *balance, struct sd_lb_stats *sds)
4580 {
4581 	struct sched_domain *child = env->sd->child;
4582 	struct sched_group *sg = env->sd->groups;
4583 	struct sg_lb_stats sgs;
4584 	int load_idx, prefer_sibling = 0;
4585 
4586 	if (child && child->flags & SD_PREFER_SIBLING)
4587 		prefer_sibling = 1;
4588 
4589 	load_idx = get_sd_load_idx(env->sd, env->idle);
4590 
4591 	do {
4592 		int local_group;
4593 
4594 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4595 		memset(&sgs, 0, sizeof(sgs));
4596 		update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
4597 
4598 		if (local_group && !(*balance))
4599 			return;
4600 
4601 		sds->total_load += sgs.group_load;
4602 		sds->total_pwr += sg->sgp->power;
4603 
4604 		/*
4605 		 * In case the child domain prefers tasks go to siblings
4606 		 * first, lower the sg capacity to one so that we'll try
4607 		 * and move all the excess tasks away. We lower the capacity
4608 		 * of a group only if the local group has the capacity to fit
4609 		 * these excess tasks, i.e. nr_running < group_capacity. The
4610 		 * extra check prevents the case where you always pull from the
4611 		 * heaviest group when it is already under-utilized (possible
4612 		 * with a large weight task outweighs the tasks on the system).
4613 		 */
4614 		if (prefer_sibling && !local_group && sds->this_has_capacity)
4615 			sgs.group_capacity = min(sgs.group_capacity, 1UL);
4616 
4617 		if (local_group) {
4618 			sds->this_load = sgs.avg_load;
4619 			sds->this = sg;
4620 			sds->this_nr_running = sgs.sum_nr_running;
4621 			sds->this_load_per_task = sgs.sum_weighted_load;
4622 			sds->this_has_capacity = sgs.group_has_capacity;
4623 			sds->this_idle_cpus = sgs.idle_cpus;
4624 		} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4625 			sds->max_load = sgs.avg_load;
4626 			sds->busiest = sg;
4627 			sds->busiest_nr_running = sgs.sum_nr_running;
4628 			sds->busiest_idle_cpus = sgs.idle_cpus;
4629 			sds->busiest_group_capacity = sgs.group_capacity;
4630 			sds->busiest_load_per_task = sgs.sum_weighted_load;
4631 			sds->busiest_has_capacity = sgs.group_has_capacity;
4632 			sds->busiest_group_weight = sgs.group_weight;
4633 			sds->group_imb = sgs.group_imb;
4634 		}
4635 
4636 		sg = sg->next;
4637 	} while (sg != env->sd->groups);
4638 }
4639 
4640 /**
4641  * check_asym_packing - Check to see if the group is packed into the
4642  *			sched doman.
4643  *
4644  * This is primarily intended to used at the sibling level.  Some
4645  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
4646  * case of POWER7, it can move to lower SMT modes only when higher
4647  * threads are idle.  When in lower SMT modes, the threads will
4648  * perform better since they share less core resources.  Hence when we
4649  * have idle threads, we want them to be the higher ones.
4650  *
4651  * This packing function is run on idle threads.  It checks to see if
4652  * the busiest CPU in this domain (core in the P7 case) has a higher
4653  * CPU number than the packing function is being run on.  Here we are
4654  * assuming lower CPU number will be equivalent to lower a SMT thread
4655  * number.
4656  *
4657  * Returns 1 when packing is required and a task should be moved to
4658  * this CPU.  The amount of the imbalance is returned in *imbalance.
4659  *
4660  * @env: The load balancing environment.
4661  * @sds: Statistics of the sched_domain which is to be packed
4662  */
4663 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4664 {
4665 	int busiest_cpu;
4666 
4667 	if (!(env->sd->flags & SD_ASYM_PACKING))
4668 		return 0;
4669 
4670 	if (!sds->busiest)
4671 		return 0;
4672 
4673 	busiest_cpu = group_first_cpu(sds->busiest);
4674 	if (env->dst_cpu > busiest_cpu)
4675 		return 0;
4676 
4677 	env->imbalance = DIV_ROUND_CLOSEST(
4678 		sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
4679 
4680 	return 1;
4681 }
4682 
4683 /**
4684  * fix_small_imbalance - Calculate the minor imbalance that exists
4685  *			amongst the groups of a sched_domain, during
4686  *			load balancing.
4687  * @env: The load balancing environment.
4688  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
4689  */
4690 static inline
4691 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4692 {
4693 	unsigned long tmp, pwr_now = 0, pwr_move = 0;
4694 	unsigned int imbn = 2;
4695 	unsigned long scaled_busy_load_per_task;
4696 
4697 	if (sds->this_nr_running) {
4698 		sds->this_load_per_task /= sds->this_nr_running;
4699 		if (sds->busiest_load_per_task >
4700 				sds->this_load_per_task)
4701 			imbn = 1;
4702 	} else {
4703 		sds->this_load_per_task =
4704 			cpu_avg_load_per_task(env->dst_cpu);
4705 	}
4706 
4707 	scaled_busy_load_per_task = sds->busiest_load_per_task
4708 					 * SCHED_POWER_SCALE;
4709 	scaled_busy_load_per_task /= sds->busiest->sgp->power;
4710 
4711 	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4712 			(scaled_busy_load_per_task * imbn)) {
4713 		env->imbalance = sds->busiest_load_per_task;
4714 		return;
4715 	}
4716 
4717 	/*
4718 	 * OK, we don't have enough imbalance to justify moving tasks,
4719 	 * however we may be able to increase total CPU power used by
4720 	 * moving them.
4721 	 */
4722 
4723 	pwr_now += sds->busiest->sgp->power *
4724 			min(sds->busiest_load_per_task, sds->max_load);
4725 	pwr_now += sds->this->sgp->power *
4726 			min(sds->this_load_per_task, sds->this_load);
4727 	pwr_now /= SCHED_POWER_SCALE;
4728 
4729 	/* Amount of load we'd subtract */
4730 	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4731 		sds->busiest->sgp->power;
4732 	if (sds->max_load > tmp)
4733 		pwr_move += sds->busiest->sgp->power *
4734 			min(sds->busiest_load_per_task, sds->max_load - tmp);
4735 
4736 	/* Amount of load we'd add */
4737 	if (sds->max_load * sds->busiest->sgp->power <
4738 		sds->busiest_load_per_task * SCHED_POWER_SCALE)
4739 		tmp = (sds->max_load * sds->busiest->sgp->power) /
4740 			sds->this->sgp->power;
4741 	else
4742 		tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4743 			sds->this->sgp->power;
4744 	pwr_move += sds->this->sgp->power *
4745 			min(sds->this_load_per_task, sds->this_load + tmp);
4746 	pwr_move /= SCHED_POWER_SCALE;
4747 
4748 	/* Move if we gain throughput */
4749 	if (pwr_move > pwr_now)
4750 		env->imbalance = sds->busiest_load_per_task;
4751 }
4752 
4753 /**
4754  * calculate_imbalance - Calculate the amount of imbalance present within the
4755  *			 groups of a given sched_domain during load balance.
4756  * @env: load balance environment
4757  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4758  */
4759 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4760 {
4761 	unsigned long max_pull, load_above_capacity = ~0UL;
4762 
4763 	sds->busiest_load_per_task /= sds->busiest_nr_running;
4764 	if (sds->group_imb) {
4765 		sds->busiest_load_per_task =
4766 			min(sds->busiest_load_per_task, sds->avg_load);
4767 	}
4768 
4769 	/*
4770 	 * In the presence of smp nice balancing, certain scenarios can have
4771 	 * max load less than avg load(as we skip the groups at or below
4772 	 * its cpu_power, while calculating max_load..)
4773 	 */
4774 	if (sds->max_load < sds->avg_load) {
4775 		env->imbalance = 0;
4776 		return fix_small_imbalance(env, sds);
4777 	}
4778 
4779 	if (!sds->group_imb) {
4780 		/*
4781 		 * Don't want to pull so many tasks that a group would go idle.
4782 		 */
4783 		load_above_capacity = (sds->busiest_nr_running -
4784 						sds->busiest_group_capacity);
4785 
4786 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4787 
4788 		load_above_capacity /= sds->busiest->sgp->power;
4789 	}
4790 
4791 	/*
4792 	 * We're trying to get all the cpus to the average_load, so we don't
4793 	 * want to push ourselves above the average load, nor do we wish to
4794 	 * reduce the max loaded cpu below the average load. At the same time,
4795 	 * we also don't want to reduce the group load below the group capacity
4796 	 * (so that we can implement power-savings policies etc). Thus we look
4797 	 * for the minimum possible imbalance.
4798 	 * Be careful of negative numbers as they'll appear as very large values
4799 	 * with unsigned longs.
4800 	 */
4801 	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4802 
4803 	/* How much load to actually move to equalise the imbalance */
4804 	env->imbalance = min(max_pull * sds->busiest->sgp->power,
4805 		(sds->avg_load - sds->this_load) * sds->this->sgp->power)
4806 			/ SCHED_POWER_SCALE;
4807 
4808 	/*
4809 	 * if *imbalance is less than the average load per runnable task
4810 	 * there is no guarantee that any tasks will be moved so we'll have
4811 	 * a think about bumping its value to force at least one task to be
4812 	 * moved
4813 	 */
4814 	if (env->imbalance < sds->busiest_load_per_task)
4815 		return fix_small_imbalance(env, sds);
4816 
4817 }
4818 
4819 /******* find_busiest_group() helpers end here *********************/
4820 
4821 /**
4822  * find_busiest_group - Returns the busiest group within the sched_domain
4823  * if there is an imbalance. If there isn't an imbalance, and
4824  * the user has opted for power-savings, it returns a group whose
4825  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4826  * such a group exists.
4827  *
4828  * Also calculates the amount of weighted load which should be moved
4829  * to restore balance.
4830  *
4831  * @env: The load balancing environment.
4832  * @balance: Pointer to a variable indicating if this_cpu
4833  *	is the appropriate cpu to perform load balancing at this_level.
4834  *
4835  * Returns:	- the busiest group if imbalance exists.
4836  *		- If no imbalance and user has opted for power-savings balance,
4837  *		   return the least loaded group whose CPUs can be
4838  *		   put to idle by rebalancing its tasks onto our group.
4839  */
4840 static struct sched_group *
4841 find_busiest_group(struct lb_env *env, int *balance)
4842 {
4843 	struct sd_lb_stats sds;
4844 
4845 	memset(&sds, 0, sizeof(sds));
4846 
4847 	/*
4848 	 * Compute the various statistics relavent for load balancing at
4849 	 * this level.
4850 	 */
4851 	update_sd_lb_stats(env, balance, &sds);
4852 
4853 	/*
4854 	 * this_cpu is not the appropriate cpu to perform load balancing at
4855 	 * this level.
4856 	 */
4857 	if (!(*balance))
4858 		goto ret;
4859 
4860 	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4861 	    check_asym_packing(env, &sds))
4862 		return sds.busiest;
4863 
4864 	/* There is no busy sibling group to pull tasks from */
4865 	if (!sds.busiest || sds.busiest_nr_running == 0)
4866 		goto out_balanced;
4867 
4868 	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4869 
4870 	/*
4871 	 * If the busiest group is imbalanced the below checks don't
4872 	 * work because they assumes all things are equal, which typically
4873 	 * isn't true due to cpus_allowed constraints and the like.
4874 	 */
4875 	if (sds.group_imb)
4876 		goto force_balance;
4877 
4878 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4879 	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4880 			!sds.busiest_has_capacity)
4881 		goto force_balance;
4882 
4883 	/*
4884 	 * If the local group is more busy than the selected busiest group
4885 	 * don't try and pull any tasks.
4886 	 */
4887 	if (sds.this_load >= sds.max_load)
4888 		goto out_balanced;
4889 
4890 	/*
4891 	 * Don't pull any tasks if this group is already above the domain
4892 	 * average load.
4893 	 */
4894 	if (sds.this_load >= sds.avg_load)
4895 		goto out_balanced;
4896 
4897 	if (env->idle == CPU_IDLE) {
4898 		/*
4899 		 * This cpu is idle. If the busiest group load doesn't
4900 		 * have more tasks than the number of available cpu's and
4901 		 * there is no imbalance between this and busiest group
4902 		 * wrt to idle cpu's, it is balanced.
4903 		 */
4904 		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4905 		    sds.busiest_nr_running <= sds.busiest_group_weight)
4906 			goto out_balanced;
4907 	} else {
4908 		/*
4909 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4910 		 * imbalance_pct to be conservative.
4911 		 */
4912 		if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4913 			goto out_balanced;
4914 	}
4915 
4916 force_balance:
4917 	/* Looks like there is an imbalance. Compute it */
4918 	calculate_imbalance(env, &sds);
4919 	return sds.busiest;
4920 
4921 out_balanced:
4922 ret:
4923 	env->imbalance = 0;
4924 	return NULL;
4925 }
4926 
4927 /*
4928  * find_busiest_queue - find the busiest runqueue among the cpus in group.
4929  */
4930 static struct rq *find_busiest_queue(struct lb_env *env,
4931 				     struct sched_group *group)
4932 {
4933 	struct rq *busiest = NULL, *rq;
4934 	unsigned long max_load = 0;
4935 	int i;
4936 
4937 	for_each_cpu(i, sched_group_cpus(group)) {
4938 		unsigned long power = power_of(i);
4939 		unsigned long capacity = DIV_ROUND_CLOSEST(power,
4940 							   SCHED_POWER_SCALE);
4941 		unsigned long wl;
4942 
4943 		if (!capacity)
4944 			capacity = fix_small_capacity(env->sd, group);
4945 
4946 		if (!cpumask_test_cpu(i, env->cpus))
4947 			continue;
4948 
4949 		rq = cpu_rq(i);
4950 		wl = weighted_cpuload(i);
4951 
4952 		/*
4953 		 * When comparing with imbalance, use weighted_cpuload()
4954 		 * which is not scaled with the cpu power.
4955 		 */
4956 		if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4957 			continue;
4958 
4959 		/*
4960 		 * For the load comparisons with the other cpu's, consider
4961 		 * the weighted_cpuload() scaled with the cpu power, so that
4962 		 * the load can be moved away from the cpu that is potentially
4963 		 * running at a lower capacity.
4964 		 */
4965 		wl = (wl * SCHED_POWER_SCALE) / power;
4966 
4967 		if (wl > max_load) {
4968 			max_load = wl;
4969 			busiest = rq;
4970 		}
4971 	}
4972 
4973 	return busiest;
4974 }
4975 
4976 /*
4977  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4978  * so long as it is large enough.
4979  */
4980 #define MAX_PINNED_INTERVAL	512
4981 
4982 /* Working cpumask for load_balance and load_balance_newidle. */
4983 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4984 
4985 static int need_active_balance(struct lb_env *env)
4986 {
4987 	struct sched_domain *sd = env->sd;
4988 
4989 	if (env->idle == CPU_NEWLY_IDLE) {
4990 
4991 		/*
4992 		 * ASYM_PACKING needs to force migrate tasks from busy but
4993 		 * higher numbered CPUs in order to pack all tasks in the
4994 		 * lowest numbered CPUs.
4995 		 */
4996 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4997 			return 1;
4998 	}
4999 
5000 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
5001 }
5002 
5003 static int active_load_balance_cpu_stop(void *data);
5004 
5005 /*
5006  * Check this_cpu to ensure it is balanced within domain. Attempt to move
5007  * tasks if there is an imbalance.
5008  */
5009 static int load_balance(int this_cpu, struct rq *this_rq,
5010 			struct sched_domain *sd, enum cpu_idle_type idle,
5011 			int *balance)
5012 {
5013 	int ld_moved, cur_ld_moved, active_balance = 0;
5014 	struct sched_group *group;
5015 	struct rq *busiest;
5016 	unsigned long flags;
5017 	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
5018 
5019 	struct lb_env env = {
5020 		.sd		= sd,
5021 		.dst_cpu	= this_cpu,
5022 		.dst_rq		= this_rq,
5023 		.dst_grpmask    = sched_group_cpus(sd->groups),
5024 		.idle		= idle,
5025 		.loop_break	= sched_nr_migrate_break,
5026 		.cpus		= cpus,
5027 	};
5028 
5029 	/*
5030 	 * For NEWLY_IDLE load_balancing, we don't need to consider
5031 	 * other cpus in our group
5032 	 */
5033 	if (idle == CPU_NEWLY_IDLE)
5034 		env.dst_grpmask = NULL;
5035 
5036 	cpumask_copy(cpus, cpu_active_mask);
5037 
5038 	schedstat_inc(sd, lb_count[idle]);
5039 
5040 redo:
5041 	group = find_busiest_group(&env, balance);
5042 
5043 	if (*balance == 0)
5044 		goto out_balanced;
5045 
5046 	if (!group) {
5047 		schedstat_inc(sd, lb_nobusyg[idle]);
5048 		goto out_balanced;
5049 	}
5050 
5051 	busiest = find_busiest_queue(&env, group);
5052 	if (!busiest) {
5053 		schedstat_inc(sd, lb_nobusyq[idle]);
5054 		goto out_balanced;
5055 	}
5056 
5057 	BUG_ON(busiest == env.dst_rq);
5058 
5059 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5060 
5061 	ld_moved = 0;
5062 	if (busiest->nr_running > 1) {
5063 		/*
5064 		 * Attempt to move tasks. If find_busiest_group has found
5065 		 * an imbalance but busiest->nr_running <= 1, the group is
5066 		 * still unbalanced. ld_moved simply stays zero, so it is
5067 		 * correctly treated as an imbalance.
5068 		 */
5069 		env.flags |= LBF_ALL_PINNED;
5070 		env.src_cpu   = busiest->cpu;
5071 		env.src_rq    = busiest;
5072 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
5073 
5074 		update_h_load(env.src_cpu);
5075 more_balance:
5076 		local_irq_save(flags);
5077 		double_rq_lock(env.dst_rq, busiest);
5078 
5079 		/*
5080 		 * cur_ld_moved - load moved in current iteration
5081 		 * ld_moved     - cumulative load moved across iterations
5082 		 */
5083 		cur_ld_moved = move_tasks(&env);
5084 		ld_moved += cur_ld_moved;
5085 		double_rq_unlock(env.dst_rq, busiest);
5086 		local_irq_restore(flags);
5087 
5088 		/*
5089 		 * some other cpu did the load balance for us.
5090 		 */
5091 		if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5092 			resched_cpu(env.dst_cpu);
5093 
5094 		if (env.flags & LBF_NEED_BREAK) {
5095 			env.flags &= ~LBF_NEED_BREAK;
5096 			goto more_balance;
5097 		}
5098 
5099 		/*
5100 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5101 		 * us and move them to an alternate dst_cpu in our sched_group
5102 		 * where they can run. The upper limit on how many times we
5103 		 * iterate on same src_cpu is dependent on number of cpus in our
5104 		 * sched_group.
5105 		 *
5106 		 * This changes load balance semantics a bit on who can move
5107 		 * load to a given_cpu. In addition to the given_cpu itself
5108 		 * (or a ilb_cpu acting on its behalf where given_cpu is
5109 		 * nohz-idle), we now have balance_cpu in a position to move
5110 		 * load to given_cpu. In rare situations, this may cause
5111 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
5112 		 * _independently_ and at _same_ time to move some load to
5113 		 * given_cpu) causing exceess load to be moved to given_cpu.
5114 		 * This however should not happen so much in practice and
5115 		 * moreover subsequent load balance cycles should correct the
5116 		 * excess load moved.
5117 		 */
5118 		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
5119 
5120 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
5121 			env.dst_cpu	 = env.new_dst_cpu;
5122 			env.flags	&= ~LBF_SOME_PINNED;
5123 			env.loop	 = 0;
5124 			env.loop_break	 = sched_nr_migrate_break;
5125 
5126 			/* Prevent to re-select dst_cpu via env's cpus */
5127 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
5128 
5129 			/*
5130 			 * Go back to "more_balance" rather than "redo" since we
5131 			 * need to continue with same src_cpu.
5132 			 */
5133 			goto more_balance;
5134 		}
5135 
5136 		/* All tasks on this runqueue were pinned by CPU affinity */
5137 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
5138 			cpumask_clear_cpu(cpu_of(busiest), cpus);
5139 			if (!cpumask_empty(cpus)) {
5140 				env.loop = 0;
5141 				env.loop_break = sched_nr_migrate_break;
5142 				goto redo;
5143 			}
5144 			goto out_balanced;
5145 		}
5146 	}
5147 
5148 	if (!ld_moved) {
5149 		schedstat_inc(sd, lb_failed[idle]);
5150 		/*
5151 		 * Increment the failure counter only on periodic balance.
5152 		 * We do not want newidle balance, which can be very
5153 		 * frequent, pollute the failure counter causing
5154 		 * excessive cache_hot migrations and active balances.
5155 		 */
5156 		if (idle != CPU_NEWLY_IDLE)
5157 			sd->nr_balance_failed++;
5158 
5159 		if (need_active_balance(&env)) {
5160 			raw_spin_lock_irqsave(&busiest->lock, flags);
5161 
5162 			/* don't kick the active_load_balance_cpu_stop,
5163 			 * if the curr task on busiest cpu can't be
5164 			 * moved to this_cpu
5165 			 */
5166 			if (!cpumask_test_cpu(this_cpu,
5167 					tsk_cpus_allowed(busiest->curr))) {
5168 				raw_spin_unlock_irqrestore(&busiest->lock,
5169 							    flags);
5170 				env.flags |= LBF_ALL_PINNED;
5171 				goto out_one_pinned;
5172 			}
5173 
5174 			/*
5175 			 * ->active_balance synchronizes accesses to
5176 			 * ->active_balance_work.  Once set, it's cleared
5177 			 * only after active load balance is finished.
5178 			 */
5179 			if (!busiest->active_balance) {
5180 				busiest->active_balance = 1;
5181 				busiest->push_cpu = this_cpu;
5182 				active_balance = 1;
5183 			}
5184 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
5185 
5186 			if (active_balance) {
5187 				stop_one_cpu_nowait(cpu_of(busiest),
5188 					active_load_balance_cpu_stop, busiest,
5189 					&busiest->active_balance_work);
5190 			}
5191 
5192 			/*
5193 			 * We've kicked active balancing, reset the failure
5194 			 * counter.
5195 			 */
5196 			sd->nr_balance_failed = sd->cache_nice_tries+1;
5197 		}
5198 	} else
5199 		sd->nr_balance_failed = 0;
5200 
5201 	if (likely(!active_balance)) {
5202 		/* We were unbalanced, so reset the balancing interval */
5203 		sd->balance_interval = sd->min_interval;
5204 	} else {
5205 		/*
5206 		 * If we've begun active balancing, start to back off. This
5207 		 * case may not be covered by the all_pinned logic if there
5208 		 * is only 1 task on the busy runqueue (because we don't call
5209 		 * move_tasks).
5210 		 */
5211 		if (sd->balance_interval < sd->max_interval)
5212 			sd->balance_interval *= 2;
5213 	}
5214 
5215 	goto out;
5216 
5217 out_balanced:
5218 	schedstat_inc(sd, lb_balanced[idle]);
5219 
5220 	sd->nr_balance_failed = 0;
5221 
5222 out_one_pinned:
5223 	/* tune up the balancing interval */
5224 	if (((env.flags & LBF_ALL_PINNED) &&
5225 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
5226 			(sd->balance_interval < sd->max_interval))
5227 		sd->balance_interval *= 2;
5228 
5229 	ld_moved = 0;
5230 out:
5231 	return ld_moved;
5232 }
5233 
5234 /*
5235  * idle_balance is called by schedule() if this_cpu is about to become
5236  * idle. Attempts to pull tasks from other CPUs.
5237  */
5238 void idle_balance(int this_cpu, struct rq *this_rq)
5239 {
5240 	struct sched_domain *sd;
5241 	int pulled_task = 0;
5242 	unsigned long next_balance = jiffies + HZ;
5243 
5244 	this_rq->idle_stamp = this_rq->clock;
5245 
5246 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
5247 		return;
5248 
5249 	/*
5250 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
5251 	 */
5252 	raw_spin_unlock(&this_rq->lock);
5253 
5254 	update_blocked_averages(this_cpu);
5255 	rcu_read_lock();
5256 	for_each_domain(this_cpu, sd) {
5257 		unsigned long interval;
5258 		int balance = 1;
5259 
5260 		if (!(sd->flags & SD_LOAD_BALANCE))
5261 			continue;
5262 
5263 		if (sd->flags & SD_BALANCE_NEWIDLE) {
5264 			/* If we've pulled tasks over stop searching: */
5265 			pulled_task = load_balance(this_cpu, this_rq,
5266 						   sd, CPU_NEWLY_IDLE, &balance);
5267 		}
5268 
5269 		interval = msecs_to_jiffies(sd->balance_interval);
5270 		if (time_after(next_balance, sd->last_balance + interval))
5271 			next_balance = sd->last_balance + interval;
5272 		if (pulled_task) {
5273 			this_rq->idle_stamp = 0;
5274 			break;
5275 		}
5276 	}
5277 	rcu_read_unlock();
5278 
5279 	raw_spin_lock(&this_rq->lock);
5280 
5281 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
5282 		/*
5283 		 * We are going idle. next_balance may be set based on
5284 		 * a busy processor. So reset next_balance.
5285 		 */
5286 		this_rq->next_balance = next_balance;
5287 	}
5288 }
5289 
5290 /*
5291  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
5292  * running tasks off the busiest CPU onto idle CPUs. It requires at
5293  * least 1 task to be running on each physical CPU where possible, and
5294  * avoids physical / logical imbalances.
5295  */
5296 static int active_load_balance_cpu_stop(void *data)
5297 {
5298 	struct rq *busiest_rq = data;
5299 	int busiest_cpu = cpu_of(busiest_rq);
5300 	int target_cpu = busiest_rq->push_cpu;
5301 	struct rq *target_rq = cpu_rq(target_cpu);
5302 	struct sched_domain *sd;
5303 
5304 	raw_spin_lock_irq(&busiest_rq->lock);
5305 
5306 	/* make sure the requested cpu hasn't gone down in the meantime */
5307 	if (unlikely(busiest_cpu != smp_processor_id() ||
5308 		     !busiest_rq->active_balance))
5309 		goto out_unlock;
5310 
5311 	/* Is there any task to move? */
5312 	if (busiest_rq->nr_running <= 1)
5313 		goto out_unlock;
5314 
5315 	/*
5316 	 * This condition is "impossible", if it occurs
5317 	 * we need to fix it. Originally reported by
5318 	 * Bjorn Helgaas on a 128-cpu setup.
5319 	 */
5320 	BUG_ON(busiest_rq == target_rq);
5321 
5322 	/* move a task from busiest_rq to target_rq */
5323 	double_lock_balance(busiest_rq, target_rq);
5324 
5325 	/* Search for an sd spanning us and the target CPU. */
5326 	rcu_read_lock();
5327 	for_each_domain(target_cpu, sd) {
5328 		if ((sd->flags & SD_LOAD_BALANCE) &&
5329 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
5330 				break;
5331 	}
5332 
5333 	if (likely(sd)) {
5334 		struct lb_env env = {
5335 			.sd		= sd,
5336 			.dst_cpu	= target_cpu,
5337 			.dst_rq		= target_rq,
5338 			.src_cpu	= busiest_rq->cpu,
5339 			.src_rq		= busiest_rq,
5340 			.idle		= CPU_IDLE,
5341 		};
5342 
5343 		schedstat_inc(sd, alb_count);
5344 
5345 		if (move_one_task(&env))
5346 			schedstat_inc(sd, alb_pushed);
5347 		else
5348 			schedstat_inc(sd, alb_failed);
5349 	}
5350 	rcu_read_unlock();
5351 	double_unlock_balance(busiest_rq, target_rq);
5352 out_unlock:
5353 	busiest_rq->active_balance = 0;
5354 	raw_spin_unlock_irq(&busiest_rq->lock);
5355 	return 0;
5356 }
5357 
5358 #ifdef CONFIG_NO_HZ_COMMON
5359 /*
5360  * idle load balancing details
5361  * - When one of the busy CPUs notice that there may be an idle rebalancing
5362  *   needed, they will kick the idle load balancer, which then does idle
5363  *   load balancing for all the idle CPUs.
5364  */
5365 static struct {
5366 	cpumask_var_t idle_cpus_mask;
5367 	atomic_t nr_cpus;
5368 	unsigned long next_balance;     /* in jiffy units */
5369 } nohz ____cacheline_aligned;
5370 
5371 static inline int find_new_ilb(int call_cpu)
5372 {
5373 	int ilb = cpumask_first(nohz.idle_cpus_mask);
5374 
5375 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
5376 		return ilb;
5377 
5378 	return nr_cpu_ids;
5379 }
5380 
5381 /*
5382  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
5383  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
5384  * CPU (if there is one).
5385  */
5386 static void nohz_balancer_kick(int cpu)
5387 {
5388 	int ilb_cpu;
5389 
5390 	nohz.next_balance++;
5391 
5392 	ilb_cpu = find_new_ilb(cpu);
5393 
5394 	if (ilb_cpu >= nr_cpu_ids)
5395 		return;
5396 
5397 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
5398 		return;
5399 	/*
5400 	 * Use smp_send_reschedule() instead of resched_cpu().
5401 	 * This way we generate a sched IPI on the target cpu which
5402 	 * is idle. And the softirq performing nohz idle load balance
5403 	 * will be run before returning from the IPI.
5404 	 */
5405 	smp_send_reschedule(ilb_cpu);
5406 	return;
5407 }
5408 
5409 static inline void nohz_balance_exit_idle(int cpu)
5410 {
5411 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5412 		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5413 		atomic_dec(&nohz.nr_cpus);
5414 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5415 	}
5416 }
5417 
5418 static inline void set_cpu_sd_state_busy(void)
5419 {
5420 	struct sched_domain *sd;
5421 	int cpu = smp_processor_id();
5422 
5423 	rcu_read_lock();
5424 	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5425 
5426 	if (!sd || !sd->nohz_idle)
5427 		goto unlock;
5428 	sd->nohz_idle = 0;
5429 
5430 	for (; sd; sd = sd->parent)
5431 		atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5432 unlock:
5433 	rcu_read_unlock();
5434 }
5435 
5436 void set_cpu_sd_state_idle(void)
5437 {
5438 	struct sched_domain *sd;
5439 	int cpu = smp_processor_id();
5440 
5441 	rcu_read_lock();
5442 	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5443 
5444 	if (!sd || sd->nohz_idle)
5445 		goto unlock;
5446 	sd->nohz_idle = 1;
5447 
5448 	for (; sd; sd = sd->parent)
5449 		atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5450 unlock:
5451 	rcu_read_unlock();
5452 }
5453 
5454 /*
5455  * This routine will record that the cpu is going idle with tick stopped.
5456  * This info will be used in performing idle load balancing in the future.
5457  */
5458 void nohz_balance_enter_idle(int cpu)
5459 {
5460 	/*
5461 	 * If this cpu is going down, then nothing needs to be done.
5462 	 */
5463 	if (!cpu_active(cpu))
5464 		return;
5465 
5466 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
5467 		return;
5468 
5469 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
5470 	atomic_inc(&nohz.nr_cpus);
5471 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5472 }
5473 
5474 static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
5475 					unsigned long action, void *hcpu)
5476 {
5477 	switch (action & ~CPU_TASKS_FROZEN) {
5478 	case CPU_DYING:
5479 		nohz_balance_exit_idle(smp_processor_id());
5480 		return NOTIFY_OK;
5481 	default:
5482 		return NOTIFY_DONE;
5483 	}
5484 }
5485 #endif
5486 
5487 static DEFINE_SPINLOCK(balancing);
5488 
5489 /*
5490  * Scale the max load_balance interval with the number of CPUs in the system.
5491  * This trades load-balance latency on larger machines for less cross talk.
5492  */
5493 void update_max_interval(void)
5494 {
5495 	max_load_balance_interval = HZ*num_online_cpus()/10;
5496 }
5497 
5498 /*
5499  * It checks each scheduling domain to see if it is due to be balanced,
5500  * and initiates a balancing operation if so.
5501  *
5502  * Balancing parameters are set up in init_sched_domains.
5503  */
5504 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5505 {
5506 	int balance = 1;
5507 	struct rq *rq = cpu_rq(cpu);
5508 	unsigned long interval;
5509 	struct sched_domain *sd;
5510 	/* Earliest time when we have to do rebalance again */
5511 	unsigned long next_balance = jiffies + 60*HZ;
5512 	int update_next_balance = 0;
5513 	int need_serialize;
5514 
5515 	update_blocked_averages(cpu);
5516 
5517 	rcu_read_lock();
5518 	for_each_domain(cpu, sd) {
5519 		if (!(sd->flags & SD_LOAD_BALANCE))
5520 			continue;
5521 
5522 		interval = sd->balance_interval;
5523 		if (idle != CPU_IDLE)
5524 			interval *= sd->busy_factor;
5525 
5526 		/* scale ms to jiffies */
5527 		interval = msecs_to_jiffies(interval);
5528 		interval = clamp(interval, 1UL, max_load_balance_interval);
5529 
5530 		need_serialize = sd->flags & SD_SERIALIZE;
5531 
5532 		if (need_serialize) {
5533 			if (!spin_trylock(&balancing))
5534 				goto out;
5535 		}
5536 
5537 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
5538 			if (load_balance(cpu, rq, sd, idle, &balance)) {
5539 				/*
5540 				 * The LBF_SOME_PINNED logic could have changed
5541 				 * env->dst_cpu, so we can't know our idle
5542 				 * state even if we migrated tasks. Update it.
5543 				 */
5544 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
5545 			}
5546 			sd->last_balance = jiffies;
5547 		}
5548 		if (need_serialize)
5549 			spin_unlock(&balancing);
5550 out:
5551 		if (time_after(next_balance, sd->last_balance + interval)) {
5552 			next_balance = sd->last_balance + interval;
5553 			update_next_balance = 1;
5554 		}
5555 
5556 		/*
5557 		 * Stop the load balance at this level. There is another
5558 		 * CPU in our sched group which is doing load balancing more
5559 		 * actively.
5560 		 */
5561 		if (!balance)
5562 			break;
5563 	}
5564 	rcu_read_unlock();
5565 
5566 	/*
5567 	 * next_balance will be updated only when there is a need.
5568 	 * When the cpu is attached to null domain for ex, it will not be
5569 	 * updated.
5570 	 */
5571 	if (likely(update_next_balance))
5572 		rq->next_balance = next_balance;
5573 }
5574 
5575 #ifdef CONFIG_NO_HZ_COMMON
5576 /*
5577  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
5578  * rebalancing for all the cpus for whom scheduler ticks are stopped.
5579  */
5580 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5581 {
5582 	struct rq *this_rq = cpu_rq(this_cpu);
5583 	struct rq *rq;
5584 	int balance_cpu;
5585 
5586 	if (idle != CPU_IDLE ||
5587 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5588 		goto end;
5589 
5590 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
5591 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
5592 			continue;
5593 
5594 		/*
5595 		 * If this cpu gets work to do, stop the load balancing
5596 		 * work being done for other cpus. Next load
5597 		 * balancing owner will pick it up.
5598 		 */
5599 		if (need_resched())
5600 			break;
5601 
5602 		rq = cpu_rq(balance_cpu);
5603 
5604 		raw_spin_lock_irq(&rq->lock);
5605 		update_rq_clock(rq);
5606 		update_idle_cpu_load(rq);
5607 		raw_spin_unlock_irq(&rq->lock);
5608 
5609 		rebalance_domains(balance_cpu, CPU_IDLE);
5610 
5611 		if (time_after(this_rq->next_balance, rq->next_balance))
5612 			this_rq->next_balance = rq->next_balance;
5613 	}
5614 	nohz.next_balance = this_rq->next_balance;
5615 end:
5616 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
5617 }
5618 
5619 /*
5620  * Current heuristic for kicking the idle load balancer in the presence
5621  * of an idle cpu is the system.
5622  *   - This rq has more than one task.
5623  *   - At any scheduler domain level, this cpu's scheduler group has multiple
5624  *     busy cpu's exceeding the group's power.
5625  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
5626  *     domain span are idle.
5627  */
5628 static inline int nohz_kick_needed(struct rq *rq, int cpu)
5629 {
5630 	unsigned long now = jiffies;
5631 	struct sched_domain *sd;
5632 
5633 	if (unlikely(idle_cpu(cpu)))
5634 		return 0;
5635 
5636        /*
5637 	* We may be recently in ticked or tickless idle mode. At the first
5638 	* busy tick after returning from idle, we will update the busy stats.
5639 	*/
5640 	set_cpu_sd_state_busy();
5641 	nohz_balance_exit_idle(cpu);
5642 
5643 	/*
5644 	 * None are in tickless mode and hence no need for NOHZ idle load
5645 	 * balancing.
5646 	 */
5647 	if (likely(!atomic_read(&nohz.nr_cpus)))
5648 		return 0;
5649 
5650 	if (time_before(now, nohz.next_balance))
5651 		return 0;
5652 
5653 	if (rq->nr_running >= 2)
5654 		goto need_kick;
5655 
5656 	rcu_read_lock();
5657 	for_each_domain(cpu, sd) {
5658 		struct sched_group *sg = sd->groups;
5659 		struct sched_group_power *sgp = sg->sgp;
5660 		int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5661 
5662 		if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
5663 			goto need_kick_unlock;
5664 
5665 		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5666 		    && (cpumask_first_and(nohz.idle_cpus_mask,
5667 					  sched_domain_span(sd)) < cpu))
5668 			goto need_kick_unlock;
5669 
5670 		if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5671 			break;
5672 	}
5673 	rcu_read_unlock();
5674 	return 0;
5675 
5676 need_kick_unlock:
5677 	rcu_read_unlock();
5678 need_kick:
5679 	return 1;
5680 }
5681 #else
5682 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
5683 #endif
5684 
5685 /*
5686  * run_rebalance_domains is triggered when needed from the scheduler tick.
5687  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
5688  */
5689 static void run_rebalance_domains(struct softirq_action *h)
5690 {
5691 	int this_cpu = smp_processor_id();
5692 	struct rq *this_rq = cpu_rq(this_cpu);
5693 	enum cpu_idle_type idle = this_rq->idle_balance ?
5694 						CPU_IDLE : CPU_NOT_IDLE;
5695 
5696 	rebalance_domains(this_cpu, idle);
5697 
5698 	/*
5699 	 * If this cpu has a pending nohz_balance_kick, then do the
5700 	 * balancing on behalf of the other idle cpus whose ticks are
5701 	 * stopped.
5702 	 */
5703 	nohz_idle_balance(this_cpu, idle);
5704 }
5705 
5706 static inline int on_null_domain(int cpu)
5707 {
5708 	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
5709 }
5710 
5711 /*
5712  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
5713  */
5714 void trigger_load_balance(struct rq *rq, int cpu)
5715 {
5716 	/* Don't need to rebalance while attached to NULL domain */
5717 	if (time_after_eq(jiffies, rq->next_balance) &&
5718 	    likely(!on_null_domain(cpu)))
5719 		raise_softirq(SCHED_SOFTIRQ);
5720 #ifdef CONFIG_NO_HZ_COMMON
5721 	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5722 		nohz_balancer_kick(cpu);
5723 #endif
5724 }
5725 
5726 static void rq_online_fair(struct rq *rq)
5727 {
5728 	update_sysctl();
5729 }
5730 
5731 static void rq_offline_fair(struct rq *rq)
5732 {
5733 	update_sysctl();
5734 
5735 	/* Ensure any throttled groups are reachable by pick_next_task */
5736 	unthrottle_offline_cfs_rqs(rq);
5737 }
5738 
5739 #endif /* CONFIG_SMP */
5740 
5741 /*
5742  * scheduler tick hitting a task of our scheduling class:
5743  */
5744 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5745 {
5746 	struct cfs_rq *cfs_rq;
5747 	struct sched_entity *se = &curr->se;
5748 
5749 	for_each_sched_entity(se) {
5750 		cfs_rq = cfs_rq_of(se);
5751 		entity_tick(cfs_rq, se, queued);
5752 	}
5753 
5754 	if (sched_feat_numa(NUMA))
5755 		task_tick_numa(rq, curr);
5756 
5757 	update_rq_runnable_avg(rq, 1);
5758 }
5759 
5760 /*
5761  * called on fork with the child task as argument from the parent's context
5762  *  - child not yet on the tasklist
5763  *  - preemption disabled
5764  */
5765 static void task_fork_fair(struct task_struct *p)
5766 {
5767 	struct cfs_rq *cfs_rq;
5768 	struct sched_entity *se = &p->se, *curr;
5769 	int this_cpu = smp_processor_id();
5770 	struct rq *rq = this_rq();
5771 	unsigned long flags;
5772 
5773 	raw_spin_lock_irqsave(&rq->lock, flags);
5774 
5775 	update_rq_clock(rq);
5776 
5777 	cfs_rq = task_cfs_rq(current);
5778 	curr = cfs_rq->curr;
5779 
5780 	if (unlikely(task_cpu(p) != this_cpu)) {
5781 		rcu_read_lock();
5782 		__set_task_cpu(p, this_cpu);
5783 		rcu_read_unlock();
5784 	}
5785 
5786 	update_curr(cfs_rq);
5787 
5788 	if (curr)
5789 		se->vruntime = curr->vruntime;
5790 	place_entity(cfs_rq, se, 1);
5791 
5792 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5793 		/*
5794 		 * Upon rescheduling, sched_class::put_prev_task() will place
5795 		 * 'current' within the tree based on its new key value.
5796 		 */
5797 		swap(curr->vruntime, se->vruntime);
5798 		resched_task(rq->curr);
5799 	}
5800 
5801 	se->vruntime -= cfs_rq->min_vruntime;
5802 
5803 	raw_spin_unlock_irqrestore(&rq->lock, flags);
5804 }
5805 
5806 /*
5807  * Priority of the task has changed. Check to see if we preempt
5808  * the current task.
5809  */
5810 static void
5811 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
5812 {
5813 	if (!p->se.on_rq)
5814 		return;
5815 
5816 	/*
5817 	 * Reschedule if we are currently running on this runqueue and
5818 	 * our priority decreased, or if we are not currently running on
5819 	 * this runqueue and our priority is higher than the current's
5820 	 */
5821 	if (rq->curr == p) {
5822 		if (p->prio > oldprio)
5823 			resched_task(rq->curr);
5824 	} else
5825 		check_preempt_curr(rq, p, 0);
5826 }
5827 
5828 static void switched_from_fair(struct rq *rq, struct task_struct *p)
5829 {
5830 	struct sched_entity *se = &p->se;
5831 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5832 
5833 	/*
5834 	 * Ensure the task's vruntime is normalized, so that when its
5835 	 * switched back to the fair class the enqueue_entity(.flags=0) will
5836 	 * do the right thing.
5837 	 *
5838 	 * If it was on_rq, then the dequeue_entity(.flags=0) will already
5839 	 * have normalized the vruntime, if it was !on_rq, then only when
5840 	 * the task is sleeping will it still have non-normalized vruntime.
5841 	 */
5842 	if (!se->on_rq && p->state != TASK_RUNNING) {
5843 		/*
5844 		 * Fix up our vruntime so that the current sleep doesn't
5845 		 * cause 'unlimited' sleep bonus.
5846 		 */
5847 		place_entity(cfs_rq, se, 0);
5848 		se->vruntime -= cfs_rq->min_vruntime;
5849 	}
5850 
5851 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5852 	/*
5853 	* Remove our load from contribution when we leave sched_fair
5854 	* and ensure we don't carry in an old decay_count if we
5855 	* switch back.
5856 	*/
5857 	if (p->se.avg.decay_count) {
5858 		struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5859 		__synchronize_entity_decay(&p->se);
5860 		subtract_blocked_load_contrib(cfs_rq,
5861 				p->se.avg.load_avg_contrib);
5862 	}
5863 #endif
5864 }
5865 
5866 /*
5867  * We switched to the sched_fair class.
5868  */
5869 static void switched_to_fair(struct rq *rq, struct task_struct *p)
5870 {
5871 	if (!p->se.on_rq)
5872 		return;
5873 
5874 	/*
5875 	 * We were most likely switched from sched_rt, so
5876 	 * kick off the schedule if running, otherwise just see
5877 	 * if we can still preempt the current task.
5878 	 */
5879 	if (rq->curr == p)
5880 		resched_task(rq->curr);
5881 	else
5882 		check_preempt_curr(rq, p, 0);
5883 }
5884 
5885 /* Account for a task changing its policy or group.
5886  *
5887  * This routine is mostly called to set cfs_rq->curr field when a task
5888  * migrates between groups/classes.
5889  */
5890 static void set_curr_task_fair(struct rq *rq)
5891 {
5892 	struct sched_entity *se = &rq->curr->se;
5893 
5894 	for_each_sched_entity(se) {
5895 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
5896 
5897 		set_next_entity(cfs_rq, se);
5898 		/* ensure bandwidth has been allocated on our new cfs_rq */
5899 		account_cfs_rq_runtime(cfs_rq, 0);
5900 	}
5901 }
5902 
5903 void init_cfs_rq(struct cfs_rq *cfs_rq)
5904 {
5905 	cfs_rq->tasks_timeline = RB_ROOT;
5906 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5907 #ifndef CONFIG_64BIT
5908 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5909 #endif
5910 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5911 	atomic64_set(&cfs_rq->decay_counter, 1);
5912 	atomic64_set(&cfs_rq->removed_load, 0);
5913 #endif
5914 }
5915 
5916 #ifdef CONFIG_FAIR_GROUP_SCHED
5917 static void task_move_group_fair(struct task_struct *p, int on_rq)
5918 {
5919 	struct cfs_rq *cfs_rq;
5920 	/*
5921 	 * If the task was not on the rq at the time of this cgroup movement
5922 	 * it must have been asleep, sleeping tasks keep their ->vruntime
5923 	 * absolute on their old rq until wakeup (needed for the fair sleeper
5924 	 * bonus in place_entity()).
5925 	 *
5926 	 * If it was on the rq, we've just 'preempted' it, which does convert
5927 	 * ->vruntime to a relative base.
5928 	 *
5929 	 * Make sure both cases convert their relative position when migrating
5930 	 * to another cgroup's rq. This does somewhat interfere with the
5931 	 * fair sleeper stuff for the first placement, but who cares.
5932 	 */
5933 	/*
5934 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
5935 	 * But there are some cases where it has already been normalized:
5936 	 *
5937 	 * - Moving a forked child which is waiting for being woken up by
5938 	 *   wake_up_new_task().
5939 	 * - Moving a task which has been woken up by try_to_wake_up() and
5940 	 *   waiting for actually being woken up by sched_ttwu_pending().
5941 	 *
5942 	 * To prevent boost or penalty in the new cfs_rq caused by delta
5943 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5944 	 */
5945 	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5946 		on_rq = 1;
5947 
5948 	if (!on_rq)
5949 		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5950 	set_task_rq(p, task_cpu(p));
5951 	if (!on_rq) {
5952 		cfs_rq = cfs_rq_of(&p->se);
5953 		p->se.vruntime += cfs_rq->min_vruntime;
5954 #ifdef CONFIG_SMP
5955 		/*
5956 		 * migrate_task_rq_fair() will have removed our previous
5957 		 * contribution, but we must synchronize for ongoing future
5958 		 * decay.
5959 		 */
5960 		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5961 		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5962 #endif
5963 	}
5964 }
5965 
5966 void free_fair_sched_group(struct task_group *tg)
5967 {
5968 	int i;
5969 
5970 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5971 
5972 	for_each_possible_cpu(i) {
5973 		if (tg->cfs_rq)
5974 			kfree(tg->cfs_rq[i]);
5975 		if (tg->se)
5976 			kfree(tg->se[i]);
5977 	}
5978 
5979 	kfree(tg->cfs_rq);
5980 	kfree(tg->se);
5981 }
5982 
5983 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5984 {
5985 	struct cfs_rq *cfs_rq;
5986 	struct sched_entity *se;
5987 	int i;
5988 
5989 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5990 	if (!tg->cfs_rq)
5991 		goto err;
5992 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5993 	if (!tg->se)
5994 		goto err;
5995 
5996 	tg->shares = NICE_0_LOAD;
5997 
5998 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5999 
6000 	for_each_possible_cpu(i) {
6001 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
6002 				      GFP_KERNEL, cpu_to_node(i));
6003 		if (!cfs_rq)
6004 			goto err;
6005 
6006 		se = kzalloc_node(sizeof(struct sched_entity),
6007 				  GFP_KERNEL, cpu_to_node(i));
6008 		if (!se)
6009 			goto err_free_rq;
6010 
6011 		init_cfs_rq(cfs_rq);
6012 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
6013 	}
6014 
6015 	return 1;
6016 
6017 err_free_rq:
6018 	kfree(cfs_rq);
6019 err:
6020 	return 0;
6021 }
6022 
6023 void unregister_fair_sched_group(struct task_group *tg, int cpu)
6024 {
6025 	struct rq *rq = cpu_rq(cpu);
6026 	unsigned long flags;
6027 
6028 	/*
6029 	* Only empty task groups can be destroyed; so we can speculatively
6030 	* check on_list without danger of it being re-added.
6031 	*/
6032 	if (!tg->cfs_rq[cpu]->on_list)
6033 		return;
6034 
6035 	raw_spin_lock_irqsave(&rq->lock, flags);
6036 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
6037 	raw_spin_unlock_irqrestore(&rq->lock, flags);
6038 }
6039 
6040 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6041 			struct sched_entity *se, int cpu,
6042 			struct sched_entity *parent)
6043 {
6044 	struct rq *rq = cpu_rq(cpu);
6045 
6046 	cfs_rq->tg = tg;
6047 	cfs_rq->rq = rq;
6048 	init_cfs_rq_runtime(cfs_rq);
6049 
6050 	tg->cfs_rq[cpu] = cfs_rq;
6051 	tg->se[cpu] = se;
6052 
6053 	/* se could be NULL for root_task_group */
6054 	if (!se)
6055 		return;
6056 
6057 	if (!parent)
6058 		se->cfs_rq = &rq->cfs;
6059 	else
6060 		se->cfs_rq = parent->my_q;
6061 
6062 	se->my_q = cfs_rq;
6063 	update_load_set(&se->load, 0);
6064 	se->parent = parent;
6065 }
6066 
6067 static DEFINE_MUTEX(shares_mutex);
6068 
6069 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6070 {
6071 	int i;
6072 	unsigned long flags;
6073 
6074 	/*
6075 	 * We can't change the weight of the root cgroup.
6076 	 */
6077 	if (!tg->se[0])
6078 		return -EINVAL;
6079 
6080 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
6081 
6082 	mutex_lock(&shares_mutex);
6083 	if (tg->shares == shares)
6084 		goto done;
6085 
6086 	tg->shares = shares;
6087 	for_each_possible_cpu(i) {
6088 		struct rq *rq = cpu_rq(i);
6089 		struct sched_entity *se;
6090 
6091 		se = tg->se[i];
6092 		/* Propagate contribution to hierarchy */
6093 		raw_spin_lock_irqsave(&rq->lock, flags);
6094 		for_each_sched_entity(se)
6095 			update_cfs_shares(group_cfs_rq(se));
6096 		raw_spin_unlock_irqrestore(&rq->lock, flags);
6097 	}
6098 
6099 done:
6100 	mutex_unlock(&shares_mutex);
6101 	return 0;
6102 }
6103 #else /* CONFIG_FAIR_GROUP_SCHED */
6104 
6105 void free_fair_sched_group(struct task_group *tg) { }
6106 
6107 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6108 {
6109 	return 1;
6110 }
6111 
6112 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
6113 
6114 #endif /* CONFIG_FAIR_GROUP_SCHED */
6115 
6116 
6117 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
6118 {
6119 	struct sched_entity *se = &task->se;
6120 	unsigned int rr_interval = 0;
6121 
6122 	/*
6123 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
6124 	 * idle runqueue:
6125 	 */
6126 	if (rq->cfs.load.weight)
6127 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
6128 
6129 	return rr_interval;
6130 }
6131 
6132 /*
6133  * All the scheduling class methods:
6134  */
6135 const struct sched_class fair_sched_class = {
6136 	.next			= &idle_sched_class,
6137 	.enqueue_task		= enqueue_task_fair,
6138 	.dequeue_task		= dequeue_task_fair,
6139 	.yield_task		= yield_task_fair,
6140 	.yield_to_task		= yield_to_task_fair,
6141 
6142 	.check_preempt_curr	= check_preempt_wakeup,
6143 
6144 	.pick_next_task		= pick_next_task_fair,
6145 	.put_prev_task		= put_prev_task_fair,
6146 
6147 #ifdef CONFIG_SMP
6148 	.select_task_rq		= select_task_rq_fair,
6149 #ifdef CONFIG_FAIR_GROUP_SCHED
6150 	.migrate_task_rq	= migrate_task_rq_fair,
6151 #endif
6152 	.rq_online		= rq_online_fair,
6153 	.rq_offline		= rq_offline_fair,
6154 
6155 	.task_waking		= task_waking_fair,
6156 #endif
6157 
6158 	.set_curr_task          = set_curr_task_fair,
6159 	.task_tick		= task_tick_fair,
6160 	.task_fork		= task_fork_fair,
6161 
6162 	.prio_changed		= prio_changed_fair,
6163 	.switched_from		= switched_from_fair,
6164 	.switched_to		= switched_to_fair,
6165 
6166 	.get_rr_interval	= get_rr_interval_fair,
6167 
6168 #ifdef CONFIG_FAIR_GROUP_SCHED
6169 	.task_move_group	= task_move_group_fair,
6170 #endif
6171 };
6172 
6173 #ifdef CONFIG_SCHED_DEBUG
6174 void print_cfs_stats(struct seq_file *m, int cpu)
6175 {
6176 	struct cfs_rq *cfs_rq;
6177 
6178 	rcu_read_lock();
6179 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
6180 		print_cfs_rq(m, cpu, cfs_rq);
6181 	rcu_read_unlock();
6182 }
6183 #endif
6184 
6185 __init void init_sched_fair_class(void)
6186 {
6187 #ifdef CONFIG_SMP
6188 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6189 
6190 #ifdef CONFIG_NO_HZ_COMMON
6191 	nohz.next_balance = jiffies;
6192 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
6193 	cpu_notifier(sched_ilb_notifier, 0);
6194 #endif
6195 #endif /* SMP */
6196 
6197 }
6198