xref: /openbmc/linux/kernel/sched/fair.c (revision 089a49b6)
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21  */
22 
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/slab.h>
27 #include <linux/profile.h>
28 #include <linux/interrupt.h>
29 #include <linux/mempolicy.h>
30 #include <linux/migrate.h>
31 #include <linux/task_work.h>
32 
33 #include <trace/events/sched.h>
34 
35 #include "sched.h"
36 
37 /*
38  * Targeted preemption latency for CPU-bound tasks:
39  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
40  *
41  * NOTE: this latency value is not the same as the concept of
42  * 'timeslice length' - timeslices in CFS are of variable length
43  * and have no persistent notion like in traditional, time-slice
44  * based scheduling concepts.
45  *
46  * (to see the precise effective timeslice length of your workload,
47  *  run vmstat and monitor the context-switches (cs) field)
48  */
49 unsigned int sysctl_sched_latency = 6000000ULL;
50 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
51 
52 /*
53  * The initial- and re-scaling of tunables is configurable
54  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
55  *
56  * Options are:
57  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
58  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
59  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
60  */
61 enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 	= SCHED_TUNABLESCALING_LOG;
63 
64 /*
65  * Minimal preemption granularity for CPU-bound tasks:
66  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
67  */
68 unsigned int sysctl_sched_min_granularity = 750000ULL;
69 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
70 
71 /*
72  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
73  */
74 static unsigned int sched_nr_latency = 8;
75 
76 /*
77  * After fork, child runs first. If set to 0 (default) then
78  * parent will (try to) run first.
79  */
80 unsigned int sysctl_sched_child_runs_first __read_mostly;
81 
82 /*
83  * SCHED_OTHER wake-up granularity.
84  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
85  *
86  * This option delays the preemption effects of decoupled workloads
87  * and reduces their over-scheduling. Synchronous workloads will still
88  * have immediate wakeup/sleep latencies.
89  */
90 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
92 
93 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94 
95 /*
96  * The exponential sliding  window over which load is averaged for shares
97  * distribution.
98  * (default: 10msec)
99  */
100 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101 
102 #ifdef CONFIG_CFS_BANDWIDTH
103 /*
104  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
105  * each time a cfs_rq requests quota.
106  *
107  * Note: in the case that the slice exceeds the runtime remaining (either due
108  * to consumption or the quota being specified to be smaller than the slice)
109  * we will always only issue the remaining available time.
110  *
111  * default: 5 msec, units: microseconds
112   */
113 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114 #endif
115 
116 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
117 {
118 	lw->weight += inc;
119 	lw->inv_weight = 0;
120 }
121 
122 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
123 {
124 	lw->weight -= dec;
125 	lw->inv_weight = 0;
126 }
127 
128 static inline void update_load_set(struct load_weight *lw, unsigned long w)
129 {
130 	lw->weight = w;
131 	lw->inv_weight = 0;
132 }
133 
134 /*
135  * Increase the granularity value when there are more CPUs,
136  * because with more CPUs the 'effective latency' as visible
137  * to users decreases. But the relationship is not linear,
138  * so pick a second-best guess by going with the log2 of the
139  * number of CPUs.
140  *
141  * This idea comes from the SD scheduler of Con Kolivas:
142  */
143 static int get_update_sysctl_factor(void)
144 {
145 	unsigned int cpus = min_t(int, num_online_cpus(), 8);
146 	unsigned int factor;
147 
148 	switch (sysctl_sched_tunable_scaling) {
149 	case SCHED_TUNABLESCALING_NONE:
150 		factor = 1;
151 		break;
152 	case SCHED_TUNABLESCALING_LINEAR:
153 		factor = cpus;
154 		break;
155 	case SCHED_TUNABLESCALING_LOG:
156 	default:
157 		factor = 1 + ilog2(cpus);
158 		break;
159 	}
160 
161 	return factor;
162 }
163 
164 static void update_sysctl(void)
165 {
166 	unsigned int factor = get_update_sysctl_factor();
167 
168 #define SET_SYSCTL(name) \
169 	(sysctl_##name = (factor) * normalized_sysctl_##name)
170 	SET_SYSCTL(sched_min_granularity);
171 	SET_SYSCTL(sched_latency);
172 	SET_SYSCTL(sched_wakeup_granularity);
173 #undef SET_SYSCTL
174 }
175 
176 void sched_init_granularity(void)
177 {
178 	update_sysctl();
179 }
180 
181 #if BITS_PER_LONG == 32
182 # define WMULT_CONST	(~0UL)
183 #else
184 # define WMULT_CONST	(1UL << 32)
185 #endif
186 
187 #define WMULT_SHIFT	32
188 
189 /*
190  * Shift right and round:
191  */
192 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
193 
194 /*
195  * delta *= weight / lw
196  */
197 static unsigned long
198 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
199 		struct load_weight *lw)
200 {
201 	u64 tmp;
202 
203 	/*
204 	 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
205 	 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
206 	 * 2^SCHED_LOAD_RESOLUTION.
207 	 */
208 	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
209 		tmp = (u64)delta_exec * scale_load_down(weight);
210 	else
211 		tmp = (u64)delta_exec;
212 
213 	if (!lw->inv_weight) {
214 		unsigned long w = scale_load_down(lw->weight);
215 
216 		if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
217 			lw->inv_weight = 1;
218 		else if (unlikely(!w))
219 			lw->inv_weight = WMULT_CONST;
220 		else
221 			lw->inv_weight = WMULT_CONST / w;
222 	}
223 
224 	/*
225 	 * Check whether we'd overflow the 64-bit multiplication:
226 	 */
227 	if (unlikely(tmp > WMULT_CONST))
228 		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
229 			WMULT_SHIFT/2);
230 	else
231 		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
232 
233 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
234 }
235 
236 
237 const struct sched_class fair_sched_class;
238 
239 /**************************************************************
240  * CFS operations on generic schedulable entities:
241  */
242 
243 #ifdef CONFIG_FAIR_GROUP_SCHED
244 
245 /* cpu runqueue to which this cfs_rq is attached */
246 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
247 {
248 	return cfs_rq->rq;
249 }
250 
251 /* An entity is a task if it doesn't "own" a runqueue */
252 #define entity_is_task(se)	(!se->my_q)
253 
254 static inline struct task_struct *task_of(struct sched_entity *se)
255 {
256 #ifdef CONFIG_SCHED_DEBUG
257 	WARN_ON_ONCE(!entity_is_task(se));
258 #endif
259 	return container_of(se, struct task_struct, se);
260 }
261 
262 /* Walk up scheduling entities hierarchy */
263 #define for_each_sched_entity(se) \
264 		for (; se; se = se->parent)
265 
266 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
267 {
268 	return p->se.cfs_rq;
269 }
270 
271 /* runqueue on which this entity is (to be) queued */
272 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
273 {
274 	return se->cfs_rq;
275 }
276 
277 /* runqueue "owned" by this group */
278 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
279 {
280 	return grp->my_q;
281 }
282 
283 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
284 				       int force_update);
285 
286 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287 {
288 	if (!cfs_rq->on_list) {
289 		/*
290 		 * Ensure we either appear before our parent (if already
291 		 * enqueued) or force our parent to appear after us when it is
292 		 * enqueued.  The fact that we always enqueue bottom-up
293 		 * reduces this to two cases.
294 		 */
295 		if (cfs_rq->tg->parent &&
296 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
297 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
298 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
299 		} else {
300 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
301 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
302 		}
303 
304 		cfs_rq->on_list = 1;
305 		/* We should have no load, but we need to update last_decay. */
306 		update_cfs_rq_blocked_load(cfs_rq, 0);
307 	}
308 }
309 
310 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
311 {
312 	if (cfs_rq->on_list) {
313 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
314 		cfs_rq->on_list = 0;
315 	}
316 }
317 
318 /* Iterate thr' all leaf cfs_rq's on a runqueue */
319 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
320 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
321 
322 /* Do the two (enqueued) entities belong to the same group ? */
323 static inline int
324 is_same_group(struct sched_entity *se, struct sched_entity *pse)
325 {
326 	if (se->cfs_rq == pse->cfs_rq)
327 		return 1;
328 
329 	return 0;
330 }
331 
332 static inline struct sched_entity *parent_entity(struct sched_entity *se)
333 {
334 	return se->parent;
335 }
336 
337 /* return depth at which a sched entity is present in the hierarchy */
338 static inline int depth_se(struct sched_entity *se)
339 {
340 	int depth = 0;
341 
342 	for_each_sched_entity(se)
343 		depth++;
344 
345 	return depth;
346 }
347 
348 static void
349 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
350 {
351 	int se_depth, pse_depth;
352 
353 	/*
354 	 * preemption test can be made between sibling entities who are in the
355 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
356 	 * both tasks until we find their ancestors who are siblings of common
357 	 * parent.
358 	 */
359 
360 	/* First walk up until both entities are at same depth */
361 	se_depth = depth_se(*se);
362 	pse_depth = depth_se(*pse);
363 
364 	while (se_depth > pse_depth) {
365 		se_depth--;
366 		*se = parent_entity(*se);
367 	}
368 
369 	while (pse_depth > se_depth) {
370 		pse_depth--;
371 		*pse = parent_entity(*pse);
372 	}
373 
374 	while (!is_same_group(*se, *pse)) {
375 		*se = parent_entity(*se);
376 		*pse = parent_entity(*pse);
377 	}
378 }
379 
380 #else	/* !CONFIG_FAIR_GROUP_SCHED */
381 
382 static inline struct task_struct *task_of(struct sched_entity *se)
383 {
384 	return container_of(se, struct task_struct, se);
385 }
386 
387 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
388 {
389 	return container_of(cfs_rq, struct rq, cfs);
390 }
391 
392 #define entity_is_task(se)	1
393 
394 #define for_each_sched_entity(se) \
395 		for (; se; se = NULL)
396 
397 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
398 {
399 	return &task_rq(p)->cfs;
400 }
401 
402 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
403 {
404 	struct task_struct *p = task_of(se);
405 	struct rq *rq = task_rq(p);
406 
407 	return &rq->cfs;
408 }
409 
410 /* runqueue "owned" by this group */
411 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
412 {
413 	return NULL;
414 }
415 
416 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
417 {
418 }
419 
420 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
421 {
422 }
423 
424 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
425 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
426 
427 static inline int
428 is_same_group(struct sched_entity *se, struct sched_entity *pse)
429 {
430 	return 1;
431 }
432 
433 static inline struct sched_entity *parent_entity(struct sched_entity *se)
434 {
435 	return NULL;
436 }
437 
438 static inline void
439 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
440 {
441 }
442 
443 #endif	/* CONFIG_FAIR_GROUP_SCHED */
444 
445 static __always_inline
446 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
447 
448 /**************************************************************
449  * Scheduling class tree data structure manipulation methods:
450  */
451 
452 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
453 {
454 	s64 delta = (s64)(vruntime - max_vruntime);
455 	if (delta > 0)
456 		max_vruntime = vruntime;
457 
458 	return max_vruntime;
459 }
460 
461 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
462 {
463 	s64 delta = (s64)(vruntime - min_vruntime);
464 	if (delta < 0)
465 		min_vruntime = vruntime;
466 
467 	return min_vruntime;
468 }
469 
470 static inline int entity_before(struct sched_entity *a,
471 				struct sched_entity *b)
472 {
473 	return (s64)(a->vruntime - b->vruntime) < 0;
474 }
475 
476 static void update_min_vruntime(struct cfs_rq *cfs_rq)
477 {
478 	u64 vruntime = cfs_rq->min_vruntime;
479 
480 	if (cfs_rq->curr)
481 		vruntime = cfs_rq->curr->vruntime;
482 
483 	if (cfs_rq->rb_leftmost) {
484 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
485 						   struct sched_entity,
486 						   run_node);
487 
488 		if (!cfs_rq->curr)
489 			vruntime = se->vruntime;
490 		else
491 			vruntime = min_vruntime(vruntime, se->vruntime);
492 	}
493 
494 	/* ensure we never gain time by being placed backwards. */
495 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
496 #ifndef CONFIG_64BIT
497 	smp_wmb();
498 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
499 #endif
500 }
501 
502 /*
503  * Enqueue an entity into the rb-tree:
504  */
505 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
506 {
507 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
508 	struct rb_node *parent = NULL;
509 	struct sched_entity *entry;
510 	int leftmost = 1;
511 
512 	/*
513 	 * Find the right place in the rbtree:
514 	 */
515 	while (*link) {
516 		parent = *link;
517 		entry = rb_entry(parent, struct sched_entity, run_node);
518 		/*
519 		 * We dont care about collisions. Nodes with
520 		 * the same key stay together.
521 		 */
522 		if (entity_before(se, entry)) {
523 			link = &parent->rb_left;
524 		} else {
525 			link = &parent->rb_right;
526 			leftmost = 0;
527 		}
528 	}
529 
530 	/*
531 	 * Maintain a cache of leftmost tree entries (it is frequently
532 	 * used):
533 	 */
534 	if (leftmost)
535 		cfs_rq->rb_leftmost = &se->run_node;
536 
537 	rb_link_node(&se->run_node, parent, link);
538 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
539 }
540 
541 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
542 {
543 	if (cfs_rq->rb_leftmost == &se->run_node) {
544 		struct rb_node *next_node;
545 
546 		next_node = rb_next(&se->run_node);
547 		cfs_rq->rb_leftmost = next_node;
548 	}
549 
550 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
551 }
552 
553 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
554 {
555 	struct rb_node *left = cfs_rq->rb_leftmost;
556 
557 	if (!left)
558 		return NULL;
559 
560 	return rb_entry(left, struct sched_entity, run_node);
561 }
562 
563 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
564 {
565 	struct rb_node *next = rb_next(&se->run_node);
566 
567 	if (!next)
568 		return NULL;
569 
570 	return rb_entry(next, struct sched_entity, run_node);
571 }
572 
573 #ifdef CONFIG_SCHED_DEBUG
574 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
575 {
576 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
577 
578 	if (!last)
579 		return NULL;
580 
581 	return rb_entry(last, struct sched_entity, run_node);
582 }
583 
584 /**************************************************************
585  * Scheduling class statistics methods:
586  */
587 
588 int sched_proc_update_handler(struct ctl_table *table, int write,
589 		void __user *buffer, size_t *lenp,
590 		loff_t *ppos)
591 {
592 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
593 	int factor = get_update_sysctl_factor();
594 
595 	if (ret || !write)
596 		return ret;
597 
598 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
599 					sysctl_sched_min_granularity);
600 
601 #define WRT_SYSCTL(name) \
602 	(normalized_sysctl_##name = sysctl_##name / (factor))
603 	WRT_SYSCTL(sched_min_granularity);
604 	WRT_SYSCTL(sched_latency);
605 	WRT_SYSCTL(sched_wakeup_granularity);
606 #undef WRT_SYSCTL
607 
608 	return 0;
609 }
610 #endif
611 
612 /*
613  * delta /= w
614  */
615 static inline unsigned long
616 calc_delta_fair(unsigned long delta, struct sched_entity *se)
617 {
618 	if (unlikely(se->load.weight != NICE_0_LOAD))
619 		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
620 
621 	return delta;
622 }
623 
624 /*
625  * The idea is to set a period in which each task runs once.
626  *
627  * When there are too many tasks (sched_nr_latency) we have to stretch
628  * this period because otherwise the slices get too small.
629  *
630  * p = (nr <= nl) ? l : l*nr/nl
631  */
632 static u64 __sched_period(unsigned long nr_running)
633 {
634 	u64 period = sysctl_sched_latency;
635 	unsigned long nr_latency = sched_nr_latency;
636 
637 	if (unlikely(nr_running > nr_latency)) {
638 		period = sysctl_sched_min_granularity;
639 		period *= nr_running;
640 	}
641 
642 	return period;
643 }
644 
645 /*
646  * We calculate the wall-time slice from the period by taking a part
647  * proportional to the weight.
648  *
649  * s = p*P[w/rw]
650  */
651 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
652 {
653 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
654 
655 	for_each_sched_entity(se) {
656 		struct load_weight *load;
657 		struct load_weight lw;
658 
659 		cfs_rq = cfs_rq_of(se);
660 		load = &cfs_rq->load;
661 
662 		if (unlikely(!se->on_rq)) {
663 			lw = cfs_rq->load;
664 
665 			update_load_add(&lw, se->load.weight);
666 			load = &lw;
667 		}
668 		slice = calc_delta_mine(slice, se->load.weight, load);
669 	}
670 	return slice;
671 }
672 
673 /*
674  * We calculate the vruntime slice of a to-be-inserted task.
675  *
676  * vs = s/w
677  */
678 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
679 {
680 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
681 }
682 
683 #ifdef CONFIG_SMP
684 static inline void __update_task_entity_contrib(struct sched_entity *se);
685 
686 /* Give new task start runnable values to heavy its load in infant time */
687 void init_task_runnable_average(struct task_struct *p)
688 {
689 	u32 slice;
690 
691 	p->se.avg.decay_count = 0;
692 	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
693 	p->se.avg.runnable_avg_sum = slice;
694 	p->se.avg.runnable_avg_period = slice;
695 	__update_task_entity_contrib(&p->se);
696 }
697 #else
698 void init_task_runnable_average(struct task_struct *p)
699 {
700 }
701 #endif
702 
703 /*
704  * Update the current task's runtime statistics. Skip current tasks that
705  * are not in our scheduling class.
706  */
707 static inline void
708 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
709 	      unsigned long delta_exec)
710 {
711 	unsigned long delta_exec_weighted;
712 
713 	schedstat_set(curr->statistics.exec_max,
714 		      max((u64)delta_exec, curr->statistics.exec_max));
715 
716 	curr->sum_exec_runtime += delta_exec;
717 	schedstat_add(cfs_rq, exec_clock, delta_exec);
718 	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
719 
720 	curr->vruntime += delta_exec_weighted;
721 	update_min_vruntime(cfs_rq);
722 }
723 
724 static void update_curr(struct cfs_rq *cfs_rq)
725 {
726 	struct sched_entity *curr = cfs_rq->curr;
727 	u64 now = rq_clock_task(rq_of(cfs_rq));
728 	unsigned long delta_exec;
729 
730 	if (unlikely(!curr))
731 		return;
732 
733 	/*
734 	 * Get the amount of time the current task was running
735 	 * since the last time we changed load (this cannot
736 	 * overflow on 32 bits):
737 	 */
738 	delta_exec = (unsigned long)(now - curr->exec_start);
739 	if (!delta_exec)
740 		return;
741 
742 	__update_curr(cfs_rq, curr, delta_exec);
743 	curr->exec_start = now;
744 
745 	if (entity_is_task(curr)) {
746 		struct task_struct *curtask = task_of(curr);
747 
748 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
749 		cpuacct_charge(curtask, delta_exec);
750 		account_group_exec_runtime(curtask, delta_exec);
751 	}
752 
753 	account_cfs_rq_runtime(cfs_rq, delta_exec);
754 }
755 
756 static inline void
757 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
758 {
759 	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
760 }
761 
762 /*
763  * Task is being enqueued - update stats:
764  */
765 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
766 {
767 	/*
768 	 * Are we enqueueing a waiting task? (for current tasks
769 	 * a dequeue/enqueue event is a NOP)
770 	 */
771 	if (se != cfs_rq->curr)
772 		update_stats_wait_start(cfs_rq, se);
773 }
774 
775 static void
776 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
777 {
778 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
779 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
780 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
781 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
782 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
783 #ifdef CONFIG_SCHEDSTATS
784 	if (entity_is_task(se)) {
785 		trace_sched_stat_wait(task_of(se),
786 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
787 	}
788 #endif
789 	schedstat_set(se->statistics.wait_start, 0);
790 }
791 
792 static inline void
793 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
794 {
795 	/*
796 	 * Mark the end of the wait period if dequeueing a
797 	 * waiting task:
798 	 */
799 	if (se != cfs_rq->curr)
800 		update_stats_wait_end(cfs_rq, se);
801 }
802 
803 /*
804  * We are picking a new current task - update its stats:
805  */
806 static inline void
807 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
808 {
809 	/*
810 	 * We are starting a new run period:
811 	 */
812 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
813 }
814 
815 /**************************************************
816  * Scheduling class queueing methods:
817  */
818 
819 #ifdef CONFIG_NUMA_BALANCING
820 /*
821  * numa task sample period in ms
822  */
823 unsigned int sysctl_numa_balancing_scan_period_min = 100;
824 unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
825 unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
826 
827 /* Portion of address space to scan in MB */
828 unsigned int sysctl_numa_balancing_scan_size = 256;
829 
830 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
831 unsigned int sysctl_numa_balancing_scan_delay = 1000;
832 
833 static void task_numa_placement(struct task_struct *p)
834 {
835 	int seq;
836 
837 	if (!p->mm)	/* for example, ksmd faulting in a user's mm */
838 		return;
839 	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
840 	if (p->numa_scan_seq == seq)
841 		return;
842 	p->numa_scan_seq = seq;
843 
844 	/* FIXME: Scheduling placement policy hints go here */
845 }
846 
847 /*
848  * Got a PROT_NONE fault for a page on @node.
849  */
850 void task_numa_fault(int node, int pages, bool migrated)
851 {
852 	struct task_struct *p = current;
853 
854 	if (!numabalancing_enabled)
855 		return;
856 
857 	/* FIXME: Allocate task-specific structure for placement policy here */
858 
859 	/*
860 	 * If pages are properly placed (did not migrate) then scan slower.
861 	 * This is reset periodically in case of phase changes
862 	 */
863         if (!migrated)
864 		p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
865 			p->numa_scan_period + jiffies_to_msecs(10));
866 
867 	task_numa_placement(p);
868 }
869 
870 static void reset_ptenuma_scan(struct task_struct *p)
871 {
872 	ACCESS_ONCE(p->mm->numa_scan_seq)++;
873 	p->mm->numa_scan_offset = 0;
874 }
875 
876 /*
877  * The expensive part of numa migration is done from task_work context.
878  * Triggered from task_tick_numa().
879  */
880 void task_numa_work(struct callback_head *work)
881 {
882 	unsigned long migrate, next_scan, now = jiffies;
883 	struct task_struct *p = current;
884 	struct mm_struct *mm = p->mm;
885 	struct vm_area_struct *vma;
886 	unsigned long start, end;
887 	long pages;
888 
889 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
890 
891 	work->next = work; /* protect against double add */
892 	/*
893 	 * Who cares about NUMA placement when they're dying.
894 	 *
895 	 * NOTE: make sure not to dereference p->mm before this check,
896 	 * exit_task_work() happens _after_ exit_mm() so we could be called
897 	 * without p->mm even though we still had it when we enqueued this
898 	 * work.
899 	 */
900 	if (p->flags & PF_EXITING)
901 		return;
902 
903 	/*
904 	 * We do not care about task placement until a task runs on a node
905 	 * other than the first one used by the address space. This is
906 	 * largely because migrations are driven by what CPU the task
907 	 * is running on. If it's never scheduled on another node, it'll
908 	 * not migrate so why bother trapping the fault.
909 	 */
910 	if (mm->first_nid == NUMA_PTE_SCAN_INIT)
911 		mm->first_nid = numa_node_id();
912 	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
913 		/* Are we running on a new node yet? */
914 		if (numa_node_id() == mm->first_nid &&
915 		    !sched_feat_numa(NUMA_FORCE))
916 			return;
917 
918 		mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
919 	}
920 
921 	/*
922 	 * Reset the scan period if enough time has gone by. Objective is that
923 	 * scanning will be reduced if pages are properly placed. As tasks
924 	 * can enter different phases this needs to be re-examined. Lacking
925 	 * proper tracking of reference behaviour, this blunt hammer is used.
926 	 */
927 	migrate = mm->numa_next_reset;
928 	if (time_after(now, migrate)) {
929 		p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
930 		next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
931 		xchg(&mm->numa_next_reset, next_scan);
932 	}
933 
934 	/*
935 	 * Enforce maximal scan/migration frequency..
936 	 */
937 	migrate = mm->numa_next_scan;
938 	if (time_before(now, migrate))
939 		return;
940 
941 	if (p->numa_scan_period == 0)
942 		p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
943 
944 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
945 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
946 		return;
947 
948 	/*
949 	 * Do not set pte_numa if the current running node is rate-limited.
950 	 * This loses statistics on the fault but if we are unwilling to
951 	 * migrate to this node, it is less likely we can do useful work
952 	 */
953 	if (migrate_ratelimited(numa_node_id()))
954 		return;
955 
956 	start = mm->numa_scan_offset;
957 	pages = sysctl_numa_balancing_scan_size;
958 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
959 	if (!pages)
960 		return;
961 
962 	down_read(&mm->mmap_sem);
963 	vma = find_vma(mm, start);
964 	if (!vma) {
965 		reset_ptenuma_scan(p);
966 		start = 0;
967 		vma = mm->mmap;
968 	}
969 	for (; vma; vma = vma->vm_next) {
970 		if (!vma_migratable(vma))
971 			continue;
972 
973 		/* Skip small VMAs. They are not likely to be of relevance */
974 		if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
975 			continue;
976 
977 		do {
978 			start = max(start, vma->vm_start);
979 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
980 			end = min(end, vma->vm_end);
981 			pages -= change_prot_numa(vma, start, end);
982 
983 			start = end;
984 			if (pages <= 0)
985 				goto out;
986 		} while (end != vma->vm_end);
987 	}
988 
989 out:
990 	/*
991 	 * It is possible to reach the end of the VMA list but the last few VMAs are
992 	 * not guaranteed to the vma_migratable. If they are not, we would find the
993 	 * !migratable VMA on the next scan but not reset the scanner to the start
994 	 * so check it now.
995 	 */
996 	if (vma)
997 		mm->numa_scan_offset = start;
998 	else
999 		reset_ptenuma_scan(p);
1000 	up_read(&mm->mmap_sem);
1001 }
1002 
1003 /*
1004  * Drive the periodic memory faults..
1005  */
1006 void task_tick_numa(struct rq *rq, struct task_struct *curr)
1007 {
1008 	struct callback_head *work = &curr->numa_work;
1009 	u64 period, now;
1010 
1011 	/*
1012 	 * We don't care about NUMA placement if we don't have memory.
1013 	 */
1014 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
1015 		return;
1016 
1017 	/*
1018 	 * Using runtime rather than walltime has the dual advantage that
1019 	 * we (mostly) drive the selection from busy threads and that the
1020 	 * task needs to have done some actual work before we bother with
1021 	 * NUMA placement.
1022 	 */
1023 	now = curr->se.sum_exec_runtime;
1024 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
1025 
1026 	if (now - curr->node_stamp > period) {
1027 		if (!curr->node_stamp)
1028 			curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1029 		curr->node_stamp = now;
1030 
1031 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1032 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
1033 			task_work_add(curr, work, true);
1034 		}
1035 	}
1036 }
1037 #else
1038 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1039 {
1040 }
1041 #endif /* CONFIG_NUMA_BALANCING */
1042 
1043 static void
1044 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1045 {
1046 	update_load_add(&cfs_rq->load, se->load.weight);
1047 	if (!parent_entity(se))
1048 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1049 #ifdef CONFIG_SMP
1050 	if (entity_is_task(se))
1051 		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1052 #endif
1053 	cfs_rq->nr_running++;
1054 }
1055 
1056 static void
1057 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1058 {
1059 	update_load_sub(&cfs_rq->load, se->load.weight);
1060 	if (!parent_entity(se))
1061 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1062 	if (entity_is_task(se))
1063 		list_del_init(&se->group_node);
1064 	cfs_rq->nr_running--;
1065 }
1066 
1067 #ifdef CONFIG_FAIR_GROUP_SCHED
1068 # ifdef CONFIG_SMP
1069 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1070 {
1071 	long tg_weight;
1072 
1073 	/*
1074 	 * Use this CPU's actual weight instead of the last load_contribution
1075 	 * to gain a more accurate current total weight. See
1076 	 * update_cfs_rq_load_contribution().
1077 	 */
1078 	tg_weight = atomic_long_read(&tg->load_avg);
1079 	tg_weight -= cfs_rq->tg_load_contrib;
1080 	tg_weight += cfs_rq->load.weight;
1081 
1082 	return tg_weight;
1083 }
1084 
1085 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1086 {
1087 	long tg_weight, load, shares;
1088 
1089 	tg_weight = calc_tg_weight(tg, cfs_rq);
1090 	load = cfs_rq->load.weight;
1091 
1092 	shares = (tg->shares * load);
1093 	if (tg_weight)
1094 		shares /= tg_weight;
1095 
1096 	if (shares < MIN_SHARES)
1097 		shares = MIN_SHARES;
1098 	if (shares > tg->shares)
1099 		shares = tg->shares;
1100 
1101 	return shares;
1102 }
1103 # else /* CONFIG_SMP */
1104 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1105 {
1106 	return tg->shares;
1107 }
1108 # endif /* CONFIG_SMP */
1109 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1110 			    unsigned long weight)
1111 {
1112 	if (se->on_rq) {
1113 		/* commit outstanding execution time */
1114 		if (cfs_rq->curr == se)
1115 			update_curr(cfs_rq);
1116 		account_entity_dequeue(cfs_rq, se);
1117 	}
1118 
1119 	update_load_set(&se->load, weight);
1120 
1121 	if (se->on_rq)
1122 		account_entity_enqueue(cfs_rq, se);
1123 }
1124 
1125 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1126 
1127 static void update_cfs_shares(struct cfs_rq *cfs_rq)
1128 {
1129 	struct task_group *tg;
1130 	struct sched_entity *se;
1131 	long shares;
1132 
1133 	tg = cfs_rq->tg;
1134 	se = tg->se[cpu_of(rq_of(cfs_rq))];
1135 	if (!se || throttled_hierarchy(cfs_rq))
1136 		return;
1137 #ifndef CONFIG_SMP
1138 	if (likely(se->load.weight == tg->shares))
1139 		return;
1140 #endif
1141 	shares = calc_cfs_shares(cfs_rq, tg);
1142 
1143 	reweight_entity(cfs_rq_of(se), se, shares);
1144 }
1145 #else /* CONFIG_FAIR_GROUP_SCHED */
1146 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1147 {
1148 }
1149 #endif /* CONFIG_FAIR_GROUP_SCHED */
1150 
1151 #ifdef CONFIG_SMP
1152 /*
1153  * We choose a half-life close to 1 scheduling period.
1154  * Note: The tables below are dependent on this value.
1155  */
1156 #define LOAD_AVG_PERIOD 32
1157 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1158 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1159 
1160 /* Precomputed fixed inverse multiplies for multiplication by y^n */
1161 static const u32 runnable_avg_yN_inv[] = {
1162 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1163 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1164 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1165 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1166 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1167 	0x85aac367, 0x82cd8698,
1168 };
1169 
1170 /*
1171  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
1172  * over-estimates when re-combining.
1173  */
1174 static const u32 runnable_avg_yN_sum[] = {
1175 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1176 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1177 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1178 };
1179 
1180 /*
1181  * Approximate:
1182  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
1183  */
1184 static __always_inline u64 decay_load(u64 val, u64 n)
1185 {
1186 	unsigned int local_n;
1187 
1188 	if (!n)
1189 		return val;
1190 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1191 		return 0;
1192 
1193 	/* after bounds checking we can collapse to 32-bit */
1194 	local_n = n;
1195 
1196 	/*
1197 	 * As y^PERIOD = 1/2, we can combine
1198 	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1199 	 * With a look-up table which covers k^n (n<PERIOD)
1200 	 *
1201 	 * To achieve constant time decay_load.
1202 	 */
1203 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1204 		val >>= local_n / LOAD_AVG_PERIOD;
1205 		local_n %= LOAD_AVG_PERIOD;
1206 	}
1207 
1208 	val *= runnable_avg_yN_inv[local_n];
1209 	/* We don't use SRR here since we always want to round down. */
1210 	return val >> 32;
1211 }
1212 
1213 /*
1214  * For updates fully spanning n periods, the contribution to runnable
1215  * average will be: \Sum 1024*y^n
1216  *
1217  * We can compute this reasonably efficiently by combining:
1218  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
1219  */
1220 static u32 __compute_runnable_contrib(u64 n)
1221 {
1222 	u32 contrib = 0;
1223 
1224 	if (likely(n <= LOAD_AVG_PERIOD))
1225 		return runnable_avg_yN_sum[n];
1226 	else if (unlikely(n >= LOAD_AVG_MAX_N))
1227 		return LOAD_AVG_MAX;
1228 
1229 	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1230 	do {
1231 		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1232 		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1233 
1234 		n -= LOAD_AVG_PERIOD;
1235 	} while (n > LOAD_AVG_PERIOD);
1236 
1237 	contrib = decay_load(contrib, n);
1238 	return contrib + runnable_avg_yN_sum[n];
1239 }
1240 
1241 /*
1242  * We can represent the historical contribution to runnable average as the
1243  * coefficients of a geometric series.  To do this we sub-divide our runnable
1244  * history into segments of approximately 1ms (1024us); label the segment that
1245  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1246  *
1247  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1248  *      p0            p1           p2
1249  *     (now)       (~1ms ago)  (~2ms ago)
1250  *
1251  * Let u_i denote the fraction of p_i that the entity was runnable.
1252  *
1253  * We then designate the fractions u_i as our co-efficients, yielding the
1254  * following representation of historical load:
1255  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1256  *
1257  * We choose y based on the with of a reasonably scheduling period, fixing:
1258  *   y^32 = 0.5
1259  *
1260  * This means that the contribution to load ~32ms ago (u_32) will be weighted
1261  * approximately half as much as the contribution to load within the last ms
1262  * (u_0).
1263  *
1264  * When a period "rolls over" and we have new u_0`, multiplying the previous
1265  * sum again by y is sufficient to update:
1266  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1267  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1268  */
1269 static __always_inline int __update_entity_runnable_avg(u64 now,
1270 							struct sched_avg *sa,
1271 							int runnable)
1272 {
1273 	u64 delta, periods;
1274 	u32 runnable_contrib;
1275 	int delta_w, decayed = 0;
1276 
1277 	delta = now - sa->last_runnable_update;
1278 	/*
1279 	 * This should only happen when time goes backwards, which it
1280 	 * unfortunately does during sched clock init when we swap over to TSC.
1281 	 */
1282 	if ((s64)delta < 0) {
1283 		sa->last_runnable_update = now;
1284 		return 0;
1285 	}
1286 
1287 	/*
1288 	 * Use 1024ns as the unit of measurement since it's a reasonable
1289 	 * approximation of 1us and fast to compute.
1290 	 */
1291 	delta >>= 10;
1292 	if (!delta)
1293 		return 0;
1294 	sa->last_runnable_update = now;
1295 
1296 	/* delta_w is the amount already accumulated against our next period */
1297 	delta_w = sa->runnable_avg_period % 1024;
1298 	if (delta + delta_w >= 1024) {
1299 		/* period roll-over */
1300 		decayed = 1;
1301 
1302 		/*
1303 		 * Now that we know we're crossing a period boundary, figure
1304 		 * out how much from delta we need to complete the current
1305 		 * period and accrue it.
1306 		 */
1307 		delta_w = 1024 - delta_w;
1308 		if (runnable)
1309 			sa->runnable_avg_sum += delta_w;
1310 		sa->runnable_avg_period += delta_w;
1311 
1312 		delta -= delta_w;
1313 
1314 		/* Figure out how many additional periods this update spans */
1315 		periods = delta / 1024;
1316 		delta %= 1024;
1317 
1318 		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1319 						  periods + 1);
1320 		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1321 						     periods + 1);
1322 
1323 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
1324 		runnable_contrib = __compute_runnable_contrib(periods);
1325 		if (runnable)
1326 			sa->runnable_avg_sum += runnable_contrib;
1327 		sa->runnable_avg_period += runnable_contrib;
1328 	}
1329 
1330 	/* Remainder of delta accrued against u_0` */
1331 	if (runnable)
1332 		sa->runnable_avg_sum += delta;
1333 	sa->runnable_avg_period += delta;
1334 
1335 	return decayed;
1336 }
1337 
1338 /* Synchronize an entity's decay with its parenting cfs_rq.*/
1339 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1340 {
1341 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1342 	u64 decays = atomic64_read(&cfs_rq->decay_counter);
1343 
1344 	decays -= se->avg.decay_count;
1345 	if (!decays)
1346 		return 0;
1347 
1348 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1349 	se->avg.decay_count = 0;
1350 
1351 	return decays;
1352 }
1353 
1354 #ifdef CONFIG_FAIR_GROUP_SCHED
1355 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1356 						 int force_update)
1357 {
1358 	struct task_group *tg = cfs_rq->tg;
1359 	long tg_contrib;
1360 
1361 	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1362 	tg_contrib -= cfs_rq->tg_load_contrib;
1363 
1364 	if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1365 		atomic_long_add(tg_contrib, &tg->load_avg);
1366 		cfs_rq->tg_load_contrib += tg_contrib;
1367 	}
1368 }
1369 
1370 /*
1371  * Aggregate cfs_rq runnable averages into an equivalent task_group
1372  * representation for computing load contributions.
1373  */
1374 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1375 						  struct cfs_rq *cfs_rq)
1376 {
1377 	struct task_group *tg = cfs_rq->tg;
1378 	long contrib;
1379 
1380 	/* The fraction of a cpu used by this cfs_rq */
1381 	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1382 			  sa->runnable_avg_period + 1);
1383 	contrib -= cfs_rq->tg_runnable_contrib;
1384 
1385 	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1386 		atomic_add(contrib, &tg->runnable_avg);
1387 		cfs_rq->tg_runnable_contrib += contrib;
1388 	}
1389 }
1390 
1391 static inline void __update_group_entity_contrib(struct sched_entity *se)
1392 {
1393 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
1394 	struct task_group *tg = cfs_rq->tg;
1395 	int runnable_avg;
1396 
1397 	u64 contrib;
1398 
1399 	contrib = cfs_rq->tg_load_contrib * tg->shares;
1400 	se->avg.load_avg_contrib = div_u64(contrib,
1401 				     atomic_long_read(&tg->load_avg) + 1);
1402 
1403 	/*
1404 	 * For group entities we need to compute a correction term in the case
1405 	 * that they are consuming <1 cpu so that we would contribute the same
1406 	 * load as a task of equal weight.
1407 	 *
1408 	 * Explicitly co-ordinating this measurement would be expensive, but
1409 	 * fortunately the sum of each cpus contribution forms a usable
1410 	 * lower-bound on the true value.
1411 	 *
1412 	 * Consider the aggregate of 2 contributions.  Either they are disjoint
1413 	 * (and the sum represents true value) or they are disjoint and we are
1414 	 * understating by the aggregate of their overlap.
1415 	 *
1416 	 * Extending this to N cpus, for a given overlap, the maximum amount we
1417 	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1418 	 * cpus that overlap for this interval and w_i is the interval width.
1419 	 *
1420 	 * On a small machine; the first term is well-bounded which bounds the
1421 	 * total error since w_i is a subset of the period.  Whereas on a
1422 	 * larger machine, while this first term can be larger, if w_i is the
1423 	 * of consequential size guaranteed to see n_i*w_i quickly converge to
1424 	 * our upper bound of 1-cpu.
1425 	 */
1426 	runnable_avg = atomic_read(&tg->runnable_avg);
1427 	if (runnable_avg < NICE_0_LOAD) {
1428 		se->avg.load_avg_contrib *= runnable_avg;
1429 		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1430 	}
1431 }
1432 #else
1433 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1434 						 int force_update) {}
1435 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1436 						  struct cfs_rq *cfs_rq) {}
1437 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1438 #endif
1439 
1440 static inline void __update_task_entity_contrib(struct sched_entity *se)
1441 {
1442 	u32 contrib;
1443 
1444 	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1445 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1446 	contrib /= (se->avg.runnable_avg_period + 1);
1447 	se->avg.load_avg_contrib = scale_load(contrib);
1448 }
1449 
1450 /* Compute the current contribution to load_avg by se, return any delta */
1451 static long __update_entity_load_avg_contrib(struct sched_entity *se)
1452 {
1453 	long old_contrib = se->avg.load_avg_contrib;
1454 
1455 	if (entity_is_task(se)) {
1456 		__update_task_entity_contrib(se);
1457 	} else {
1458 		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1459 		__update_group_entity_contrib(se);
1460 	}
1461 
1462 	return se->avg.load_avg_contrib - old_contrib;
1463 }
1464 
1465 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1466 						 long load_contrib)
1467 {
1468 	if (likely(load_contrib < cfs_rq->blocked_load_avg))
1469 		cfs_rq->blocked_load_avg -= load_contrib;
1470 	else
1471 		cfs_rq->blocked_load_avg = 0;
1472 }
1473 
1474 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1475 
1476 /* Update a sched_entity's runnable average */
1477 static inline void update_entity_load_avg(struct sched_entity *se,
1478 					  int update_cfs_rq)
1479 {
1480 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1481 	long contrib_delta;
1482 	u64 now;
1483 
1484 	/*
1485 	 * For a group entity we need to use their owned cfs_rq_clock_task() in
1486 	 * case they are the parent of a throttled hierarchy.
1487 	 */
1488 	if (entity_is_task(se))
1489 		now = cfs_rq_clock_task(cfs_rq);
1490 	else
1491 		now = cfs_rq_clock_task(group_cfs_rq(se));
1492 
1493 	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1494 		return;
1495 
1496 	contrib_delta = __update_entity_load_avg_contrib(se);
1497 
1498 	if (!update_cfs_rq)
1499 		return;
1500 
1501 	if (se->on_rq)
1502 		cfs_rq->runnable_load_avg += contrib_delta;
1503 	else
1504 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1505 }
1506 
1507 /*
1508  * Decay the load contributed by all blocked children and account this so that
1509  * their contribution may appropriately discounted when they wake up.
1510  */
1511 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1512 {
1513 	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1514 	u64 decays;
1515 
1516 	decays = now - cfs_rq->last_decay;
1517 	if (!decays && !force_update)
1518 		return;
1519 
1520 	if (atomic_long_read(&cfs_rq->removed_load)) {
1521 		unsigned long removed_load;
1522 		removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
1523 		subtract_blocked_load_contrib(cfs_rq, removed_load);
1524 	}
1525 
1526 	if (decays) {
1527 		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1528 						      decays);
1529 		atomic64_add(decays, &cfs_rq->decay_counter);
1530 		cfs_rq->last_decay = now;
1531 	}
1532 
1533 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1534 }
1535 
1536 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1537 {
1538 	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
1539 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
1540 }
1541 
1542 /* Add the load generated by se into cfs_rq's child load-average */
1543 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1544 						  struct sched_entity *se,
1545 						  int wakeup)
1546 {
1547 	/*
1548 	 * We track migrations using entity decay_count <= 0, on a wake-up
1549 	 * migration we use a negative decay count to track the remote decays
1550 	 * accumulated while sleeping.
1551 	 *
1552 	 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
1553 	 * are seen by enqueue_entity_load_avg() as a migration with an already
1554 	 * constructed load_avg_contrib.
1555 	 */
1556 	if (unlikely(se->avg.decay_count <= 0)) {
1557 		se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
1558 		if (se->avg.decay_count) {
1559 			/*
1560 			 * In a wake-up migration we have to approximate the
1561 			 * time sleeping.  This is because we can't synchronize
1562 			 * clock_task between the two cpus, and it is not
1563 			 * guaranteed to be read-safe.  Instead, we can
1564 			 * approximate this using our carried decays, which are
1565 			 * explicitly atomically readable.
1566 			 */
1567 			se->avg.last_runnable_update -= (-se->avg.decay_count)
1568 							<< 20;
1569 			update_entity_load_avg(se, 0);
1570 			/* Indicate that we're now synchronized and on-rq */
1571 			se->avg.decay_count = 0;
1572 		}
1573 		wakeup = 0;
1574 	} else {
1575 		/*
1576 		 * Task re-woke on same cpu (or else migrate_task_rq_fair()
1577 		 * would have made count negative); we must be careful to avoid
1578 		 * double-accounting blocked time after synchronizing decays.
1579 		 */
1580 		se->avg.last_runnable_update += __synchronize_entity_decay(se)
1581 							<< 20;
1582 	}
1583 
1584 	/* migrated tasks did not contribute to our blocked load */
1585 	if (wakeup) {
1586 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1587 		update_entity_load_avg(se, 0);
1588 	}
1589 
1590 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1591 	/* we force update consideration on load-balancer moves */
1592 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1593 }
1594 
1595 /*
1596  * Remove se's load from this cfs_rq child load-average, if the entity is
1597  * transitioning to a blocked state we track its projected decay using
1598  * blocked_load_avg.
1599  */
1600 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1601 						  struct sched_entity *se,
1602 						  int sleep)
1603 {
1604 	update_entity_load_avg(se, 1);
1605 	/* we force update consideration on load-balancer moves */
1606 	update_cfs_rq_blocked_load(cfs_rq, !sleep);
1607 
1608 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1609 	if (sleep) {
1610 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1611 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1612 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
1613 }
1614 
1615 /*
1616  * Update the rq's load with the elapsed running time before entering
1617  * idle. if the last scheduled task is not a CFS task, idle_enter will
1618  * be the only way to update the runnable statistic.
1619  */
1620 void idle_enter_fair(struct rq *this_rq)
1621 {
1622 	update_rq_runnable_avg(this_rq, 1);
1623 }
1624 
1625 /*
1626  * Update the rq's load with the elapsed idle time before a task is
1627  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1628  * be the only way to update the runnable statistic.
1629  */
1630 void idle_exit_fair(struct rq *this_rq)
1631 {
1632 	update_rq_runnable_avg(this_rq, 0);
1633 }
1634 
1635 #else
1636 static inline void update_entity_load_avg(struct sched_entity *se,
1637 					  int update_cfs_rq) {}
1638 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1639 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1640 					   struct sched_entity *se,
1641 					   int wakeup) {}
1642 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1643 					   struct sched_entity *se,
1644 					   int sleep) {}
1645 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1646 					      int force_update) {}
1647 #endif
1648 
1649 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1650 {
1651 #ifdef CONFIG_SCHEDSTATS
1652 	struct task_struct *tsk = NULL;
1653 
1654 	if (entity_is_task(se))
1655 		tsk = task_of(se);
1656 
1657 	if (se->statistics.sleep_start) {
1658 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
1659 
1660 		if ((s64)delta < 0)
1661 			delta = 0;
1662 
1663 		if (unlikely(delta > se->statistics.sleep_max))
1664 			se->statistics.sleep_max = delta;
1665 
1666 		se->statistics.sleep_start = 0;
1667 		se->statistics.sum_sleep_runtime += delta;
1668 
1669 		if (tsk) {
1670 			account_scheduler_latency(tsk, delta >> 10, 1);
1671 			trace_sched_stat_sleep(tsk, delta);
1672 		}
1673 	}
1674 	if (se->statistics.block_start) {
1675 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
1676 
1677 		if ((s64)delta < 0)
1678 			delta = 0;
1679 
1680 		if (unlikely(delta > se->statistics.block_max))
1681 			se->statistics.block_max = delta;
1682 
1683 		se->statistics.block_start = 0;
1684 		se->statistics.sum_sleep_runtime += delta;
1685 
1686 		if (tsk) {
1687 			if (tsk->in_iowait) {
1688 				se->statistics.iowait_sum += delta;
1689 				se->statistics.iowait_count++;
1690 				trace_sched_stat_iowait(tsk, delta);
1691 			}
1692 
1693 			trace_sched_stat_blocked(tsk, delta);
1694 
1695 			/*
1696 			 * Blocking time is in units of nanosecs, so shift by
1697 			 * 20 to get a milliseconds-range estimation of the
1698 			 * amount of time that the task spent sleeping:
1699 			 */
1700 			if (unlikely(prof_on == SLEEP_PROFILING)) {
1701 				profile_hits(SLEEP_PROFILING,
1702 						(void *)get_wchan(tsk),
1703 						delta >> 20);
1704 			}
1705 			account_scheduler_latency(tsk, delta >> 10, 0);
1706 		}
1707 	}
1708 #endif
1709 }
1710 
1711 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1712 {
1713 #ifdef CONFIG_SCHED_DEBUG
1714 	s64 d = se->vruntime - cfs_rq->min_vruntime;
1715 
1716 	if (d < 0)
1717 		d = -d;
1718 
1719 	if (d > 3*sysctl_sched_latency)
1720 		schedstat_inc(cfs_rq, nr_spread_over);
1721 #endif
1722 }
1723 
1724 static void
1725 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1726 {
1727 	u64 vruntime = cfs_rq->min_vruntime;
1728 
1729 	/*
1730 	 * The 'current' period is already promised to the current tasks,
1731 	 * however the extra weight of the new task will slow them down a
1732 	 * little, place the new task so that it fits in the slot that
1733 	 * stays open at the end.
1734 	 */
1735 	if (initial && sched_feat(START_DEBIT))
1736 		vruntime += sched_vslice(cfs_rq, se);
1737 
1738 	/* sleeps up to a single latency don't count. */
1739 	if (!initial) {
1740 		unsigned long thresh = sysctl_sched_latency;
1741 
1742 		/*
1743 		 * Halve their sleep time's effect, to allow
1744 		 * for a gentler effect of sleepers:
1745 		 */
1746 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
1747 			thresh >>= 1;
1748 
1749 		vruntime -= thresh;
1750 	}
1751 
1752 	/* ensure we never gain time by being placed backwards. */
1753 	se->vruntime = max_vruntime(se->vruntime, vruntime);
1754 }
1755 
1756 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1757 
1758 static void
1759 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1760 {
1761 	/*
1762 	 * Update the normalized vruntime before updating min_vruntime
1763 	 * through calling update_curr().
1764 	 */
1765 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1766 		se->vruntime += cfs_rq->min_vruntime;
1767 
1768 	/*
1769 	 * Update run-time statistics of the 'current'.
1770 	 */
1771 	update_curr(cfs_rq);
1772 	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1773 	account_entity_enqueue(cfs_rq, se);
1774 	update_cfs_shares(cfs_rq);
1775 
1776 	if (flags & ENQUEUE_WAKEUP) {
1777 		place_entity(cfs_rq, se, 0);
1778 		enqueue_sleeper(cfs_rq, se);
1779 	}
1780 
1781 	update_stats_enqueue(cfs_rq, se);
1782 	check_spread(cfs_rq, se);
1783 	if (se != cfs_rq->curr)
1784 		__enqueue_entity(cfs_rq, se);
1785 	se->on_rq = 1;
1786 
1787 	if (cfs_rq->nr_running == 1) {
1788 		list_add_leaf_cfs_rq(cfs_rq);
1789 		check_enqueue_throttle(cfs_rq);
1790 	}
1791 }
1792 
1793 static void __clear_buddies_last(struct sched_entity *se)
1794 {
1795 	for_each_sched_entity(se) {
1796 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1797 		if (cfs_rq->last == se)
1798 			cfs_rq->last = NULL;
1799 		else
1800 			break;
1801 	}
1802 }
1803 
1804 static void __clear_buddies_next(struct sched_entity *se)
1805 {
1806 	for_each_sched_entity(se) {
1807 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1808 		if (cfs_rq->next == se)
1809 			cfs_rq->next = NULL;
1810 		else
1811 			break;
1812 	}
1813 }
1814 
1815 static void __clear_buddies_skip(struct sched_entity *se)
1816 {
1817 	for_each_sched_entity(se) {
1818 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
1819 		if (cfs_rq->skip == se)
1820 			cfs_rq->skip = NULL;
1821 		else
1822 			break;
1823 	}
1824 }
1825 
1826 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1827 {
1828 	if (cfs_rq->last == se)
1829 		__clear_buddies_last(se);
1830 
1831 	if (cfs_rq->next == se)
1832 		__clear_buddies_next(se);
1833 
1834 	if (cfs_rq->skip == se)
1835 		__clear_buddies_skip(se);
1836 }
1837 
1838 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1839 
1840 static void
1841 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1842 {
1843 	/*
1844 	 * Update run-time statistics of the 'current'.
1845 	 */
1846 	update_curr(cfs_rq);
1847 	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1848 
1849 	update_stats_dequeue(cfs_rq, se);
1850 	if (flags & DEQUEUE_SLEEP) {
1851 #ifdef CONFIG_SCHEDSTATS
1852 		if (entity_is_task(se)) {
1853 			struct task_struct *tsk = task_of(se);
1854 
1855 			if (tsk->state & TASK_INTERRUPTIBLE)
1856 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
1857 			if (tsk->state & TASK_UNINTERRUPTIBLE)
1858 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
1859 		}
1860 #endif
1861 	}
1862 
1863 	clear_buddies(cfs_rq, se);
1864 
1865 	if (se != cfs_rq->curr)
1866 		__dequeue_entity(cfs_rq, se);
1867 	se->on_rq = 0;
1868 	account_entity_dequeue(cfs_rq, se);
1869 
1870 	/*
1871 	 * Normalize the entity after updating the min_vruntime because the
1872 	 * update can refer to the ->curr item and we need to reflect this
1873 	 * movement in our normalized position.
1874 	 */
1875 	if (!(flags & DEQUEUE_SLEEP))
1876 		se->vruntime -= cfs_rq->min_vruntime;
1877 
1878 	/* return excess runtime on last dequeue */
1879 	return_cfs_rq_runtime(cfs_rq);
1880 
1881 	update_min_vruntime(cfs_rq);
1882 	update_cfs_shares(cfs_rq);
1883 }
1884 
1885 /*
1886  * Preempt the current task with a newly woken task if needed:
1887  */
1888 static void
1889 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1890 {
1891 	unsigned long ideal_runtime, delta_exec;
1892 	struct sched_entity *se;
1893 	s64 delta;
1894 
1895 	ideal_runtime = sched_slice(cfs_rq, curr);
1896 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1897 	if (delta_exec > ideal_runtime) {
1898 		resched_task(rq_of(cfs_rq)->curr);
1899 		/*
1900 		 * The current task ran long enough, ensure it doesn't get
1901 		 * re-elected due to buddy favours.
1902 		 */
1903 		clear_buddies(cfs_rq, curr);
1904 		return;
1905 	}
1906 
1907 	/*
1908 	 * Ensure that a task that missed wakeup preemption by a
1909 	 * narrow margin doesn't have to wait for a full slice.
1910 	 * This also mitigates buddy induced latencies under load.
1911 	 */
1912 	if (delta_exec < sysctl_sched_min_granularity)
1913 		return;
1914 
1915 	se = __pick_first_entity(cfs_rq);
1916 	delta = curr->vruntime - se->vruntime;
1917 
1918 	if (delta < 0)
1919 		return;
1920 
1921 	if (delta > ideal_runtime)
1922 		resched_task(rq_of(cfs_rq)->curr);
1923 }
1924 
1925 static void
1926 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1927 {
1928 	/* 'current' is not kept within the tree. */
1929 	if (se->on_rq) {
1930 		/*
1931 		 * Any task has to be enqueued before it get to execute on
1932 		 * a CPU. So account for the time it spent waiting on the
1933 		 * runqueue.
1934 		 */
1935 		update_stats_wait_end(cfs_rq, se);
1936 		__dequeue_entity(cfs_rq, se);
1937 	}
1938 
1939 	update_stats_curr_start(cfs_rq, se);
1940 	cfs_rq->curr = se;
1941 #ifdef CONFIG_SCHEDSTATS
1942 	/*
1943 	 * Track our maximum slice length, if the CPU's load is at
1944 	 * least twice that of our own weight (i.e. dont track it
1945 	 * when there are only lesser-weight tasks around):
1946 	 */
1947 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1948 		se->statistics.slice_max = max(se->statistics.slice_max,
1949 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
1950 	}
1951 #endif
1952 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
1953 }
1954 
1955 static int
1956 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1957 
1958 /*
1959  * Pick the next process, keeping these things in mind, in this order:
1960  * 1) keep things fair between processes/task groups
1961  * 2) pick the "next" process, since someone really wants that to run
1962  * 3) pick the "last" process, for cache locality
1963  * 4) do not run the "skip" process, if something else is available
1964  */
1965 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1966 {
1967 	struct sched_entity *se = __pick_first_entity(cfs_rq);
1968 	struct sched_entity *left = se;
1969 
1970 	/*
1971 	 * Avoid running the skip buddy, if running something else can
1972 	 * be done without getting too unfair.
1973 	 */
1974 	if (cfs_rq->skip == se) {
1975 		struct sched_entity *second = __pick_next_entity(se);
1976 		if (second && wakeup_preempt_entity(second, left) < 1)
1977 			se = second;
1978 	}
1979 
1980 	/*
1981 	 * Prefer last buddy, try to return the CPU to a preempted task.
1982 	 */
1983 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1984 		se = cfs_rq->last;
1985 
1986 	/*
1987 	 * Someone really wants this to run. If it's not unfair, run it.
1988 	 */
1989 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1990 		se = cfs_rq->next;
1991 
1992 	clear_buddies(cfs_rq, se);
1993 
1994 	return se;
1995 }
1996 
1997 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1998 
1999 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2000 {
2001 	/*
2002 	 * If still on the runqueue then deactivate_task()
2003 	 * was not called and update_curr() has to be done:
2004 	 */
2005 	if (prev->on_rq)
2006 		update_curr(cfs_rq);
2007 
2008 	/* throttle cfs_rqs exceeding runtime */
2009 	check_cfs_rq_runtime(cfs_rq);
2010 
2011 	check_spread(cfs_rq, prev);
2012 	if (prev->on_rq) {
2013 		update_stats_wait_start(cfs_rq, prev);
2014 		/* Put 'current' back into the tree. */
2015 		__enqueue_entity(cfs_rq, prev);
2016 		/* in !on_rq case, update occurred at dequeue */
2017 		update_entity_load_avg(prev, 1);
2018 	}
2019 	cfs_rq->curr = NULL;
2020 }
2021 
2022 static void
2023 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2024 {
2025 	/*
2026 	 * Update run-time statistics of the 'current'.
2027 	 */
2028 	update_curr(cfs_rq);
2029 
2030 	/*
2031 	 * Ensure that runnable average is periodically updated.
2032 	 */
2033 	update_entity_load_avg(curr, 1);
2034 	update_cfs_rq_blocked_load(cfs_rq, 1);
2035 	update_cfs_shares(cfs_rq);
2036 
2037 #ifdef CONFIG_SCHED_HRTICK
2038 	/*
2039 	 * queued ticks are scheduled to match the slice, so don't bother
2040 	 * validating it and just reschedule.
2041 	 */
2042 	if (queued) {
2043 		resched_task(rq_of(cfs_rq)->curr);
2044 		return;
2045 	}
2046 	/*
2047 	 * don't let the period tick interfere with the hrtick preemption
2048 	 */
2049 	if (!sched_feat(DOUBLE_TICK) &&
2050 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2051 		return;
2052 #endif
2053 
2054 	if (cfs_rq->nr_running > 1)
2055 		check_preempt_tick(cfs_rq, curr);
2056 }
2057 
2058 
2059 /**************************************************
2060  * CFS bandwidth control machinery
2061  */
2062 
2063 #ifdef CONFIG_CFS_BANDWIDTH
2064 
2065 #ifdef HAVE_JUMP_LABEL
2066 static struct static_key __cfs_bandwidth_used;
2067 
2068 static inline bool cfs_bandwidth_used(void)
2069 {
2070 	return static_key_false(&__cfs_bandwidth_used);
2071 }
2072 
2073 void account_cfs_bandwidth_used(int enabled, int was_enabled)
2074 {
2075 	/* only need to count groups transitioning between enabled/!enabled */
2076 	if (enabled && !was_enabled)
2077 		static_key_slow_inc(&__cfs_bandwidth_used);
2078 	else if (!enabled && was_enabled)
2079 		static_key_slow_dec(&__cfs_bandwidth_used);
2080 }
2081 #else /* HAVE_JUMP_LABEL */
2082 static bool cfs_bandwidth_used(void)
2083 {
2084 	return true;
2085 }
2086 
2087 void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
2088 #endif /* HAVE_JUMP_LABEL */
2089 
2090 /*
2091  * default period for cfs group bandwidth.
2092  * default: 0.1s, units: nanoseconds
2093  */
2094 static inline u64 default_cfs_period(void)
2095 {
2096 	return 100000000ULL;
2097 }
2098 
2099 static inline u64 sched_cfs_bandwidth_slice(void)
2100 {
2101 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2102 }
2103 
2104 /*
2105  * Replenish runtime according to assigned quota and update expiration time.
2106  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2107  * additional synchronization around rq->lock.
2108  *
2109  * requires cfs_b->lock
2110  */
2111 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2112 {
2113 	u64 now;
2114 
2115 	if (cfs_b->quota == RUNTIME_INF)
2116 		return;
2117 
2118 	now = sched_clock_cpu(smp_processor_id());
2119 	cfs_b->runtime = cfs_b->quota;
2120 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2121 }
2122 
2123 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2124 {
2125 	return &tg->cfs_bandwidth;
2126 }
2127 
2128 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2129 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2130 {
2131 	if (unlikely(cfs_rq->throttle_count))
2132 		return cfs_rq->throttled_clock_task;
2133 
2134 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
2135 }
2136 
2137 /* returns 0 on failure to allocate runtime */
2138 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2139 {
2140 	struct task_group *tg = cfs_rq->tg;
2141 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2142 	u64 amount = 0, min_amount, expires;
2143 
2144 	/* note: this is a positive sum as runtime_remaining <= 0 */
2145 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2146 
2147 	raw_spin_lock(&cfs_b->lock);
2148 	if (cfs_b->quota == RUNTIME_INF)
2149 		amount = min_amount;
2150 	else {
2151 		/*
2152 		 * If the bandwidth pool has become inactive, then at least one
2153 		 * period must have elapsed since the last consumption.
2154 		 * Refresh the global state and ensure bandwidth timer becomes
2155 		 * active.
2156 		 */
2157 		if (!cfs_b->timer_active) {
2158 			__refill_cfs_bandwidth_runtime(cfs_b);
2159 			__start_cfs_bandwidth(cfs_b);
2160 		}
2161 
2162 		if (cfs_b->runtime > 0) {
2163 			amount = min(cfs_b->runtime, min_amount);
2164 			cfs_b->runtime -= amount;
2165 			cfs_b->idle = 0;
2166 		}
2167 	}
2168 	expires = cfs_b->runtime_expires;
2169 	raw_spin_unlock(&cfs_b->lock);
2170 
2171 	cfs_rq->runtime_remaining += amount;
2172 	/*
2173 	 * we may have advanced our local expiration to account for allowed
2174 	 * spread between our sched_clock and the one on which runtime was
2175 	 * issued.
2176 	 */
2177 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2178 		cfs_rq->runtime_expires = expires;
2179 
2180 	return cfs_rq->runtime_remaining > 0;
2181 }
2182 
2183 /*
2184  * Note: This depends on the synchronization provided by sched_clock and the
2185  * fact that rq->clock snapshots this value.
2186  */
2187 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2188 {
2189 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2190 
2191 	/* if the deadline is ahead of our clock, nothing to do */
2192 	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
2193 		return;
2194 
2195 	if (cfs_rq->runtime_remaining < 0)
2196 		return;
2197 
2198 	/*
2199 	 * If the local deadline has passed we have to consider the
2200 	 * possibility that our sched_clock is 'fast' and the global deadline
2201 	 * has not truly expired.
2202 	 *
2203 	 * Fortunately we can check determine whether this the case by checking
2204 	 * whether the global deadline has advanced.
2205 	 */
2206 
2207 	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2208 		/* extend local deadline, drift is bounded above by 2 ticks */
2209 		cfs_rq->runtime_expires += TICK_NSEC;
2210 	} else {
2211 		/* global deadline is ahead, expiration has passed */
2212 		cfs_rq->runtime_remaining = 0;
2213 	}
2214 }
2215 
2216 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2217 				     unsigned long delta_exec)
2218 {
2219 	/* dock delta_exec before expiring quota (as it could span periods) */
2220 	cfs_rq->runtime_remaining -= delta_exec;
2221 	expire_cfs_rq_runtime(cfs_rq);
2222 
2223 	if (likely(cfs_rq->runtime_remaining > 0))
2224 		return;
2225 
2226 	/*
2227 	 * if we're unable to extend our runtime we resched so that the active
2228 	 * hierarchy can be throttled
2229 	 */
2230 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2231 		resched_task(rq_of(cfs_rq)->curr);
2232 }
2233 
2234 static __always_inline
2235 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
2236 {
2237 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
2238 		return;
2239 
2240 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
2241 }
2242 
2243 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2244 {
2245 	return cfs_bandwidth_used() && cfs_rq->throttled;
2246 }
2247 
2248 /* check whether cfs_rq, or any parent, is throttled */
2249 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2250 {
2251 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
2252 }
2253 
2254 /*
2255  * Ensure that neither of the group entities corresponding to src_cpu or
2256  * dest_cpu are members of a throttled hierarchy when performing group
2257  * load-balance operations.
2258  */
2259 static inline int throttled_lb_pair(struct task_group *tg,
2260 				    int src_cpu, int dest_cpu)
2261 {
2262 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2263 
2264 	src_cfs_rq = tg->cfs_rq[src_cpu];
2265 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
2266 
2267 	return throttled_hierarchy(src_cfs_rq) ||
2268 	       throttled_hierarchy(dest_cfs_rq);
2269 }
2270 
2271 /* updated child weight may affect parent so we have to do this bottom up */
2272 static int tg_unthrottle_up(struct task_group *tg, void *data)
2273 {
2274 	struct rq *rq = data;
2275 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2276 
2277 	cfs_rq->throttle_count--;
2278 #ifdef CONFIG_SMP
2279 	if (!cfs_rq->throttle_count) {
2280 		/* adjust cfs_rq_clock_task() */
2281 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
2282 					     cfs_rq->throttled_clock_task;
2283 	}
2284 #endif
2285 
2286 	return 0;
2287 }
2288 
2289 static int tg_throttle_down(struct task_group *tg, void *data)
2290 {
2291 	struct rq *rq = data;
2292 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2293 
2294 	/* group is entering throttled state, stop time */
2295 	if (!cfs_rq->throttle_count)
2296 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
2297 	cfs_rq->throttle_count++;
2298 
2299 	return 0;
2300 }
2301 
2302 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2303 {
2304 	struct rq *rq = rq_of(cfs_rq);
2305 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2306 	struct sched_entity *se;
2307 	long task_delta, dequeue = 1;
2308 
2309 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2310 
2311 	/* freeze hierarchy runnable averages while throttled */
2312 	rcu_read_lock();
2313 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2314 	rcu_read_unlock();
2315 
2316 	task_delta = cfs_rq->h_nr_running;
2317 	for_each_sched_entity(se) {
2318 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2319 		/* throttled entity or throttle-on-deactivate */
2320 		if (!se->on_rq)
2321 			break;
2322 
2323 		if (dequeue)
2324 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2325 		qcfs_rq->h_nr_running -= task_delta;
2326 
2327 		if (qcfs_rq->load.weight)
2328 			dequeue = 0;
2329 	}
2330 
2331 	if (!se)
2332 		rq->nr_running -= task_delta;
2333 
2334 	cfs_rq->throttled = 1;
2335 	cfs_rq->throttled_clock = rq_clock(rq);
2336 	raw_spin_lock(&cfs_b->lock);
2337 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2338 	raw_spin_unlock(&cfs_b->lock);
2339 }
2340 
2341 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2342 {
2343 	struct rq *rq = rq_of(cfs_rq);
2344 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2345 	struct sched_entity *se;
2346 	int enqueue = 1;
2347 	long task_delta;
2348 
2349 	se = cfs_rq->tg->se[cpu_of(rq)];
2350 
2351 	cfs_rq->throttled = 0;
2352 
2353 	update_rq_clock(rq);
2354 
2355 	raw_spin_lock(&cfs_b->lock);
2356 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
2357 	list_del_rcu(&cfs_rq->throttled_list);
2358 	raw_spin_unlock(&cfs_b->lock);
2359 
2360 	/* update hierarchical throttle state */
2361 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2362 
2363 	if (!cfs_rq->load.weight)
2364 		return;
2365 
2366 	task_delta = cfs_rq->h_nr_running;
2367 	for_each_sched_entity(se) {
2368 		if (se->on_rq)
2369 			enqueue = 0;
2370 
2371 		cfs_rq = cfs_rq_of(se);
2372 		if (enqueue)
2373 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2374 		cfs_rq->h_nr_running += task_delta;
2375 
2376 		if (cfs_rq_throttled(cfs_rq))
2377 			break;
2378 	}
2379 
2380 	if (!se)
2381 		rq->nr_running += task_delta;
2382 
2383 	/* determine whether we need to wake up potentially idle cpu */
2384 	if (rq->curr == rq->idle && rq->cfs.nr_running)
2385 		resched_task(rq->curr);
2386 }
2387 
2388 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2389 		u64 remaining, u64 expires)
2390 {
2391 	struct cfs_rq *cfs_rq;
2392 	u64 runtime = remaining;
2393 
2394 	rcu_read_lock();
2395 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2396 				throttled_list) {
2397 		struct rq *rq = rq_of(cfs_rq);
2398 
2399 		raw_spin_lock(&rq->lock);
2400 		if (!cfs_rq_throttled(cfs_rq))
2401 			goto next;
2402 
2403 		runtime = -cfs_rq->runtime_remaining + 1;
2404 		if (runtime > remaining)
2405 			runtime = remaining;
2406 		remaining -= runtime;
2407 
2408 		cfs_rq->runtime_remaining += runtime;
2409 		cfs_rq->runtime_expires = expires;
2410 
2411 		/* we check whether we're throttled above */
2412 		if (cfs_rq->runtime_remaining > 0)
2413 			unthrottle_cfs_rq(cfs_rq);
2414 
2415 next:
2416 		raw_spin_unlock(&rq->lock);
2417 
2418 		if (!remaining)
2419 			break;
2420 	}
2421 	rcu_read_unlock();
2422 
2423 	return remaining;
2424 }
2425 
2426 /*
2427  * Responsible for refilling a task_group's bandwidth and unthrottling its
2428  * cfs_rqs as appropriate. If there has been no activity within the last
2429  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2430  * used to track this state.
2431  */
2432 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2433 {
2434 	u64 runtime, runtime_expires;
2435 	int idle = 1, throttled;
2436 
2437 	raw_spin_lock(&cfs_b->lock);
2438 	/* no need to continue the timer with no bandwidth constraint */
2439 	if (cfs_b->quota == RUNTIME_INF)
2440 		goto out_unlock;
2441 
2442 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2443 	/* idle depends on !throttled (for the case of a large deficit) */
2444 	idle = cfs_b->idle && !throttled;
2445 	cfs_b->nr_periods += overrun;
2446 
2447 	/* if we're going inactive then everything else can be deferred */
2448 	if (idle)
2449 		goto out_unlock;
2450 
2451 	__refill_cfs_bandwidth_runtime(cfs_b);
2452 
2453 	if (!throttled) {
2454 		/* mark as potentially idle for the upcoming period */
2455 		cfs_b->idle = 1;
2456 		goto out_unlock;
2457 	}
2458 
2459 	/* account preceding periods in which throttling occurred */
2460 	cfs_b->nr_throttled += overrun;
2461 
2462 	/*
2463 	 * There are throttled entities so we must first use the new bandwidth
2464 	 * to unthrottle them before making it generally available.  This
2465 	 * ensures that all existing debts will be paid before a new cfs_rq is
2466 	 * allowed to run.
2467 	 */
2468 	runtime = cfs_b->runtime;
2469 	runtime_expires = cfs_b->runtime_expires;
2470 	cfs_b->runtime = 0;
2471 
2472 	/*
2473 	 * This check is repeated as we are holding onto the new bandwidth
2474 	 * while we unthrottle.  This can potentially race with an unthrottled
2475 	 * group trying to acquire new bandwidth from the global pool.
2476 	 */
2477 	while (throttled && runtime > 0) {
2478 		raw_spin_unlock(&cfs_b->lock);
2479 		/* we can't nest cfs_b->lock while distributing bandwidth */
2480 		runtime = distribute_cfs_runtime(cfs_b, runtime,
2481 						 runtime_expires);
2482 		raw_spin_lock(&cfs_b->lock);
2483 
2484 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2485 	}
2486 
2487 	/* return (any) remaining runtime */
2488 	cfs_b->runtime = runtime;
2489 	/*
2490 	 * While we are ensured activity in the period following an
2491 	 * unthrottle, this also covers the case in which the new bandwidth is
2492 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
2493 	 * timer to remain active while there are any throttled entities.)
2494 	 */
2495 	cfs_b->idle = 0;
2496 out_unlock:
2497 	if (idle)
2498 		cfs_b->timer_active = 0;
2499 	raw_spin_unlock(&cfs_b->lock);
2500 
2501 	return idle;
2502 }
2503 
2504 /* a cfs_rq won't donate quota below this amount */
2505 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2506 /* minimum remaining period time to redistribute slack quota */
2507 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2508 /* how long we wait to gather additional slack before distributing */
2509 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2510 
2511 /* are we near the end of the current quota period? */
2512 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2513 {
2514 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
2515 	u64 remaining;
2516 
2517 	/* if the call-back is running a quota refresh is already occurring */
2518 	if (hrtimer_callback_running(refresh_timer))
2519 		return 1;
2520 
2521 	/* is a quota refresh about to occur? */
2522 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2523 	if (remaining < min_expire)
2524 		return 1;
2525 
2526 	return 0;
2527 }
2528 
2529 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2530 {
2531 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2532 
2533 	/* if there's a quota refresh soon don't bother with slack */
2534 	if (runtime_refresh_within(cfs_b, min_left))
2535 		return;
2536 
2537 	start_bandwidth_timer(&cfs_b->slack_timer,
2538 				ns_to_ktime(cfs_bandwidth_slack_period));
2539 }
2540 
2541 /* we know any runtime found here is valid as update_curr() precedes return */
2542 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2543 {
2544 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2545 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2546 
2547 	if (slack_runtime <= 0)
2548 		return;
2549 
2550 	raw_spin_lock(&cfs_b->lock);
2551 	if (cfs_b->quota != RUNTIME_INF &&
2552 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2553 		cfs_b->runtime += slack_runtime;
2554 
2555 		/* we are under rq->lock, defer unthrottling using a timer */
2556 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2557 		    !list_empty(&cfs_b->throttled_cfs_rq))
2558 			start_cfs_slack_bandwidth(cfs_b);
2559 	}
2560 	raw_spin_unlock(&cfs_b->lock);
2561 
2562 	/* even if it's not valid for return we don't want to try again */
2563 	cfs_rq->runtime_remaining -= slack_runtime;
2564 }
2565 
2566 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2567 {
2568 	if (!cfs_bandwidth_used())
2569 		return;
2570 
2571 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
2572 		return;
2573 
2574 	__return_cfs_rq_runtime(cfs_rq);
2575 }
2576 
2577 /*
2578  * This is done with a timer (instead of inline with bandwidth return) since
2579  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2580  */
2581 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2582 {
2583 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2584 	u64 expires;
2585 
2586 	/* confirm we're still not at a refresh boundary */
2587 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2588 		return;
2589 
2590 	raw_spin_lock(&cfs_b->lock);
2591 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2592 		runtime = cfs_b->runtime;
2593 		cfs_b->runtime = 0;
2594 	}
2595 	expires = cfs_b->runtime_expires;
2596 	raw_spin_unlock(&cfs_b->lock);
2597 
2598 	if (!runtime)
2599 		return;
2600 
2601 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2602 
2603 	raw_spin_lock(&cfs_b->lock);
2604 	if (expires == cfs_b->runtime_expires)
2605 		cfs_b->runtime = runtime;
2606 	raw_spin_unlock(&cfs_b->lock);
2607 }
2608 
2609 /*
2610  * When a group wakes up we want to make sure that its quota is not already
2611  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2612  * runtime as update_curr() throttling can not not trigger until it's on-rq.
2613  */
2614 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2615 {
2616 	if (!cfs_bandwidth_used())
2617 		return;
2618 
2619 	/* an active group must be handled by the update_curr()->put() path */
2620 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2621 		return;
2622 
2623 	/* ensure the group is not already throttled */
2624 	if (cfs_rq_throttled(cfs_rq))
2625 		return;
2626 
2627 	/* update runtime allocation */
2628 	account_cfs_rq_runtime(cfs_rq, 0);
2629 	if (cfs_rq->runtime_remaining <= 0)
2630 		throttle_cfs_rq(cfs_rq);
2631 }
2632 
2633 /* conditionally throttle active cfs_rq's from put_prev_entity() */
2634 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2635 {
2636 	if (!cfs_bandwidth_used())
2637 		return;
2638 
2639 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2640 		return;
2641 
2642 	/*
2643 	 * it's possible for a throttled entity to be forced into a running
2644 	 * state (e.g. set_curr_task), in this case we're finished.
2645 	 */
2646 	if (cfs_rq_throttled(cfs_rq))
2647 		return;
2648 
2649 	throttle_cfs_rq(cfs_rq);
2650 }
2651 
2652 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2653 {
2654 	struct cfs_bandwidth *cfs_b =
2655 		container_of(timer, struct cfs_bandwidth, slack_timer);
2656 	do_sched_cfs_slack_timer(cfs_b);
2657 
2658 	return HRTIMER_NORESTART;
2659 }
2660 
2661 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2662 {
2663 	struct cfs_bandwidth *cfs_b =
2664 		container_of(timer, struct cfs_bandwidth, period_timer);
2665 	ktime_t now;
2666 	int overrun;
2667 	int idle = 0;
2668 
2669 	for (;;) {
2670 		now = hrtimer_cb_get_time(timer);
2671 		overrun = hrtimer_forward(timer, now, cfs_b->period);
2672 
2673 		if (!overrun)
2674 			break;
2675 
2676 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
2677 	}
2678 
2679 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2680 }
2681 
2682 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2683 {
2684 	raw_spin_lock_init(&cfs_b->lock);
2685 	cfs_b->runtime = 0;
2686 	cfs_b->quota = RUNTIME_INF;
2687 	cfs_b->period = ns_to_ktime(default_cfs_period());
2688 
2689 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2690 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2691 	cfs_b->period_timer.function = sched_cfs_period_timer;
2692 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2693 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
2694 }
2695 
2696 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2697 {
2698 	cfs_rq->runtime_enabled = 0;
2699 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
2700 }
2701 
2702 /* requires cfs_b->lock, may release to reprogram timer */
2703 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2704 {
2705 	/*
2706 	 * The timer may be active because we're trying to set a new bandwidth
2707 	 * period or because we're racing with the tear-down path
2708 	 * (timer_active==0 becomes visible before the hrtimer call-back
2709 	 * terminates).  In either case we ensure that it's re-programmed
2710 	 */
2711 	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2712 		raw_spin_unlock(&cfs_b->lock);
2713 		/* ensure cfs_b->lock is available while we wait */
2714 		hrtimer_cancel(&cfs_b->period_timer);
2715 
2716 		raw_spin_lock(&cfs_b->lock);
2717 		/* if someone else restarted the timer then we're done */
2718 		if (cfs_b->timer_active)
2719 			return;
2720 	}
2721 
2722 	cfs_b->timer_active = 1;
2723 	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2724 }
2725 
2726 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2727 {
2728 	hrtimer_cancel(&cfs_b->period_timer);
2729 	hrtimer_cancel(&cfs_b->slack_timer);
2730 }
2731 
2732 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2733 {
2734 	struct cfs_rq *cfs_rq;
2735 
2736 	for_each_leaf_cfs_rq(rq, cfs_rq) {
2737 		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2738 
2739 		if (!cfs_rq->runtime_enabled)
2740 			continue;
2741 
2742 		/*
2743 		 * clock_task is not advancing so we just need to make sure
2744 		 * there's some valid quota amount
2745 		 */
2746 		cfs_rq->runtime_remaining = cfs_b->quota;
2747 		if (cfs_rq_throttled(cfs_rq))
2748 			unthrottle_cfs_rq(cfs_rq);
2749 	}
2750 }
2751 
2752 #else /* CONFIG_CFS_BANDWIDTH */
2753 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2754 {
2755 	return rq_clock_task(rq_of(cfs_rq));
2756 }
2757 
2758 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2759 				     unsigned long delta_exec) {}
2760 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2761 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2762 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2763 
2764 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2765 {
2766 	return 0;
2767 }
2768 
2769 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2770 {
2771 	return 0;
2772 }
2773 
2774 static inline int throttled_lb_pair(struct task_group *tg,
2775 				    int src_cpu, int dest_cpu)
2776 {
2777 	return 0;
2778 }
2779 
2780 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2781 
2782 #ifdef CONFIG_FAIR_GROUP_SCHED
2783 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2784 #endif
2785 
2786 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2787 {
2788 	return NULL;
2789 }
2790 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2791 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2792 
2793 #endif /* CONFIG_CFS_BANDWIDTH */
2794 
2795 /**************************************************
2796  * CFS operations on tasks:
2797  */
2798 
2799 #ifdef CONFIG_SCHED_HRTICK
2800 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2801 {
2802 	struct sched_entity *se = &p->se;
2803 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2804 
2805 	WARN_ON(task_rq(p) != rq);
2806 
2807 	if (cfs_rq->nr_running > 1) {
2808 		u64 slice = sched_slice(cfs_rq, se);
2809 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2810 		s64 delta = slice - ran;
2811 
2812 		if (delta < 0) {
2813 			if (rq->curr == p)
2814 				resched_task(p);
2815 			return;
2816 		}
2817 
2818 		/*
2819 		 * Don't schedule slices shorter than 10000ns, that just
2820 		 * doesn't make sense. Rely on vruntime for fairness.
2821 		 */
2822 		if (rq->curr != p)
2823 			delta = max_t(s64, 10000LL, delta);
2824 
2825 		hrtick_start(rq, delta);
2826 	}
2827 }
2828 
2829 /*
2830  * called from enqueue/dequeue and updates the hrtick when the
2831  * current task is from our class and nr_running is low enough
2832  * to matter.
2833  */
2834 static void hrtick_update(struct rq *rq)
2835 {
2836 	struct task_struct *curr = rq->curr;
2837 
2838 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2839 		return;
2840 
2841 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2842 		hrtick_start_fair(rq, curr);
2843 }
2844 #else /* !CONFIG_SCHED_HRTICK */
2845 static inline void
2846 hrtick_start_fair(struct rq *rq, struct task_struct *p)
2847 {
2848 }
2849 
2850 static inline void hrtick_update(struct rq *rq)
2851 {
2852 }
2853 #endif
2854 
2855 /*
2856  * The enqueue_task method is called before nr_running is
2857  * increased. Here we update the fair scheduling stats and
2858  * then put the task into the rbtree:
2859  */
2860 static void
2861 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2862 {
2863 	struct cfs_rq *cfs_rq;
2864 	struct sched_entity *se = &p->se;
2865 
2866 	for_each_sched_entity(se) {
2867 		if (se->on_rq)
2868 			break;
2869 		cfs_rq = cfs_rq_of(se);
2870 		enqueue_entity(cfs_rq, se, flags);
2871 
2872 		/*
2873 		 * end evaluation on encountering a throttled cfs_rq
2874 		 *
2875 		 * note: in the case of encountering a throttled cfs_rq we will
2876 		 * post the final h_nr_running increment below.
2877 		*/
2878 		if (cfs_rq_throttled(cfs_rq))
2879 			break;
2880 		cfs_rq->h_nr_running++;
2881 
2882 		flags = ENQUEUE_WAKEUP;
2883 	}
2884 
2885 	for_each_sched_entity(se) {
2886 		cfs_rq = cfs_rq_of(se);
2887 		cfs_rq->h_nr_running++;
2888 
2889 		if (cfs_rq_throttled(cfs_rq))
2890 			break;
2891 
2892 		update_cfs_shares(cfs_rq);
2893 		update_entity_load_avg(se, 1);
2894 	}
2895 
2896 	if (!se) {
2897 		update_rq_runnable_avg(rq, rq->nr_running);
2898 		inc_nr_running(rq);
2899 	}
2900 	hrtick_update(rq);
2901 }
2902 
2903 static void set_next_buddy(struct sched_entity *se);
2904 
2905 /*
2906  * The dequeue_task method is called before nr_running is
2907  * decreased. We remove the task from the rbtree and
2908  * update the fair scheduling stats:
2909  */
2910 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2911 {
2912 	struct cfs_rq *cfs_rq;
2913 	struct sched_entity *se = &p->se;
2914 	int task_sleep = flags & DEQUEUE_SLEEP;
2915 
2916 	for_each_sched_entity(se) {
2917 		cfs_rq = cfs_rq_of(se);
2918 		dequeue_entity(cfs_rq, se, flags);
2919 
2920 		/*
2921 		 * end evaluation on encountering a throttled cfs_rq
2922 		 *
2923 		 * note: in the case of encountering a throttled cfs_rq we will
2924 		 * post the final h_nr_running decrement below.
2925 		*/
2926 		if (cfs_rq_throttled(cfs_rq))
2927 			break;
2928 		cfs_rq->h_nr_running--;
2929 
2930 		/* Don't dequeue parent if it has other entities besides us */
2931 		if (cfs_rq->load.weight) {
2932 			/*
2933 			 * Bias pick_next to pick a task from this cfs_rq, as
2934 			 * p is sleeping when it is within its sched_slice.
2935 			 */
2936 			if (task_sleep && parent_entity(se))
2937 				set_next_buddy(parent_entity(se));
2938 
2939 			/* avoid re-evaluating load for this entity */
2940 			se = parent_entity(se);
2941 			break;
2942 		}
2943 		flags |= DEQUEUE_SLEEP;
2944 	}
2945 
2946 	for_each_sched_entity(se) {
2947 		cfs_rq = cfs_rq_of(se);
2948 		cfs_rq->h_nr_running--;
2949 
2950 		if (cfs_rq_throttled(cfs_rq))
2951 			break;
2952 
2953 		update_cfs_shares(cfs_rq);
2954 		update_entity_load_avg(se, 1);
2955 	}
2956 
2957 	if (!se) {
2958 		dec_nr_running(rq);
2959 		update_rq_runnable_avg(rq, 1);
2960 	}
2961 	hrtick_update(rq);
2962 }
2963 
2964 #ifdef CONFIG_SMP
2965 /* Used instead of source_load when we know the type == 0 */
2966 static unsigned long weighted_cpuload(const int cpu)
2967 {
2968 	return cpu_rq(cpu)->cfs.runnable_load_avg;
2969 }
2970 
2971 /*
2972  * Return a low guess at the load of a migration-source cpu weighted
2973  * according to the scheduling class and "nice" value.
2974  *
2975  * We want to under-estimate the load of migration sources, to
2976  * balance conservatively.
2977  */
2978 static unsigned long source_load(int cpu, int type)
2979 {
2980 	struct rq *rq = cpu_rq(cpu);
2981 	unsigned long total = weighted_cpuload(cpu);
2982 
2983 	if (type == 0 || !sched_feat(LB_BIAS))
2984 		return total;
2985 
2986 	return min(rq->cpu_load[type-1], total);
2987 }
2988 
2989 /*
2990  * Return a high guess at the load of a migration-target cpu weighted
2991  * according to the scheduling class and "nice" value.
2992  */
2993 static unsigned long target_load(int cpu, int type)
2994 {
2995 	struct rq *rq = cpu_rq(cpu);
2996 	unsigned long total = weighted_cpuload(cpu);
2997 
2998 	if (type == 0 || !sched_feat(LB_BIAS))
2999 		return total;
3000 
3001 	return max(rq->cpu_load[type-1], total);
3002 }
3003 
3004 static unsigned long power_of(int cpu)
3005 {
3006 	return cpu_rq(cpu)->cpu_power;
3007 }
3008 
3009 static unsigned long cpu_avg_load_per_task(int cpu)
3010 {
3011 	struct rq *rq = cpu_rq(cpu);
3012 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
3013 	unsigned long load_avg = rq->cfs.runnable_load_avg;
3014 
3015 	if (nr_running)
3016 		return load_avg / nr_running;
3017 
3018 	return 0;
3019 }
3020 
3021 static void record_wakee(struct task_struct *p)
3022 {
3023 	/*
3024 	 * Rough decay (wiping) for cost saving, don't worry
3025 	 * about the boundary, really active task won't care
3026 	 * about the loss.
3027 	 */
3028 	if (jiffies > current->wakee_flip_decay_ts + HZ) {
3029 		current->wakee_flips = 0;
3030 		current->wakee_flip_decay_ts = jiffies;
3031 	}
3032 
3033 	if (current->last_wakee != p) {
3034 		current->last_wakee = p;
3035 		current->wakee_flips++;
3036 	}
3037 }
3038 
3039 static void task_waking_fair(struct task_struct *p)
3040 {
3041 	struct sched_entity *se = &p->se;
3042 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3043 	u64 min_vruntime;
3044 
3045 #ifndef CONFIG_64BIT
3046 	u64 min_vruntime_copy;
3047 
3048 	do {
3049 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
3050 		smp_rmb();
3051 		min_vruntime = cfs_rq->min_vruntime;
3052 	} while (min_vruntime != min_vruntime_copy);
3053 #else
3054 	min_vruntime = cfs_rq->min_vruntime;
3055 #endif
3056 
3057 	se->vruntime -= min_vruntime;
3058 	record_wakee(p);
3059 }
3060 
3061 #ifdef CONFIG_FAIR_GROUP_SCHED
3062 /*
3063  * effective_load() calculates the load change as seen from the root_task_group
3064  *
3065  * Adding load to a group doesn't make a group heavier, but can cause movement
3066  * of group shares between cpus. Assuming the shares were perfectly aligned one
3067  * can calculate the shift in shares.
3068  *
3069  * Calculate the effective load difference if @wl is added (subtracted) to @tg
3070  * on this @cpu and results in a total addition (subtraction) of @wg to the
3071  * total group weight.
3072  *
3073  * Given a runqueue weight distribution (rw_i) we can compute a shares
3074  * distribution (s_i) using:
3075  *
3076  *   s_i = rw_i / \Sum rw_j						(1)
3077  *
3078  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
3079  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
3080  * shares distribution (s_i):
3081  *
3082  *   rw_i = {   2,   4,   1,   0 }
3083  *   s_i  = { 2/7, 4/7, 1/7,   0 }
3084  *
3085  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
3086  * task used to run on and the CPU the waker is running on), we need to
3087  * compute the effect of waking a task on either CPU and, in case of a sync
3088  * wakeup, compute the effect of the current task going to sleep.
3089  *
3090  * So for a change of @wl to the local @cpu with an overall group weight change
3091  * of @wl we can compute the new shares distribution (s'_i) using:
3092  *
3093  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
3094  *
3095  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3096  * differences in waking a task to CPU 0. The additional task changes the
3097  * weight and shares distributions like:
3098  *
3099  *   rw'_i = {   3,   4,   1,   0 }
3100  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
3101  *
3102  * We can then compute the difference in effective weight by using:
3103  *
3104  *   dw_i = S * (s'_i - s_i)						(3)
3105  *
3106  * Where 'S' is the group weight as seen by its parent.
3107  *
3108  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3109  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3110  * 4/7) times the weight of the group.
3111  */
3112 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3113 {
3114 	struct sched_entity *se = tg->se[cpu];
3115 
3116 	if (!tg->parent)	/* the trivial, non-cgroup case */
3117 		return wl;
3118 
3119 	for_each_sched_entity(se) {
3120 		long w, W;
3121 
3122 		tg = se->my_q->tg;
3123 
3124 		/*
3125 		 * W = @wg + \Sum rw_j
3126 		 */
3127 		W = wg + calc_tg_weight(tg, se->my_q);
3128 
3129 		/*
3130 		 * w = rw_i + @wl
3131 		 */
3132 		w = se->my_q->load.weight + wl;
3133 
3134 		/*
3135 		 * wl = S * s'_i; see (2)
3136 		 */
3137 		if (W > 0 && w < W)
3138 			wl = (w * tg->shares) / W;
3139 		else
3140 			wl = tg->shares;
3141 
3142 		/*
3143 		 * Per the above, wl is the new se->load.weight value; since
3144 		 * those are clipped to [MIN_SHARES, ...) do so now. See
3145 		 * calc_cfs_shares().
3146 		 */
3147 		if (wl < MIN_SHARES)
3148 			wl = MIN_SHARES;
3149 
3150 		/*
3151 		 * wl = dw_i = S * (s'_i - s_i); see (3)
3152 		 */
3153 		wl -= se->load.weight;
3154 
3155 		/*
3156 		 * Recursively apply this logic to all parent groups to compute
3157 		 * the final effective load change on the root group. Since
3158 		 * only the @tg group gets extra weight, all parent groups can
3159 		 * only redistribute existing shares. @wl is the shift in shares
3160 		 * resulting from this level per the above.
3161 		 */
3162 		wg = 0;
3163 	}
3164 
3165 	return wl;
3166 }
3167 #else
3168 
3169 static inline unsigned long effective_load(struct task_group *tg, int cpu,
3170 		unsigned long wl, unsigned long wg)
3171 {
3172 	return wl;
3173 }
3174 
3175 #endif
3176 
3177 static int wake_wide(struct task_struct *p)
3178 {
3179 	int factor = this_cpu_read(sd_llc_size);
3180 
3181 	/*
3182 	 * Yeah, it's the switching-frequency, could means many wakee or
3183 	 * rapidly switch, use factor here will just help to automatically
3184 	 * adjust the loose-degree, so bigger node will lead to more pull.
3185 	 */
3186 	if (p->wakee_flips > factor) {
3187 		/*
3188 		 * wakee is somewhat hot, it needs certain amount of cpu
3189 		 * resource, so if waker is far more hot, prefer to leave
3190 		 * it alone.
3191 		 */
3192 		if (current->wakee_flips > (factor * p->wakee_flips))
3193 			return 1;
3194 	}
3195 
3196 	return 0;
3197 }
3198 
3199 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3200 {
3201 	s64 this_load, load;
3202 	int idx, this_cpu, prev_cpu;
3203 	unsigned long tl_per_task;
3204 	struct task_group *tg;
3205 	unsigned long weight;
3206 	int balanced;
3207 
3208 	/*
3209 	 * If we wake multiple tasks be careful to not bounce
3210 	 * ourselves around too much.
3211 	 */
3212 	if (wake_wide(p))
3213 		return 0;
3214 
3215 	idx	  = sd->wake_idx;
3216 	this_cpu  = smp_processor_id();
3217 	prev_cpu  = task_cpu(p);
3218 	load	  = source_load(prev_cpu, idx);
3219 	this_load = target_load(this_cpu, idx);
3220 
3221 	/*
3222 	 * If sync wakeup then subtract the (maximum possible)
3223 	 * effect of the currently running task from the load
3224 	 * of the current CPU:
3225 	 */
3226 	if (sync) {
3227 		tg = task_group(current);
3228 		weight = current->se.load.weight;
3229 
3230 		this_load += effective_load(tg, this_cpu, -weight, -weight);
3231 		load += effective_load(tg, prev_cpu, 0, -weight);
3232 	}
3233 
3234 	tg = task_group(p);
3235 	weight = p->se.load.weight;
3236 
3237 	/*
3238 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
3239 	 * due to the sync cause above having dropped this_load to 0, we'll
3240 	 * always have an imbalance, but there's really nothing you can do
3241 	 * about that, so that's good too.
3242 	 *
3243 	 * Otherwise check if either cpus are near enough in load to allow this
3244 	 * task to be woken on this_cpu.
3245 	 */
3246 	if (this_load > 0) {
3247 		s64 this_eff_load, prev_eff_load;
3248 
3249 		this_eff_load = 100;
3250 		this_eff_load *= power_of(prev_cpu);
3251 		this_eff_load *= this_load +
3252 			effective_load(tg, this_cpu, weight, weight);
3253 
3254 		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3255 		prev_eff_load *= power_of(this_cpu);
3256 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3257 
3258 		balanced = this_eff_load <= prev_eff_load;
3259 	} else
3260 		balanced = true;
3261 
3262 	/*
3263 	 * If the currently running task will sleep within
3264 	 * a reasonable amount of time then attract this newly
3265 	 * woken task:
3266 	 */
3267 	if (sync && balanced)
3268 		return 1;
3269 
3270 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
3271 	tl_per_task = cpu_avg_load_per_task(this_cpu);
3272 
3273 	if (balanced ||
3274 	    (this_load <= load &&
3275 	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
3276 		/*
3277 		 * This domain has SD_WAKE_AFFINE and
3278 		 * p is cache cold in this domain, and
3279 		 * there is no bad imbalance.
3280 		 */
3281 		schedstat_inc(sd, ttwu_move_affine);
3282 		schedstat_inc(p, se.statistics.nr_wakeups_affine);
3283 
3284 		return 1;
3285 	}
3286 	return 0;
3287 }
3288 
3289 /*
3290  * find_idlest_group finds and returns the least busy CPU group within the
3291  * domain.
3292  */
3293 static struct sched_group *
3294 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
3295 		  int this_cpu, int load_idx)
3296 {
3297 	struct sched_group *idlest = NULL, *group = sd->groups;
3298 	unsigned long min_load = ULONG_MAX, this_load = 0;
3299 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
3300 
3301 	do {
3302 		unsigned long load, avg_load;
3303 		int local_group;
3304 		int i;
3305 
3306 		/* Skip over this group if it has no CPUs allowed */
3307 		if (!cpumask_intersects(sched_group_cpus(group),
3308 					tsk_cpus_allowed(p)))
3309 			continue;
3310 
3311 		local_group = cpumask_test_cpu(this_cpu,
3312 					       sched_group_cpus(group));
3313 
3314 		/* Tally up the load of all CPUs in the group */
3315 		avg_load = 0;
3316 
3317 		for_each_cpu(i, sched_group_cpus(group)) {
3318 			/* Bias balancing toward cpus of our domain */
3319 			if (local_group)
3320 				load = source_load(i, load_idx);
3321 			else
3322 				load = target_load(i, load_idx);
3323 
3324 			avg_load += load;
3325 		}
3326 
3327 		/* Adjust by relative CPU power of the group */
3328 		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
3329 
3330 		if (local_group) {
3331 			this_load = avg_load;
3332 		} else if (avg_load < min_load) {
3333 			min_load = avg_load;
3334 			idlest = group;
3335 		}
3336 	} while (group = group->next, group != sd->groups);
3337 
3338 	if (!idlest || 100*this_load < imbalance*min_load)
3339 		return NULL;
3340 	return idlest;
3341 }
3342 
3343 /*
3344  * find_idlest_cpu - find the idlest cpu among the cpus in group.
3345  */
3346 static int
3347 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3348 {
3349 	unsigned long load, min_load = ULONG_MAX;
3350 	int idlest = -1;
3351 	int i;
3352 
3353 	/* Traverse only the allowed CPUs */
3354 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
3355 		load = weighted_cpuload(i);
3356 
3357 		if (load < min_load || (load == min_load && i == this_cpu)) {
3358 			min_load = load;
3359 			idlest = i;
3360 		}
3361 	}
3362 
3363 	return idlest;
3364 }
3365 
3366 /*
3367  * Try and locate an idle CPU in the sched_domain.
3368  */
3369 static int select_idle_sibling(struct task_struct *p, int target)
3370 {
3371 	struct sched_domain *sd;
3372 	struct sched_group *sg;
3373 	int i = task_cpu(p);
3374 
3375 	if (idle_cpu(target))
3376 		return target;
3377 
3378 	/*
3379 	 * If the prevous cpu is cache affine and idle, don't be stupid.
3380 	 */
3381 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3382 		return i;
3383 
3384 	/*
3385 	 * Otherwise, iterate the domains and find an elegible idle cpu.
3386 	 */
3387 	sd = rcu_dereference(per_cpu(sd_llc, target));
3388 	for_each_lower_domain(sd) {
3389 		sg = sd->groups;
3390 		do {
3391 			if (!cpumask_intersects(sched_group_cpus(sg),
3392 						tsk_cpus_allowed(p)))
3393 				goto next;
3394 
3395 			for_each_cpu(i, sched_group_cpus(sg)) {
3396 				if (i == target || !idle_cpu(i))
3397 					goto next;
3398 			}
3399 
3400 			target = cpumask_first_and(sched_group_cpus(sg),
3401 					tsk_cpus_allowed(p));
3402 			goto done;
3403 next:
3404 			sg = sg->next;
3405 		} while (sg != sd->groups);
3406 	}
3407 done:
3408 	return target;
3409 }
3410 
3411 /*
3412  * sched_balance_self: balance the current task (running on cpu) in domains
3413  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
3414  * SD_BALANCE_EXEC.
3415  *
3416  * Balance, ie. select the least loaded group.
3417  *
3418  * Returns the target CPU number, or the same CPU if no balancing is needed.
3419  *
3420  * preempt must be disabled.
3421  */
3422 static int
3423 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
3424 {
3425 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3426 	int cpu = smp_processor_id();
3427 	int prev_cpu = task_cpu(p);
3428 	int new_cpu = cpu;
3429 	int want_affine = 0;
3430 	int sync = wake_flags & WF_SYNC;
3431 
3432 	if (p->nr_cpus_allowed == 1)
3433 		return prev_cpu;
3434 
3435 	if (sd_flag & SD_BALANCE_WAKE) {
3436 		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
3437 			want_affine = 1;
3438 		new_cpu = prev_cpu;
3439 	}
3440 
3441 	rcu_read_lock();
3442 	for_each_domain(cpu, tmp) {
3443 		if (!(tmp->flags & SD_LOAD_BALANCE))
3444 			continue;
3445 
3446 		/*
3447 		 * If both cpu and prev_cpu are part of this domain,
3448 		 * cpu is a valid SD_WAKE_AFFINE target.
3449 		 */
3450 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
3451 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
3452 			affine_sd = tmp;
3453 			break;
3454 		}
3455 
3456 		if (tmp->flags & sd_flag)
3457 			sd = tmp;
3458 	}
3459 
3460 	if (affine_sd) {
3461 		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
3462 			prev_cpu = cpu;
3463 
3464 		new_cpu = select_idle_sibling(p, prev_cpu);
3465 		goto unlock;
3466 	}
3467 
3468 	while (sd) {
3469 		int load_idx = sd->forkexec_idx;
3470 		struct sched_group *group;
3471 		int weight;
3472 
3473 		if (!(sd->flags & sd_flag)) {
3474 			sd = sd->child;
3475 			continue;
3476 		}
3477 
3478 		if (sd_flag & SD_BALANCE_WAKE)
3479 			load_idx = sd->wake_idx;
3480 
3481 		group = find_idlest_group(sd, p, cpu, load_idx);
3482 		if (!group) {
3483 			sd = sd->child;
3484 			continue;
3485 		}
3486 
3487 		new_cpu = find_idlest_cpu(group, p, cpu);
3488 		if (new_cpu == -1 || new_cpu == cpu) {
3489 			/* Now try balancing at a lower domain level of cpu */
3490 			sd = sd->child;
3491 			continue;
3492 		}
3493 
3494 		/* Now try balancing at a lower domain level of new_cpu */
3495 		cpu = new_cpu;
3496 		weight = sd->span_weight;
3497 		sd = NULL;
3498 		for_each_domain(cpu, tmp) {
3499 			if (weight <= tmp->span_weight)
3500 				break;
3501 			if (tmp->flags & sd_flag)
3502 				sd = tmp;
3503 		}
3504 		/* while loop will break here if sd == NULL */
3505 	}
3506 unlock:
3507 	rcu_read_unlock();
3508 
3509 	return new_cpu;
3510 }
3511 
3512 /*
3513  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3514  * cfs_rq_of(p) references at time of call are still valid and identify the
3515  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
3516  * other assumptions, including the state of rq->lock, should be made.
3517  */
3518 static void
3519 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3520 {
3521 	struct sched_entity *se = &p->se;
3522 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3523 
3524 	/*
3525 	 * Load tracking: accumulate removed load so that it can be processed
3526 	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
3527 	 * to blocked load iff they have a positive decay-count.  It can never
3528 	 * be negative here since on-rq tasks have decay-count == 0.
3529 	 */
3530 	if (se->avg.decay_count) {
3531 		se->avg.decay_count = -__synchronize_entity_decay(se);
3532 		atomic_long_add(se->avg.load_avg_contrib,
3533 						&cfs_rq->removed_load);
3534 	}
3535 }
3536 #endif /* CONFIG_SMP */
3537 
3538 static unsigned long
3539 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
3540 {
3541 	unsigned long gran = sysctl_sched_wakeup_granularity;
3542 
3543 	/*
3544 	 * Since its curr running now, convert the gran from real-time
3545 	 * to virtual-time in his units.
3546 	 *
3547 	 * By using 'se' instead of 'curr' we penalize light tasks, so
3548 	 * they get preempted easier. That is, if 'se' < 'curr' then
3549 	 * the resulting gran will be larger, therefore penalizing the
3550 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
3551 	 * be smaller, again penalizing the lighter task.
3552 	 *
3553 	 * This is especially important for buddies when the leftmost
3554 	 * task is higher priority than the buddy.
3555 	 */
3556 	return calc_delta_fair(gran, se);
3557 }
3558 
3559 /*
3560  * Should 'se' preempt 'curr'.
3561  *
3562  *             |s1
3563  *        |s2
3564  *   |s3
3565  *         g
3566  *      |<--->|c
3567  *
3568  *  w(c, s1) = -1
3569  *  w(c, s2) =  0
3570  *  w(c, s3) =  1
3571  *
3572  */
3573 static int
3574 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
3575 {
3576 	s64 gran, vdiff = curr->vruntime - se->vruntime;
3577 
3578 	if (vdiff <= 0)
3579 		return -1;
3580 
3581 	gran = wakeup_gran(curr, se);
3582 	if (vdiff > gran)
3583 		return 1;
3584 
3585 	return 0;
3586 }
3587 
3588 static void set_last_buddy(struct sched_entity *se)
3589 {
3590 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3591 		return;
3592 
3593 	for_each_sched_entity(se)
3594 		cfs_rq_of(se)->last = se;
3595 }
3596 
3597 static void set_next_buddy(struct sched_entity *se)
3598 {
3599 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3600 		return;
3601 
3602 	for_each_sched_entity(se)
3603 		cfs_rq_of(se)->next = se;
3604 }
3605 
3606 static void set_skip_buddy(struct sched_entity *se)
3607 {
3608 	for_each_sched_entity(se)
3609 		cfs_rq_of(se)->skip = se;
3610 }
3611 
3612 /*
3613  * Preempt the current task with a newly woken task if needed:
3614  */
3615 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
3616 {
3617 	struct task_struct *curr = rq->curr;
3618 	struct sched_entity *se = &curr->se, *pse = &p->se;
3619 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3620 	int scale = cfs_rq->nr_running >= sched_nr_latency;
3621 	int next_buddy_marked = 0;
3622 
3623 	if (unlikely(se == pse))
3624 		return;
3625 
3626 	/*
3627 	 * This is possible from callers such as move_task(), in which we
3628 	 * unconditionally check_prempt_curr() after an enqueue (which may have
3629 	 * lead to a throttle).  This both saves work and prevents false
3630 	 * next-buddy nomination below.
3631 	 */
3632 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3633 		return;
3634 
3635 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3636 		set_next_buddy(pse);
3637 		next_buddy_marked = 1;
3638 	}
3639 
3640 	/*
3641 	 * We can come here with TIF_NEED_RESCHED already set from new task
3642 	 * wake up path.
3643 	 *
3644 	 * Note: this also catches the edge-case of curr being in a throttled
3645 	 * group (e.g. via set_curr_task), since update_curr() (in the
3646 	 * enqueue of curr) will have resulted in resched being set.  This
3647 	 * prevents us from potentially nominating it as a false LAST_BUDDY
3648 	 * below.
3649 	 */
3650 	if (test_tsk_need_resched(curr))
3651 		return;
3652 
3653 	/* Idle tasks are by definition preempted by non-idle tasks. */
3654 	if (unlikely(curr->policy == SCHED_IDLE) &&
3655 	    likely(p->policy != SCHED_IDLE))
3656 		goto preempt;
3657 
3658 	/*
3659 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
3660 	 * is driven by the tick):
3661 	 */
3662 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
3663 		return;
3664 
3665 	find_matching_se(&se, &pse);
3666 	update_curr(cfs_rq_of(se));
3667 	BUG_ON(!pse);
3668 	if (wakeup_preempt_entity(se, pse) == 1) {
3669 		/*
3670 		 * Bias pick_next to pick the sched entity that is
3671 		 * triggering this preemption.
3672 		 */
3673 		if (!next_buddy_marked)
3674 			set_next_buddy(pse);
3675 		goto preempt;
3676 	}
3677 
3678 	return;
3679 
3680 preempt:
3681 	resched_task(curr);
3682 	/*
3683 	 * Only set the backward buddy when the current task is still
3684 	 * on the rq. This can happen when a wakeup gets interleaved
3685 	 * with schedule on the ->pre_schedule() or idle_balance()
3686 	 * point, either of which can * drop the rq lock.
3687 	 *
3688 	 * Also, during early boot the idle thread is in the fair class,
3689 	 * for obvious reasons its a bad idea to schedule back to it.
3690 	 */
3691 	if (unlikely(!se->on_rq || curr == rq->idle))
3692 		return;
3693 
3694 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3695 		set_last_buddy(se);
3696 }
3697 
3698 static struct task_struct *pick_next_task_fair(struct rq *rq)
3699 {
3700 	struct task_struct *p;
3701 	struct cfs_rq *cfs_rq = &rq->cfs;
3702 	struct sched_entity *se;
3703 
3704 	if (!cfs_rq->nr_running)
3705 		return NULL;
3706 
3707 	do {
3708 		se = pick_next_entity(cfs_rq);
3709 		set_next_entity(cfs_rq, se);
3710 		cfs_rq = group_cfs_rq(se);
3711 	} while (cfs_rq);
3712 
3713 	p = task_of(se);
3714 	if (hrtick_enabled(rq))
3715 		hrtick_start_fair(rq, p);
3716 
3717 	return p;
3718 }
3719 
3720 /*
3721  * Account for a descheduled task:
3722  */
3723 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
3724 {
3725 	struct sched_entity *se = &prev->se;
3726 	struct cfs_rq *cfs_rq;
3727 
3728 	for_each_sched_entity(se) {
3729 		cfs_rq = cfs_rq_of(se);
3730 		put_prev_entity(cfs_rq, se);
3731 	}
3732 }
3733 
3734 /*
3735  * sched_yield() is very simple
3736  *
3737  * The magic of dealing with the ->skip buddy is in pick_next_entity.
3738  */
3739 static void yield_task_fair(struct rq *rq)
3740 {
3741 	struct task_struct *curr = rq->curr;
3742 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3743 	struct sched_entity *se = &curr->se;
3744 
3745 	/*
3746 	 * Are we the only task in the tree?
3747 	 */
3748 	if (unlikely(rq->nr_running == 1))
3749 		return;
3750 
3751 	clear_buddies(cfs_rq, se);
3752 
3753 	if (curr->policy != SCHED_BATCH) {
3754 		update_rq_clock(rq);
3755 		/*
3756 		 * Update run-time statistics of the 'current'.
3757 		 */
3758 		update_curr(cfs_rq);
3759 		/*
3760 		 * Tell update_rq_clock() that we've just updated,
3761 		 * so we don't do microscopic update in schedule()
3762 		 * and double the fastpath cost.
3763 		 */
3764 		 rq->skip_clock_update = 1;
3765 	}
3766 
3767 	set_skip_buddy(se);
3768 }
3769 
3770 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3771 {
3772 	struct sched_entity *se = &p->se;
3773 
3774 	/* throttled hierarchies are not runnable */
3775 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3776 		return false;
3777 
3778 	/* Tell the scheduler that we'd really like pse to run next. */
3779 	set_next_buddy(se);
3780 
3781 	yield_task_fair(rq);
3782 
3783 	return true;
3784 }
3785 
3786 #ifdef CONFIG_SMP
3787 /**************************************************
3788  * Fair scheduling class load-balancing methods.
3789  *
3790  * BASICS
3791  *
3792  * The purpose of load-balancing is to achieve the same basic fairness the
3793  * per-cpu scheduler provides, namely provide a proportional amount of compute
3794  * time to each task. This is expressed in the following equation:
3795  *
3796  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
3797  *
3798  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3799  * W_i,0 is defined as:
3800  *
3801  *   W_i,0 = \Sum_j w_i,j                                             (2)
3802  *
3803  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3804  * is derived from the nice value as per prio_to_weight[].
3805  *
3806  * The weight average is an exponential decay average of the instantaneous
3807  * weight:
3808  *
3809  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
3810  *
3811  * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3812  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3813  * can also include other factors [XXX].
3814  *
3815  * To achieve this balance we define a measure of imbalance which follows
3816  * directly from (1):
3817  *
3818  *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
3819  *
3820  * We them move tasks around to minimize the imbalance. In the continuous
3821  * function space it is obvious this converges, in the discrete case we get
3822  * a few fun cases generally called infeasible weight scenarios.
3823  *
3824  * [XXX expand on:
3825  *     - infeasible weights;
3826  *     - local vs global optima in the discrete case. ]
3827  *
3828  *
3829  * SCHED DOMAINS
3830  *
3831  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3832  * for all i,j solution, we create a tree of cpus that follows the hardware
3833  * topology where each level pairs two lower groups (or better). This results
3834  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3835  * tree to only the first of the previous level and we decrease the frequency
3836  * of load-balance at each level inv. proportional to the number of cpus in
3837  * the groups.
3838  *
3839  * This yields:
3840  *
3841  *     log_2 n     1     n
3842  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
3843  *     i = 0      2^i   2^i
3844  *                               `- size of each group
3845  *         |         |     `- number of cpus doing load-balance
3846  *         |         `- freq
3847  *         `- sum over all levels
3848  *
3849  * Coupled with a limit on how many tasks we can migrate every balance pass,
3850  * this makes (5) the runtime complexity of the balancer.
3851  *
3852  * An important property here is that each CPU is still (indirectly) connected
3853  * to every other cpu in at most O(log n) steps:
3854  *
3855  * The adjacency matrix of the resulting graph is given by:
3856  *
3857  *             log_2 n
3858  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
3859  *             k = 0
3860  *
3861  * And you'll find that:
3862  *
3863  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
3864  *
3865  * Showing there's indeed a path between every cpu in at most O(log n) steps.
3866  * The task movement gives a factor of O(m), giving a convergence complexity
3867  * of:
3868  *
3869  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
3870  *
3871  *
3872  * WORK CONSERVING
3873  *
3874  * In order to avoid CPUs going idle while there's still work to do, new idle
3875  * balancing is more aggressive and has the newly idle cpu iterate up the domain
3876  * tree itself instead of relying on other CPUs to bring it work.
3877  *
3878  * This adds some complexity to both (5) and (8) but it reduces the total idle
3879  * time.
3880  *
3881  * [XXX more?]
3882  *
3883  *
3884  * CGROUPS
3885  *
3886  * Cgroups make a horror show out of (2), instead of a simple sum we get:
3887  *
3888  *                                s_k,i
3889  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
3890  *                                 S_k
3891  *
3892  * Where
3893  *
3894  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
3895  *
3896  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3897  *
3898  * The big problem is S_k, its a global sum needed to compute a local (W_i)
3899  * property.
3900  *
3901  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3902  *      rewrite all of this once again.]
3903  */
3904 
3905 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3906 
3907 #define LBF_ALL_PINNED	0x01
3908 #define LBF_NEED_BREAK	0x02
3909 #define LBF_SOME_PINNED 0x04
3910 
3911 struct lb_env {
3912 	struct sched_domain	*sd;
3913 
3914 	struct rq		*src_rq;
3915 	int			src_cpu;
3916 
3917 	int			dst_cpu;
3918 	struct rq		*dst_rq;
3919 
3920 	struct cpumask		*dst_grpmask;
3921 	int			new_dst_cpu;
3922 	enum cpu_idle_type	idle;
3923 	long			imbalance;
3924 	/* The set of CPUs under consideration for load-balancing */
3925 	struct cpumask		*cpus;
3926 
3927 	unsigned int		flags;
3928 
3929 	unsigned int		loop;
3930 	unsigned int		loop_break;
3931 	unsigned int		loop_max;
3932 };
3933 
3934 /*
3935  * move_task - move a task from one runqueue to another runqueue.
3936  * Both runqueues must be locked.
3937  */
3938 static void move_task(struct task_struct *p, struct lb_env *env)
3939 {
3940 	deactivate_task(env->src_rq, p, 0);
3941 	set_task_cpu(p, env->dst_cpu);
3942 	activate_task(env->dst_rq, p, 0);
3943 	check_preempt_curr(env->dst_rq, p, 0);
3944 }
3945 
3946 /*
3947  * Is this task likely cache-hot:
3948  */
3949 static int
3950 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3951 {
3952 	s64 delta;
3953 
3954 	if (p->sched_class != &fair_sched_class)
3955 		return 0;
3956 
3957 	if (unlikely(p->policy == SCHED_IDLE))
3958 		return 0;
3959 
3960 	/*
3961 	 * Buddy candidates are cache hot:
3962 	 */
3963 	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3964 			(&p->se == cfs_rq_of(&p->se)->next ||
3965 			 &p->se == cfs_rq_of(&p->se)->last))
3966 		return 1;
3967 
3968 	if (sysctl_sched_migration_cost == -1)
3969 		return 1;
3970 	if (sysctl_sched_migration_cost == 0)
3971 		return 0;
3972 
3973 	delta = now - p->se.exec_start;
3974 
3975 	return delta < (s64)sysctl_sched_migration_cost;
3976 }
3977 
3978 /*
3979  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3980  */
3981 static
3982 int can_migrate_task(struct task_struct *p, struct lb_env *env)
3983 {
3984 	int tsk_cache_hot = 0;
3985 	/*
3986 	 * We do not migrate tasks that are:
3987 	 * 1) throttled_lb_pair, or
3988 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3989 	 * 3) running (obviously), or
3990 	 * 4) are cache-hot on their current CPU.
3991 	 */
3992 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3993 		return 0;
3994 
3995 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3996 		int cpu;
3997 
3998 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3999 
4000 		/*
4001 		 * Remember if this task can be migrated to any other cpu in
4002 		 * our sched_group. We may want to revisit it if we couldn't
4003 		 * meet load balance goals by pulling other tasks on src_cpu.
4004 		 *
4005 		 * Also avoid computing new_dst_cpu if we have already computed
4006 		 * one in current iteration.
4007 		 */
4008 		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
4009 			return 0;
4010 
4011 		/* Prevent to re-select dst_cpu via env's cpus */
4012 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4013 			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4014 				env->flags |= LBF_SOME_PINNED;
4015 				env->new_dst_cpu = cpu;
4016 				break;
4017 			}
4018 		}
4019 
4020 		return 0;
4021 	}
4022 
4023 	/* Record that we found atleast one task that could run on dst_cpu */
4024 	env->flags &= ~LBF_ALL_PINNED;
4025 
4026 	if (task_running(env->src_rq, p)) {
4027 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
4028 		return 0;
4029 	}
4030 
4031 	/*
4032 	 * Aggressive migration if:
4033 	 * 1) task is cache cold, or
4034 	 * 2) too many balance attempts have failed.
4035 	 */
4036 
4037 	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
4038 	if (!tsk_cache_hot ||
4039 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4040 
4041 		if (tsk_cache_hot) {
4042 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4043 			schedstat_inc(p, se.statistics.nr_forced_migrations);
4044 		}
4045 
4046 		return 1;
4047 	}
4048 
4049 	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
4050 	return 0;
4051 }
4052 
4053 /*
4054  * move_one_task tries to move exactly one task from busiest to this_rq, as
4055  * part of active balancing operations within "domain".
4056  * Returns 1 if successful and 0 otherwise.
4057  *
4058  * Called with both runqueues locked.
4059  */
4060 static int move_one_task(struct lb_env *env)
4061 {
4062 	struct task_struct *p, *n;
4063 
4064 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
4065 		if (!can_migrate_task(p, env))
4066 			continue;
4067 
4068 		move_task(p, env);
4069 		/*
4070 		 * Right now, this is only the second place move_task()
4071 		 * is called, so we can safely collect move_task()
4072 		 * stats here rather than inside move_task().
4073 		 */
4074 		schedstat_inc(env->sd, lb_gained[env->idle]);
4075 		return 1;
4076 	}
4077 	return 0;
4078 }
4079 
4080 static unsigned long task_h_load(struct task_struct *p);
4081 
4082 static const unsigned int sched_nr_migrate_break = 32;
4083 
4084 /*
4085  * move_tasks tries to move up to imbalance weighted load from busiest to
4086  * this_rq, as part of a balancing operation within domain "sd".
4087  * Returns 1 if successful and 0 otherwise.
4088  *
4089  * Called with both runqueues locked.
4090  */
4091 static int move_tasks(struct lb_env *env)
4092 {
4093 	struct list_head *tasks = &env->src_rq->cfs_tasks;
4094 	struct task_struct *p;
4095 	unsigned long load;
4096 	int pulled = 0;
4097 
4098 	if (env->imbalance <= 0)
4099 		return 0;
4100 
4101 	while (!list_empty(tasks)) {
4102 		p = list_first_entry(tasks, struct task_struct, se.group_node);
4103 
4104 		env->loop++;
4105 		/* We've more or less seen every task there is, call it quits */
4106 		if (env->loop > env->loop_max)
4107 			break;
4108 
4109 		/* take a breather every nr_migrate tasks */
4110 		if (env->loop > env->loop_break) {
4111 			env->loop_break += sched_nr_migrate_break;
4112 			env->flags |= LBF_NEED_BREAK;
4113 			break;
4114 		}
4115 
4116 		if (!can_migrate_task(p, env))
4117 			goto next;
4118 
4119 		load = task_h_load(p);
4120 
4121 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
4122 			goto next;
4123 
4124 		if ((load / 2) > env->imbalance)
4125 			goto next;
4126 
4127 		move_task(p, env);
4128 		pulled++;
4129 		env->imbalance -= load;
4130 
4131 #ifdef CONFIG_PREEMPT
4132 		/*
4133 		 * NEWIDLE balancing is a source of latency, so preemptible
4134 		 * kernels will stop after the first task is pulled to minimize
4135 		 * the critical section.
4136 		 */
4137 		if (env->idle == CPU_NEWLY_IDLE)
4138 			break;
4139 #endif
4140 
4141 		/*
4142 		 * We only want to steal up to the prescribed amount of
4143 		 * weighted load.
4144 		 */
4145 		if (env->imbalance <= 0)
4146 			break;
4147 
4148 		continue;
4149 next:
4150 		list_move_tail(&p->se.group_node, tasks);
4151 	}
4152 
4153 	/*
4154 	 * Right now, this is one of only two places move_task() is called,
4155 	 * so we can safely collect move_task() stats here rather than
4156 	 * inside move_task().
4157 	 */
4158 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
4159 
4160 	return pulled;
4161 }
4162 
4163 #ifdef CONFIG_FAIR_GROUP_SCHED
4164 /*
4165  * update tg->load_weight by folding this cpu's load_avg
4166  */
4167 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
4168 {
4169 	struct sched_entity *se = tg->se[cpu];
4170 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
4171 
4172 	/* throttled entities do not contribute to load */
4173 	if (throttled_hierarchy(cfs_rq))
4174 		return;
4175 
4176 	update_cfs_rq_blocked_load(cfs_rq, 1);
4177 
4178 	if (se) {
4179 		update_entity_load_avg(se, 1);
4180 		/*
4181 		 * We pivot on our runnable average having decayed to zero for
4182 		 * list removal.  This generally implies that all our children
4183 		 * have also been removed (modulo rounding error or bandwidth
4184 		 * control); however, such cases are rare and we can fix these
4185 		 * at enqueue.
4186 		 *
4187 		 * TODO: fix up out-of-order children on enqueue.
4188 		 */
4189 		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4190 			list_del_leaf_cfs_rq(cfs_rq);
4191 	} else {
4192 		struct rq *rq = rq_of(cfs_rq);
4193 		update_rq_runnable_avg(rq, rq->nr_running);
4194 	}
4195 }
4196 
4197 static void update_blocked_averages(int cpu)
4198 {
4199 	struct rq *rq = cpu_rq(cpu);
4200 	struct cfs_rq *cfs_rq;
4201 	unsigned long flags;
4202 
4203 	raw_spin_lock_irqsave(&rq->lock, flags);
4204 	update_rq_clock(rq);
4205 	/*
4206 	 * Iterates the task_group tree in a bottom up fashion, see
4207 	 * list_add_leaf_cfs_rq() for details.
4208 	 */
4209 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4210 		/*
4211 		 * Note: We may want to consider periodically releasing
4212 		 * rq->lock about these updates so that creating many task
4213 		 * groups does not result in continually extending hold time.
4214 		 */
4215 		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
4216 	}
4217 
4218 	raw_spin_unlock_irqrestore(&rq->lock, flags);
4219 }
4220 
4221 /*
4222  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
4223  * This needs to be done in a top-down fashion because the load of a child
4224  * group is a fraction of its parents load.
4225  */
4226 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
4227 {
4228 	struct rq *rq = rq_of(cfs_rq);
4229 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
4230 	unsigned long now = jiffies;
4231 	unsigned long load;
4232 
4233 	if (cfs_rq->last_h_load_update == now)
4234 		return;
4235 
4236 	cfs_rq->h_load_next = NULL;
4237 	for_each_sched_entity(se) {
4238 		cfs_rq = cfs_rq_of(se);
4239 		cfs_rq->h_load_next = se;
4240 		if (cfs_rq->last_h_load_update == now)
4241 			break;
4242 	}
4243 
4244 	if (!se) {
4245 		cfs_rq->h_load = cfs_rq->runnable_load_avg;
4246 		cfs_rq->last_h_load_update = now;
4247 	}
4248 
4249 	while ((se = cfs_rq->h_load_next) != NULL) {
4250 		load = cfs_rq->h_load;
4251 		load = div64_ul(load * se->avg.load_avg_contrib,
4252 				cfs_rq->runnable_load_avg + 1);
4253 		cfs_rq = group_cfs_rq(se);
4254 		cfs_rq->h_load = load;
4255 		cfs_rq->last_h_load_update = now;
4256 	}
4257 }
4258 
4259 static unsigned long task_h_load(struct task_struct *p)
4260 {
4261 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
4262 
4263 	update_cfs_rq_h_load(cfs_rq);
4264 	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4265 			cfs_rq->runnable_load_avg + 1);
4266 }
4267 #else
4268 static inline void update_blocked_averages(int cpu)
4269 {
4270 }
4271 
4272 static unsigned long task_h_load(struct task_struct *p)
4273 {
4274 	return p->se.avg.load_avg_contrib;
4275 }
4276 #endif
4277 
4278 /********** Helpers for find_busiest_group ************************/
4279 /*
4280  * sg_lb_stats - stats of a sched_group required for load_balancing
4281  */
4282 struct sg_lb_stats {
4283 	unsigned long avg_load; /*Avg load across the CPUs of the group */
4284 	unsigned long group_load; /* Total load over the CPUs of the group */
4285 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4286 	unsigned long load_per_task;
4287 	unsigned long group_power;
4288 	unsigned int sum_nr_running; /* Nr tasks running in the group */
4289 	unsigned int group_capacity;
4290 	unsigned int idle_cpus;
4291 	unsigned int group_weight;
4292 	int group_imb; /* Is there an imbalance in the group ? */
4293 	int group_has_capacity; /* Is there extra capacity in the group? */
4294 };
4295 
4296 /*
4297  * sd_lb_stats - Structure to store the statistics of a sched_domain
4298  *		 during load balancing.
4299  */
4300 struct sd_lb_stats {
4301 	struct sched_group *busiest;	/* Busiest group in this sd */
4302 	struct sched_group *local;	/* Local group in this sd */
4303 	unsigned long total_load;	/* Total load of all groups in sd */
4304 	unsigned long total_pwr;	/* Total power of all groups in sd */
4305 	unsigned long avg_load;	/* Average load across all groups in sd */
4306 
4307 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
4308 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
4309 };
4310 
4311 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4312 {
4313 	/*
4314 	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
4315 	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
4316 	 * We must however clear busiest_stat::avg_load because
4317 	 * update_sd_pick_busiest() reads this before assignment.
4318 	 */
4319 	*sds = (struct sd_lb_stats){
4320 		.busiest = NULL,
4321 		.local = NULL,
4322 		.total_load = 0UL,
4323 		.total_pwr = 0UL,
4324 		.busiest_stat = {
4325 			.avg_load = 0UL,
4326 		},
4327 	};
4328 }
4329 
4330 /**
4331  * get_sd_load_idx - Obtain the load index for a given sched domain.
4332  * @sd: The sched_domain whose load_idx is to be obtained.
4333  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
4334  *
4335  * Return: The load index.
4336  */
4337 static inline int get_sd_load_idx(struct sched_domain *sd,
4338 					enum cpu_idle_type idle)
4339 {
4340 	int load_idx;
4341 
4342 	switch (idle) {
4343 	case CPU_NOT_IDLE:
4344 		load_idx = sd->busy_idx;
4345 		break;
4346 
4347 	case CPU_NEWLY_IDLE:
4348 		load_idx = sd->newidle_idx;
4349 		break;
4350 	default:
4351 		load_idx = sd->idle_idx;
4352 		break;
4353 	}
4354 
4355 	return load_idx;
4356 }
4357 
4358 static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4359 {
4360 	return SCHED_POWER_SCALE;
4361 }
4362 
4363 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4364 {
4365 	return default_scale_freq_power(sd, cpu);
4366 }
4367 
4368 static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4369 {
4370 	unsigned long weight = sd->span_weight;
4371 	unsigned long smt_gain = sd->smt_gain;
4372 
4373 	smt_gain /= weight;
4374 
4375 	return smt_gain;
4376 }
4377 
4378 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4379 {
4380 	return default_scale_smt_power(sd, cpu);
4381 }
4382 
4383 static unsigned long scale_rt_power(int cpu)
4384 {
4385 	struct rq *rq = cpu_rq(cpu);
4386 	u64 total, available, age_stamp, avg;
4387 
4388 	/*
4389 	 * Since we're reading these variables without serialization make sure
4390 	 * we read them once before doing sanity checks on them.
4391 	 */
4392 	age_stamp = ACCESS_ONCE(rq->age_stamp);
4393 	avg = ACCESS_ONCE(rq->rt_avg);
4394 
4395 	total = sched_avg_period() + (rq_clock(rq) - age_stamp);
4396 
4397 	if (unlikely(total < avg)) {
4398 		/* Ensures that power won't end up being negative */
4399 		available = 0;
4400 	} else {
4401 		available = total - avg;
4402 	}
4403 
4404 	if (unlikely((s64)total < SCHED_POWER_SCALE))
4405 		total = SCHED_POWER_SCALE;
4406 
4407 	total >>= SCHED_POWER_SHIFT;
4408 
4409 	return div_u64(available, total);
4410 }
4411 
4412 static void update_cpu_power(struct sched_domain *sd, int cpu)
4413 {
4414 	unsigned long weight = sd->span_weight;
4415 	unsigned long power = SCHED_POWER_SCALE;
4416 	struct sched_group *sdg = sd->groups;
4417 
4418 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
4419 		if (sched_feat(ARCH_POWER))
4420 			power *= arch_scale_smt_power(sd, cpu);
4421 		else
4422 			power *= default_scale_smt_power(sd, cpu);
4423 
4424 		power >>= SCHED_POWER_SHIFT;
4425 	}
4426 
4427 	sdg->sgp->power_orig = power;
4428 
4429 	if (sched_feat(ARCH_POWER))
4430 		power *= arch_scale_freq_power(sd, cpu);
4431 	else
4432 		power *= default_scale_freq_power(sd, cpu);
4433 
4434 	power >>= SCHED_POWER_SHIFT;
4435 
4436 	power *= scale_rt_power(cpu);
4437 	power >>= SCHED_POWER_SHIFT;
4438 
4439 	if (!power)
4440 		power = 1;
4441 
4442 	cpu_rq(cpu)->cpu_power = power;
4443 	sdg->sgp->power = power;
4444 }
4445 
4446 void update_group_power(struct sched_domain *sd, int cpu)
4447 {
4448 	struct sched_domain *child = sd->child;
4449 	struct sched_group *group, *sdg = sd->groups;
4450 	unsigned long power;
4451 	unsigned long interval;
4452 
4453 	interval = msecs_to_jiffies(sd->balance_interval);
4454 	interval = clamp(interval, 1UL, max_load_balance_interval);
4455 	sdg->sgp->next_update = jiffies + interval;
4456 
4457 	if (!child) {
4458 		update_cpu_power(sd, cpu);
4459 		return;
4460 	}
4461 
4462 	power = 0;
4463 
4464 	if (child->flags & SD_OVERLAP) {
4465 		/*
4466 		 * SD_OVERLAP domains cannot assume that child groups
4467 		 * span the current group.
4468 		 */
4469 
4470 		for_each_cpu(cpu, sched_group_cpus(sdg))
4471 			power += power_of(cpu);
4472 	} else  {
4473 		/*
4474 		 * !SD_OVERLAP domains can assume that child groups
4475 		 * span the current group.
4476 		 */
4477 
4478 		group = child->groups;
4479 		do {
4480 			power += group->sgp->power;
4481 			group = group->next;
4482 		} while (group != child->groups);
4483 	}
4484 
4485 	sdg->sgp->power_orig = sdg->sgp->power = power;
4486 }
4487 
4488 /*
4489  * Try and fix up capacity for tiny siblings, this is needed when
4490  * things like SD_ASYM_PACKING need f_b_g to select another sibling
4491  * which on its own isn't powerful enough.
4492  *
4493  * See update_sd_pick_busiest() and check_asym_packing().
4494  */
4495 static inline int
4496 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4497 {
4498 	/*
4499 	 * Only siblings can have significantly less than SCHED_POWER_SCALE
4500 	 */
4501 	if (!(sd->flags & SD_SHARE_CPUPOWER))
4502 		return 0;
4503 
4504 	/*
4505 	 * If ~90% of the cpu_power is still there, we're good.
4506 	 */
4507 	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
4508 		return 1;
4509 
4510 	return 0;
4511 }
4512 
4513 /*
4514  * Group imbalance indicates (and tries to solve) the problem where balancing
4515  * groups is inadequate due to tsk_cpus_allowed() constraints.
4516  *
4517  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
4518  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
4519  * Something like:
4520  *
4521  * 	{ 0 1 2 3 } { 4 5 6 7 }
4522  * 	        *     * * *
4523  *
4524  * If we were to balance group-wise we'd place two tasks in the first group and
4525  * two tasks in the second group. Clearly this is undesired as it will overload
4526  * cpu 3 and leave one of the cpus in the second group unused.
4527  *
4528  * The current solution to this issue is detecting the skew in the first group
4529  * by noticing it has a cpu that is overloaded while the remaining cpus are
4530  * idle -- or rather, there's a distinct imbalance in the cpus; see
4531  * sg_imbalanced().
4532  *
4533  * When this is so detected; this group becomes a candidate for busiest; see
4534  * update_sd_pick_busiest(). And calculcate_imbalance() and
4535  * find_busiest_group() avoid some of the usual balance conditional to allow it
4536  * to create an effective group imbalance.
4537  *
4538  * This is a somewhat tricky proposition since the next run might not find the
4539  * group imbalance and decide the groups need to be balanced again. A most
4540  * subtle and fragile situation.
4541  */
4542 
4543 struct sg_imb_stats {
4544 	unsigned long max_nr_running, min_nr_running;
4545 	unsigned long max_cpu_load, min_cpu_load;
4546 };
4547 
4548 static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549 {
4550 	sgi->max_cpu_load = sgi->max_nr_running = 0UL;
4551 	sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552 }
4553 
4554 static inline void
4555 update_sg_imb_stats(struct sg_imb_stats *sgi,
4556 		    unsigned long load, unsigned long nr_running)
4557 {
4558 	if (load > sgi->max_cpu_load)
4559 		sgi->max_cpu_load = load;
4560 	if (sgi->min_cpu_load > load)
4561 		sgi->min_cpu_load = load;
4562 
4563 	if (nr_running > sgi->max_nr_running)
4564 		sgi->max_nr_running = nr_running;
4565 	if (sgi->min_nr_running > nr_running)
4566 		sgi->min_nr_running = nr_running;
4567 }
4568 
4569 static inline int
4570 sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
4571 {
4572 	/*
4573 	 * Consider the group unbalanced when the imbalance is larger
4574 	 * than the average weight of a task.
4575 	 *
4576 	 * APZ: with cgroup the avg task weight can vary wildly and
4577 	 *      might not be a suitable number - should we keep a
4578 	 *      normalized nr_running number somewhere that negates
4579 	 *      the hierarchy?
4580 	 */
4581 	if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 	    (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 		return 1;
4584 
4585 	return 0;
4586 }
4587 
4588 /**
4589  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4590  * @env: The load balancing environment.
4591  * @group: sched_group whose statistics are to be updated.
4592  * @load_idx: Load index of sched_domain of this_cpu for load calc.
4593  * @local_group: Does group contain this_cpu.
4594  * @sgs: variable to hold the statistics for this group.
4595  */
4596 static inline void update_sg_lb_stats(struct lb_env *env,
4597 			struct sched_group *group, int load_idx,
4598 			int local_group, struct sg_lb_stats *sgs)
4599 {
4600 	struct sg_imb_stats sgi;
4601 	unsigned long nr_running;
4602 	unsigned long load;
4603 	int i;
4604 
4605 	init_sg_imb_stats(&sgi);
4606 
4607 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4608 		struct rq *rq = cpu_rq(i);
4609 
4610 		nr_running = rq->nr_running;
4611 
4612 		/* Bias balancing toward cpus of our domain */
4613 		if (local_group) {
4614 			load = target_load(i, load_idx);
4615 		} else {
4616 			load = source_load(i, load_idx);
4617 			update_sg_imb_stats(&sgi, load, nr_running);
4618 		}
4619 
4620 		sgs->group_load += load;
4621 		sgs->sum_nr_running += nr_running;
4622 		sgs->sum_weighted_load += weighted_cpuload(i);
4623 		if (idle_cpu(i))
4624 			sgs->idle_cpus++;
4625 	}
4626 
4627 	if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4628 			time_after_eq(jiffies, group->sgp->next_update)))
4629 		update_group_power(env->sd, env->dst_cpu);
4630 
4631 	/* Adjust by relative CPU power of the group */
4632 	sgs->group_power = group->sgp->power;
4633 	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
4634 
4635 	if (sgs->sum_nr_running)
4636 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4637 
4638 	sgs->group_imb = sg_imbalanced(sgs, &sgi);
4639 
4640 	sgs->group_capacity =
4641 		DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4642 
4643 	if (!sgs->group_capacity)
4644 		sgs->group_capacity = fix_small_capacity(env->sd, group);
4645 
4646 	sgs->group_weight = group->group_weight;
4647 
4648 	if (sgs->group_capacity > sgs->sum_nr_running)
4649 		sgs->group_has_capacity = 1;
4650 }
4651 
4652 /**
4653  * update_sd_pick_busiest - return 1 on busiest group
4654  * @env: The load balancing environment.
4655  * @sds: sched_domain statistics
4656  * @sg: sched_group candidate to be checked for being the busiest
4657  * @sgs: sched_group statistics
4658  *
4659  * Determine if @sg is a busier group than the previously selected
4660  * busiest group.
4661  *
4662  * Return: %true if @sg is a busier group than the previously selected
4663  * busiest group. %false otherwise.
4664  */
4665 static bool update_sd_pick_busiest(struct lb_env *env,
4666 				   struct sd_lb_stats *sds,
4667 				   struct sched_group *sg,
4668 				   struct sg_lb_stats *sgs)
4669 {
4670 	if (sgs->avg_load <= sds->busiest_stat.avg_load)
4671 		return false;
4672 
4673 	if (sgs->sum_nr_running > sgs->group_capacity)
4674 		return true;
4675 
4676 	if (sgs->group_imb)
4677 		return true;
4678 
4679 	/*
4680 	 * ASYM_PACKING needs to move all the work to the lowest
4681 	 * numbered CPUs in the group, therefore mark all groups
4682 	 * higher than ourself as busy.
4683 	 */
4684 	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
4685 	    env->dst_cpu < group_first_cpu(sg)) {
4686 		if (!sds->busiest)
4687 			return true;
4688 
4689 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
4690 			return true;
4691 	}
4692 
4693 	return false;
4694 }
4695 
4696 /**
4697  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4698  * @env: The load balancing environment.
4699  * @balance: Should we balance.
4700  * @sds: variable to hold the statistics for this sched_domain.
4701  */
4702 static inline void update_sd_lb_stats(struct lb_env *env,
4703 					struct sd_lb_stats *sds)
4704 {
4705 	struct sched_domain *child = env->sd->child;
4706 	struct sched_group *sg = env->sd->groups;
4707 	struct sg_lb_stats tmp_sgs;
4708 	int load_idx, prefer_sibling = 0;
4709 
4710 	if (child && child->flags & SD_PREFER_SIBLING)
4711 		prefer_sibling = 1;
4712 
4713 	load_idx = get_sd_load_idx(env->sd, env->idle);
4714 
4715 	do {
4716 		struct sg_lb_stats *sgs = &tmp_sgs;
4717 		int local_group;
4718 
4719 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4720 		if (local_group) {
4721 			sds->local = sg;
4722 			sgs = &sds->local_stat;
4723 		}
4724 
4725 		memset(sgs, 0, sizeof(*sgs));
4726 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4727 
4728 		/*
4729 		 * In case the child domain prefers tasks go to siblings
4730 		 * first, lower the sg capacity to one so that we'll try
4731 		 * and move all the excess tasks away. We lower the capacity
4732 		 * of a group only if the local group has the capacity to fit
4733 		 * these excess tasks, i.e. nr_running < group_capacity. The
4734 		 * extra check prevents the case where you always pull from the
4735 		 * heaviest group when it is already under-utilized (possible
4736 		 * with a large weight task outweighs the tasks on the system).
4737 		 */
4738 		if (prefer_sibling && !local_group &&
4739 				sds->local && sds->local_stat.group_has_capacity)
4740 			sgs->group_capacity = min(sgs->group_capacity, 1U);
4741 
4742 		/* Now, start updating sd_lb_stats */
4743 		sds->total_load += sgs->group_load;
4744 		sds->total_pwr += sgs->group_power;
4745 
4746 		if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4747 			sds->busiest = sg;
4748 			sds->busiest_stat = *sgs;
4749 		}
4750 
4751 		sg = sg->next;
4752 	} while (sg != env->sd->groups);
4753 }
4754 
4755 /**
4756  * check_asym_packing - Check to see if the group is packed into the
4757  *			sched doman.
4758  *
4759  * This is primarily intended to used at the sibling level.  Some
4760  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
4761  * case of POWER7, it can move to lower SMT modes only when higher
4762  * threads are idle.  When in lower SMT modes, the threads will
4763  * perform better since they share less core resources.  Hence when we
4764  * have idle threads, we want them to be the higher ones.
4765  *
4766  * This packing function is run on idle threads.  It checks to see if
4767  * the busiest CPU in this domain (core in the P7 case) has a higher
4768  * CPU number than the packing function is being run on.  Here we are
4769  * assuming lower CPU number will be equivalent to lower a SMT thread
4770  * number.
4771  *
4772  * Return: 1 when packing is required and a task should be moved to
4773  * this CPU.  The amount of the imbalance is returned in *imbalance.
4774  *
4775  * @env: The load balancing environment.
4776  * @sds: Statistics of the sched_domain which is to be packed
4777  */
4778 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4779 {
4780 	int busiest_cpu;
4781 
4782 	if (!(env->sd->flags & SD_ASYM_PACKING))
4783 		return 0;
4784 
4785 	if (!sds->busiest)
4786 		return 0;
4787 
4788 	busiest_cpu = group_first_cpu(sds->busiest);
4789 	if (env->dst_cpu > busiest_cpu)
4790 		return 0;
4791 
4792 	env->imbalance = DIV_ROUND_CLOSEST(
4793 		sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
4794 		SCHED_POWER_SCALE);
4795 
4796 	return 1;
4797 }
4798 
4799 /**
4800  * fix_small_imbalance - Calculate the minor imbalance that exists
4801  *			amongst the groups of a sched_domain, during
4802  *			load balancing.
4803  * @env: The load balancing environment.
4804  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
4805  */
4806 static inline
4807 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4808 {
4809 	unsigned long tmp, pwr_now = 0, pwr_move = 0;
4810 	unsigned int imbn = 2;
4811 	unsigned long scaled_busy_load_per_task;
4812 	struct sg_lb_stats *local, *busiest;
4813 
4814 	local = &sds->local_stat;
4815 	busiest = &sds->busiest_stat;
4816 
4817 	if (!local->sum_nr_running)
4818 		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
4819 	else if (busiest->load_per_task > local->load_per_task)
4820 		imbn = 1;
4821 
4822 	scaled_busy_load_per_task =
4823 		(busiest->load_per_task * SCHED_POWER_SCALE) /
4824 		busiest->group_power;
4825 
4826 	if (busiest->avg_load + scaled_busy_load_per_task >=
4827 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
4828 		env->imbalance = busiest->load_per_task;
4829 		return;
4830 	}
4831 
4832 	/*
4833 	 * OK, we don't have enough imbalance to justify moving tasks,
4834 	 * however we may be able to increase total CPU power used by
4835 	 * moving them.
4836 	 */
4837 
4838 	pwr_now += busiest->group_power *
4839 			min(busiest->load_per_task, busiest->avg_load);
4840 	pwr_now += local->group_power *
4841 			min(local->load_per_task, local->avg_load);
4842 	pwr_now /= SCHED_POWER_SCALE;
4843 
4844 	/* Amount of load we'd subtract */
4845 	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4846 		busiest->group_power;
4847 	if (busiest->avg_load > tmp) {
4848 		pwr_move += busiest->group_power *
4849 			    min(busiest->load_per_task,
4850 				busiest->avg_load - tmp);
4851 	}
4852 
4853 	/* Amount of load we'd add */
4854 	if (busiest->avg_load * busiest->group_power <
4855 	    busiest->load_per_task * SCHED_POWER_SCALE) {
4856 		tmp = (busiest->avg_load * busiest->group_power) /
4857 		      local->group_power;
4858 	} else {
4859 		tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4860 		      local->group_power;
4861 	}
4862 	pwr_move += local->group_power *
4863 		    min(local->load_per_task, local->avg_load + tmp);
4864 	pwr_move /= SCHED_POWER_SCALE;
4865 
4866 	/* Move if we gain throughput */
4867 	if (pwr_move > pwr_now)
4868 		env->imbalance = busiest->load_per_task;
4869 }
4870 
4871 /**
4872  * calculate_imbalance - Calculate the amount of imbalance present within the
4873  *			 groups of a given sched_domain during load balance.
4874  * @env: load balance environment
4875  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4876  */
4877 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4878 {
4879 	unsigned long max_pull, load_above_capacity = ~0UL;
4880 	struct sg_lb_stats *local, *busiest;
4881 
4882 	local = &sds->local_stat;
4883 	busiest = &sds->busiest_stat;
4884 
4885 	if (busiest->group_imb) {
4886 		/*
4887 		 * In the group_imb case we cannot rely on group-wide averages
4888 		 * to ensure cpu-load equilibrium, look at wider averages. XXX
4889 		 */
4890 		busiest->load_per_task =
4891 			min(busiest->load_per_task, sds->avg_load);
4892 	}
4893 
4894 	/*
4895 	 * In the presence of smp nice balancing, certain scenarios can have
4896 	 * max load less than avg load(as we skip the groups at or below
4897 	 * its cpu_power, while calculating max_load..)
4898 	 */
4899 	if (busiest->avg_load <= sds->avg_load ||
4900 	    local->avg_load >= sds->avg_load) {
4901 		env->imbalance = 0;
4902 		return fix_small_imbalance(env, sds);
4903 	}
4904 
4905 	if (!busiest->group_imb) {
4906 		/*
4907 		 * Don't want to pull so many tasks that a group would go idle.
4908 		 * Except of course for the group_imb case, since then we might
4909 		 * have to drop below capacity to reach cpu-load equilibrium.
4910 		 */
4911 		load_above_capacity =
4912 			(busiest->sum_nr_running - busiest->group_capacity);
4913 
4914 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4915 		load_above_capacity /= busiest->group_power;
4916 	}
4917 
4918 	/*
4919 	 * We're trying to get all the cpus to the average_load, so we don't
4920 	 * want to push ourselves above the average load, nor do we wish to
4921 	 * reduce the max loaded cpu below the average load. At the same time,
4922 	 * we also don't want to reduce the group load below the group capacity
4923 	 * (so that we can implement power-savings policies etc). Thus we look
4924 	 * for the minimum possible imbalance.
4925 	 */
4926 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
4927 
4928 	/* How much load to actually move to equalise the imbalance */
4929 	env->imbalance = min(
4930 		max_pull * busiest->group_power,
4931 		(sds->avg_load - local->avg_load) * local->group_power
4932 	) / SCHED_POWER_SCALE;
4933 
4934 	/*
4935 	 * if *imbalance is less than the average load per runnable task
4936 	 * there is no guarantee that any tasks will be moved so we'll have
4937 	 * a think about bumping its value to force at least one task to be
4938 	 * moved
4939 	 */
4940 	if (env->imbalance < busiest->load_per_task)
4941 		return fix_small_imbalance(env, sds);
4942 }
4943 
4944 /******* find_busiest_group() helpers end here *********************/
4945 
4946 /**
4947  * find_busiest_group - Returns the busiest group within the sched_domain
4948  * if there is an imbalance. If there isn't an imbalance, and
4949  * the user has opted for power-savings, it returns a group whose
4950  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4951  * such a group exists.
4952  *
4953  * Also calculates the amount of weighted load which should be moved
4954  * to restore balance.
4955  *
4956  * @env: The load balancing environment.
4957  *
4958  * Return:	- The busiest group if imbalance exists.
4959  *		- If no imbalance and user has opted for power-savings balance,
4960  *		   return the least loaded group whose CPUs can be
4961  *		   put to idle by rebalancing its tasks onto our group.
4962  */
4963 static struct sched_group *find_busiest_group(struct lb_env *env)
4964 {
4965 	struct sg_lb_stats *local, *busiest;
4966 	struct sd_lb_stats sds;
4967 
4968 	init_sd_lb_stats(&sds);
4969 
4970 	/*
4971 	 * Compute the various statistics relavent for load balancing at
4972 	 * this level.
4973 	 */
4974 	update_sd_lb_stats(env, &sds);
4975 	local = &sds.local_stat;
4976 	busiest = &sds.busiest_stat;
4977 
4978 	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4979 	    check_asym_packing(env, &sds))
4980 		return sds.busiest;
4981 
4982 	/* There is no busy sibling group to pull tasks from */
4983 	if (!sds.busiest || busiest->sum_nr_running == 0)
4984 		goto out_balanced;
4985 
4986 	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4987 
4988 	/*
4989 	 * If the busiest group is imbalanced the below checks don't
4990 	 * work because they assume all things are equal, which typically
4991 	 * isn't true due to cpus_allowed constraints and the like.
4992 	 */
4993 	if (busiest->group_imb)
4994 		goto force_balance;
4995 
4996 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4997 	if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
4998 	    !busiest->group_has_capacity)
4999 		goto force_balance;
5000 
5001 	/*
5002 	 * If the local group is more busy than the selected busiest group
5003 	 * don't try and pull any tasks.
5004 	 */
5005 	if (local->avg_load >= busiest->avg_load)
5006 		goto out_balanced;
5007 
5008 	/*
5009 	 * Don't pull any tasks if this group is already above the domain
5010 	 * average load.
5011 	 */
5012 	if (local->avg_load >= sds.avg_load)
5013 		goto out_balanced;
5014 
5015 	if (env->idle == CPU_IDLE) {
5016 		/*
5017 		 * This cpu is idle. If the busiest group load doesn't
5018 		 * have more tasks than the number of available cpu's and
5019 		 * there is no imbalance between this and busiest group
5020 		 * wrt to idle cpu's, it is balanced.
5021 		 */
5022 		if ((local->idle_cpus < busiest->idle_cpus) &&
5023 		    busiest->sum_nr_running <= busiest->group_weight)
5024 			goto out_balanced;
5025 	} else {
5026 		/*
5027 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
5028 		 * imbalance_pct to be conservative.
5029 		 */
5030 		if (100 * busiest->avg_load <=
5031 				env->sd->imbalance_pct * local->avg_load)
5032 			goto out_balanced;
5033 	}
5034 
5035 force_balance:
5036 	/* Looks like there is an imbalance. Compute it */
5037 	calculate_imbalance(env, &sds);
5038 	return sds.busiest;
5039 
5040 out_balanced:
5041 	env->imbalance = 0;
5042 	return NULL;
5043 }
5044 
5045 /*
5046  * find_busiest_queue - find the busiest runqueue among the cpus in group.
5047  */
5048 static struct rq *find_busiest_queue(struct lb_env *env,
5049 				     struct sched_group *group)
5050 {
5051 	struct rq *busiest = NULL, *rq;
5052 	unsigned long busiest_load = 0, busiest_power = 1;
5053 	int i;
5054 
5055 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5056 		unsigned long power = power_of(i);
5057 		unsigned long capacity = DIV_ROUND_CLOSEST(power,
5058 							   SCHED_POWER_SCALE);
5059 		unsigned long wl;
5060 
5061 		if (!capacity)
5062 			capacity = fix_small_capacity(env->sd, group);
5063 
5064 		rq = cpu_rq(i);
5065 		wl = weighted_cpuload(i);
5066 
5067 		/*
5068 		 * When comparing with imbalance, use weighted_cpuload()
5069 		 * which is not scaled with the cpu power.
5070 		 */
5071 		if (capacity && rq->nr_running == 1 && wl > env->imbalance)
5072 			continue;
5073 
5074 		/*
5075 		 * For the load comparisons with the other cpu's, consider
5076 		 * the weighted_cpuload() scaled with the cpu power, so that
5077 		 * the load can be moved away from the cpu that is potentially
5078 		 * running at a lower capacity.
5079 		 *
5080 		 * Thus we're looking for max(wl_i / power_i), crosswise
5081 		 * multiplication to rid ourselves of the division works out
5082 		 * to: wl_i * power_j > wl_j * power_i;  where j is our
5083 		 * previous maximum.
5084 		 */
5085 		if (wl * busiest_power > busiest_load * power) {
5086 			busiest_load = wl;
5087 			busiest_power = power;
5088 			busiest = rq;
5089 		}
5090 	}
5091 
5092 	return busiest;
5093 }
5094 
5095 /*
5096  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
5097  * so long as it is large enough.
5098  */
5099 #define MAX_PINNED_INTERVAL	512
5100 
5101 /* Working cpumask for load_balance and load_balance_newidle. */
5102 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5103 
5104 static int need_active_balance(struct lb_env *env)
5105 {
5106 	struct sched_domain *sd = env->sd;
5107 
5108 	if (env->idle == CPU_NEWLY_IDLE) {
5109 
5110 		/*
5111 		 * ASYM_PACKING needs to force migrate tasks from busy but
5112 		 * higher numbered CPUs in order to pack all tasks in the
5113 		 * lowest numbered CPUs.
5114 		 */
5115 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
5116 			return 1;
5117 	}
5118 
5119 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
5120 }
5121 
5122 static int active_load_balance_cpu_stop(void *data);
5123 
5124 static int should_we_balance(struct lb_env *env)
5125 {
5126 	struct sched_group *sg = env->sd->groups;
5127 	struct cpumask *sg_cpus, *sg_mask;
5128 	int cpu, balance_cpu = -1;
5129 
5130 	/*
5131 	 * In the newly idle case, we will allow all the cpu's
5132 	 * to do the newly idle load balance.
5133 	 */
5134 	if (env->idle == CPU_NEWLY_IDLE)
5135 		return 1;
5136 
5137 	sg_cpus = sched_group_cpus(sg);
5138 	sg_mask = sched_group_mask(sg);
5139 	/* Try to find first idle cpu */
5140 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
5141 		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
5142 			continue;
5143 
5144 		balance_cpu = cpu;
5145 		break;
5146 	}
5147 
5148 	if (balance_cpu == -1)
5149 		balance_cpu = group_balance_cpu(sg);
5150 
5151 	/*
5152 	 * First idle cpu or the first cpu(busiest) in this sched group
5153 	 * is eligible for doing load balancing at this and above domains.
5154 	 */
5155 	return balance_cpu == env->dst_cpu;
5156 }
5157 
5158 /*
5159  * Check this_cpu to ensure it is balanced within domain. Attempt to move
5160  * tasks if there is an imbalance.
5161  */
5162 static int load_balance(int this_cpu, struct rq *this_rq,
5163 			struct sched_domain *sd, enum cpu_idle_type idle,
5164 			int *continue_balancing)
5165 {
5166 	int ld_moved, cur_ld_moved, active_balance = 0;
5167 	struct sched_group *group;
5168 	struct rq *busiest;
5169 	unsigned long flags;
5170 	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
5171 
5172 	struct lb_env env = {
5173 		.sd		= sd,
5174 		.dst_cpu	= this_cpu,
5175 		.dst_rq		= this_rq,
5176 		.dst_grpmask    = sched_group_cpus(sd->groups),
5177 		.idle		= idle,
5178 		.loop_break	= sched_nr_migrate_break,
5179 		.cpus		= cpus,
5180 	};
5181 
5182 	/*
5183 	 * For NEWLY_IDLE load_balancing, we don't need to consider
5184 	 * other cpus in our group
5185 	 */
5186 	if (idle == CPU_NEWLY_IDLE)
5187 		env.dst_grpmask = NULL;
5188 
5189 	cpumask_copy(cpus, cpu_active_mask);
5190 
5191 	schedstat_inc(sd, lb_count[idle]);
5192 
5193 redo:
5194 	if (!should_we_balance(&env)) {
5195 		*continue_balancing = 0;
5196 		goto out_balanced;
5197 	}
5198 
5199 	group = find_busiest_group(&env);
5200 	if (!group) {
5201 		schedstat_inc(sd, lb_nobusyg[idle]);
5202 		goto out_balanced;
5203 	}
5204 
5205 	busiest = find_busiest_queue(&env, group);
5206 	if (!busiest) {
5207 		schedstat_inc(sd, lb_nobusyq[idle]);
5208 		goto out_balanced;
5209 	}
5210 
5211 	BUG_ON(busiest == env.dst_rq);
5212 
5213 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5214 
5215 	ld_moved = 0;
5216 	if (busiest->nr_running > 1) {
5217 		/*
5218 		 * Attempt to move tasks. If find_busiest_group has found
5219 		 * an imbalance but busiest->nr_running <= 1, the group is
5220 		 * still unbalanced. ld_moved simply stays zero, so it is
5221 		 * correctly treated as an imbalance.
5222 		 */
5223 		env.flags |= LBF_ALL_PINNED;
5224 		env.src_cpu   = busiest->cpu;
5225 		env.src_rq    = busiest;
5226 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
5227 
5228 more_balance:
5229 		local_irq_save(flags);
5230 		double_rq_lock(env.dst_rq, busiest);
5231 
5232 		/*
5233 		 * cur_ld_moved - load moved in current iteration
5234 		 * ld_moved     - cumulative load moved across iterations
5235 		 */
5236 		cur_ld_moved = move_tasks(&env);
5237 		ld_moved += cur_ld_moved;
5238 		double_rq_unlock(env.dst_rq, busiest);
5239 		local_irq_restore(flags);
5240 
5241 		/*
5242 		 * some other cpu did the load balance for us.
5243 		 */
5244 		if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5245 			resched_cpu(env.dst_cpu);
5246 
5247 		if (env.flags & LBF_NEED_BREAK) {
5248 			env.flags &= ~LBF_NEED_BREAK;
5249 			goto more_balance;
5250 		}
5251 
5252 		/*
5253 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5254 		 * us and move them to an alternate dst_cpu in our sched_group
5255 		 * where they can run. The upper limit on how many times we
5256 		 * iterate on same src_cpu is dependent on number of cpus in our
5257 		 * sched_group.
5258 		 *
5259 		 * This changes load balance semantics a bit on who can move
5260 		 * load to a given_cpu. In addition to the given_cpu itself
5261 		 * (or a ilb_cpu acting on its behalf where given_cpu is
5262 		 * nohz-idle), we now have balance_cpu in a position to move
5263 		 * load to given_cpu. In rare situations, this may cause
5264 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
5265 		 * _independently_ and at _same_ time to move some load to
5266 		 * given_cpu) causing exceess load to be moved to given_cpu.
5267 		 * This however should not happen so much in practice and
5268 		 * moreover subsequent load balance cycles should correct the
5269 		 * excess load moved.
5270 		 */
5271 		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
5272 
5273 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
5274 			env.dst_cpu	 = env.new_dst_cpu;
5275 			env.flags	&= ~LBF_SOME_PINNED;
5276 			env.loop	 = 0;
5277 			env.loop_break	 = sched_nr_migrate_break;
5278 
5279 			/* Prevent to re-select dst_cpu via env's cpus */
5280 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
5281 
5282 			/*
5283 			 * Go back to "more_balance" rather than "redo" since we
5284 			 * need to continue with same src_cpu.
5285 			 */
5286 			goto more_balance;
5287 		}
5288 
5289 		/* All tasks on this runqueue were pinned by CPU affinity */
5290 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
5291 			cpumask_clear_cpu(cpu_of(busiest), cpus);
5292 			if (!cpumask_empty(cpus)) {
5293 				env.loop = 0;
5294 				env.loop_break = sched_nr_migrate_break;
5295 				goto redo;
5296 			}
5297 			goto out_balanced;
5298 		}
5299 	}
5300 
5301 	if (!ld_moved) {
5302 		schedstat_inc(sd, lb_failed[idle]);
5303 		/*
5304 		 * Increment the failure counter only on periodic balance.
5305 		 * We do not want newidle balance, which can be very
5306 		 * frequent, pollute the failure counter causing
5307 		 * excessive cache_hot migrations and active balances.
5308 		 */
5309 		if (idle != CPU_NEWLY_IDLE)
5310 			sd->nr_balance_failed++;
5311 
5312 		if (need_active_balance(&env)) {
5313 			raw_spin_lock_irqsave(&busiest->lock, flags);
5314 
5315 			/* don't kick the active_load_balance_cpu_stop,
5316 			 * if the curr task on busiest cpu can't be
5317 			 * moved to this_cpu
5318 			 */
5319 			if (!cpumask_test_cpu(this_cpu,
5320 					tsk_cpus_allowed(busiest->curr))) {
5321 				raw_spin_unlock_irqrestore(&busiest->lock,
5322 							    flags);
5323 				env.flags |= LBF_ALL_PINNED;
5324 				goto out_one_pinned;
5325 			}
5326 
5327 			/*
5328 			 * ->active_balance synchronizes accesses to
5329 			 * ->active_balance_work.  Once set, it's cleared
5330 			 * only after active load balance is finished.
5331 			 */
5332 			if (!busiest->active_balance) {
5333 				busiest->active_balance = 1;
5334 				busiest->push_cpu = this_cpu;
5335 				active_balance = 1;
5336 			}
5337 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
5338 
5339 			if (active_balance) {
5340 				stop_one_cpu_nowait(cpu_of(busiest),
5341 					active_load_balance_cpu_stop, busiest,
5342 					&busiest->active_balance_work);
5343 			}
5344 
5345 			/*
5346 			 * We've kicked active balancing, reset the failure
5347 			 * counter.
5348 			 */
5349 			sd->nr_balance_failed = sd->cache_nice_tries+1;
5350 		}
5351 	} else
5352 		sd->nr_balance_failed = 0;
5353 
5354 	if (likely(!active_balance)) {
5355 		/* We were unbalanced, so reset the balancing interval */
5356 		sd->balance_interval = sd->min_interval;
5357 	} else {
5358 		/*
5359 		 * If we've begun active balancing, start to back off. This
5360 		 * case may not be covered by the all_pinned logic if there
5361 		 * is only 1 task on the busy runqueue (because we don't call
5362 		 * move_tasks).
5363 		 */
5364 		if (sd->balance_interval < sd->max_interval)
5365 			sd->balance_interval *= 2;
5366 	}
5367 
5368 	goto out;
5369 
5370 out_balanced:
5371 	schedstat_inc(sd, lb_balanced[idle]);
5372 
5373 	sd->nr_balance_failed = 0;
5374 
5375 out_one_pinned:
5376 	/* tune up the balancing interval */
5377 	if (((env.flags & LBF_ALL_PINNED) &&
5378 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
5379 			(sd->balance_interval < sd->max_interval))
5380 		sd->balance_interval *= 2;
5381 
5382 	ld_moved = 0;
5383 out:
5384 	return ld_moved;
5385 }
5386 
5387 /*
5388  * idle_balance is called by schedule() if this_cpu is about to become
5389  * idle. Attempts to pull tasks from other CPUs.
5390  */
5391 void idle_balance(int this_cpu, struct rq *this_rq)
5392 {
5393 	struct sched_domain *sd;
5394 	int pulled_task = 0;
5395 	unsigned long next_balance = jiffies + HZ;
5396 
5397 	this_rq->idle_stamp = rq_clock(this_rq);
5398 
5399 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
5400 		return;
5401 
5402 	/*
5403 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
5404 	 */
5405 	raw_spin_unlock(&this_rq->lock);
5406 
5407 	update_blocked_averages(this_cpu);
5408 	rcu_read_lock();
5409 	for_each_domain(this_cpu, sd) {
5410 		unsigned long interval;
5411 		int continue_balancing = 1;
5412 
5413 		if (!(sd->flags & SD_LOAD_BALANCE))
5414 			continue;
5415 
5416 		if (sd->flags & SD_BALANCE_NEWIDLE) {
5417 			/* If we've pulled tasks over stop searching: */
5418 			pulled_task = load_balance(this_cpu, this_rq,
5419 						   sd, CPU_NEWLY_IDLE,
5420 						   &continue_balancing);
5421 		}
5422 
5423 		interval = msecs_to_jiffies(sd->balance_interval);
5424 		if (time_after(next_balance, sd->last_balance + interval))
5425 			next_balance = sd->last_balance + interval;
5426 		if (pulled_task) {
5427 			this_rq->idle_stamp = 0;
5428 			break;
5429 		}
5430 	}
5431 	rcu_read_unlock();
5432 
5433 	raw_spin_lock(&this_rq->lock);
5434 
5435 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
5436 		/*
5437 		 * We are going idle. next_balance may be set based on
5438 		 * a busy processor. So reset next_balance.
5439 		 */
5440 		this_rq->next_balance = next_balance;
5441 	}
5442 }
5443 
5444 /*
5445  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
5446  * running tasks off the busiest CPU onto idle CPUs. It requires at
5447  * least 1 task to be running on each physical CPU where possible, and
5448  * avoids physical / logical imbalances.
5449  */
5450 static int active_load_balance_cpu_stop(void *data)
5451 {
5452 	struct rq *busiest_rq = data;
5453 	int busiest_cpu = cpu_of(busiest_rq);
5454 	int target_cpu = busiest_rq->push_cpu;
5455 	struct rq *target_rq = cpu_rq(target_cpu);
5456 	struct sched_domain *sd;
5457 
5458 	raw_spin_lock_irq(&busiest_rq->lock);
5459 
5460 	/* make sure the requested cpu hasn't gone down in the meantime */
5461 	if (unlikely(busiest_cpu != smp_processor_id() ||
5462 		     !busiest_rq->active_balance))
5463 		goto out_unlock;
5464 
5465 	/* Is there any task to move? */
5466 	if (busiest_rq->nr_running <= 1)
5467 		goto out_unlock;
5468 
5469 	/*
5470 	 * This condition is "impossible", if it occurs
5471 	 * we need to fix it. Originally reported by
5472 	 * Bjorn Helgaas on a 128-cpu setup.
5473 	 */
5474 	BUG_ON(busiest_rq == target_rq);
5475 
5476 	/* move a task from busiest_rq to target_rq */
5477 	double_lock_balance(busiest_rq, target_rq);
5478 
5479 	/* Search for an sd spanning us and the target CPU. */
5480 	rcu_read_lock();
5481 	for_each_domain(target_cpu, sd) {
5482 		if ((sd->flags & SD_LOAD_BALANCE) &&
5483 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
5484 				break;
5485 	}
5486 
5487 	if (likely(sd)) {
5488 		struct lb_env env = {
5489 			.sd		= sd,
5490 			.dst_cpu	= target_cpu,
5491 			.dst_rq		= target_rq,
5492 			.src_cpu	= busiest_rq->cpu,
5493 			.src_rq		= busiest_rq,
5494 			.idle		= CPU_IDLE,
5495 		};
5496 
5497 		schedstat_inc(sd, alb_count);
5498 
5499 		if (move_one_task(&env))
5500 			schedstat_inc(sd, alb_pushed);
5501 		else
5502 			schedstat_inc(sd, alb_failed);
5503 	}
5504 	rcu_read_unlock();
5505 	double_unlock_balance(busiest_rq, target_rq);
5506 out_unlock:
5507 	busiest_rq->active_balance = 0;
5508 	raw_spin_unlock_irq(&busiest_rq->lock);
5509 	return 0;
5510 }
5511 
5512 #ifdef CONFIG_NO_HZ_COMMON
5513 /*
5514  * idle load balancing details
5515  * - When one of the busy CPUs notice that there may be an idle rebalancing
5516  *   needed, they will kick the idle load balancer, which then does idle
5517  *   load balancing for all the idle CPUs.
5518  */
5519 static struct {
5520 	cpumask_var_t idle_cpus_mask;
5521 	atomic_t nr_cpus;
5522 	unsigned long next_balance;     /* in jiffy units */
5523 } nohz ____cacheline_aligned;
5524 
5525 static inline int find_new_ilb(int call_cpu)
5526 {
5527 	int ilb = cpumask_first(nohz.idle_cpus_mask);
5528 
5529 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
5530 		return ilb;
5531 
5532 	return nr_cpu_ids;
5533 }
5534 
5535 /*
5536  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
5537  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
5538  * CPU (if there is one).
5539  */
5540 static void nohz_balancer_kick(int cpu)
5541 {
5542 	int ilb_cpu;
5543 
5544 	nohz.next_balance++;
5545 
5546 	ilb_cpu = find_new_ilb(cpu);
5547 
5548 	if (ilb_cpu >= nr_cpu_ids)
5549 		return;
5550 
5551 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
5552 		return;
5553 	/*
5554 	 * Use smp_send_reschedule() instead of resched_cpu().
5555 	 * This way we generate a sched IPI on the target cpu which
5556 	 * is idle. And the softirq performing nohz idle load balance
5557 	 * will be run before returning from the IPI.
5558 	 */
5559 	smp_send_reschedule(ilb_cpu);
5560 	return;
5561 }
5562 
5563 static inline void nohz_balance_exit_idle(int cpu)
5564 {
5565 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5566 		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5567 		atomic_dec(&nohz.nr_cpus);
5568 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5569 	}
5570 }
5571 
5572 static inline void set_cpu_sd_state_busy(void)
5573 {
5574 	struct sched_domain *sd;
5575 
5576 	rcu_read_lock();
5577 	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5578 
5579 	if (!sd || !sd->nohz_idle)
5580 		goto unlock;
5581 	sd->nohz_idle = 0;
5582 
5583 	for (; sd; sd = sd->parent)
5584 		atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5585 unlock:
5586 	rcu_read_unlock();
5587 }
5588 
5589 void set_cpu_sd_state_idle(void)
5590 {
5591 	struct sched_domain *sd;
5592 
5593 	rcu_read_lock();
5594 	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5595 
5596 	if (!sd || sd->nohz_idle)
5597 		goto unlock;
5598 	sd->nohz_idle = 1;
5599 
5600 	for (; sd; sd = sd->parent)
5601 		atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5602 unlock:
5603 	rcu_read_unlock();
5604 }
5605 
5606 /*
5607  * This routine will record that the cpu is going idle with tick stopped.
5608  * This info will be used in performing idle load balancing in the future.
5609  */
5610 void nohz_balance_enter_idle(int cpu)
5611 {
5612 	/*
5613 	 * If this cpu is going down, then nothing needs to be done.
5614 	 */
5615 	if (!cpu_active(cpu))
5616 		return;
5617 
5618 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
5619 		return;
5620 
5621 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
5622 	atomic_inc(&nohz.nr_cpus);
5623 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5624 }
5625 
5626 static int sched_ilb_notifier(struct notifier_block *nfb,
5627 					unsigned long action, void *hcpu)
5628 {
5629 	switch (action & ~CPU_TASKS_FROZEN) {
5630 	case CPU_DYING:
5631 		nohz_balance_exit_idle(smp_processor_id());
5632 		return NOTIFY_OK;
5633 	default:
5634 		return NOTIFY_DONE;
5635 	}
5636 }
5637 #endif
5638 
5639 static DEFINE_SPINLOCK(balancing);
5640 
5641 /*
5642  * Scale the max load_balance interval with the number of CPUs in the system.
5643  * This trades load-balance latency on larger machines for less cross talk.
5644  */
5645 void update_max_interval(void)
5646 {
5647 	max_load_balance_interval = HZ*num_online_cpus()/10;
5648 }
5649 
5650 /*
5651  * It checks each scheduling domain to see if it is due to be balanced,
5652  * and initiates a balancing operation if so.
5653  *
5654  * Balancing parameters are set up in init_sched_domains.
5655  */
5656 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5657 {
5658 	int continue_balancing = 1;
5659 	struct rq *rq = cpu_rq(cpu);
5660 	unsigned long interval;
5661 	struct sched_domain *sd;
5662 	/* Earliest time when we have to do rebalance again */
5663 	unsigned long next_balance = jiffies + 60*HZ;
5664 	int update_next_balance = 0;
5665 	int need_serialize;
5666 
5667 	update_blocked_averages(cpu);
5668 
5669 	rcu_read_lock();
5670 	for_each_domain(cpu, sd) {
5671 		if (!(sd->flags & SD_LOAD_BALANCE))
5672 			continue;
5673 
5674 		interval = sd->balance_interval;
5675 		if (idle != CPU_IDLE)
5676 			interval *= sd->busy_factor;
5677 
5678 		/* scale ms to jiffies */
5679 		interval = msecs_to_jiffies(interval);
5680 		interval = clamp(interval, 1UL, max_load_balance_interval);
5681 
5682 		need_serialize = sd->flags & SD_SERIALIZE;
5683 
5684 		if (need_serialize) {
5685 			if (!spin_trylock(&balancing))
5686 				goto out;
5687 		}
5688 
5689 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
5690 			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5691 				/*
5692 				 * The LBF_SOME_PINNED logic could have changed
5693 				 * env->dst_cpu, so we can't know our idle
5694 				 * state even if we migrated tasks. Update it.
5695 				 */
5696 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
5697 			}
5698 			sd->last_balance = jiffies;
5699 		}
5700 		if (need_serialize)
5701 			spin_unlock(&balancing);
5702 out:
5703 		if (time_after(next_balance, sd->last_balance + interval)) {
5704 			next_balance = sd->last_balance + interval;
5705 			update_next_balance = 1;
5706 		}
5707 
5708 		/*
5709 		 * Stop the load balance at this level. There is another
5710 		 * CPU in our sched group which is doing load balancing more
5711 		 * actively.
5712 		 */
5713 		if (!continue_balancing)
5714 			break;
5715 	}
5716 	rcu_read_unlock();
5717 
5718 	/*
5719 	 * next_balance will be updated only when there is a need.
5720 	 * When the cpu is attached to null domain for ex, it will not be
5721 	 * updated.
5722 	 */
5723 	if (likely(update_next_balance))
5724 		rq->next_balance = next_balance;
5725 }
5726 
5727 #ifdef CONFIG_NO_HZ_COMMON
5728 /*
5729  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
5730  * rebalancing for all the cpus for whom scheduler ticks are stopped.
5731  */
5732 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5733 {
5734 	struct rq *this_rq = cpu_rq(this_cpu);
5735 	struct rq *rq;
5736 	int balance_cpu;
5737 
5738 	if (idle != CPU_IDLE ||
5739 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5740 		goto end;
5741 
5742 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
5743 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
5744 			continue;
5745 
5746 		/*
5747 		 * If this cpu gets work to do, stop the load balancing
5748 		 * work being done for other cpus. Next load
5749 		 * balancing owner will pick it up.
5750 		 */
5751 		if (need_resched())
5752 			break;
5753 
5754 		rq = cpu_rq(balance_cpu);
5755 
5756 		raw_spin_lock_irq(&rq->lock);
5757 		update_rq_clock(rq);
5758 		update_idle_cpu_load(rq);
5759 		raw_spin_unlock_irq(&rq->lock);
5760 
5761 		rebalance_domains(balance_cpu, CPU_IDLE);
5762 
5763 		if (time_after(this_rq->next_balance, rq->next_balance))
5764 			this_rq->next_balance = rq->next_balance;
5765 	}
5766 	nohz.next_balance = this_rq->next_balance;
5767 end:
5768 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
5769 }
5770 
5771 /*
5772  * Current heuristic for kicking the idle load balancer in the presence
5773  * of an idle cpu is the system.
5774  *   - This rq has more than one task.
5775  *   - At any scheduler domain level, this cpu's scheduler group has multiple
5776  *     busy cpu's exceeding the group's power.
5777  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
5778  *     domain span are idle.
5779  */
5780 static inline int nohz_kick_needed(struct rq *rq, int cpu)
5781 {
5782 	unsigned long now = jiffies;
5783 	struct sched_domain *sd;
5784 
5785 	if (unlikely(idle_cpu(cpu)))
5786 		return 0;
5787 
5788        /*
5789 	* We may be recently in ticked or tickless idle mode. At the first
5790 	* busy tick after returning from idle, we will update the busy stats.
5791 	*/
5792 	set_cpu_sd_state_busy();
5793 	nohz_balance_exit_idle(cpu);
5794 
5795 	/*
5796 	 * None are in tickless mode and hence no need for NOHZ idle load
5797 	 * balancing.
5798 	 */
5799 	if (likely(!atomic_read(&nohz.nr_cpus)))
5800 		return 0;
5801 
5802 	if (time_before(now, nohz.next_balance))
5803 		return 0;
5804 
5805 	if (rq->nr_running >= 2)
5806 		goto need_kick;
5807 
5808 	rcu_read_lock();
5809 	for_each_domain(cpu, sd) {
5810 		struct sched_group *sg = sd->groups;
5811 		struct sched_group_power *sgp = sg->sgp;
5812 		int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5813 
5814 		if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
5815 			goto need_kick_unlock;
5816 
5817 		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5818 		    && (cpumask_first_and(nohz.idle_cpus_mask,
5819 					  sched_domain_span(sd)) < cpu))
5820 			goto need_kick_unlock;
5821 
5822 		if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5823 			break;
5824 	}
5825 	rcu_read_unlock();
5826 	return 0;
5827 
5828 need_kick_unlock:
5829 	rcu_read_unlock();
5830 need_kick:
5831 	return 1;
5832 }
5833 #else
5834 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
5835 #endif
5836 
5837 /*
5838  * run_rebalance_domains is triggered when needed from the scheduler tick.
5839  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
5840  */
5841 static void run_rebalance_domains(struct softirq_action *h)
5842 {
5843 	int this_cpu = smp_processor_id();
5844 	struct rq *this_rq = cpu_rq(this_cpu);
5845 	enum cpu_idle_type idle = this_rq->idle_balance ?
5846 						CPU_IDLE : CPU_NOT_IDLE;
5847 
5848 	rebalance_domains(this_cpu, idle);
5849 
5850 	/*
5851 	 * If this cpu has a pending nohz_balance_kick, then do the
5852 	 * balancing on behalf of the other idle cpus whose ticks are
5853 	 * stopped.
5854 	 */
5855 	nohz_idle_balance(this_cpu, idle);
5856 }
5857 
5858 static inline int on_null_domain(int cpu)
5859 {
5860 	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
5861 }
5862 
5863 /*
5864  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
5865  */
5866 void trigger_load_balance(struct rq *rq, int cpu)
5867 {
5868 	/* Don't need to rebalance while attached to NULL domain */
5869 	if (time_after_eq(jiffies, rq->next_balance) &&
5870 	    likely(!on_null_domain(cpu)))
5871 		raise_softirq(SCHED_SOFTIRQ);
5872 #ifdef CONFIG_NO_HZ_COMMON
5873 	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5874 		nohz_balancer_kick(cpu);
5875 #endif
5876 }
5877 
5878 static void rq_online_fair(struct rq *rq)
5879 {
5880 	update_sysctl();
5881 }
5882 
5883 static void rq_offline_fair(struct rq *rq)
5884 {
5885 	update_sysctl();
5886 
5887 	/* Ensure any throttled groups are reachable by pick_next_task */
5888 	unthrottle_offline_cfs_rqs(rq);
5889 }
5890 
5891 #endif /* CONFIG_SMP */
5892 
5893 /*
5894  * scheduler tick hitting a task of our scheduling class:
5895  */
5896 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5897 {
5898 	struct cfs_rq *cfs_rq;
5899 	struct sched_entity *se = &curr->se;
5900 
5901 	for_each_sched_entity(se) {
5902 		cfs_rq = cfs_rq_of(se);
5903 		entity_tick(cfs_rq, se, queued);
5904 	}
5905 
5906 	if (numabalancing_enabled)
5907 		task_tick_numa(rq, curr);
5908 
5909 	update_rq_runnable_avg(rq, 1);
5910 }
5911 
5912 /*
5913  * called on fork with the child task as argument from the parent's context
5914  *  - child not yet on the tasklist
5915  *  - preemption disabled
5916  */
5917 static void task_fork_fair(struct task_struct *p)
5918 {
5919 	struct cfs_rq *cfs_rq;
5920 	struct sched_entity *se = &p->se, *curr;
5921 	int this_cpu = smp_processor_id();
5922 	struct rq *rq = this_rq();
5923 	unsigned long flags;
5924 
5925 	raw_spin_lock_irqsave(&rq->lock, flags);
5926 
5927 	update_rq_clock(rq);
5928 
5929 	cfs_rq = task_cfs_rq(current);
5930 	curr = cfs_rq->curr;
5931 
5932 	/*
5933 	 * Not only the cpu but also the task_group of the parent might have
5934 	 * been changed after parent->se.parent,cfs_rq were copied to
5935 	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
5936 	 * of child point to valid ones.
5937 	 */
5938 	rcu_read_lock();
5939 	__set_task_cpu(p, this_cpu);
5940 	rcu_read_unlock();
5941 
5942 	update_curr(cfs_rq);
5943 
5944 	if (curr)
5945 		se->vruntime = curr->vruntime;
5946 	place_entity(cfs_rq, se, 1);
5947 
5948 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5949 		/*
5950 		 * Upon rescheduling, sched_class::put_prev_task() will place
5951 		 * 'current' within the tree based on its new key value.
5952 		 */
5953 		swap(curr->vruntime, se->vruntime);
5954 		resched_task(rq->curr);
5955 	}
5956 
5957 	se->vruntime -= cfs_rq->min_vruntime;
5958 
5959 	raw_spin_unlock_irqrestore(&rq->lock, flags);
5960 }
5961 
5962 /*
5963  * Priority of the task has changed. Check to see if we preempt
5964  * the current task.
5965  */
5966 static void
5967 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
5968 {
5969 	if (!p->se.on_rq)
5970 		return;
5971 
5972 	/*
5973 	 * Reschedule if we are currently running on this runqueue and
5974 	 * our priority decreased, or if we are not currently running on
5975 	 * this runqueue and our priority is higher than the current's
5976 	 */
5977 	if (rq->curr == p) {
5978 		if (p->prio > oldprio)
5979 			resched_task(rq->curr);
5980 	} else
5981 		check_preempt_curr(rq, p, 0);
5982 }
5983 
5984 static void switched_from_fair(struct rq *rq, struct task_struct *p)
5985 {
5986 	struct sched_entity *se = &p->se;
5987 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5988 
5989 	/*
5990 	 * Ensure the task's vruntime is normalized, so that when its
5991 	 * switched back to the fair class the enqueue_entity(.flags=0) will
5992 	 * do the right thing.
5993 	 *
5994 	 * If it was on_rq, then the dequeue_entity(.flags=0) will already
5995 	 * have normalized the vruntime, if it was !on_rq, then only when
5996 	 * the task is sleeping will it still have non-normalized vruntime.
5997 	 */
5998 	if (!se->on_rq && p->state != TASK_RUNNING) {
5999 		/*
6000 		 * Fix up our vruntime so that the current sleep doesn't
6001 		 * cause 'unlimited' sleep bonus.
6002 		 */
6003 		place_entity(cfs_rq, se, 0);
6004 		se->vruntime -= cfs_rq->min_vruntime;
6005 	}
6006 
6007 #ifdef CONFIG_SMP
6008 	/*
6009 	* Remove our load from contribution when we leave sched_fair
6010 	* and ensure we don't carry in an old decay_count if we
6011 	* switch back.
6012 	*/
6013 	if (se->avg.decay_count) {
6014 		__synchronize_entity_decay(se);
6015 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
6016 	}
6017 #endif
6018 }
6019 
6020 /*
6021  * We switched to the sched_fair class.
6022  */
6023 static void switched_to_fair(struct rq *rq, struct task_struct *p)
6024 {
6025 	if (!p->se.on_rq)
6026 		return;
6027 
6028 	/*
6029 	 * We were most likely switched from sched_rt, so
6030 	 * kick off the schedule if running, otherwise just see
6031 	 * if we can still preempt the current task.
6032 	 */
6033 	if (rq->curr == p)
6034 		resched_task(rq->curr);
6035 	else
6036 		check_preempt_curr(rq, p, 0);
6037 }
6038 
6039 /* Account for a task changing its policy or group.
6040  *
6041  * This routine is mostly called to set cfs_rq->curr field when a task
6042  * migrates between groups/classes.
6043  */
6044 static void set_curr_task_fair(struct rq *rq)
6045 {
6046 	struct sched_entity *se = &rq->curr->se;
6047 
6048 	for_each_sched_entity(se) {
6049 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
6050 
6051 		set_next_entity(cfs_rq, se);
6052 		/* ensure bandwidth has been allocated on our new cfs_rq */
6053 		account_cfs_rq_runtime(cfs_rq, 0);
6054 	}
6055 }
6056 
6057 void init_cfs_rq(struct cfs_rq *cfs_rq)
6058 {
6059 	cfs_rq->tasks_timeline = RB_ROOT;
6060 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6061 #ifndef CONFIG_64BIT
6062 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
6063 #endif
6064 #ifdef CONFIG_SMP
6065 	atomic64_set(&cfs_rq->decay_counter, 1);
6066 	atomic_long_set(&cfs_rq->removed_load, 0);
6067 #endif
6068 }
6069 
6070 #ifdef CONFIG_FAIR_GROUP_SCHED
6071 static void task_move_group_fair(struct task_struct *p, int on_rq)
6072 {
6073 	struct cfs_rq *cfs_rq;
6074 	/*
6075 	 * If the task was not on the rq at the time of this cgroup movement
6076 	 * it must have been asleep, sleeping tasks keep their ->vruntime
6077 	 * absolute on their old rq until wakeup (needed for the fair sleeper
6078 	 * bonus in place_entity()).
6079 	 *
6080 	 * If it was on the rq, we've just 'preempted' it, which does convert
6081 	 * ->vruntime to a relative base.
6082 	 *
6083 	 * Make sure both cases convert their relative position when migrating
6084 	 * to another cgroup's rq. This does somewhat interfere with the
6085 	 * fair sleeper stuff for the first placement, but who cares.
6086 	 */
6087 	/*
6088 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
6089 	 * But there are some cases where it has already been normalized:
6090 	 *
6091 	 * - Moving a forked child which is waiting for being woken up by
6092 	 *   wake_up_new_task().
6093 	 * - Moving a task which has been woken up by try_to_wake_up() and
6094 	 *   waiting for actually being woken up by sched_ttwu_pending().
6095 	 *
6096 	 * To prevent boost or penalty in the new cfs_rq caused by delta
6097 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
6098 	 */
6099 	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
6100 		on_rq = 1;
6101 
6102 	if (!on_rq)
6103 		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
6104 	set_task_rq(p, task_cpu(p));
6105 	if (!on_rq) {
6106 		cfs_rq = cfs_rq_of(&p->se);
6107 		p->se.vruntime += cfs_rq->min_vruntime;
6108 #ifdef CONFIG_SMP
6109 		/*
6110 		 * migrate_task_rq_fair() will have removed our previous
6111 		 * contribution, but we must synchronize for ongoing future
6112 		 * decay.
6113 		 */
6114 		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
6115 		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
6116 #endif
6117 	}
6118 }
6119 
6120 void free_fair_sched_group(struct task_group *tg)
6121 {
6122 	int i;
6123 
6124 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
6125 
6126 	for_each_possible_cpu(i) {
6127 		if (tg->cfs_rq)
6128 			kfree(tg->cfs_rq[i]);
6129 		if (tg->se)
6130 			kfree(tg->se[i]);
6131 	}
6132 
6133 	kfree(tg->cfs_rq);
6134 	kfree(tg->se);
6135 }
6136 
6137 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6138 {
6139 	struct cfs_rq *cfs_rq;
6140 	struct sched_entity *se;
6141 	int i;
6142 
6143 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
6144 	if (!tg->cfs_rq)
6145 		goto err;
6146 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
6147 	if (!tg->se)
6148 		goto err;
6149 
6150 	tg->shares = NICE_0_LOAD;
6151 
6152 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
6153 
6154 	for_each_possible_cpu(i) {
6155 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
6156 				      GFP_KERNEL, cpu_to_node(i));
6157 		if (!cfs_rq)
6158 			goto err;
6159 
6160 		se = kzalloc_node(sizeof(struct sched_entity),
6161 				  GFP_KERNEL, cpu_to_node(i));
6162 		if (!se)
6163 			goto err_free_rq;
6164 
6165 		init_cfs_rq(cfs_rq);
6166 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
6167 	}
6168 
6169 	return 1;
6170 
6171 err_free_rq:
6172 	kfree(cfs_rq);
6173 err:
6174 	return 0;
6175 }
6176 
6177 void unregister_fair_sched_group(struct task_group *tg, int cpu)
6178 {
6179 	struct rq *rq = cpu_rq(cpu);
6180 	unsigned long flags;
6181 
6182 	/*
6183 	* Only empty task groups can be destroyed; so we can speculatively
6184 	* check on_list without danger of it being re-added.
6185 	*/
6186 	if (!tg->cfs_rq[cpu]->on_list)
6187 		return;
6188 
6189 	raw_spin_lock_irqsave(&rq->lock, flags);
6190 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
6191 	raw_spin_unlock_irqrestore(&rq->lock, flags);
6192 }
6193 
6194 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6195 			struct sched_entity *se, int cpu,
6196 			struct sched_entity *parent)
6197 {
6198 	struct rq *rq = cpu_rq(cpu);
6199 
6200 	cfs_rq->tg = tg;
6201 	cfs_rq->rq = rq;
6202 	init_cfs_rq_runtime(cfs_rq);
6203 
6204 	tg->cfs_rq[cpu] = cfs_rq;
6205 	tg->se[cpu] = se;
6206 
6207 	/* se could be NULL for root_task_group */
6208 	if (!se)
6209 		return;
6210 
6211 	if (!parent)
6212 		se->cfs_rq = &rq->cfs;
6213 	else
6214 		se->cfs_rq = parent->my_q;
6215 
6216 	se->my_q = cfs_rq;
6217 	update_load_set(&se->load, 0);
6218 	se->parent = parent;
6219 }
6220 
6221 static DEFINE_MUTEX(shares_mutex);
6222 
6223 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6224 {
6225 	int i;
6226 	unsigned long flags;
6227 
6228 	/*
6229 	 * We can't change the weight of the root cgroup.
6230 	 */
6231 	if (!tg->se[0])
6232 		return -EINVAL;
6233 
6234 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
6235 
6236 	mutex_lock(&shares_mutex);
6237 	if (tg->shares == shares)
6238 		goto done;
6239 
6240 	tg->shares = shares;
6241 	for_each_possible_cpu(i) {
6242 		struct rq *rq = cpu_rq(i);
6243 		struct sched_entity *se;
6244 
6245 		se = tg->se[i];
6246 		/* Propagate contribution to hierarchy */
6247 		raw_spin_lock_irqsave(&rq->lock, flags);
6248 
6249 		/* Possible calls to update_curr() need rq clock */
6250 		update_rq_clock(rq);
6251 		for_each_sched_entity(se)
6252 			update_cfs_shares(group_cfs_rq(se));
6253 		raw_spin_unlock_irqrestore(&rq->lock, flags);
6254 	}
6255 
6256 done:
6257 	mutex_unlock(&shares_mutex);
6258 	return 0;
6259 }
6260 #else /* CONFIG_FAIR_GROUP_SCHED */
6261 
6262 void free_fair_sched_group(struct task_group *tg) { }
6263 
6264 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6265 {
6266 	return 1;
6267 }
6268 
6269 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
6270 
6271 #endif /* CONFIG_FAIR_GROUP_SCHED */
6272 
6273 
6274 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
6275 {
6276 	struct sched_entity *se = &task->se;
6277 	unsigned int rr_interval = 0;
6278 
6279 	/*
6280 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
6281 	 * idle runqueue:
6282 	 */
6283 	if (rq->cfs.load.weight)
6284 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
6285 
6286 	return rr_interval;
6287 }
6288 
6289 /*
6290  * All the scheduling class methods:
6291  */
6292 const struct sched_class fair_sched_class = {
6293 	.next			= &idle_sched_class,
6294 	.enqueue_task		= enqueue_task_fair,
6295 	.dequeue_task		= dequeue_task_fair,
6296 	.yield_task		= yield_task_fair,
6297 	.yield_to_task		= yield_to_task_fair,
6298 
6299 	.check_preempt_curr	= check_preempt_wakeup,
6300 
6301 	.pick_next_task		= pick_next_task_fair,
6302 	.put_prev_task		= put_prev_task_fair,
6303 
6304 #ifdef CONFIG_SMP
6305 	.select_task_rq		= select_task_rq_fair,
6306 	.migrate_task_rq	= migrate_task_rq_fair,
6307 
6308 	.rq_online		= rq_online_fair,
6309 	.rq_offline		= rq_offline_fair,
6310 
6311 	.task_waking		= task_waking_fair,
6312 #endif
6313 
6314 	.set_curr_task          = set_curr_task_fair,
6315 	.task_tick		= task_tick_fair,
6316 	.task_fork		= task_fork_fair,
6317 
6318 	.prio_changed		= prio_changed_fair,
6319 	.switched_from		= switched_from_fair,
6320 	.switched_to		= switched_to_fair,
6321 
6322 	.get_rr_interval	= get_rr_interval_fair,
6323 
6324 #ifdef CONFIG_FAIR_GROUP_SCHED
6325 	.task_move_group	= task_move_group_fair,
6326 #endif
6327 };
6328 
6329 #ifdef CONFIG_SCHED_DEBUG
6330 void print_cfs_stats(struct seq_file *m, int cpu)
6331 {
6332 	struct cfs_rq *cfs_rq;
6333 
6334 	rcu_read_lock();
6335 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
6336 		print_cfs_rq(m, cpu, cfs_rq);
6337 	rcu_read_unlock();
6338 }
6339 #endif
6340 
6341 __init void init_sched_fair_class(void)
6342 {
6343 #ifdef CONFIG_SMP
6344 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6345 
6346 #ifdef CONFIG_NO_HZ_COMMON
6347 	nohz.next_balance = jiffies;
6348 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
6349 	cpu_notifier(sched_ilb_notifier, 0);
6350 #endif
6351 #endif /* SMP */
6352 
6353 }
6354