xref: /openbmc/linux/kernel/sched/fair.c (revision 37be287c)
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21  */
22 
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/slab.h>
27 #include <linux/profile.h>
28 #include <linux/interrupt.h>
29 #include <linux/mempolicy.h>
30 #include <linux/migrate.h>
31 #include <linux/task_work.h>
32 
33 #include <trace/events/sched.h>
34 
35 #include "sched.h"
36 
37 /*
38  * Targeted preemption latency for CPU-bound tasks:
39  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
40  *
41  * NOTE: this latency value is not the same as the concept of
42  * 'timeslice length' - timeslices in CFS are of variable length
43  * and have no persistent notion like in traditional, time-slice
44  * based scheduling concepts.
45  *
46  * (to see the precise effective timeslice length of your workload,
47  *  run vmstat and monitor the context-switches (cs) field)
48  */
49 unsigned int sysctl_sched_latency = 6000000ULL;
50 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
51 
52 /*
53  * The initial- and re-scaling of tunables is configurable
54  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
55  *
56  * Options are:
57  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
58  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
59  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
60  */
61 enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 	= SCHED_TUNABLESCALING_LOG;
63 
64 /*
65  * Minimal preemption granularity for CPU-bound tasks:
66  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
67  */
68 unsigned int sysctl_sched_min_granularity = 750000ULL;
69 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
70 
71 /*
72  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
73  */
74 static unsigned int sched_nr_latency = 8;
75 
76 /*
77  * After fork, child runs first. If set to 0 (default) then
78  * parent will (try to) run first.
79  */
80 unsigned int sysctl_sched_child_runs_first __read_mostly;
81 
82 /*
83  * SCHED_OTHER wake-up granularity.
84  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
85  *
86  * This option delays the preemption effects of decoupled workloads
87  * and reduces their over-scheduling. Synchronous workloads will still
88  * have immediate wakeup/sleep latencies.
89  */
90 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
92 
93 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94 
95 /*
96  * The exponential sliding  window over which load is averaged for shares
97  * distribution.
98  * (default: 10msec)
99  */
100 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101 
102 #ifdef CONFIG_CFS_BANDWIDTH
103 /*
104  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
105  * each time a cfs_rq requests quota.
106  *
107  * Note: in the case that the slice exceeds the runtime remaining (either due
108  * to consumption or the quota being specified to be smaller than the slice)
109  * we will always only issue the remaining available time.
110  *
111  * default: 5 msec, units: microseconds
112   */
113 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114 #endif
115 
116 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
117 {
118 	lw->weight += inc;
119 	lw->inv_weight = 0;
120 }
121 
122 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
123 {
124 	lw->weight -= dec;
125 	lw->inv_weight = 0;
126 }
127 
128 static inline void update_load_set(struct load_weight *lw, unsigned long w)
129 {
130 	lw->weight = w;
131 	lw->inv_weight = 0;
132 }
133 
134 /*
135  * Increase the granularity value when there are more CPUs,
136  * because with more CPUs the 'effective latency' as visible
137  * to users decreases. But the relationship is not linear,
138  * so pick a second-best guess by going with the log2 of the
139  * number of CPUs.
140  *
141  * This idea comes from the SD scheduler of Con Kolivas:
142  */
143 static int get_update_sysctl_factor(void)
144 {
145 	unsigned int cpus = min_t(int, num_online_cpus(), 8);
146 	unsigned int factor;
147 
148 	switch (sysctl_sched_tunable_scaling) {
149 	case SCHED_TUNABLESCALING_NONE:
150 		factor = 1;
151 		break;
152 	case SCHED_TUNABLESCALING_LINEAR:
153 		factor = cpus;
154 		break;
155 	case SCHED_TUNABLESCALING_LOG:
156 	default:
157 		factor = 1 + ilog2(cpus);
158 		break;
159 	}
160 
161 	return factor;
162 }
163 
164 static void update_sysctl(void)
165 {
166 	unsigned int factor = get_update_sysctl_factor();
167 
168 #define SET_SYSCTL(name) \
169 	(sysctl_##name = (factor) * normalized_sysctl_##name)
170 	SET_SYSCTL(sched_min_granularity);
171 	SET_SYSCTL(sched_latency);
172 	SET_SYSCTL(sched_wakeup_granularity);
173 #undef SET_SYSCTL
174 }
175 
176 void sched_init_granularity(void)
177 {
178 	update_sysctl();
179 }
180 
181 #define WMULT_CONST	(~0U)
182 #define WMULT_SHIFT	32
183 
184 static void __update_inv_weight(struct load_weight *lw)
185 {
186 	unsigned long w;
187 
188 	if (likely(lw->inv_weight))
189 		return;
190 
191 	w = scale_load_down(lw->weight);
192 
193 	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
194 		lw->inv_weight = 1;
195 	else if (unlikely(!w))
196 		lw->inv_weight = WMULT_CONST;
197 	else
198 		lw->inv_weight = WMULT_CONST / w;
199 }
200 
201 /*
202  * delta_exec * weight / lw.weight
203  *   OR
204  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
205  *
206  * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
207  * we're guaranteed shift stays positive because inv_weight is guaranteed to
208  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
209  *
210  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
211  * weight/lw.weight <= 1, and therefore our shift will also be positive.
212  */
213 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
214 {
215 	u64 fact = scale_load_down(weight);
216 	int shift = WMULT_SHIFT;
217 
218 	__update_inv_weight(lw);
219 
220 	if (unlikely(fact >> 32)) {
221 		while (fact >> 32) {
222 			fact >>= 1;
223 			shift--;
224 		}
225 	}
226 
227 	/* hint to use a 32x32->64 mul */
228 	fact = (u64)(u32)fact * lw->inv_weight;
229 
230 	while (fact >> 32) {
231 		fact >>= 1;
232 		shift--;
233 	}
234 
235 	return mul_u64_u32_shr(delta_exec, fact, shift);
236 }
237 
238 
239 const struct sched_class fair_sched_class;
240 
241 /**************************************************************
242  * CFS operations on generic schedulable entities:
243  */
244 
245 #ifdef CONFIG_FAIR_GROUP_SCHED
246 
247 /* cpu runqueue to which this cfs_rq is attached */
248 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
249 {
250 	return cfs_rq->rq;
251 }
252 
253 /* An entity is a task if it doesn't "own" a runqueue */
254 #define entity_is_task(se)	(!se->my_q)
255 
256 static inline struct task_struct *task_of(struct sched_entity *se)
257 {
258 #ifdef CONFIG_SCHED_DEBUG
259 	WARN_ON_ONCE(!entity_is_task(se));
260 #endif
261 	return container_of(se, struct task_struct, se);
262 }
263 
264 /* Walk up scheduling entities hierarchy */
265 #define for_each_sched_entity(se) \
266 		for (; se; se = se->parent)
267 
268 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
269 {
270 	return p->se.cfs_rq;
271 }
272 
273 /* runqueue on which this entity is (to be) queued */
274 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
275 {
276 	return se->cfs_rq;
277 }
278 
279 /* runqueue "owned" by this group */
280 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
281 {
282 	return grp->my_q;
283 }
284 
285 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
286 				       int force_update);
287 
288 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
289 {
290 	if (!cfs_rq->on_list) {
291 		/*
292 		 * Ensure we either appear before our parent (if already
293 		 * enqueued) or force our parent to appear after us when it is
294 		 * enqueued.  The fact that we always enqueue bottom-up
295 		 * reduces this to two cases.
296 		 */
297 		if (cfs_rq->tg->parent &&
298 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
299 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
300 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
301 		} else {
302 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
303 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
304 		}
305 
306 		cfs_rq->on_list = 1;
307 		/* We should have no load, but we need to update last_decay. */
308 		update_cfs_rq_blocked_load(cfs_rq, 0);
309 	}
310 }
311 
312 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
313 {
314 	if (cfs_rq->on_list) {
315 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
316 		cfs_rq->on_list = 0;
317 	}
318 }
319 
320 /* Iterate thr' all leaf cfs_rq's on a runqueue */
321 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
322 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
323 
324 /* Do the two (enqueued) entities belong to the same group ? */
325 static inline int
326 is_same_group(struct sched_entity *se, struct sched_entity *pse)
327 {
328 	if (se->cfs_rq == pse->cfs_rq)
329 		return 1;
330 
331 	return 0;
332 }
333 
334 static inline struct sched_entity *parent_entity(struct sched_entity *se)
335 {
336 	return se->parent;
337 }
338 
339 /* return depth at which a sched entity is present in the hierarchy */
340 static inline int depth_se(struct sched_entity *se)
341 {
342 	int depth = 0;
343 
344 	for_each_sched_entity(se)
345 		depth++;
346 
347 	return depth;
348 }
349 
350 static void
351 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
352 {
353 	int se_depth, pse_depth;
354 
355 	/*
356 	 * preemption test can be made between sibling entities who are in the
357 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
358 	 * both tasks until we find their ancestors who are siblings of common
359 	 * parent.
360 	 */
361 
362 	/* First walk up until both entities are at same depth */
363 	se_depth = depth_se(*se);
364 	pse_depth = depth_se(*pse);
365 
366 	while (se_depth > pse_depth) {
367 		se_depth--;
368 		*se = parent_entity(*se);
369 	}
370 
371 	while (pse_depth > se_depth) {
372 		pse_depth--;
373 		*pse = parent_entity(*pse);
374 	}
375 
376 	while (!is_same_group(*se, *pse)) {
377 		*se = parent_entity(*se);
378 		*pse = parent_entity(*pse);
379 	}
380 }
381 
382 #else	/* !CONFIG_FAIR_GROUP_SCHED */
383 
384 static inline struct task_struct *task_of(struct sched_entity *se)
385 {
386 	return container_of(se, struct task_struct, se);
387 }
388 
389 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
390 {
391 	return container_of(cfs_rq, struct rq, cfs);
392 }
393 
394 #define entity_is_task(se)	1
395 
396 #define for_each_sched_entity(se) \
397 		for (; se; se = NULL)
398 
399 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
400 {
401 	return &task_rq(p)->cfs;
402 }
403 
404 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
405 {
406 	struct task_struct *p = task_of(se);
407 	struct rq *rq = task_rq(p);
408 
409 	return &rq->cfs;
410 }
411 
412 /* runqueue "owned" by this group */
413 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
414 {
415 	return NULL;
416 }
417 
418 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
419 {
420 }
421 
422 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
423 {
424 }
425 
426 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
427 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
428 
429 static inline int
430 is_same_group(struct sched_entity *se, struct sched_entity *pse)
431 {
432 	return 1;
433 }
434 
435 static inline struct sched_entity *parent_entity(struct sched_entity *se)
436 {
437 	return NULL;
438 }
439 
440 static inline void
441 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
442 {
443 }
444 
445 #endif	/* CONFIG_FAIR_GROUP_SCHED */
446 
447 static __always_inline
448 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
449 
450 /**************************************************************
451  * Scheduling class tree data structure manipulation methods:
452  */
453 
454 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
455 {
456 	s64 delta = (s64)(vruntime - max_vruntime);
457 	if (delta > 0)
458 		max_vruntime = vruntime;
459 
460 	return max_vruntime;
461 }
462 
463 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
464 {
465 	s64 delta = (s64)(vruntime - min_vruntime);
466 	if (delta < 0)
467 		min_vruntime = vruntime;
468 
469 	return min_vruntime;
470 }
471 
472 static inline int entity_before(struct sched_entity *a,
473 				struct sched_entity *b)
474 {
475 	return (s64)(a->vruntime - b->vruntime) < 0;
476 }
477 
478 static void update_min_vruntime(struct cfs_rq *cfs_rq)
479 {
480 	u64 vruntime = cfs_rq->min_vruntime;
481 
482 	if (cfs_rq->curr)
483 		vruntime = cfs_rq->curr->vruntime;
484 
485 	if (cfs_rq->rb_leftmost) {
486 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
487 						   struct sched_entity,
488 						   run_node);
489 
490 		if (!cfs_rq->curr)
491 			vruntime = se->vruntime;
492 		else
493 			vruntime = min_vruntime(vruntime, se->vruntime);
494 	}
495 
496 	/* ensure we never gain time by being placed backwards. */
497 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
498 #ifndef CONFIG_64BIT
499 	smp_wmb();
500 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
501 #endif
502 }
503 
504 /*
505  * Enqueue an entity into the rb-tree:
506  */
507 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
508 {
509 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
510 	struct rb_node *parent = NULL;
511 	struct sched_entity *entry;
512 	int leftmost = 1;
513 
514 	/*
515 	 * Find the right place in the rbtree:
516 	 */
517 	while (*link) {
518 		parent = *link;
519 		entry = rb_entry(parent, struct sched_entity, run_node);
520 		/*
521 		 * We dont care about collisions. Nodes with
522 		 * the same key stay together.
523 		 */
524 		if (entity_before(se, entry)) {
525 			link = &parent->rb_left;
526 		} else {
527 			link = &parent->rb_right;
528 			leftmost = 0;
529 		}
530 	}
531 
532 	/*
533 	 * Maintain a cache of leftmost tree entries (it is frequently
534 	 * used):
535 	 */
536 	if (leftmost)
537 		cfs_rq->rb_leftmost = &se->run_node;
538 
539 	rb_link_node(&se->run_node, parent, link);
540 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
541 }
542 
543 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
544 {
545 	if (cfs_rq->rb_leftmost == &se->run_node) {
546 		struct rb_node *next_node;
547 
548 		next_node = rb_next(&se->run_node);
549 		cfs_rq->rb_leftmost = next_node;
550 	}
551 
552 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
553 }
554 
555 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
556 {
557 	struct rb_node *left = cfs_rq->rb_leftmost;
558 
559 	if (!left)
560 		return NULL;
561 
562 	return rb_entry(left, struct sched_entity, run_node);
563 }
564 
565 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
566 {
567 	struct rb_node *next = rb_next(&se->run_node);
568 
569 	if (!next)
570 		return NULL;
571 
572 	return rb_entry(next, struct sched_entity, run_node);
573 }
574 
575 #ifdef CONFIG_SCHED_DEBUG
576 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
577 {
578 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
579 
580 	if (!last)
581 		return NULL;
582 
583 	return rb_entry(last, struct sched_entity, run_node);
584 }
585 
586 /**************************************************************
587  * Scheduling class statistics methods:
588  */
589 
590 int sched_proc_update_handler(struct ctl_table *table, int write,
591 		void __user *buffer, size_t *lenp,
592 		loff_t *ppos)
593 {
594 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
595 	int factor = get_update_sysctl_factor();
596 
597 	if (ret || !write)
598 		return ret;
599 
600 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
601 					sysctl_sched_min_granularity);
602 
603 #define WRT_SYSCTL(name) \
604 	(normalized_sysctl_##name = sysctl_##name / (factor))
605 	WRT_SYSCTL(sched_min_granularity);
606 	WRT_SYSCTL(sched_latency);
607 	WRT_SYSCTL(sched_wakeup_granularity);
608 #undef WRT_SYSCTL
609 
610 	return 0;
611 }
612 #endif
613 
614 /*
615  * delta /= w
616  */
617 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
618 {
619 	if (unlikely(se->load.weight != NICE_0_LOAD))
620 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
621 
622 	return delta;
623 }
624 
625 /*
626  * The idea is to set a period in which each task runs once.
627  *
628  * When there are too many tasks (sched_nr_latency) we have to stretch
629  * this period because otherwise the slices get too small.
630  *
631  * p = (nr <= nl) ? l : l*nr/nl
632  */
633 static u64 __sched_period(unsigned long nr_running)
634 {
635 	u64 period = sysctl_sched_latency;
636 	unsigned long nr_latency = sched_nr_latency;
637 
638 	if (unlikely(nr_running > nr_latency)) {
639 		period = sysctl_sched_min_granularity;
640 		period *= nr_running;
641 	}
642 
643 	return period;
644 }
645 
646 /*
647  * We calculate the wall-time slice from the period by taking a part
648  * proportional to the weight.
649  *
650  * s = p*P[w/rw]
651  */
652 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
653 {
654 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
655 
656 	for_each_sched_entity(se) {
657 		struct load_weight *load;
658 		struct load_weight lw;
659 
660 		cfs_rq = cfs_rq_of(se);
661 		load = &cfs_rq->load;
662 
663 		if (unlikely(!se->on_rq)) {
664 			lw = cfs_rq->load;
665 
666 			update_load_add(&lw, se->load.weight);
667 			load = &lw;
668 		}
669 		slice = __calc_delta(slice, se->load.weight, load);
670 	}
671 	return slice;
672 }
673 
674 /*
675  * We calculate the vruntime slice of a to-be-inserted task.
676  *
677  * vs = s/w
678  */
679 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
680 {
681 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
682 }
683 
684 #ifdef CONFIG_SMP
685 static unsigned long task_h_load(struct task_struct *p);
686 
687 static inline void __update_task_entity_contrib(struct sched_entity *se);
688 
689 /* Give new task start runnable values to heavy its load in infant time */
690 void init_task_runnable_average(struct task_struct *p)
691 {
692 	u32 slice;
693 
694 	p->se.avg.decay_count = 0;
695 	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
696 	p->se.avg.runnable_avg_sum = slice;
697 	p->se.avg.runnable_avg_period = slice;
698 	__update_task_entity_contrib(&p->se);
699 }
700 #else
701 void init_task_runnable_average(struct task_struct *p)
702 {
703 }
704 #endif
705 
706 /*
707  * Update the current task's runtime statistics.
708  */
709 static void update_curr(struct cfs_rq *cfs_rq)
710 {
711 	struct sched_entity *curr = cfs_rq->curr;
712 	u64 now = rq_clock_task(rq_of(cfs_rq));
713 	u64 delta_exec;
714 
715 	if (unlikely(!curr))
716 		return;
717 
718 	delta_exec = now - curr->exec_start;
719 	if (unlikely((s64)delta_exec <= 0))
720 		return;
721 
722 	curr->exec_start = now;
723 
724 	schedstat_set(curr->statistics.exec_max,
725 		      max(delta_exec, curr->statistics.exec_max));
726 
727 	curr->sum_exec_runtime += delta_exec;
728 	schedstat_add(cfs_rq, exec_clock, delta_exec);
729 
730 	curr->vruntime += calc_delta_fair(delta_exec, curr);
731 	update_min_vruntime(cfs_rq);
732 
733 	if (entity_is_task(curr)) {
734 		struct task_struct *curtask = task_of(curr);
735 
736 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
737 		cpuacct_charge(curtask, delta_exec);
738 		account_group_exec_runtime(curtask, delta_exec);
739 	}
740 
741 	account_cfs_rq_runtime(cfs_rq, delta_exec);
742 }
743 
744 static inline void
745 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
746 {
747 	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
748 }
749 
750 /*
751  * Task is being enqueued - update stats:
752  */
753 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
754 {
755 	/*
756 	 * Are we enqueueing a waiting task? (for current tasks
757 	 * a dequeue/enqueue event is a NOP)
758 	 */
759 	if (se != cfs_rq->curr)
760 		update_stats_wait_start(cfs_rq, se);
761 }
762 
763 static void
764 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
765 {
766 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
767 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
768 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
769 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
770 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
771 #ifdef CONFIG_SCHEDSTATS
772 	if (entity_is_task(se)) {
773 		trace_sched_stat_wait(task_of(se),
774 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
775 	}
776 #endif
777 	schedstat_set(se->statistics.wait_start, 0);
778 }
779 
780 static inline void
781 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
782 {
783 	/*
784 	 * Mark the end of the wait period if dequeueing a
785 	 * waiting task:
786 	 */
787 	if (se != cfs_rq->curr)
788 		update_stats_wait_end(cfs_rq, se);
789 }
790 
791 /*
792  * We are picking a new current task - update its stats:
793  */
794 static inline void
795 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
796 {
797 	/*
798 	 * We are starting a new run period:
799 	 */
800 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
801 }
802 
803 /**************************************************
804  * Scheduling class queueing methods:
805  */
806 
807 #ifdef CONFIG_NUMA_BALANCING
808 /*
809  * Approximate time to scan a full NUMA task in ms. The task scan period is
810  * calculated based on the tasks virtual memory size and
811  * numa_balancing_scan_size.
812  */
813 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
814 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
815 
816 /* Portion of address space to scan in MB */
817 unsigned int sysctl_numa_balancing_scan_size = 256;
818 
819 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
820 unsigned int sysctl_numa_balancing_scan_delay = 1000;
821 
822 /*
823  * After skipping a page migration on a shared page, skip N more numa page
824  * migrations unconditionally. This reduces the number of NUMA migrations
825  * in shared memory workloads, and has the effect of pulling tasks towards
826  * where their memory lives, over pulling the memory towards the task.
827  */
828 unsigned int sysctl_numa_balancing_migrate_deferred = 16;
829 
830 static unsigned int task_nr_scan_windows(struct task_struct *p)
831 {
832 	unsigned long rss = 0;
833 	unsigned long nr_scan_pages;
834 
835 	/*
836 	 * Calculations based on RSS as non-present and empty pages are skipped
837 	 * by the PTE scanner and NUMA hinting faults should be trapped based
838 	 * on resident pages
839 	 */
840 	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
841 	rss = get_mm_rss(p->mm);
842 	if (!rss)
843 		rss = nr_scan_pages;
844 
845 	rss = round_up(rss, nr_scan_pages);
846 	return rss / nr_scan_pages;
847 }
848 
849 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
850 #define MAX_SCAN_WINDOW 2560
851 
852 static unsigned int task_scan_min(struct task_struct *p)
853 {
854 	unsigned int scan, floor;
855 	unsigned int windows = 1;
856 
857 	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
858 		windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
859 	floor = 1000 / windows;
860 
861 	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
862 	return max_t(unsigned int, floor, scan);
863 }
864 
865 static unsigned int task_scan_max(struct task_struct *p)
866 {
867 	unsigned int smin = task_scan_min(p);
868 	unsigned int smax;
869 
870 	/* Watch for min being lower than max due to floor calculations */
871 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
872 	return max(smin, smax);
873 }
874 
875 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
876 {
877 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
878 	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
879 }
880 
881 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
882 {
883 	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
884 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
885 }
886 
887 struct numa_group {
888 	atomic_t refcount;
889 
890 	spinlock_t lock; /* nr_tasks, tasks */
891 	int nr_tasks;
892 	pid_t gid;
893 	struct list_head task_list;
894 
895 	struct rcu_head rcu;
896 	unsigned long total_faults;
897 	unsigned long faults[0];
898 };
899 
900 pid_t task_numa_group_id(struct task_struct *p)
901 {
902 	return p->numa_group ? p->numa_group->gid : 0;
903 }
904 
905 static inline int task_faults_idx(int nid, int priv)
906 {
907 	return 2 * nid + priv;
908 }
909 
910 static inline unsigned long task_faults(struct task_struct *p, int nid)
911 {
912 	if (!p->numa_faults)
913 		return 0;
914 
915 	return p->numa_faults[task_faults_idx(nid, 0)] +
916 		p->numa_faults[task_faults_idx(nid, 1)];
917 }
918 
919 static inline unsigned long group_faults(struct task_struct *p, int nid)
920 {
921 	if (!p->numa_group)
922 		return 0;
923 
924 	return p->numa_group->faults[task_faults_idx(nid, 0)] +
925 		p->numa_group->faults[task_faults_idx(nid, 1)];
926 }
927 
928 /*
929  * These return the fraction of accesses done by a particular task, or
930  * task group, on a particular numa node.  The group weight is given a
931  * larger multiplier, in order to group tasks together that are almost
932  * evenly spread out between numa nodes.
933  */
934 static inline unsigned long task_weight(struct task_struct *p, int nid)
935 {
936 	unsigned long total_faults;
937 
938 	if (!p->numa_faults)
939 		return 0;
940 
941 	total_faults = p->total_numa_faults;
942 
943 	if (!total_faults)
944 		return 0;
945 
946 	return 1000 * task_faults(p, nid) / total_faults;
947 }
948 
949 static inline unsigned long group_weight(struct task_struct *p, int nid)
950 {
951 	if (!p->numa_group || !p->numa_group->total_faults)
952 		return 0;
953 
954 	return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
955 }
956 
957 static unsigned long weighted_cpuload(const int cpu);
958 static unsigned long source_load(int cpu, int type);
959 static unsigned long target_load(int cpu, int type);
960 static unsigned long power_of(int cpu);
961 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
962 
963 /* Cached statistics for all CPUs within a node */
964 struct numa_stats {
965 	unsigned long nr_running;
966 	unsigned long load;
967 
968 	/* Total compute capacity of CPUs on a node */
969 	unsigned long power;
970 
971 	/* Approximate capacity in terms of runnable tasks on a node */
972 	unsigned long capacity;
973 	int has_capacity;
974 };
975 
976 /*
977  * XXX borrowed from update_sg_lb_stats
978  */
979 static void update_numa_stats(struct numa_stats *ns, int nid)
980 {
981 	int cpu, cpus = 0;
982 
983 	memset(ns, 0, sizeof(*ns));
984 	for_each_cpu(cpu, cpumask_of_node(nid)) {
985 		struct rq *rq = cpu_rq(cpu);
986 
987 		ns->nr_running += rq->nr_running;
988 		ns->load += weighted_cpuload(cpu);
989 		ns->power += power_of(cpu);
990 
991 		cpus++;
992 	}
993 
994 	/*
995 	 * If we raced with hotplug and there are no CPUs left in our mask
996 	 * the @ns structure is NULL'ed and task_numa_compare() will
997 	 * not find this node attractive.
998 	 *
999 	 * We'll either bail at !has_capacity, or we'll detect a huge imbalance
1000 	 * and bail there.
1001 	 */
1002 	if (!cpus)
1003 		return;
1004 
1005 	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1006 	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1007 	ns->has_capacity = (ns->nr_running < ns->capacity);
1008 }
1009 
1010 struct task_numa_env {
1011 	struct task_struct *p;
1012 
1013 	int src_cpu, src_nid;
1014 	int dst_cpu, dst_nid;
1015 
1016 	struct numa_stats src_stats, dst_stats;
1017 
1018 	int imbalance_pct;
1019 
1020 	struct task_struct *best_task;
1021 	long best_imp;
1022 	int best_cpu;
1023 };
1024 
1025 static void task_numa_assign(struct task_numa_env *env,
1026 			     struct task_struct *p, long imp)
1027 {
1028 	if (env->best_task)
1029 		put_task_struct(env->best_task);
1030 	if (p)
1031 		get_task_struct(p);
1032 
1033 	env->best_task = p;
1034 	env->best_imp = imp;
1035 	env->best_cpu = env->dst_cpu;
1036 }
1037 
1038 /*
1039  * This checks if the overall compute and NUMA accesses of the system would
1040  * be improved if the source tasks was migrated to the target dst_cpu taking
1041  * into account that it might be best if task running on the dst_cpu should
1042  * be exchanged with the source task
1043  */
1044 static void task_numa_compare(struct task_numa_env *env,
1045 			      long taskimp, long groupimp)
1046 {
1047 	struct rq *src_rq = cpu_rq(env->src_cpu);
1048 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
1049 	struct task_struct *cur;
1050 	long dst_load, src_load;
1051 	long load;
1052 	long imp = (groupimp > 0) ? groupimp : taskimp;
1053 
1054 	rcu_read_lock();
1055 	cur = ACCESS_ONCE(dst_rq->curr);
1056 	if (cur->pid == 0) /* idle */
1057 		cur = NULL;
1058 
1059 	/*
1060 	 * "imp" is the fault differential for the source task between the
1061 	 * source and destination node. Calculate the total differential for
1062 	 * the source task and potential destination task. The more negative
1063 	 * the value is, the more rmeote accesses that would be expected to
1064 	 * be incurred if the tasks were swapped.
1065 	 */
1066 	if (cur) {
1067 		/* Skip this swap candidate if cannot move to the source cpu */
1068 		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1069 			goto unlock;
1070 
1071 		/*
1072 		 * If dst and source tasks are in the same NUMA group, or not
1073 		 * in any group then look only at task weights.
1074 		 */
1075 		if (cur->numa_group == env->p->numa_group) {
1076 			imp = taskimp + task_weight(cur, env->src_nid) -
1077 			      task_weight(cur, env->dst_nid);
1078 			/*
1079 			 * Add some hysteresis to prevent swapping the
1080 			 * tasks within a group over tiny differences.
1081 			 */
1082 			if (cur->numa_group)
1083 				imp -= imp/16;
1084 		} else {
1085 			/*
1086 			 * Compare the group weights. If a task is all by
1087 			 * itself (not part of a group), use the task weight
1088 			 * instead.
1089 			 */
1090 			if (env->p->numa_group)
1091 				imp = groupimp;
1092 			else
1093 				imp = taskimp;
1094 
1095 			if (cur->numa_group)
1096 				imp += group_weight(cur, env->src_nid) -
1097 				       group_weight(cur, env->dst_nid);
1098 			else
1099 				imp += task_weight(cur, env->src_nid) -
1100 				       task_weight(cur, env->dst_nid);
1101 		}
1102 	}
1103 
1104 	if (imp < env->best_imp)
1105 		goto unlock;
1106 
1107 	if (!cur) {
1108 		/* Is there capacity at our destination? */
1109 		if (env->src_stats.has_capacity &&
1110 		    !env->dst_stats.has_capacity)
1111 			goto unlock;
1112 
1113 		goto balance;
1114 	}
1115 
1116 	/* Balance doesn't matter much if we're running a task per cpu */
1117 	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1118 		goto assign;
1119 
1120 	/*
1121 	 * In the overloaded case, try and keep the load balanced.
1122 	 */
1123 balance:
1124 	dst_load = env->dst_stats.load;
1125 	src_load = env->src_stats.load;
1126 
1127 	/* XXX missing power terms */
1128 	load = task_h_load(env->p);
1129 	dst_load += load;
1130 	src_load -= load;
1131 
1132 	if (cur) {
1133 		load = task_h_load(cur);
1134 		dst_load -= load;
1135 		src_load += load;
1136 	}
1137 
1138 	/* make src_load the smaller */
1139 	if (dst_load < src_load)
1140 		swap(dst_load, src_load);
1141 
1142 	if (src_load * env->imbalance_pct < dst_load * 100)
1143 		goto unlock;
1144 
1145 assign:
1146 	task_numa_assign(env, cur, imp);
1147 unlock:
1148 	rcu_read_unlock();
1149 }
1150 
1151 static void task_numa_find_cpu(struct task_numa_env *env,
1152 				long taskimp, long groupimp)
1153 {
1154 	int cpu;
1155 
1156 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1157 		/* Skip this CPU if the source task cannot migrate */
1158 		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1159 			continue;
1160 
1161 		env->dst_cpu = cpu;
1162 		task_numa_compare(env, taskimp, groupimp);
1163 	}
1164 }
1165 
1166 static int task_numa_migrate(struct task_struct *p)
1167 {
1168 	struct task_numa_env env = {
1169 		.p = p,
1170 
1171 		.src_cpu = task_cpu(p),
1172 		.src_nid = task_node(p),
1173 
1174 		.imbalance_pct = 112,
1175 
1176 		.best_task = NULL,
1177 		.best_imp = 0,
1178 		.best_cpu = -1
1179 	};
1180 	struct sched_domain *sd;
1181 	unsigned long taskweight, groupweight;
1182 	int nid, ret;
1183 	long taskimp, groupimp;
1184 
1185 	/*
1186 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
1187 	 * imbalance and would be the first to start moving tasks about.
1188 	 *
1189 	 * And we want to avoid any moving of tasks about, as that would create
1190 	 * random movement of tasks -- counter the numa conditions we're trying
1191 	 * to satisfy here.
1192 	 */
1193 	rcu_read_lock();
1194 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1195 	if (sd)
1196 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1197 	rcu_read_unlock();
1198 
1199 	/*
1200 	 * Cpusets can break the scheduler domain tree into smaller
1201 	 * balance domains, some of which do not cross NUMA boundaries.
1202 	 * Tasks that are "trapped" in such domains cannot be migrated
1203 	 * elsewhere, so there is no point in (re)trying.
1204 	 */
1205 	if (unlikely(!sd)) {
1206 		p->numa_preferred_nid = task_node(p);
1207 		return -EINVAL;
1208 	}
1209 
1210 	taskweight = task_weight(p, env.src_nid);
1211 	groupweight = group_weight(p, env.src_nid);
1212 	update_numa_stats(&env.src_stats, env.src_nid);
1213 	env.dst_nid = p->numa_preferred_nid;
1214 	taskimp = task_weight(p, env.dst_nid) - taskweight;
1215 	groupimp = group_weight(p, env.dst_nid) - groupweight;
1216 	update_numa_stats(&env.dst_stats, env.dst_nid);
1217 
1218 	/* If the preferred nid has capacity, try to use it. */
1219 	if (env.dst_stats.has_capacity)
1220 		task_numa_find_cpu(&env, taskimp, groupimp);
1221 
1222 	/* No space available on the preferred nid. Look elsewhere. */
1223 	if (env.best_cpu == -1) {
1224 		for_each_online_node(nid) {
1225 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
1226 				continue;
1227 
1228 			/* Only consider nodes where both task and groups benefit */
1229 			taskimp = task_weight(p, nid) - taskweight;
1230 			groupimp = group_weight(p, nid) - groupweight;
1231 			if (taskimp < 0 && groupimp < 0)
1232 				continue;
1233 
1234 			env.dst_nid = nid;
1235 			update_numa_stats(&env.dst_stats, env.dst_nid);
1236 			task_numa_find_cpu(&env, taskimp, groupimp);
1237 		}
1238 	}
1239 
1240 	/* No better CPU than the current one was found. */
1241 	if (env.best_cpu == -1)
1242 		return -EAGAIN;
1243 
1244 	sched_setnuma(p, env.dst_nid);
1245 
1246 	/*
1247 	 * Reset the scan period if the task is being rescheduled on an
1248 	 * alternative node to recheck if the tasks is now properly placed.
1249 	 */
1250 	p->numa_scan_period = task_scan_min(p);
1251 
1252 	if (env.best_task == NULL) {
1253 		ret = migrate_task_to(p, env.best_cpu);
1254 		if (ret != 0)
1255 			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1256 		return ret;
1257 	}
1258 
1259 	ret = migrate_swap(p, env.best_task);
1260 	if (ret != 0)
1261 		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1262 	put_task_struct(env.best_task);
1263 	return ret;
1264 }
1265 
1266 /* Attempt to migrate a task to a CPU on the preferred node. */
1267 static void numa_migrate_preferred(struct task_struct *p)
1268 {
1269 	/* This task has no NUMA fault statistics yet */
1270 	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1271 		return;
1272 
1273 	/* Periodically retry migrating the task to the preferred node */
1274 	p->numa_migrate_retry = jiffies + HZ;
1275 
1276 	/* Success if task is already running on preferred CPU */
1277 	if (task_node(p) == p->numa_preferred_nid)
1278 		return;
1279 
1280 	/* Otherwise, try migrate to a CPU on the preferred node */
1281 	task_numa_migrate(p);
1282 }
1283 
1284 /*
1285  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1286  * increments. The more local the fault statistics are, the higher the scan
1287  * period will be for the next scan window. If local/remote ratio is below
1288  * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
1289  * scan period will decrease
1290  */
1291 #define NUMA_PERIOD_SLOTS 10
1292 #define NUMA_PERIOD_THRESHOLD 3
1293 
1294 /*
1295  * Increase the scan period (slow down scanning) if the majority of
1296  * our memory is already on our local node, or if the majority of
1297  * the page accesses are shared with other processes.
1298  * Otherwise, decrease the scan period.
1299  */
1300 static void update_task_scan_period(struct task_struct *p,
1301 			unsigned long shared, unsigned long private)
1302 {
1303 	unsigned int period_slot;
1304 	int ratio;
1305 	int diff;
1306 
1307 	unsigned long remote = p->numa_faults_locality[0];
1308 	unsigned long local = p->numa_faults_locality[1];
1309 
1310 	/*
1311 	 * If there were no record hinting faults then either the task is
1312 	 * completely idle or all activity is areas that are not of interest
1313 	 * to automatic numa balancing. Scan slower
1314 	 */
1315 	if (local + shared == 0) {
1316 		p->numa_scan_period = min(p->numa_scan_period_max,
1317 			p->numa_scan_period << 1);
1318 
1319 		p->mm->numa_next_scan = jiffies +
1320 			msecs_to_jiffies(p->numa_scan_period);
1321 
1322 		return;
1323 	}
1324 
1325 	/*
1326 	 * Prepare to scale scan period relative to the current period.
1327 	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
1328 	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1329 	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1330 	 */
1331 	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1332 	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1333 	if (ratio >= NUMA_PERIOD_THRESHOLD) {
1334 		int slot = ratio - NUMA_PERIOD_THRESHOLD;
1335 		if (!slot)
1336 			slot = 1;
1337 		diff = slot * period_slot;
1338 	} else {
1339 		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1340 
1341 		/*
1342 		 * Scale scan rate increases based on sharing. There is an
1343 		 * inverse relationship between the degree of sharing and
1344 		 * the adjustment made to the scanning period. Broadly
1345 		 * speaking the intent is that there is little point
1346 		 * scanning faster if shared accesses dominate as it may
1347 		 * simply bounce migrations uselessly
1348 		 */
1349 		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1350 		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1351 	}
1352 
1353 	p->numa_scan_period = clamp(p->numa_scan_period + diff,
1354 			task_scan_min(p), task_scan_max(p));
1355 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1356 }
1357 
1358 static void task_numa_placement(struct task_struct *p)
1359 {
1360 	int seq, nid, max_nid = -1, max_group_nid = -1;
1361 	unsigned long max_faults = 0, max_group_faults = 0;
1362 	unsigned long fault_types[2] = { 0, 0 };
1363 	spinlock_t *group_lock = NULL;
1364 
1365 	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
1366 	if (p->numa_scan_seq == seq)
1367 		return;
1368 	p->numa_scan_seq = seq;
1369 	p->numa_scan_period_max = task_scan_max(p);
1370 
1371 	/* If the task is part of a group prevent parallel updates to group stats */
1372 	if (p->numa_group) {
1373 		group_lock = &p->numa_group->lock;
1374 		spin_lock(group_lock);
1375 	}
1376 
1377 	/* Find the node with the highest number of faults */
1378 	for_each_online_node(nid) {
1379 		unsigned long faults = 0, group_faults = 0;
1380 		int priv, i;
1381 
1382 		for (priv = 0; priv < 2; priv++) {
1383 			long diff;
1384 
1385 			i = task_faults_idx(nid, priv);
1386 			diff = -p->numa_faults[i];
1387 
1388 			/* Decay existing window, copy faults since last scan */
1389 			p->numa_faults[i] >>= 1;
1390 			p->numa_faults[i] += p->numa_faults_buffer[i];
1391 			fault_types[priv] += p->numa_faults_buffer[i];
1392 			p->numa_faults_buffer[i] = 0;
1393 
1394 			faults += p->numa_faults[i];
1395 			diff += p->numa_faults[i];
1396 			p->total_numa_faults += diff;
1397 			if (p->numa_group) {
1398 				/* safe because we can only change our own group */
1399 				p->numa_group->faults[i] += diff;
1400 				p->numa_group->total_faults += diff;
1401 				group_faults += p->numa_group->faults[i];
1402 			}
1403 		}
1404 
1405 		if (faults > max_faults) {
1406 			max_faults = faults;
1407 			max_nid = nid;
1408 		}
1409 
1410 		if (group_faults > max_group_faults) {
1411 			max_group_faults = group_faults;
1412 			max_group_nid = nid;
1413 		}
1414 	}
1415 
1416 	update_task_scan_period(p, fault_types[0], fault_types[1]);
1417 
1418 	if (p->numa_group) {
1419 		/*
1420 		 * If the preferred task and group nids are different,
1421 		 * iterate over the nodes again to find the best place.
1422 		 */
1423 		if (max_nid != max_group_nid) {
1424 			unsigned long weight, max_weight = 0;
1425 
1426 			for_each_online_node(nid) {
1427 				weight = task_weight(p, nid) + group_weight(p, nid);
1428 				if (weight > max_weight) {
1429 					max_weight = weight;
1430 					max_nid = nid;
1431 				}
1432 			}
1433 		}
1434 
1435 		spin_unlock(group_lock);
1436 	}
1437 
1438 	/* Preferred node as the node with the most faults */
1439 	if (max_faults && max_nid != p->numa_preferred_nid) {
1440 		/* Update the preferred nid and migrate task if possible */
1441 		sched_setnuma(p, max_nid);
1442 		numa_migrate_preferred(p);
1443 	}
1444 }
1445 
1446 static inline int get_numa_group(struct numa_group *grp)
1447 {
1448 	return atomic_inc_not_zero(&grp->refcount);
1449 }
1450 
1451 static inline void put_numa_group(struct numa_group *grp)
1452 {
1453 	if (atomic_dec_and_test(&grp->refcount))
1454 		kfree_rcu(grp, rcu);
1455 }
1456 
1457 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1458 			int *priv)
1459 {
1460 	struct numa_group *grp, *my_grp;
1461 	struct task_struct *tsk;
1462 	bool join = false;
1463 	int cpu = cpupid_to_cpu(cpupid);
1464 	int i;
1465 
1466 	if (unlikely(!p->numa_group)) {
1467 		unsigned int size = sizeof(struct numa_group) +
1468 				    2*nr_node_ids*sizeof(unsigned long);
1469 
1470 		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1471 		if (!grp)
1472 			return;
1473 
1474 		atomic_set(&grp->refcount, 1);
1475 		spin_lock_init(&grp->lock);
1476 		INIT_LIST_HEAD(&grp->task_list);
1477 		grp->gid = p->pid;
1478 
1479 		for (i = 0; i < 2*nr_node_ids; i++)
1480 			grp->faults[i] = p->numa_faults[i];
1481 
1482 		grp->total_faults = p->total_numa_faults;
1483 
1484 		list_add(&p->numa_entry, &grp->task_list);
1485 		grp->nr_tasks++;
1486 		rcu_assign_pointer(p->numa_group, grp);
1487 	}
1488 
1489 	rcu_read_lock();
1490 	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1491 
1492 	if (!cpupid_match_pid(tsk, cpupid))
1493 		goto no_join;
1494 
1495 	grp = rcu_dereference(tsk->numa_group);
1496 	if (!grp)
1497 		goto no_join;
1498 
1499 	my_grp = p->numa_group;
1500 	if (grp == my_grp)
1501 		goto no_join;
1502 
1503 	/*
1504 	 * Only join the other group if its bigger; if we're the bigger group,
1505 	 * the other task will join us.
1506 	 */
1507 	if (my_grp->nr_tasks > grp->nr_tasks)
1508 		goto no_join;
1509 
1510 	/*
1511 	 * Tie-break on the grp address.
1512 	 */
1513 	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1514 		goto no_join;
1515 
1516 	/* Always join threads in the same process. */
1517 	if (tsk->mm == current->mm)
1518 		join = true;
1519 
1520 	/* Simple filter to avoid false positives due to PID collisions */
1521 	if (flags & TNF_SHARED)
1522 		join = true;
1523 
1524 	/* Update priv based on whether false sharing was detected */
1525 	*priv = !join;
1526 
1527 	if (join && !get_numa_group(grp))
1528 		goto no_join;
1529 
1530 	rcu_read_unlock();
1531 
1532 	if (!join)
1533 		return;
1534 
1535 	double_lock(&my_grp->lock, &grp->lock);
1536 
1537 	for (i = 0; i < 2*nr_node_ids; i++) {
1538 		my_grp->faults[i] -= p->numa_faults[i];
1539 		grp->faults[i] += p->numa_faults[i];
1540 	}
1541 	my_grp->total_faults -= p->total_numa_faults;
1542 	grp->total_faults += p->total_numa_faults;
1543 
1544 	list_move(&p->numa_entry, &grp->task_list);
1545 	my_grp->nr_tasks--;
1546 	grp->nr_tasks++;
1547 
1548 	spin_unlock(&my_grp->lock);
1549 	spin_unlock(&grp->lock);
1550 
1551 	rcu_assign_pointer(p->numa_group, grp);
1552 
1553 	put_numa_group(my_grp);
1554 	return;
1555 
1556 no_join:
1557 	rcu_read_unlock();
1558 	return;
1559 }
1560 
1561 void task_numa_free(struct task_struct *p)
1562 {
1563 	struct numa_group *grp = p->numa_group;
1564 	int i;
1565 	void *numa_faults = p->numa_faults;
1566 
1567 	if (grp) {
1568 		spin_lock(&grp->lock);
1569 		for (i = 0; i < 2*nr_node_ids; i++)
1570 			grp->faults[i] -= p->numa_faults[i];
1571 		grp->total_faults -= p->total_numa_faults;
1572 
1573 		list_del(&p->numa_entry);
1574 		grp->nr_tasks--;
1575 		spin_unlock(&grp->lock);
1576 		rcu_assign_pointer(p->numa_group, NULL);
1577 		put_numa_group(grp);
1578 	}
1579 
1580 	p->numa_faults = NULL;
1581 	p->numa_faults_buffer = NULL;
1582 	kfree(numa_faults);
1583 }
1584 
1585 /*
1586  * Got a PROT_NONE fault for a page on @node.
1587  */
1588 void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1589 {
1590 	struct task_struct *p = current;
1591 	bool migrated = flags & TNF_MIGRATED;
1592 	int priv;
1593 
1594 	if (!numabalancing_enabled)
1595 		return;
1596 
1597 	/* for example, ksmd faulting in a user's mm */
1598 	if (!p->mm)
1599 		return;
1600 
1601 	/* Do not worry about placement if exiting */
1602 	if (p->state == TASK_DEAD)
1603 		return;
1604 
1605 	/* Allocate buffer to track faults on a per-node basis */
1606 	if (unlikely(!p->numa_faults)) {
1607 		int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
1608 
1609 		/* numa_faults and numa_faults_buffer share the allocation */
1610 		p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
1611 		if (!p->numa_faults)
1612 			return;
1613 
1614 		BUG_ON(p->numa_faults_buffer);
1615 		p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1616 		p->total_numa_faults = 0;
1617 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1618 	}
1619 
1620 	/*
1621 	 * First accesses are treated as private, otherwise consider accesses
1622 	 * to be private if the accessing pid has not changed
1623 	 */
1624 	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
1625 		priv = 1;
1626 	} else {
1627 		priv = cpupid_match_pid(p, last_cpupid);
1628 		if (!priv && !(flags & TNF_NO_GROUP))
1629 			task_numa_group(p, last_cpupid, flags, &priv);
1630 	}
1631 
1632 	task_numa_placement(p);
1633 
1634 	/*
1635 	 * Retry task to preferred node migration periodically, in case it
1636 	 * case it previously failed, or the scheduler moved us.
1637 	 */
1638 	if (time_after(jiffies, p->numa_migrate_retry))
1639 		numa_migrate_preferred(p);
1640 
1641 	if (migrated)
1642 		p->numa_pages_migrated += pages;
1643 
1644 	p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
1645 	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1646 }
1647 
1648 static void reset_ptenuma_scan(struct task_struct *p)
1649 {
1650 	ACCESS_ONCE(p->mm->numa_scan_seq)++;
1651 	p->mm->numa_scan_offset = 0;
1652 }
1653 
1654 /*
1655  * The expensive part of numa migration is done from task_work context.
1656  * Triggered from task_tick_numa().
1657  */
1658 void task_numa_work(struct callback_head *work)
1659 {
1660 	unsigned long migrate, next_scan, now = jiffies;
1661 	struct task_struct *p = current;
1662 	struct mm_struct *mm = p->mm;
1663 	struct vm_area_struct *vma;
1664 	unsigned long start, end;
1665 	unsigned long nr_pte_updates = 0;
1666 	long pages;
1667 
1668 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
1669 
1670 	work->next = work; /* protect against double add */
1671 	/*
1672 	 * Who cares about NUMA placement when they're dying.
1673 	 *
1674 	 * NOTE: make sure not to dereference p->mm before this check,
1675 	 * exit_task_work() happens _after_ exit_mm() so we could be called
1676 	 * without p->mm even though we still had it when we enqueued this
1677 	 * work.
1678 	 */
1679 	if (p->flags & PF_EXITING)
1680 		return;
1681 
1682 	if (!mm->numa_next_scan) {
1683 		mm->numa_next_scan = now +
1684 			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1685 	}
1686 
1687 	/*
1688 	 * Enforce maximal scan/migration frequency..
1689 	 */
1690 	migrate = mm->numa_next_scan;
1691 	if (time_before(now, migrate))
1692 		return;
1693 
1694 	if (p->numa_scan_period == 0) {
1695 		p->numa_scan_period_max = task_scan_max(p);
1696 		p->numa_scan_period = task_scan_min(p);
1697 	}
1698 
1699 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
1700 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
1701 		return;
1702 
1703 	/*
1704 	 * Delay this task enough that another task of this mm will likely win
1705 	 * the next time around.
1706 	 */
1707 	p->node_stamp += 2 * TICK_NSEC;
1708 
1709 	start = mm->numa_scan_offset;
1710 	pages = sysctl_numa_balancing_scan_size;
1711 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
1712 	if (!pages)
1713 		return;
1714 
1715 	down_read(&mm->mmap_sem);
1716 	vma = find_vma(mm, start);
1717 	if (!vma) {
1718 		reset_ptenuma_scan(p);
1719 		start = 0;
1720 		vma = mm->mmap;
1721 	}
1722 	for (; vma; vma = vma->vm_next) {
1723 		if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
1724 			continue;
1725 
1726 		/*
1727 		 * Shared library pages mapped by multiple processes are not
1728 		 * migrated as it is expected they are cache replicated. Avoid
1729 		 * hinting faults in read-only file-backed mappings or the vdso
1730 		 * as migrating the pages will be of marginal benefit.
1731 		 */
1732 		if (!vma->vm_mm ||
1733 		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
1734 			continue;
1735 
1736 		/*
1737 		 * Skip inaccessible VMAs to avoid any confusion between
1738 		 * PROT_NONE and NUMA hinting ptes
1739 		 */
1740 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1741 			continue;
1742 
1743 		do {
1744 			start = max(start, vma->vm_start);
1745 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
1746 			end = min(end, vma->vm_end);
1747 			nr_pte_updates += change_prot_numa(vma, start, end);
1748 
1749 			/*
1750 			 * Scan sysctl_numa_balancing_scan_size but ensure that
1751 			 * at least one PTE is updated so that unused virtual
1752 			 * address space is quickly skipped.
1753 			 */
1754 			if (nr_pte_updates)
1755 				pages -= (end - start) >> PAGE_SHIFT;
1756 
1757 			start = end;
1758 			if (pages <= 0)
1759 				goto out;
1760 		} while (end != vma->vm_end);
1761 	}
1762 
1763 out:
1764 	/*
1765 	 * It is possible to reach the end of the VMA list but the last few
1766 	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
1767 	 * would find the !migratable VMA on the next scan but not reset the
1768 	 * scanner to the start so check it now.
1769 	 */
1770 	if (vma)
1771 		mm->numa_scan_offset = start;
1772 	else
1773 		reset_ptenuma_scan(p);
1774 	up_read(&mm->mmap_sem);
1775 }
1776 
1777 /*
1778  * Drive the periodic memory faults..
1779  */
1780 void task_tick_numa(struct rq *rq, struct task_struct *curr)
1781 {
1782 	struct callback_head *work = &curr->numa_work;
1783 	u64 period, now;
1784 
1785 	/*
1786 	 * We don't care about NUMA placement if we don't have memory.
1787 	 */
1788 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
1789 		return;
1790 
1791 	/*
1792 	 * Using runtime rather than walltime has the dual advantage that
1793 	 * we (mostly) drive the selection from busy threads and that the
1794 	 * task needs to have done some actual work before we bother with
1795 	 * NUMA placement.
1796 	 */
1797 	now = curr->se.sum_exec_runtime;
1798 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
1799 
1800 	if (now - curr->node_stamp > period) {
1801 		if (!curr->node_stamp)
1802 			curr->numa_scan_period = task_scan_min(curr);
1803 		curr->node_stamp += period;
1804 
1805 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1806 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
1807 			task_work_add(curr, work, true);
1808 		}
1809 	}
1810 }
1811 #else
1812 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1813 {
1814 }
1815 
1816 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1817 {
1818 }
1819 
1820 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1821 {
1822 }
1823 #endif /* CONFIG_NUMA_BALANCING */
1824 
1825 static void
1826 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1827 {
1828 	update_load_add(&cfs_rq->load, se->load.weight);
1829 	if (!parent_entity(se))
1830 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1831 #ifdef CONFIG_SMP
1832 	if (entity_is_task(se)) {
1833 		struct rq *rq = rq_of(cfs_rq);
1834 
1835 		account_numa_enqueue(rq, task_of(se));
1836 		list_add(&se->group_node, &rq->cfs_tasks);
1837 	}
1838 #endif
1839 	cfs_rq->nr_running++;
1840 }
1841 
1842 static void
1843 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1844 {
1845 	update_load_sub(&cfs_rq->load, se->load.weight);
1846 	if (!parent_entity(se))
1847 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1848 	if (entity_is_task(se)) {
1849 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1850 		list_del_init(&se->group_node);
1851 	}
1852 	cfs_rq->nr_running--;
1853 }
1854 
1855 #ifdef CONFIG_FAIR_GROUP_SCHED
1856 # ifdef CONFIG_SMP
1857 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1858 {
1859 	long tg_weight;
1860 
1861 	/*
1862 	 * Use this CPU's actual weight instead of the last load_contribution
1863 	 * to gain a more accurate current total weight. See
1864 	 * update_cfs_rq_load_contribution().
1865 	 */
1866 	tg_weight = atomic_long_read(&tg->load_avg);
1867 	tg_weight -= cfs_rq->tg_load_contrib;
1868 	tg_weight += cfs_rq->load.weight;
1869 
1870 	return tg_weight;
1871 }
1872 
1873 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1874 {
1875 	long tg_weight, load, shares;
1876 
1877 	tg_weight = calc_tg_weight(tg, cfs_rq);
1878 	load = cfs_rq->load.weight;
1879 
1880 	shares = (tg->shares * load);
1881 	if (tg_weight)
1882 		shares /= tg_weight;
1883 
1884 	if (shares < MIN_SHARES)
1885 		shares = MIN_SHARES;
1886 	if (shares > tg->shares)
1887 		shares = tg->shares;
1888 
1889 	return shares;
1890 }
1891 # else /* CONFIG_SMP */
1892 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1893 {
1894 	return tg->shares;
1895 }
1896 # endif /* CONFIG_SMP */
1897 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1898 			    unsigned long weight)
1899 {
1900 	if (se->on_rq) {
1901 		/* commit outstanding execution time */
1902 		if (cfs_rq->curr == se)
1903 			update_curr(cfs_rq);
1904 		account_entity_dequeue(cfs_rq, se);
1905 	}
1906 
1907 	update_load_set(&se->load, weight);
1908 
1909 	if (se->on_rq)
1910 		account_entity_enqueue(cfs_rq, se);
1911 }
1912 
1913 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1914 
1915 static void update_cfs_shares(struct cfs_rq *cfs_rq)
1916 {
1917 	struct task_group *tg;
1918 	struct sched_entity *se;
1919 	long shares;
1920 
1921 	tg = cfs_rq->tg;
1922 	se = tg->se[cpu_of(rq_of(cfs_rq))];
1923 	if (!se || throttled_hierarchy(cfs_rq))
1924 		return;
1925 #ifndef CONFIG_SMP
1926 	if (likely(se->load.weight == tg->shares))
1927 		return;
1928 #endif
1929 	shares = calc_cfs_shares(cfs_rq, tg);
1930 
1931 	reweight_entity(cfs_rq_of(se), se, shares);
1932 }
1933 #else /* CONFIG_FAIR_GROUP_SCHED */
1934 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1935 {
1936 }
1937 #endif /* CONFIG_FAIR_GROUP_SCHED */
1938 
1939 #ifdef CONFIG_SMP
1940 /*
1941  * We choose a half-life close to 1 scheduling period.
1942  * Note: The tables below are dependent on this value.
1943  */
1944 #define LOAD_AVG_PERIOD 32
1945 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1946 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1947 
1948 /* Precomputed fixed inverse multiplies for multiplication by y^n */
1949 static const u32 runnable_avg_yN_inv[] = {
1950 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1951 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1952 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1953 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1954 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1955 	0x85aac367, 0x82cd8698,
1956 };
1957 
1958 /*
1959  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
1960  * over-estimates when re-combining.
1961  */
1962 static const u32 runnable_avg_yN_sum[] = {
1963 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1964 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1965 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1966 };
1967 
1968 /*
1969  * Approximate:
1970  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
1971  */
1972 static __always_inline u64 decay_load(u64 val, u64 n)
1973 {
1974 	unsigned int local_n;
1975 
1976 	if (!n)
1977 		return val;
1978 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1979 		return 0;
1980 
1981 	/* after bounds checking we can collapse to 32-bit */
1982 	local_n = n;
1983 
1984 	/*
1985 	 * As y^PERIOD = 1/2, we can combine
1986 	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1987 	 * With a look-up table which covers k^n (n<PERIOD)
1988 	 *
1989 	 * To achieve constant time decay_load.
1990 	 */
1991 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1992 		val >>= local_n / LOAD_AVG_PERIOD;
1993 		local_n %= LOAD_AVG_PERIOD;
1994 	}
1995 
1996 	val *= runnable_avg_yN_inv[local_n];
1997 	/* We don't use SRR here since we always want to round down. */
1998 	return val >> 32;
1999 }
2000 
2001 /*
2002  * For updates fully spanning n periods, the contribution to runnable
2003  * average will be: \Sum 1024*y^n
2004  *
2005  * We can compute this reasonably efficiently by combining:
2006  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2007  */
2008 static u32 __compute_runnable_contrib(u64 n)
2009 {
2010 	u32 contrib = 0;
2011 
2012 	if (likely(n <= LOAD_AVG_PERIOD))
2013 		return runnable_avg_yN_sum[n];
2014 	else if (unlikely(n >= LOAD_AVG_MAX_N))
2015 		return LOAD_AVG_MAX;
2016 
2017 	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2018 	do {
2019 		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2020 		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2021 
2022 		n -= LOAD_AVG_PERIOD;
2023 	} while (n > LOAD_AVG_PERIOD);
2024 
2025 	contrib = decay_load(contrib, n);
2026 	return contrib + runnable_avg_yN_sum[n];
2027 }
2028 
2029 /*
2030  * We can represent the historical contribution to runnable average as the
2031  * coefficients of a geometric series.  To do this we sub-divide our runnable
2032  * history into segments of approximately 1ms (1024us); label the segment that
2033  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2034  *
2035  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2036  *      p0            p1           p2
2037  *     (now)       (~1ms ago)  (~2ms ago)
2038  *
2039  * Let u_i denote the fraction of p_i that the entity was runnable.
2040  *
2041  * We then designate the fractions u_i as our co-efficients, yielding the
2042  * following representation of historical load:
2043  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2044  *
2045  * We choose y based on the with of a reasonably scheduling period, fixing:
2046  *   y^32 = 0.5
2047  *
2048  * This means that the contribution to load ~32ms ago (u_32) will be weighted
2049  * approximately half as much as the contribution to load within the last ms
2050  * (u_0).
2051  *
2052  * When a period "rolls over" and we have new u_0`, multiplying the previous
2053  * sum again by y is sufficient to update:
2054  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2055  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2056  */
2057 static __always_inline int __update_entity_runnable_avg(u64 now,
2058 							struct sched_avg *sa,
2059 							int runnable)
2060 {
2061 	u64 delta, periods;
2062 	u32 runnable_contrib;
2063 	int delta_w, decayed = 0;
2064 
2065 	delta = now - sa->last_runnable_update;
2066 	/*
2067 	 * This should only happen when time goes backwards, which it
2068 	 * unfortunately does during sched clock init when we swap over to TSC.
2069 	 */
2070 	if ((s64)delta < 0) {
2071 		sa->last_runnable_update = now;
2072 		return 0;
2073 	}
2074 
2075 	/*
2076 	 * Use 1024ns as the unit of measurement since it's a reasonable
2077 	 * approximation of 1us and fast to compute.
2078 	 */
2079 	delta >>= 10;
2080 	if (!delta)
2081 		return 0;
2082 	sa->last_runnable_update = now;
2083 
2084 	/* delta_w is the amount already accumulated against our next period */
2085 	delta_w = sa->runnable_avg_period % 1024;
2086 	if (delta + delta_w >= 1024) {
2087 		/* period roll-over */
2088 		decayed = 1;
2089 
2090 		/*
2091 		 * Now that we know we're crossing a period boundary, figure
2092 		 * out how much from delta we need to complete the current
2093 		 * period and accrue it.
2094 		 */
2095 		delta_w = 1024 - delta_w;
2096 		if (runnable)
2097 			sa->runnable_avg_sum += delta_w;
2098 		sa->runnable_avg_period += delta_w;
2099 
2100 		delta -= delta_w;
2101 
2102 		/* Figure out how many additional periods this update spans */
2103 		periods = delta / 1024;
2104 		delta %= 1024;
2105 
2106 		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
2107 						  periods + 1);
2108 		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
2109 						     periods + 1);
2110 
2111 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
2112 		runnable_contrib = __compute_runnable_contrib(periods);
2113 		if (runnable)
2114 			sa->runnable_avg_sum += runnable_contrib;
2115 		sa->runnable_avg_period += runnable_contrib;
2116 	}
2117 
2118 	/* Remainder of delta accrued against u_0` */
2119 	if (runnable)
2120 		sa->runnable_avg_sum += delta;
2121 	sa->runnable_avg_period += delta;
2122 
2123 	return decayed;
2124 }
2125 
2126 /* Synchronize an entity's decay with its parenting cfs_rq.*/
2127 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2128 {
2129 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2130 	u64 decays = atomic64_read(&cfs_rq->decay_counter);
2131 
2132 	decays -= se->avg.decay_count;
2133 	if (!decays)
2134 		return 0;
2135 
2136 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2137 	se->avg.decay_count = 0;
2138 
2139 	return decays;
2140 }
2141 
2142 #ifdef CONFIG_FAIR_GROUP_SCHED
2143 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2144 						 int force_update)
2145 {
2146 	struct task_group *tg = cfs_rq->tg;
2147 	long tg_contrib;
2148 
2149 	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
2150 	tg_contrib -= cfs_rq->tg_load_contrib;
2151 
2152 	if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
2153 		atomic_long_add(tg_contrib, &tg->load_avg);
2154 		cfs_rq->tg_load_contrib += tg_contrib;
2155 	}
2156 }
2157 
2158 /*
2159  * Aggregate cfs_rq runnable averages into an equivalent task_group
2160  * representation for computing load contributions.
2161  */
2162 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2163 						  struct cfs_rq *cfs_rq)
2164 {
2165 	struct task_group *tg = cfs_rq->tg;
2166 	long contrib;
2167 
2168 	/* The fraction of a cpu used by this cfs_rq */
2169 	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
2170 			  sa->runnable_avg_period + 1);
2171 	contrib -= cfs_rq->tg_runnable_contrib;
2172 
2173 	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
2174 		atomic_add(contrib, &tg->runnable_avg);
2175 		cfs_rq->tg_runnable_contrib += contrib;
2176 	}
2177 }
2178 
2179 static inline void __update_group_entity_contrib(struct sched_entity *se)
2180 {
2181 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
2182 	struct task_group *tg = cfs_rq->tg;
2183 	int runnable_avg;
2184 
2185 	u64 contrib;
2186 
2187 	contrib = cfs_rq->tg_load_contrib * tg->shares;
2188 	se->avg.load_avg_contrib = div_u64(contrib,
2189 				     atomic_long_read(&tg->load_avg) + 1);
2190 
2191 	/*
2192 	 * For group entities we need to compute a correction term in the case
2193 	 * that they are consuming <1 cpu so that we would contribute the same
2194 	 * load as a task of equal weight.
2195 	 *
2196 	 * Explicitly co-ordinating this measurement would be expensive, but
2197 	 * fortunately the sum of each cpus contribution forms a usable
2198 	 * lower-bound on the true value.
2199 	 *
2200 	 * Consider the aggregate of 2 contributions.  Either they are disjoint
2201 	 * (and the sum represents true value) or they are disjoint and we are
2202 	 * understating by the aggregate of their overlap.
2203 	 *
2204 	 * Extending this to N cpus, for a given overlap, the maximum amount we
2205 	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
2206 	 * cpus that overlap for this interval and w_i is the interval width.
2207 	 *
2208 	 * On a small machine; the first term is well-bounded which bounds the
2209 	 * total error since w_i is a subset of the period.  Whereas on a
2210 	 * larger machine, while this first term can be larger, if w_i is the
2211 	 * of consequential size guaranteed to see n_i*w_i quickly converge to
2212 	 * our upper bound of 1-cpu.
2213 	 */
2214 	runnable_avg = atomic_read(&tg->runnable_avg);
2215 	if (runnable_avg < NICE_0_LOAD) {
2216 		se->avg.load_avg_contrib *= runnable_avg;
2217 		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2218 	}
2219 }
2220 #else
2221 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2222 						 int force_update) {}
2223 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2224 						  struct cfs_rq *cfs_rq) {}
2225 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2226 #endif
2227 
2228 static inline void __update_task_entity_contrib(struct sched_entity *se)
2229 {
2230 	u32 contrib;
2231 
2232 	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2233 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2234 	contrib /= (se->avg.runnable_avg_period + 1);
2235 	se->avg.load_avg_contrib = scale_load(contrib);
2236 }
2237 
2238 /* Compute the current contribution to load_avg by se, return any delta */
2239 static long __update_entity_load_avg_contrib(struct sched_entity *se)
2240 {
2241 	long old_contrib = se->avg.load_avg_contrib;
2242 
2243 	if (entity_is_task(se)) {
2244 		__update_task_entity_contrib(se);
2245 	} else {
2246 		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
2247 		__update_group_entity_contrib(se);
2248 	}
2249 
2250 	return se->avg.load_avg_contrib - old_contrib;
2251 }
2252 
2253 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
2254 						 long load_contrib)
2255 {
2256 	if (likely(load_contrib < cfs_rq->blocked_load_avg))
2257 		cfs_rq->blocked_load_avg -= load_contrib;
2258 	else
2259 		cfs_rq->blocked_load_avg = 0;
2260 }
2261 
2262 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2263 
2264 /* Update a sched_entity's runnable average */
2265 static inline void update_entity_load_avg(struct sched_entity *se,
2266 					  int update_cfs_rq)
2267 {
2268 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2269 	long contrib_delta;
2270 	u64 now;
2271 
2272 	/*
2273 	 * For a group entity we need to use their owned cfs_rq_clock_task() in
2274 	 * case they are the parent of a throttled hierarchy.
2275 	 */
2276 	if (entity_is_task(se))
2277 		now = cfs_rq_clock_task(cfs_rq);
2278 	else
2279 		now = cfs_rq_clock_task(group_cfs_rq(se));
2280 
2281 	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
2282 		return;
2283 
2284 	contrib_delta = __update_entity_load_avg_contrib(se);
2285 
2286 	if (!update_cfs_rq)
2287 		return;
2288 
2289 	if (se->on_rq)
2290 		cfs_rq->runnable_load_avg += contrib_delta;
2291 	else
2292 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2293 }
2294 
2295 /*
2296  * Decay the load contributed by all blocked children and account this so that
2297  * their contribution may appropriately discounted when they wake up.
2298  */
2299 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2300 {
2301 	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
2302 	u64 decays;
2303 
2304 	decays = now - cfs_rq->last_decay;
2305 	if (!decays && !force_update)
2306 		return;
2307 
2308 	if (atomic_long_read(&cfs_rq->removed_load)) {
2309 		unsigned long removed_load;
2310 		removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
2311 		subtract_blocked_load_contrib(cfs_rq, removed_load);
2312 	}
2313 
2314 	if (decays) {
2315 		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
2316 						      decays);
2317 		atomic64_add(decays, &cfs_rq->decay_counter);
2318 		cfs_rq->last_decay = now;
2319 	}
2320 
2321 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2322 }
2323 
2324 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2325 {
2326 	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2327 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
2328 }
2329 
2330 /* Add the load generated by se into cfs_rq's child load-average */
2331 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2332 						  struct sched_entity *se,
2333 						  int wakeup)
2334 {
2335 	/*
2336 	 * We track migrations using entity decay_count <= 0, on a wake-up
2337 	 * migration we use a negative decay count to track the remote decays
2338 	 * accumulated while sleeping.
2339 	 *
2340 	 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
2341 	 * are seen by enqueue_entity_load_avg() as a migration with an already
2342 	 * constructed load_avg_contrib.
2343 	 */
2344 	if (unlikely(se->avg.decay_count <= 0)) {
2345 		se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
2346 		if (se->avg.decay_count) {
2347 			/*
2348 			 * In a wake-up migration we have to approximate the
2349 			 * time sleeping.  This is because we can't synchronize
2350 			 * clock_task between the two cpus, and it is not
2351 			 * guaranteed to be read-safe.  Instead, we can
2352 			 * approximate this using our carried decays, which are
2353 			 * explicitly atomically readable.
2354 			 */
2355 			se->avg.last_runnable_update -= (-se->avg.decay_count)
2356 							<< 20;
2357 			update_entity_load_avg(se, 0);
2358 			/* Indicate that we're now synchronized and on-rq */
2359 			se->avg.decay_count = 0;
2360 		}
2361 		wakeup = 0;
2362 	} else {
2363 		__synchronize_entity_decay(se);
2364 	}
2365 
2366 	/* migrated tasks did not contribute to our blocked load */
2367 	if (wakeup) {
2368 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
2369 		update_entity_load_avg(se, 0);
2370 	}
2371 
2372 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
2373 	/* we force update consideration on load-balancer moves */
2374 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2375 }
2376 
2377 /*
2378  * Remove se's load from this cfs_rq child load-average, if the entity is
2379  * transitioning to a blocked state we track its projected decay using
2380  * blocked_load_avg.
2381  */
2382 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2383 						  struct sched_entity *se,
2384 						  int sleep)
2385 {
2386 	update_entity_load_avg(se, 1);
2387 	/* we force update consideration on load-balancer moves */
2388 	update_cfs_rq_blocked_load(cfs_rq, !sleep);
2389 
2390 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
2391 	if (sleep) {
2392 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
2393 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
2394 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
2395 }
2396 
2397 /*
2398  * Update the rq's load with the elapsed running time before entering
2399  * idle. if the last scheduled task is not a CFS task, idle_enter will
2400  * be the only way to update the runnable statistic.
2401  */
2402 void idle_enter_fair(struct rq *this_rq)
2403 {
2404 	update_rq_runnable_avg(this_rq, 1);
2405 }
2406 
2407 /*
2408  * Update the rq's load with the elapsed idle time before a task is
2409  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
2410  * be the only way to update the runnable statistic.
2411  */
2412 void idle_exit_fair(struct rq *this_rq)
2413 {
2414 	update_rq_runnable_avg(this_rq, 0);
2415 }
2416 
2417 #else
2418 static inline void update_entity_load_avg(struct sched_entity *se,
2419 					  int update_cfs_rq) {}
2420 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2421 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2422 					   struct sched_entity *se,
2423 					   int wakeup) {}
2424 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2425 					   struct sched_entity *se,
2426 					   int sleep) {}
2427 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2428 					      int force_update) {}
2429 #endif
2430 
2431 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2432 {
2433 #ifdef CONFIG_SCHEDSTATS
2434 	struct task_struct *tsk = NULL;
2435 
2436 	if (entity_is_task(se))
2437 		tsk = task_of(se);
2438 
2439 	if (se->statistics.sleep_start) {
2440 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
2441 
2442 		if ((s64)delta < 0)
2443 			delta = 0;
2444 
2445 		if (unlikely(delta > se->statistics.sleep_max))
2446 			se->statistics.sleep_max = delta;
2447 
2448 		se->statistics.sleep_start = 0;
2449 		se->statistics.sum_sleep_runtime += delta;
2450 
2451 		if (tsk) {
2452 			account_scheduler_latency(tsk, delta >> 10, 1);
2453 			trace_sched_stat_sleep(tsk, delta);
2454 		}
2455 	}
2456 	if (se->statistics.block_start) {
2457 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
2458 
2459 		if ((s64)delta < 0)
2460 			delta = 0;
2461 
2462 		if (unlikely(delta > se->statistics.block_max))
2463 			se->statistics.block_max = delta;
2464 
2465 		se->statistics.block_start = 0;
2466 		se->statistics.sum_sleep_runtime += delta;
2467 
2468 		if (tsk) {
2469 			if (tsk->in_iowait) {
2470 				se->statistics.iowait_sum += delta;
2471 				se->statistics.iowait_count++;
2472 				trace_sched_stat_iowait(tsk, delta);
2473 			}
2474 
2475 			trace_sched_stat_blocked(tsk, delta);
2476 
2477 			/*
2478 			 * Blocking time is in units of nanosecs, so shift by
2479 			 * 20 to get a milliseconds-range estimation of the
2480 			 * amount of time that the task spent sleeping:
2481 			 */
2482 			if (unlikely(prof_on == SLEEP_PROFILING)) {
2483 				profile_hits(SLEEP_PROFILING,
2484 						(void *)get_wchan(tsk),
2485 						delta >> 20);
2486 			}
2487 			account_scheduler_latency(tsk, delta >> 10, 0);
2488 		}
2489 	}
2490 #endif
2491 }
2492 
2493 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
2494 {
2495 #ifdef CONFIG_SCHED_DEBUG
2496 	s64 d = se->vruntime - cfs_rq->min_vruntime;
2497 
2498 	if (d < 0)
2499 		d = -d;
2500 
2501 	if (d > 3*sysctl_sched_latency)
2502 		schedstat_inc(cfs_rq, nr_spread_over);
2503 #endif
2504 }
2505 
2506 static void
2507 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
2508 {
2509 	u64 vruntime = cfs_rq->min_vruntime;
2510 
2511 	/*
2512 	 * The 'current' period is already promised to the current tasks,
2513 	 * however the extra weight of the new task will slow them down a
2514 	 * little, place the new task so that it fits in the slot that
2515 	 * stays open at the end.
2516 	 */
2517 	if (initial && sched_feat(START_DEBIT))
2518 		vruntime += sched_vslice(cfs_rq, se);
2519 
2520 	/* sleeps up to a single latency don't count. */
2521 	if (!initial) {
2522 		unsigned long thresh = sysctl_sched_latency;
2523 
2524 		/*
2525 		 * Halve their sleep time's effect, to allow
2526 		 * for a gentler effect of sleepers:
2527 		 */
2528 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
2529 			thresh >>= 1;
2530 
2531 		vruntime -= thresh;
2532 	}
2533 
2534 	/* ensure we never gain time by being placed backwards. */
2535 	se->vruntime = max_vruntime(se->vruntime, vruntime);
2536 }
2537 
2538 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
2539 
2540 static void
2541 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
2542 {
2543 	/*
2544 	 * Update the normalized vruntime before updating min_vruntime
2545 	 * through calling update_curr().
2546 	 */
2547 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
2548 		se->vruntime += cfs_rq->min_vruntime;
2549 
2550 	/*
2551 	 * Update run-time statistics of the 'current'.
2552 	 */
2553 	update_curr(cfs_rq);
2554 	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
2555 	account_entity_enqueue(cfs_rq, se);
2556 	update_cfs_shares(cfs_rq);
2557 
2558 	if (flags & ENQUEUE_WAKEUP) {
2559 		place_entity(cfs_rq, se, 0);
2560 		enqueue_sleeper(cfs_rq, se);
2561 	}
2562 
2563 	update_stats_enqueue(cfs_rq, se);
2564 	check_spread(cfs_rq, se);
2565 	if (se != cfs_rq->curr)
2566 		__enqueue_entity(cfs_rq, se);
2567 	se->on_rq = 1;
2568 
2569 	if (cfs_rq->nr_running == 1) {
2570 		list_add_leaf_cfs_rq(cfs_rq);
2571 		check_enqueue_throttle(cfs_rq);
2572 	}
2573 }
2574 
2575 static void __clear_buddies_last(struct sched_entity *se)
2576 {
2577 	for_each_sched_entity(se) {
2578 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
2579 		if (cfs_rq->last == se)
2580 			cfs_rq->last = NULL;
2581 		else
2582 			break;
2583 	}
2584 }
2585 
2586 static void __clear_buddies_next(struct sched_entity *se)
2587 {
2588 	for_each_sched_entity(se) {
2589 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
2590 		if (cfs_rq->next == se)
2591 			cfs_rq->next = NULL;
2592 		else
2593 			break;
2594 	}
2595 }
2596 
2597 static void __clear_buddies_skip(struct sched_entity *se)
2598 {
2599 	for_each_sched_entity(se) {
2600 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
2601 		if (cfs_rq->skip == se)
2602 			cfs_rq->skip = NULL;
2603 		else
2604 			break;
2605 	}
2606 }
2607 
2608 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
2609 {
2610 	if (cfs_rq->last == se)
2611 		__clear_buddies_last(se);
2612 
2613 	if (cfs_rq->next == se)
2614 		__clear_buddies_next(se);
2615 
2616 	if (cfs_rq->skip == se)
2617 		__clear_buddies_skip(se);
2618 }
2619 
2620 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2621 
2622 static void
2623 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
2624 {
2625 	/*
2626 	 * Update run-time statistics of the 'current'.
2627 	 */
2628 	update_curr(cfs_rq);
2629 	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
2630 
2631 	update_stats_dequeue(cfs_rq, se);
2632 	if (flags & DEQUEUE_SLEEP) {
2633 #ifdef CONFIG_SCHEDSTATS
2634 		if (entity_is_task(se)) {
2635 			struct task_struct *tsk = task_of(se);
2636 
2637 			if (tsk->state & TASK_INTERRUPTIBLE)
2638 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
2639 			if (tsk->state & TASK_UNINTERRUPTIBLE)
2640 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
2641 		}
2642 #endif
2643 	}
2644 
2645 	clear_buddies(cfs_rq, se);
2646 
2647 	if (se != cfs_rq->curr)
2648 		__dequeue_entity(cfs_rq, se);
2649 	se->on_rq = 0;
2650 	account_entity_dequeue(cfs_rq, se);
2651 
2652 	/*
2653 	 * Normalize the entity after updating the min_vruntime because the
2654 	 * update can refer to the ->curr item and we need to reflect this
2655 	 * movement in our normalized position.
2656 	 */
2657 	if (!(flags & DEQUEUE_SLEEP))
2658 		se->vruntime -= cfs_rq->min_vruntime;
2659 
2660 	/* return excess runtime on last dequeue */
2661 	return_cfs_rq_runtime(cfs_rq);
2662 
2663 	update_min_vruntime(cfs_rq);
2664 	update_cfs_shares(cfs_rq);
2665 }
2666 
2667 /*
2668  * Preempt the current task with a newly woken task if needed:
2669  */
2670 static void
2671 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2672 {
2673 	unsigned long ideal_runtime, delta_exec;
2674 	struct sched_entity *se;
2675 	s64 delta;
2676 
2677 	ideal_runtime = sched_slice(cfs_rq, curr);
2678 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2679 	if (delta_exec > ideal_runtime) {
2680 		resched_task(rq_of(cfs_rq)->curr);
2681 		/*
2682 		 * The current task ran long enough, ensure it doesn't get
2683 		 * re-elected due to buddy favours.
2684 		 */
2685 		clear_buddies(cfs_rq, curr);
2686 		return;
2687 	}
2688 
2689 	/*
2690 	 * Ensure that a task that missed wakeup preemption by a
2691 	 * narrow margin doesn't have to wait for a full slice.
2692 	 * This also mitigates buddy induced latencies under load.
2693 	 */
2694 	if (delta_exec < sysctl_sched_min_granularity)
2695 		return;
2696 
2697 	se = __pick_first_entity(cfs_rq);
2698 	delta = curr->vruntime - se->vruntime;
2699 
2700 	if (delta < 0)
2701 		return;
2702 
2703 	if (delta > ideal_runtime)
2704 		resched_task(rq_of(cfs_rq)->curr);
2705 }
2706 
2707 static void
2708 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
2709 {
2710 	/* 'current' is not kept within the tree. */
2711 	if (se->on_rq) {
2712 		/*
2713 		 * Any task has to be enqueued before it get to execute on
2714 		 * a CPU. So account for the time it spent waiting on the
2715 		 * runqueue.
2716 		 */
2717 		update_stats_wait_end(cfs_rq, se);
2718 		__dequeue_entity(cfs_rq, se);
2719 	}
2720 
2721 	update_stats_curr_start(cfs_rq, se);
2722 	cfs_rq->curr = se;
2723 #ifdef CONFIG_SCHEDSTATS
2724 	/*
2725 	 * Track our maximum slice length, if the CPU's load is at
2726 	 * least twice that of our own weight (i.e. dont track it
2727 	 * when there are only lesser-weight tasks around):
2728 	 */
2729 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
2730 		se->statistics.slice_max = max(se->statistics.slice_max,
2731 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
2732 	}
2733 #endif
2734 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
2735 }
2736 
2737 static int
2738 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2739 
2740 /*
2741  * Pick the next process, keeping these things in mind, in this order:
2742  * 1) keep things fair between processes/task groups
2743  * 2) pick the "next" process, since someone really wants that to run
2744  * 3) pick the "last" process, for cache locality
2745  * 4) do not run the "skip" process, if something else is available
2746  */
2747 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2748 {
2749 	struct sched_entity *se = __pick_first_entity(cfs_rq);
2750 	struct sched_entity *left = se;
2751 
2752 	/*
2753 	 * Avoid running the skip buddy, if running something else can
2754 	 * be done without getting too unfair.
2755 	 */
2756 	if (cfs_rq->skip == se) {
2757 		struct sched_entity *second = __pick_next_entity(se);
2758 		if (second && wakeup_preempt_entity(second, left) < 1)
2759 			se = second;
2760 	}
2761 
2762 	/*
2763 	 * Prefer last buddy, try to return the CPU to a preempted task.
2764 	 */
2765 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
2766 		se = cfs_rq->last;
2767 
2768 	/*
2769 	 * Someone really wants this to run. If it's not unfair, run it.
2770 	 */
2771 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
2772 		se = cfs_rq->next;
2773 
2774 	clear_buddies(cfs_rq, se);
2775 
2776 	return se;
2777 }
2778 
2779 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2780 
2781 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2782 {
2783 	/*
2784 	 * If still on the runqueue then deactivate_task()
2785 	 * was not called and update_curr() has to be done:
2786 	 */
2787 	if (prev->on_rq)
2788 		update_curr(cfs_rq);
2789 
2790 	/* throttle cfs_rqs exceeding runtime */
2791 	check_cfs_rq_runtime(cfs_rq);
2792 
2793 	check_spread(cfs_rq, prev);
2794 	if (prev->on_rq) {
2795 		update_stats_wait_start(cfs_rq, prev);
2796 		/* Put 'current' back into the tree. */
2797 		__enqueue_entity(cfs_rq, prev);
2798 		/* in !on_rq case, update occurred at dequeue */
2799 		update_entity_load_avg(prev, 1);
2800 	}
2801 	cfs_rq->curr = NULL;
2802 }
2803 
2804 static void
2805 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2806 {
2807 	/*
2808 	 * Update run-time statistics of the 'current'.
2809 	 */
2810 	update_curr(cfs_rq);
2811 
2812 	/*
2813 	 * Ensure that runnable average is periodically updated.
2814 	 */
2815 	update_entity_load_avg(curr, 1);
2816 	update_cfs_rq_blocked_load(cfs_rq, 1);
2817 	update_cfs_shares(cfs_rq);
2818 
2819 #ifdef CONFIG_SCHED_HRTICK
2820 	/*
2821 	 * queued ticks are scheduled to match the slice, so don't bother
2822 	 * validating it and just reschedule.
2823 	 */
2824 	if (queued) {
2825 		resched_task(rq_of(cfs_rq)->curr);
2826 		return;
2827 	}
2828 	/*
2829 	 * don't let the period tick interfere with the hrtick preemption
2830 	 */
2831 	if (!sched_feat(DOUBLE_TICK) &&
2832 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2833 		return;
2834 #endif
2835 
2836 	if (cfs_rq->nr_running > 1)
2837 		check_preempt_tick(cfs_rq, curr);
2838 }
2839 
2840 
2841 /**************************************************
2842  * CFS bandwidth control machinery
2843  */
2844 
2845 #ifdef CONFIG_CFS_BANDWIDTH
2846 
2847 #ifdef HAVE_JUMP_LABEL
2848 static struct static_key __cfs_bandwidth_used;
2849 
2850 static inline bool cfs_bandwidth_used(void)
2851 {
2852 	return static_key_false(&__cfs_bandwidth_used);
2853 }
2854 
2855 void cfs_bandwidth_usage_inc(void)
2856 {
2857 	static_key_slow_inc(&__cfs_bandwidth_used);
2858 }
2859 
2860 void cfs_bandwidth_usage_dec(void)
2861 {
2862 	static_key_slow_dec(&__cfs_bandwidth_used);
2863 }
2864 #else /* HAVE_JUMP_LABEL */
2865 static bool cfs_bandwidth_used(void)
2866 {
2867 	return true;
2868 }
2869 
2870 void cfs_bandwidth_usage_inc(void) {}
2871 void cfs_bandwidth_usage_dec(void) {}
2872 #endif /* HAVE_JUMP_LABEL */
2873 
2874 /*
2875  * default period for cfs group bandwidth.
2876  * default: 0.1s, units: nanoseconds
2877  */
2878 static inline u64 default_cfs_period(void)
2879 {
2880 	return 100000000ULL;
2881 }
2882 
2883 static inline u64 sched_cfs_bandwidth_slice(void)
2884 {
2885 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2886 }
2887 
2888 /*
2889  * Replenish runtime according to assigned quota and update expiration time.
2890  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2891  * additional synchronization around rq->lock.
2892  *
2893  * requires cfs_b->lock
2894  */
2895 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2896 {
2897 	u64 now;
2898 
2899 	if (cfs_b->quota == RUNTIME_INF)
2900 		return;
2901 
2902 	now = sched_clock_cpu(smp_processor_id());
2903 	cfs_b->runtime = cfs_b->quota;
2904 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2905 }
2906 
2907 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2908 {
2909 	return &tg->cfs_bandwidth;
2910 }
2911 
2912 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2913 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2914 {
2915 	if (unlikely(cfs_rq->throttle_count))
2916 		return cfs_rq->throttled_clock_task;
2917 
2918 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
2919 }
2920 
2921 /* returns 0 on failure to allocate runtime */
2922 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2923 {
2924 	struct task_group *tg = cfs_rq->tg;
2925 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2926 	u64 amount = 0, min_amount, expires;
2927 
2928 	/* note: this is a positive sum as runtime_remaining <= 0 */
2929 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2930 
2931 	raw_spin_lock(&cfs_b->lock);
2932 	if (cfs_b->quota == RUNTIME_INF)
2933 		amount = min_amount;
2934 	else {
2935 		/*
2936 		 * If the bandwidth pool has become inactive, then at least one
2937 		 * period must have elapsed since the last consumption.
2938 		 * Refresh the global state and ensure bandwidth timer becomes
2939 		 * active.
2940 		 */
2941 		if (!cfs_b->timer_active) {
2942 			__refill_cfs_bandwidth_runtime(cfs_b);
2943 			__start_cfs_bandwidth(cfs_b);
2944 		}
2945 
2946 		if (cfs_b->runtime > 0) {
2947 			amount = min(cfs_b->runtime, min_amount);
2948 			cfs_b->runtime -= amount;
2949 			cfs_b->idle = 0;
2950 		}
2951 	}
2952 	expires = cfs_b->runtime_expires;
2953 	raw_spin_unlock(&cfs_b->lock);
2954 
2955 	cfs_rq->runtime_remaining += amount;
2956 	/*
2957 	 * we may have advanced our local expiration to account for allowed
2958 	 * spread between our sched_clock and the one on which runtime was
2959 	 * issued.
2960 	 */
2961 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2962 		cfs_rq->runtime_expires = expires;
2963 
2964 	return cfs_rq->runtime_remaining > 0;
2965 }
2966 
2967 /*
2968  * Note: This depends on the synchronization provided by sched_clock and the
2969  * fact that rq->clock snapshots this value.
2970  */
2971 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2972 {
2973 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2974 
2975 	/* if the deadline is ahead of our clock, nothing to do */
2976 	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
2977 		return;
2978 
2979 	if (cfs_rq->runtime_remaining < 0)
2980 		return;
2981 
2982 	/*
2983 	 * If the local deadline has passed we have to consider the
2984 	 * possibility that our sched_clock is 'fast' and the global deadline
2985 	 * has not truly expired.
2986 	 *
2987 	 * Fortunately we can check determine whether this the case by checking
2988 	 * whether the global deadline has advanced.
2989 	 */
2990 
2991 	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2992 		/* extend local deadline, drift is bounded above by 2 ticks */
2993 		cfs_rq->runtime_expires += TICK_NSEC;
2994 	} else {
2995 		/* global deadline is ahead, expiration has passed */
2996 		cfs_rq->runtime_remaining = 0;
2997 	}
2998 }
2999 
3000 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3001 {
3002 	/* dock delta_exec before expiring quota (as it could span periods) */
3003 	cfs_rq->runtime_remaining -= delta_exec;
3004 	expire_cfs_rq_runtime(cfs_rq);
3005 
3006 	if (likely(cfs_rq->runtime_remaining > 0))
3007 		return;
3008 
3009 	/*
3010 	 * if we're unable to extend our runtime we resched so that the active
3011 	 * hierarchy can be throttled
3012 	 */
3013 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3014 		resched_task(rq_of(cfs_rq)->curr);
3015 }
3016 
3017 static __always_inline
3018 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3019 {
3020 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3021 		return;
3022 
3023 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
3024 }
3025 
3026 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3027 {
3028 	return cfs_bandwidth_used() && cfs_rq->throttled;
3029 }
3030 
3031 /* check whether cfs_rq, or any parent, is throttled */
3032 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3033 {
3034 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
3035 }
3036 
3037 /*
3038  * Ensure that neither of the group entities corresponding to src_cpu or
3039  * dest_cpu are members of a throttled hierarchy when performing group
3040  * load-balance operations.
3041  */
3042 static inline int throttled_lb_pair(struct task_group *tg,
3043 				    int src_cpu, int dest_cpu)
3044 {
3045 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3046 
3047 	src_cfs_rq = tg->cfs_rq[src_cpu];
3048 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
3049 
3050 	return throttled_hierarchy(src_cfs_rq) ||
3051 	       throttled_hierarchy(dest_cfs_rq);
3052 }
3053 
3054 /* updated child weight may affect parent so we have to do this bottom up */
3055 static int tg_unthrottle_up(struct task_group *tg, void *data)
3056 {
3057 	struct rq *rq = data;
3058 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3059 
3060 	cfs_rq->throttle_count--;
3061 #ifdef CONFIG_SMP
3062 	if (!cfs_rq->throttle_count) {
3063 		/* adjust cfs_rq_clock_task() */
3064 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3065 					     cfs_rq->throttled_clock_task;
3066 	}
3067 #endif
3068 
3069 	return 0;
3070 }
3071 
3072 static int tg_throttle_down(struct task_group *tg, void *data)
3073 {
3074 	struct rq *rq = data;
3075 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3076 
3077 	/* group is entering throttled state, stop time */
3078 	if (!cfs_rq->throttle_count)
3079 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
3080 	cfs_rq->throttle_count++;
3081 
3082 	return 0;
3083 }
3084 
3085 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3086 {
3087 	struct rq *rq = rq_of(cfs_rq);
3088 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3089 	struct sched_entity *se;
3090 	long task_delta, dequeue = 1;
3091 
3092 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3093 
3094 	/* freeze hierarchy runnable averages while throttled */
3095 	rcu_read_lock();
3096 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3097 	rcu_read_unlock();
3098 
3099 	task_delta = cfs_rq->h_nr_running;
3100 	for_each_sched_entity(se) {
3101 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3102 		/* throttled entity or throttle-on-deactivate */
3103 		if (!se->on_rq)
3104 			break;
3105 
3106 		if (dequeue)
3107 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3108 		qcfs_rq->h_nr_running -= task_delta;
3109 
3110 		if (qcfs_rq->load.weight)
3111 			dequeue = 0;
3112 	}
3113 
3114 	if (!se)
3115 		rq->nr_running -= task_delta;
3116 
3117 	cfs_rq->throttled = 1;
3118 	cfs_rq->throttled_clock = rq_clock(rq);
3119 	raw_spin_lock(&cfs_b->lock);
3120 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3121 	if (!cfs_b->timer_active)
3122 		__start_cfs_bandwidth(cfs_b);
3123 	raw_spin_unlock(&cfs_b->lock);
3124 }
3125 
3126 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3127 {
3128 	struct rq *rq = rq_of(cfs_rq);
3129 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3130 	struct sched_entity *se;
3131 	int enqueue = 1;
3132 	long task_delta;
3133 
3134 	se = cfs_rq->tg->se[cpu_of(rq)];
3135 
3136 	cfs_rq->throttled = 0;
3137 
3138 	update_rq_clock(rq);
3139 
3140 	raw_spin_lock(&cfs_b->lock);
3141 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3142 	list_del_rcu(&cfs_rq->throttled_list);
3143 	raw_spin_unlock(&cfs_b->lock);
3144 
3145 	/* update hierarchical throttle state */
3146 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3147 
3148 	if (!cfs_rq->load.weight)
3149 		return;
3150 
3151 	task_delta = cfs_rq->h_nr_running;
3152 	for_each_sched_entity(se) {
3153 		if (se->on_rq)
3154 			enqueue = 0;
3155 
3156 		cfs_rq = cfs_rq_of(se);
3157 		if (enqueue)
3158 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3159 		cfs_rq->h_nr_running += task_delta;
3160 
3161 		if (cfs_rq_throttled(cfs_rq))
3162 			break;
3163 	}
3164 
3165 	if (!se)
3166 		rq->nr_running += task_delta;
3167 
3168 	/* determine whether we need to wake up potentially idle cpu */
3169 	if (rq->curr == rq->idle && rq->cfs.nr_running)
3170 		resched_task(rq->curr);
3171 }
3172 
3173 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3174 		u64 remaining, u64 expires)
3175 {
3176 	struct cfs_rq *cfs_rq;
3177 	u64 runtime = remaining;
3178 
3179 	rcu_read_lock();
3180 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3181 				throttled_list) {
3182 		struct rq *rq = rq_of(cfs_rq);
3183 
3184 		raw_spin_lock(&rq->lock);
3185 		if (!cfs_rq_throttled(cfs_rq))
3186 			goto next;
3187 
3188 		runtime = -cfs_rq->runtime_remaining + 1;
3189 		if (runtime > remaining)
3190 			runtime = remaining;
3191 		remaining -= runtime;
3192 
3193 		cfs_rq->runtime_remaining += runtime;
3194 		cfs_rq->runtime_expires = expires;
3195 
3196 		/* we check whether we're throttled above */
3197 		if (cfs_rq->runtime_remaining > 0)
3198 			unthrottle_cfs_rq(cfs_rq);
3199 
3200 next:
3201 		raw_spin_unlock(&rq->lock);
3202 
3203 		if (!remaining)
3204 			break;
3205 	}
3206 	rcu_read_unlock();
3207 
3208 	return remaining;
3209 }
3210 
3211 /*
3212  * Responsible for refilling a task_group's bandwidth and unthrottling its
3213  * cfs_rqs as appropriate. If there has been no activity within the last
3214  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3215  * used to track this state.
3216  */
3217 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3218 {
3219 	u64 runtime, runtime_expires;
3220 	int idle = 1, throttled;
3221 
3222 	raw_spin_lock(&cfs_b->lock);
3223 	/* no need to continue the timer with no bandwidth constraint */
3224 	if (cfs_b->quota == RUNTIME_INF)
3225 		goto out_unlock;
3226 
3227 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3228 	/* idle depends on !throttled (for the case of a large deficit) */
3229 	idle = cfs_b->idle && !throttled;
3230 	cfs_b->nr_periods += overrun;
3231 
3232 	/* if we're going inactive then everything else can be deferred */
3233 	if (idle)
3234 		goto out_unlock;
3235 
3236 	/*
3237 	 * if we have relooped after returning idle once, we need to update our
3238 	 * status as actually running, so that other cpus doing
3239 	 * __start_cfs_bandwidth will stop trying to cancel us.
3240 	 */
3241 	cfs_b->timer_active = 1;
3242 
3243 	__refill_cfs_bandwidth_runtime(cfs_b);
3244 
3245 	if (!throttled) {
3246 		/* mark as potentially idle for the upcoming period */
3247 		cfs_b->idle = 1;
3248 		goto out_unlock;
3249 	}
3250 
3251 	/* account preceding periods in which throttling occurred */
3252 	cfs_b->nr_throttled += overrun;
3253 
3254 	/*
3255 	 * There are throttled entities so we must first use the new bandwidth
3256 	 * to unthrottle them before making it generally available.  This
3257 	 * ensures that all existing debts will be paid before a new cfs_rq is
3258 	 * allowed to run.
3259 	 */
3260 	runtime = cfs_b->runtime;
3261 	runtime_expires = cfs_b->runtime_expires;
3262 	cfs_b->runtime = 0;
3263 
3264 	/*
3265 	 * This check is repeated as we are holding onto the new bandwidth
3266 	 * while we unthrottle.  This can potentially race with an unthrottled
3267 	 * group trying to acquire new bandwidth from the global pool.
3268 	 */
3269 	while (throttled && runtime > 0) {
3270 		raw_spin_unlock(&cfs_b->lock);
3271 		/* we can't nest cfs_b->lock while distributing bandwidth */
3272 		runtime = distribute_cfs_runtime(cfs_b, runtime,
3273 						 runtime_expires);
3274 		raw_spin_lock(&cfs_b->lock);
3275 
3276 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3277 	}
3278 
3279 	/* return (any) remaining runtime */
3280 	cfs_b->runtime = runtime;
3281 	/*
3282 	 * While we are ensured activity in the period following an
3283 	 * unthrottle, this also covers the case in which the new bandwidth is
3284 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
3285 	 * timer to remain active while there are any throttled entities.)
3286 	 */
3287 	cfs_b->idle = 0;
3288 out_unlock:
3289 	if (idle)
3290 		cfs_b->timer_active = 0;
3291 	raw_spin_unlock(&cfs_b->lock);
3292 
3293 	return idle;
3294 }
3295 
3296 /* a cfs_rq won't donate quota below this amount */
3297 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3298 /* minimum remaining period time to redistribute slack quota */
3299 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3300 /* how long we wait to gather additional slack before distributing */
3301 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3302 
3303 /*
3304  * Are we near the end of the current quota period?
3305  *
3306  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3307  * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3308  * migrate_hrtimers, base is never cleared, so we are fine.
3309  */
3310 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3311 {
3312 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
3313 	u64 remaining;
3314 
3315 	/* if the call-back is running a quota refresh is already occurring */
3316 	if (hrtimer_callback_running(refresh_timer))
3317 		return 1;
3318 
3319 	/* is a quota refresh about to occur? */
3320 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3321 	if (remaining < min_expire)
3322 		return 1;
3323 
3324 	return 0;
3325 }
3326 
3327 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3328 {
3329 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3330 
3331 	/* if there's a quota refresh soon don't bother with slack */
3332 	if (runtime_refresh_within(cfs_b, min_left))
3333 		return;
3334 
3335 	start_bandwidth_timer(&cfs_b->slack_timer,
3336 				ns_to_ktime(cfs_bandwidth_slack_period));
3337 }
3338 
3339 /* we know any runtime found here is valid as update_curr() precedes return */
3340 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3341 {
3342 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3343 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3344 
3345 	if (slack_runtime <= 0)
3346 		return;
3347 
3348 	raw_spin_lock(&cfs_b->lock);
3349 	if (cfs_b->quota != RUNTIME_INF &&
3350 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3351 		cfs_b->runtime += slack_runtime;
3352 
3353 		/* we are under rq->lock, defer unthrottling using a timer */
3354 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3355 		    !list_empty(&cfs_b->throttled_cfs_rq))
3356 			start_cfs_slack_bandwidth(cfs_b);
3357 	}
3358 	raw_spin_unlock(&cfs_b->lock);
3359 
3360 	/* even if it's not valid for return we don't want to try again */
3361 	cfs_rq->runtime_remaining -= slack_runtime;
3362 }
3363 
3364 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3365 {
3366 	if (!cfs_bandwidth_used())
3367 		return;
3368 
3369 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3370 		return;
3371 
3372 	__return_cfs_rq_runtime(cfs_rq);
3373 }
3374 
3375 /*
3376  * This is done with a timer (instead of inline with bandwidth return) since
3377  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3378  */
3379 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3380 {
3381 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3382 	u64 expires;
3383 
3384 	/* confirm we're still not at a refresh boundary */
3385 	raw_spin_lock(&cfs_b->lock);
3386 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3387 		raw_spin_unlock(&cfs_b->lock);
3388 		return;
3389 	}
3390 
3391 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
3392 		runtime = cfs_b->runtime;
3393 		cfs_b->runtime = 0;
3394 	}
3395 	expires = cfs_b->runtime_expires;
3396 	raw_spin_unlock(&cfs_b->lock);
3397 
3398 	if (!runtime)
3399 		return;
3400 
3401 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3402 
3403 	raw_spin_lock(&cfs_b->lock);
3404 	if (expires == cfs_b->runtime_expires)
3405 		cfs_b->runtime = runtime;
3406 	raw_spin_unlock(&cfs_b->lock);
3407 }
3408 
3409 /*
3410  * When a group wakes up we want to make sure that its quota is not already
3411  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3412  * runtime as update_curr() throttling can not not trigger until it's on-rq.
3413  */
3414 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3415 {
3416 	if (!cfs_bandwidth_used())
3417 		return;
3418 
3419 	/* an active group must be handled by the update_curr()->put() path */
3420 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3421 		return;
3422 
3423 	/* ensure the group is not already throttled */
3424 	if (cfs_rq_throttled(cfs_rq))
3425 		return;
3426 
3427 	/* update runtime allocation */
3428 	account_cfs_rq_runtime(cfs_rq, 0);
3429 	if (cfs_rq->runtime_remaining <= 0)
3430 		throttle_cfs_rq(cfs_rq);
3431 }
3432 
3433 /* conditionally throttle active cfs_rq's from put_prev_entity() */
3434 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3435 {
3436 	if (!cfs_bandwidth_used())
3437 		return;
3438 
3439 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3440 		return;
3441 
3442 	/*
3443 	 * it's possible for a throttled entity to be forced into a running
3444 	 * state (e.g. set_curr_task), in this case we're finished.
3445 	 */
3446 	if (cfs_rq_throttled(cfs_rq))
3447 		return;
3448 
3449 	throttle_cfs_rq(cfs_rq);
3450 }
3451 
3452 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
3453 {
3454 	struct cfs_bandwidth *cfs_b =
3455 		container_of(timer, struct cfs_bandwidth, slack_timer);
3456 	do_sched_cfs_slack_timer(cfs_b);
3457 
3458 	return HRTIMER_NORESTART;
3459 }
3460 
3461 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3462 {
3463 	struct cfs_bandwidth *cfs_b =
3464 		container_of(timer, struct cfs_bandwidth, period_timer);
3465 	ktime_t now;
3466 	int overrun;
3467 	int idle = 0;
3468 
3469 	for (;;) {
3470 		now = hrtimer_cb_get_time(timer);
3471 		overrun = hrtimer_forward(timer, now, cfs_b->period);
3472 
3473 		if (!overrun)
3474 			break;
3475 
3476 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
3477 	}
3478 
3479 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3480 }
3481 
3482 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3483 {
3484 	raw_spin_lock_init(&cfs_b->lock);
3485 	cfs_b->runtime = 0;
3486 	cfs_b->quota = RUNTIME_INF;
3487 	cfs_b->period = ns_to_ktime(default_cfs_period());
3488 
3489 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
3490 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3491 	cfs_b->period_timer.function = sched_cfs_period_timer;
3492 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3493 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
3494 }
3495 
3496 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3497 {
3498 	cfs_rq->runtime_enabled = 0;
3499 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
3500 }
3501 
3502 /* requires cfs_b->lock, may release to reprogram timer */
3503 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3504 {
3505 	/*
3506 	 * The timer may be active because we're trying to set a new bandwidth
3507 	 * period or because we're racing with the tear-down path
3508 	 * (timer_active==0 becomes visible before the hrtimer call-back
3509 	 * terminates).  In either case we ensure that it's re-programmed
3510 	 */
3511 	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3512 	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3513 		/* bounce the lock to allow do_sched_cfs_period_timer to run */
3514 		raw_spin_unlock(&cfs_b->lock);
3515 		cpu_relax();
3516 		raw_spin_lock(&cfs_b->lock);
3517 		/* if someone else restarted the timer then we're done */
3518 		if (cfs_b->timer_active)
3519 			return;
3520 	}
3521 
3522 	cfs_b->timer_active = 1;
3523 	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
3524 }
3525 
3526 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3527 {
3528 	hrtimer_cancel(&cfs_b->period_timer);
3529 	hrtimer_cancel(&cfs_b->slack_timer);
3530 }
3531 
3532 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3533 {
3534 	struct cfs_rq *cfs_rq;
3535 
3536 	for_each_leaf_cfs_rq(rq, cfs_rq) {
3537 		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3538 
3539 		if (!cfs_rq->runtime_enabled)
3540 			continue;
3541 
3542 		/*
3543 		 * clock_task is not advancing so we just need to make sure
3544 		 * there's some valid quota amount
3545 		 */
3546 		cfs_rq->runtime_remaining = cfs_b->quota;
3547 		if (cfs_rq_throttled(cfs_rq))
3548 			unthrottle_cfs_rq(cfs_rq);
3549 	}
3550 }
3551 
3552 #else /* CONFIG_CFS_BANDWIDTH */
3553 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3554 {
3555 	return rq_clock_task(rq_of(cfs_rq));
3556 }
3557 
3558 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3559 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3560 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3561 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3562 
3563 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3564 {
3565 	return 0;
3566 }
3567 
3568 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3569 {
3570 	return 0;
3571 }
3572 
3573 static inline int throttled_lb_pair(struct task_group *tg,
3574 				    int src_cpu, int dest_cpu)
3575 {
3576 	return 0;
3577 }
3578 
3579 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3580 
3581 #ifdef CONFIG_FAIR_GROUP_SCHED
3582 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3583 #endif
3584 
3585 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3586 {
3587 	return NULL;
3588 }
3589 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3590 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
3591 
3592 #endif /* CONFIG_CFS_BANDWIDTH */
3593 
3594 /**************************************************
3595  * CFS operations on tasks:
3596  */
3597 
3598 #ifdef CONFIG_SCHED_HRTICK
3599 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3600 {
3601 	struct sched_entity *se = &p->se;
3602 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3603 
3604 	WARN_ON(task_rq(p) != rq);
3605 
3606 	if (cfs_rq->nr_running > 1) {
3607 		u64 slice = sched_slice(cfs_rq, se);
3608 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
3609 		s64 delta = slice - ran;
3610 
3611 		if (delta < 0) {
3612 			if (rq->curr == p)
3613 				resched_task(p);
3614 			return;
3615 		}
3616 
3617 		/*
3618 		 * Don't schedule slices shorter than 10000ns, that just
3619 		 * doesn't make sense. Rely on vruntime for fairness.
3620 		 */
3621 		if (rq->curr != p)
3622 			delta = max_t(s64, 10000LL, delta);
3623 
3624 		hrtick_start(rq, delta);
3625 	}
3626 }
3627 
3628 /*
3629  * called from enqueue/dequeue and updates the hrtick when the
3630  * current task is from our class and nr_running is low enough
3631  * to matter.
3632  */
3633 static void hrtick_update(struct rq *rq)
3634 {
3635 	struct task_struct *curr = rq->curr;
3636 
3637 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
3638 		return;
3639 
3640 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
3641 		hrtick_start_fair(rq, curr);
3642 }
3643 #else /* !CONFIG_SCHED_HRTICK */
3644 static inline void
3645 hrtick_start_fair(struct rq *rq, struct task_struct *p)
3646 {
3647 }
3648 
3649 static inline void hrtick_update(struct rq *rq)
3650 {
3651 }
3652 #endif
3653 
3654 /*
3655  * The enqueue_task method is called before nr_running is
3656  * increased. Here we update the fair scheduling stats and
3657  * then put the task into the rbtree:
3658  */
3659 static void
3660 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3661 {
3662 	struct cfs_rq *cfs_rq;
3663 	struct sched_entity *se = &p->se;
3664 
3665 	for_each_sched_entity(se) {
3666 		if (se->on_rq)
3667 			break;
3668 		cfs_rq = cfs_rq_of(se);
3669 		enqueue_entity(cfs_rq, se, flags);
3670 
3671 		/*
3672 		 * end evaluation on encountering a throttled cfs_rq
3673 		 *
3674 		 * note: in the case of encountering a throttled cfs_rq we will
3675 		 * post the final h_nr_running increment below.
3676 		*/
3677 		if (cfs_rq_throttled(cfs_rq))
3678 			break;
3679 		cfs_rq->h_nr_running++;
3680 
3681 		flags = ENQUEUE_WAKEUP;
3682 	}
3683 
3684 	for_each_sched_entity(se) {
3685 		cfs_rq = cfs_rq_of(se);
3686 		cfs_rq->h_nr_running++;
3687 
3688 		if (cfs_rq_throttled(cfs_rq))
3689 			break;
3690 
3691 		update_cfs_shares(cfs_rq);
3692 		update_entity_load_avg(se, 1);
3693 	}
3694 
3695 	if (!se) {
3696 		update_rq_runnable_avg(rq, rq->nr_running);
3697 		inc_nr_running(rq);
3698 	}
3699 	hrtick_update(rq);
3700 }
3701 
3702 static void set_next_buddy(struct sched_entity *se);
3703 
3704 /*
3705  * The dequeue_task method is called before nr_running is
3706  * decreased. We remove the task from the rbtree and
3707  * update the fair scheduling stats:
3708  */
3709 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3710 {
3711 	struct cfs_rq *cfs_rq;
3712 	struct sched_entity *se = &p->se;
3713 	int task_sleep = flags & DEQUEUE_SLEEP;
3714 
3715 	for_each_sched_entity(se) {
3716 		cfs_rq = cfs_rq_of(se);
3717 		dequeue_entity(cfs_rq, se, flags);
3718 
3719 		/*
3720 		 * end evaluation on encountering a throttled cfs_rq
3721 		 *
3722 		 * note: in the case of encountering a throttled cfs_rq we will
3723 		 * post the final h_nr_running decrement below.
3724 		*/
3725 		if (cfs_rq_throttled(cfs_rq))
3726 			break;
3727 		cfs_rq->h_nr_running--;
3728 
3729 		/* Don't dequeue parent if it has other entities besides us */
3730 		if (cfs_rq->load.weight) {
3731 			/*
3732 			 * Bias pick_next to pick a task from this cfs_rq, as
3733 			 * p is sleeping when it is within its sched_slice.
3734 			 */
3735 			if (task_sleep && parent_entity(se))
3736 				set_next_buddy(parent_entity(se));
3737 
3738 			/* avoid re-evaluating load for this entity */
3739 			se = parent_entity(se);
3740 			break;
3741 		}
3742 		flags |= DEQUEUE_SLEEP;
3743 	}
3744 
3745 	for_each_sched_entity(se) {
3746 		cfs_rq = cfs_rq_of(se);
3747 		cfs_rq->h_nr_running--;
3748 
3749 		if (cfs_rq_throttled(cfs_rq))
3750 			break;
3751 
3752 		update_cfs_shares(cfs_rq);
3753 		update_entity_load_avg(se, 1);
3754 	}
3755 
3756 	if (!se) {
3757 		dec_nr_running(rq);
3758 		update_rq_runnable_avg(rq, 1);
3759 	}
3760 	hrtick_update(rq);
3761 }
3762 
3763 #ifdef CONFIG_SMP
3764 /* Used instead of source_load when we know the type == 0 */
3765 static unsigned long weighted_cpuload(const int cpu)
3766 {
3767 	return cpu_rq(cpu)->cfs.runnable_load_avg;
3768 }
3769 
3770 /*
3771  * Return a low guess at the load of a migration-source cpu weighted
3772  * according to the scheduling class and "nice" value.
3773  *
3774  * We want to under-estimate the load of migration sources, to
3775  * balance conservatively.
3776  */
3777 static unsigned long source_load(int cpu, int type)
3778 {
3779 	struct rq *rq = cpu_rq(cpu);
3780 	unsigned long total = weighted_cpuload(cpu);
3781 
3782 	if (type == 0 || !sched_feat(LB_BIAS))
3783 		return total;
3784 
3785 	return min(rq->cpu_load[type-1], total);
3786 }
3787 
3788 /*
3789  * Return a high guess at the load of a migration-target cpu weighted
3790  * according to the scheduling class and "nice" value.
3791  */
3792 static unsigned long target_load(int cpu, int type)
3793 {
3794 	struct rq *rq = cpu_rq(cpu);
3795 	unsigned long total = weighted_cpuload(cpu);
3796 
3797 	if (type == 0 || !sched_feat(LB_BIAS))
3798 		return total;
3799 
3800 	return max(rq->cpu_load[type-1], total);
3801 }
3802 
3803 static unsigned long power_of(int cpu)
3804 {
3805 	return cpu_rq(cpu)->cpu_power;
3806 }
3807 
3808 static unsigned long cpu_avg_load_per_task(int cpu)
3809 {
3810 	struct rq *rq = cpu_rq(cpu);
3811 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
3812 	unsigned long load_avg = rq->cfs.runnable_load_avg;
3813 
3814 	if (nr_running)
3815 		return load_avg / nr_running;
3816 
3817 	return 0;
3818 }
3819 
3820 static void record_wakee(struct task_struct *p)
3821 {
3822 	/*
3823 	 * Rough decay (wiping) for cost saving, don't worry
3824 	 * about the boundary, really active task won't care
3825 	 * about the loss.
3826 	 */
3827 	if (jiffies > current->wakee_flip_decay_ts + HZ) {
3828 		current->wakee_flips = 0;
3829 		current->wakee_flip_decay_ts = jiffies;
3830 	}
3831 
3832 	if (current->last_wakee != p) {
3833 		current->last_wakee = p;
3834 		current->wakee_flips++;
3835 	}
3836 }
3837 
3838 static void task_waking_fair(struct task_struct *p)
3839 {
3840 	struct sched_entity *se = &p->se;
3841 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3842 	u64 min_vruntime;
3843 
3844 #ifndef CONFIG_64BIT
3845 	u64 min_vruntime_copy;
3846 
3847 	do {
3848 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
3849 		smp_rmb();
3850 		min_vruntime = cfs_rq->min_vruntime;
3851 	} while (min_vruntime != min_vruntime_copy);
3852 #else
3853 	min_vruntime = cfs_rq->min_vruntime;
3854 #endif
3855 
3856 	se->vruntime -= min_vruntime;
3857 	record_wakee(p);
3858 }
3859 
3860 #ifdef CONFIG_FAIR_GROUP_SCHED
3861 /*
3862  * effective_load() calculates the load change as seen from the root_task_group
3863  *
3864  * Adding load to a group doesn't make a group heavier, but can cause movement
3865  * of group shares between cpus. Assuming the shares were perfectly aligned one
3866  * can calculate the shift in shares.
3867  *
3868  * Calculate the effective load difference if @wl is added (subtracted) to @tg
3869  * on this @cpu and results in a total addition (subtraction) of @wg to the
3870  * total group weight.
3871  *
3872  * Given a runqueue weight distribution (rw_i) we can compute a shares
3873  * distribution (s_i) using:
3874  *
3875  *   s_i = rw_i / \Sum rw_j						(1)
3876  *
3877  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
3878  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
3879  * shares distribution (s_i):
3880  *
3881  *   rw_i = {   2,   4,   1,   0 }
3882  *   s_i  = { 2/7, 4/7, 1/7,   0 }
3883  *
3884  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
3885  * task used to run on and the CPU the waker is running on), we need to
3886  * compute the effect of waking a task on either CPU and, in case of a sync
3887  * wakeup, compute the effect of the current task going to sleep.
3888  *
3889  * So for a change of @wl to the local @cpu with an overall group weight change
3890  * of @wl we can compute the new shares distribution (s'_i) using:
3891  *
3892  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
3893  *
3894  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3895  * differences in waking a task to CPU 0. The additional task changes the
3896  * weight and shares distributions like:
3897  *
3898  *   rw'_i = {   3,   4,   1,   0 }
3899  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
3900  *
3901  * We can then compute the difference in effective weight by using:
3902  *
3903  *   dw_i = S * (s'_i - s_i)						(3)
3904  *
3905  * Where 'S' is the group weight as seen by its parent.
3906  *
3907  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3908  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3909  * 4/7) times the weight of the group.
3910  */
3911 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3912 {
3913 	struct sched_entity *se = tg->se[cpu];
3914 
3915 	if (!tg->parent)	/* the trivial, non-cgroup case */
3916 		return wl;
3917 
3918 	for_each_sched_entity(se) {
3919 		long w, W;
3920 
3921 		tg = se->my_q->tg;
3922 
3923 		/*
3924 		 * W = @wg + \Sum rw_j
3925 		 */
3926 		W = wg + calc_tg_weight(tg, se->my_q);
3927 
3928 		/*
3929 		 * w = rw_i + @wl
3930 		 */
3931 		w = se->my_q->load.weight + wl;
3932 
3933 		/*
3934 		 * wl = S * s'_i; see (2)
3935 		 */
3936 		if (W > 0 && w < W)
3937 			wl = (w * tg->shares) / W;
3938 		else
3939 			wl = tg->shares;
3940 
3941 		/*
3942 		 * Per the above, wl is the new se->load.weight value; since
3943 		 * those are clipped to [MIN_SHARES, ...) do so now. See
3944 		 * calc_cfs_shares().
3945 		 */
3946 		if (wl < MIN_SHARES)
3947 			wl = MIN_SHARES;
3948 
3949 		/*
3950 		 * wl = dw_i = S * (s'_i - s_i); see (3)
3951 		 */
3952 		wl -= se->load.weight;
3953 
3954 		/*
3955 		 * Recursively apply this logic to all parent groups to compute
3956 		 * the final effective load change on the root group. Since
3957 		 * only the @tg group gets extra weight, all parent groups can
3958 		 * only redistribute existing shares. @wl is the shift in shares
3959 		 * resulting from this level per the above.
3960 		 */
3961 		wg = 0;
3962 	}
3963 
3964 	return wl;
3965 }
3966 #else
3967 
3968 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3969 {
3970 	return wl;
3971 }
3972 
3973 #endif
3974 
3975 static int wake_wide(struct task_struct *p)
3976 {
3977 	int factor = this_cpu_read(sd_llc_size);
3978 
3979 	/*
3980 	 * Yeah, it's the switching-frequency, could means many wakee or
3981 	 * rapidly switch, use factor here will just help to automatically
3982 	 * adjust the loose-degree, so bigger node will lead to more pull.
3983 	 */
3984 	if (p->wakee_flips > factor) {
3985 		/*
3986 		 * wakee is somewhat hot, it needs certain amount of cpu
3987 		 * resource, so if waker is far more hot, prefer to leave
3988 		 * it alone.
3989 		 */
3990 		if (current->wakee_flips > (factor * p->wakee_flips))
3991 			return 1;
3992 	}
3993 
3994 	return 0;
3995 }
3996 
3997 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3998 {
3999 	s64 this_load, load;
4000 	int idx, this_cpu, prev_cpu;
4001 	unsigned long tl_per_task;
4002 	struct task_group *tg;
4003 	unsigned long weight;
4004 	int balanced;
4005 
4006 	/*
4007 	 * If we wake multiple tasks be careful to not bounce
4008 	 * ourselves around too much.
4009 	 */
4010 	if (wake_wide(p))
4011 		return 0;
4012 
4013 	idx	  = sd->wake_idx;
4014 	this_cpu  = smp_processor_id();
4015 	prev_cpu  = task_cpu(p);
4016 	load	  = source_load(prev_cpu, idx);
4017 	this_load = target_load(this_cpu, idx);
4018 
4019 	/*
4020 	 * If sync wakeup then subtract the (maximum possible)
4021 	 * effect of the currently running task from the load
4022 	 * of the current CPU:
4023 	 */
4024 	if (sync) {
4025 		tg = task_group(current);
4026 		weight = current->se.load.weight;
4027 
4028 		this_load += effective_load(tg, this_cpu, -weight, -weight);
4029 		load += effective_load(tg, prev_cpu, 0, -weight);
4030 	}
4031 
4032 	tg = task_group(p);
4033 	weight = p->se.load.weight;
4034 
4035 	/*
4036 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
4037 	 * due to the sync cause above having dropped this_load to 0, we'll
4038 	 * always have an imbalance, but there's really nothing you can do
4039 	 * about that, so that's good too.
4040 	 *
4041 	 * Otherwise check if either cpus are near enough in load to allow this
4042 	 * task to be woken on this_cpu.
4043 	 */
4044 	if (this_load > 0) {
4045 		s64 this_eff_load, prev_eff_load;
4046 
4047 		this_eff_load = 100;
4048 		this_eff_load *= power_of(prev_cpu);
4049 		this_eff_load *= this_load +
4050 			effective_load(tg, this_cpu, weight, weight);
4051 
4052 		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4053 		prev_eff_load *= power_of(this_cpu);
4054 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4055 
4056 		balanced = this_eff_load <= prev_eff_load;
4057 	} else
4058 		balanced = true;
4059 
4060 	/*
4061 	 * If the currently running task will sleep within
4062 	 * a reasonable amount of time then attract this newly
4063 	 * woken task:
4064 	 */
4065 	if (sync && balanced)
4066 		return 1;
4067 
4068 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4069 	tl_per_task = cpu_avg_load_per_task(this_cpu);
4070 
4071 	if (balanced ||
4072 	    (this_load <= load &&
4073 	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
4074 		/*
4075 		 * This domain has SD_WAKE_AFFINE and
4076 		 * p is cache cold in this domain, and
4077 		 * there is no bad imbalance.
4078 		 */
4079 		schedstat_inc(sd, ttwu_move_affine);
4080 		schedstat_inc(p, se.statistics.nr_wakeups_affine);
4081 
4082 		return 1;
4083 	}
4084 	return 0;
4085 }
4086 
4087 /*
4088  * find_idlest_group finds and returns the least busy CPU group within the
4089  * domain.
4090  */
4091 static struct sched_group *
4092 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4093 		  int this_cpu, int sd_flag)
4094 {
4095 	struct sched_group *idlest = NULL, *group = sd->groups;
4096 	unsigned long min_load = ULONG_MAX, this_load = 0;
4097 	int load_idx = sd->forkexec_idx;
4098 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
4099 
4100 	if (sd_flag & SD_BALANCE_WAKE)
4101 		load_idx = sd->wake_idx;
4102 
4103 	do {
4104 		unsigned long load, avg_load;
4105 		int local_group;
4106 		int i;
4107 
4108 		/* Skip over this group if it has no CPUs allowed */
4109 		if (!cpumask_intersects(sched_group_cpus(group),
4110 					tsk_cpus_allowed(p)))
4111 			continue;
4112 
4113 		local_group = cpumask_test_cpu(this_cpu,
4114 					       sched_group_cpus(group));
4115 
4116 		/* Tally up the load of all CPUs in the group */
4117 		avg_load = 0;
4118 
4119 		for_each_cpu(i, sched_group_cpus(group)) {
4120 			/* Bias balancing toward cpus of our domain */
4121 			if (local_group)
4122 				load = source_load(i, load_idx);
4123 			else
4124 				load = target_load(i, load_idx);
4125 
4126 			avg_load += load;
4127 		}
4128 
4129 		/* Adjust by relative CPU power of the group */
4130 		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
4131 
4132 		if (local_group) {
4133 			this_load = avg_load;
4134 		} else if (avg_load < min_load) {
4135 			min_load = avg_load;
4136 			idlest = group;
4137 		}
4138 	} while (group = group->next, group != sd->groups);
4139 
4140 	if (!idlest || 100*this_load < imbalance*min_load)
4141 		return NULL;
4142 	return idlest;
4143 }
4144 
4145 /*
4146  * find_idlest_cpu - find the idlest cpu among the cpus in group.
4147  */
4148 static int
4149 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4150 {
4151 	unsigned long load, min_load = ULONG_MAX;
4152 	int idlest = -1;
4153 	int i;
4154 
4155 	/* Traverse only the allowed CPUs */
4156 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4157 		load = weighted_cpuload(i);
4158 
4159 		if (load < min_load || (load == min_load && i == this_cpu)) {
4160 			min_load = load;
4161 			idlest = i;
4162 		}
4163 	}
4164 
4165 	return idlest;
4166 }
4167 
4168 /*
4169  * Try and locate an idle CPU in the sched_domain.
4170  */
4171 static int select_idle_sibling(struct task_struct *p, int target)
4172 {
4173 	struct sched_domain *sd;
4174 	struct sched_group *sg;
4175 	int i = task_cpu(p);
4176 
4177 	if (idle_cpu(target))
4178 		return target;
4179 
4180 	/*
4181 	 * If the prevous cpu is cache affine and idle, don't be stupid.
4182 	 */
4183 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4184 		return i;
4185 
4186 	/*
4187 	 * Otherwise, iterate the domains and find an elegible idle cpu.
4188 	 */
4189 	sd = rcu_dereference(per_cpu(sd_llc, target));
4190 	for_each_lower_domain(sd) {
4191 		sg = sd->groups;
4192 		do {
4193 			if (!cpumask_intersects(sched_group_cpus(sg),
4194 						tsk_cpus_allowed(p)))
4195 				goto next;
4196 
4197 			for_each_cpu(i, sched_group_cpus(sg)) {
4198 				if (i == target || !idle_cpu(i))
4199 					goto next;
4200 			}
4201 
4202 			target = cpumask_first_and(sched_group_cpus(sg),
4203 					tsk_cpus_allowed(p));
4204 			goto done;
4205 next:
4206 			sg = sg->next;
4207 		} while (sg != sd->groups);
4208 	}
4209 done:
4210 	return target;
4211 }
4212 
4213 /*
4214  * sched_balance_self: balance the current task (running on cpu) in domains
4215  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
4216  * SD_BALANCE_EXEC.
4217  *
4218  * Balance, ie. select the least loaded group.
4219  *
4220  * Returns the target CPU number, or the same CPU if no balancing is needed.
4221  *
4222  * preempt must be disabled.
4223  */
4224 static int
4225 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
4226 {
4227 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
4228 	int cpu = smp_processor_id();
4229 	int new_cpu = cpu;
4230 	int want_affine = 0;
4231 	int sync = wake_flags & WF_SYNC;
4232 
4233 	if (p->nr_cpus_allowed == 1)
4234 		return prev_cpu;
4235 
4236 	if (sd_flag & SD_BALANCE_WAKE) {
4237 		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
4238 			want_affine = 1;
4239 		new_cpu = prev_cpu;
4240 	}
4241 
4242 	rcu_read_lock();
4243 	for_each_domain(cpu, tmp) {
4244 		if (!(tmp->flags & SD_LOAD_BALANCE))
4245 			continue;
4246 
4247 		/*
4248 		 * If both cpu and prev_cpu are part of this domain,
4249 		 * cpu is a valid SD_WAKE_AFFINE target.
4250 		 */
4251 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
4252 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
4253 			affine_sd = tmp;
4254 			break;
4255 		}
4256 
4257 		if (tmp->flags & sd_flag)
4258 			sd = tmp;
4259 	}
4260 
4261 	if (affine_sd) {
4262 		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4263 			prev_cpu = cpu;
4264 
4265 		new_cpu = select_idle_sibling(p, prev_cpu);
4266 		goto unlock;
4267 	}
4268 
4269 	while (sd) {
4270 		struct sched_group *group;
4271 		int weight;
4272 
4273 		if (!(sd->flags & sd_flag)) {
4274 			sd = sd->child;
4275 			continue;
4276 		}
4277 
4278 		group = find_idlest_group(sd, p, cpu, sd_flag);
4279 		if (!group) {
4280 			sd = sd->child;
4281 			continue;
4282 		}
4283 
4284 		new_cpu = find_idlest_cpu(group, p, cpu);
4285 		if (new_cpu == -1 || new_cpu == cpu) {
4286 			/* Now try balancing at a lower domain level of cpu */
4287 			sd = sd->child;
4288 			continue;
4289 		}
4290 
4291 		/* Now try balancing at a lower domain level of new_cpu */
4292 		cpu = new_cpu;
4293 		weight = sd->span_weight;
4294 		sd = NULL;
4295 		for_each_domain(cpu, tmp) {
4296 			if (weight <= tmp->span_weight)
4297 				break;
4298 			if (tmp->flags & sd_flag)
4299 				sd = tmp;
4300 		}
4301 		/* while loop will break here if sd == NULL */
4302 	}
4303 unlock:
4304 	rcu_read_unlock();
4305 
4306 	return new_cpu;
4307 }
4308 
4309 /*
4310  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
4311  * cfs_rq_of(p) references at time of call are still valid and identify the
4312  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
4313  * other assumptions, including the state of rq->lock, should be made.
4314  */
4315 static void
4316 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4317 {
4318 	struct sched_entity *se = &p->se;
4319 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4320 
4321 	/*
4322 	 * Load tracking: accumulate removed load so that it can be processed
4323 	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
4324 	 * to blocked load iff they have a positive decay-count.  It can never
4325 	 * be negative here since on-rq tasks have decay-count == 0.
4326 	 */
4327 	if (se->avg.decay_count) {
4328 		se->avg.decay_count = -__synchronize_entity_decay(se);
4329 		atomic_long_add(se->avg.load_avg_contrib,
4330 						&cfs_rq->removed_load);
4331 	}
4332 }
4333 #endif /* CONFIG_SMP */
4334 
4335 static unsigned long
4336 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
4337 {
4338 	unsigned long gran = sysctl_sched_wakeup_granularity;
4339 
4340 	/*
4341 	 * Since its curr running now, convert the gran from real-time
4342 	 * to virtual-time in his units.
4343 	 *
4344 	 * By using 'se' instead of 'curr' we penalize light tasks, so
4345 	 * they get preempted easier. That is, if 'se' < 'curr' then
4346 	 * the resulting gran will be larger, therefore penalizing the
4347 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
4348 	 * be smaller, again penalizing the lighter task.
4349 	 *
4350 	 * This is especially important for buddies when the leftmost
4351 	 * task is higher priority than the buddy.
4352 	 */
4353 	return calc_delta_fair(gran, se);
4354 }
4355 
4356 /*
4357  * Should 'se' preempt 'curr'.
4358  *
4359  *             |s1
4360  *        |s2
4361  *   |s3
4362  *         g
4363  *      |<--->|c
4364  *
4365  *  w(c, s1) = -1
4366  *  w(c, s2) =  0
4367  *  w(c, s3) =  1
4368  *
4369  */
4370 static int
4371 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
4372 {
4373 	s64 gran, vdiff = curr->vruntime - se->vruntime;
4374 
4375 	if (vdiff <= 0)
4376 		return -1;
4377 
4378 	gran = wakeup_gran(curr, se);
4379 	if (vdiff > gran)
4380 		return 1;
4381 
4382 	return 0;
4383 }
4384 
4385 static void set_last_buddy(struct sched_entity *se)
4386 {
4387 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4388 		return;
4389 
4390 	for_each_sched_entity(se)
4391 		cfs_rq_of(se)->last = se;
4392 }
4393 
4394 static void set_next_buddy(struct sched_entity *se)
4395 {
4396 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4397 		return;
4398 
4399 	for_each_sched_entity(se)
4400 		cfs_rq_of(se)->next = se;
4401 }
4402 
4403 static void set_skip_buddy(struct sched_entity *se)
4404 {
4405 	for_each_sched_entity(se)
4406 		cfs_rq_of(se)->skip = se;
4407 }
4408 
4409 /*
4410  * Preempt the current task with a newly woken task if needed:
4411  */
4412 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
4413 {
4414 	struct task_struct *curr = rq->curr;
4415 	struct sched_entity *se = &curr->se, *pse = &p->se;
4416 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4417 	int scale = cfs_rq->nr_running >= sched_nr_latency;
4418 	int next_buddy_marked = 0;
4419 
4420 	if (unlikely(se == pse))
4421 		return;
4422 
4423 	/*
4424 	 * This is possible from callers such as move_task(), in which we
4425 	 * unconditionally check_prempt_curr() after an enqueue (which may have
4426 	 * lead to a throttle).  This both saves work and prevents false
4427 	 * next-buddy nomination below.
4428 	 */
4429 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
4430 		return;
4431 
4432 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
4433 		set_next_buddy(pse);
4434 		next_buddy_marked = 1;
4435 	}
4436 
4437 	/*
4438 	 * We can come here with TIF_NEED_RESCHED already set from new task
4439 	 * wake up path.
4440 	 *
4441 	 * Note: this also catches the edge-case of curr being in a throttled
4442 	 * group (e.g. via set_curr_task), since update_curr() (in the
4443 	 * enqueue of curr) will have resulted in resched being set.  This
4444 	 * prevents us from potentially nominating it as a false LAST_BUDDY
4445 	 * below.
4446 	 */
4447 	if (test_tsk_need_resched(curr))
4448 		return;
4449 
4450 	/* Idle tasks are by definition preempted by non-idle tasks. */
4451 	if (unlikely(curr->policy == SCHED_IDLE) &&
4452 	    likely(p->policy != SCHED_IDLE))
4453 		goto preempt;
4454 
4455 	/*
4456 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
4457 	 * is driven by the tick):
4458 	 */
4459 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
4460 		return;
4461 
4462 	find_matching_se(&se, &pse);
4463 	update_curr(cfs_rq_of(se));
4464 	BUG_ON(!pse);
4465 	if (wakeup_preempt_entity(se, pse) == 1) {
4466 		/*
4467 		 * Bias pick_next to pick the sched entity that is
4468 		 * triggering this preemption.
4469 		 */
4470 		if (!next_buddy_marked)
4471 			set_next_buddy(pse);
4472 		goto preempt;
4473 	}
4474 
4475 	return;
4476 
4477 preempt:
4478 	resched_task(curr);
4479 	/*
4480 	 * Only set the backward buddy when the current task is still
4481 	 * on the rq. This can happen when a wakeup gets interleaved
4482 	 * with schedule on the ->pre_schedule() or idle_balance()
4483 	 * point, either of which can * drop the rq lock.
4484 	 *
4485 	 * Also, during early boot the idle thread is in the fair class,
4486 	 * for obvious reasons its a bad idea to schedule back to it.
4487 	 */
4488 	if (unlikely(!se->on_rq || curr == rq->idle))
4489 		return;
4490 
4491 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
4492 		set_last_buddy(se);
4493 }
4494 
4495 static struct task_struct *pick_next_task_fair(struct rq *rq)
4496 {
4497 	struct task_struct *p;
4498 	struct cfs_rq *cfs_rq = &rq->cfs;
4499 	struct sched_entity *se;
4500 
4501 	if (!cfs_rq->nr_running)
4502 		return NULL;
4503 
4504 	do {
4505 		se = pick_next_entity(cfs_rq);
4506 		set_next_entity(cfs_rq, se);
4507 		cfs_rq = group_cfs_rq(se);
4508 	} while (cfs_rq);
4509 
4510 	p = task_of(se);
4511 	if (hrtick_enabled(rq))
4512 		hrtick_start_fair(rq, p);
4513 
4514 	return p;
4515 }
4516 
4517 /*
4518  * Account for a descheduled task:
4519  */
4520 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
4521 {
4522 	struct sched_entity *se = &prev->se;
4523 	struct cfs_rq *cfs_rq;
4524 
4525 	for_each_sched_entity(se) {
4526 		cfs_rq = cfs_rq_of(se);
4527 		put_prev_entity(cfs_rq, se);
4528 	}
4529 }
4530 
4531 /*
4532  * sched_yield() is very simple
4533  *
4534  * The magic of dealing with the ->skip buddy is in pick_next_entity.
4535  */
4536 static void yield_task_fair(struct rq *rq)
4537 {
4538 	struct task_struct *curr = rq->curr;
4539 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4540 	struct sched_entity *se = &curr->se;
4541 
4542 	/*
4543 	 * Are we the only task in the tree?
4544 	 */
4545 	if (unlikely(rq->nr_running == 1))
4546 		return;
4547 
4548 	clear_buddies(cfs_rq, se);
4549 
4550 	if (curr->policy != SCHED_BATCH) {
4551 		update_rq_clock(rq);
4552 		/*
4553 		 * Update run-time statistics of the 'current'.
4554 		 */
4555 		update_curr(cfs_rq);
4556 		/*
4557 		 * Tell update_rq_clock() that we've just updated,
4558 		 * so we don't do microscopic update in schedule()
4559 		 * and double the fastpath cost.
4560 		 */
4561 		 rq->skip_clock_update = 1;
4562 	}
4563 
4564 	set_skip_buddy(se);
4565 }
4566 
4567 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
4568 {
4569 	struct sched_entity *se = &p->se;
4570 
4571 	/* throttled hierarchies are not runnable */
4572 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
4573 		return false;
4574 
4575 	/* Tell the scheduler that we'd really like pse to run next. */
4576 	set_next_buddy(se);
4577 
4578 	yield_task_fair(rq);
4579 
4580 	return true;
4581 }
4582 
4583 #ifdef CONFIG_SMP
4584 /**************************************************
4585  * Fair scheduling class load-balancing methods.
4586  *
4587  * BASICS
4588  *
4589  * The purpose of load-balancing is to achieve the same basic fairness the
4590  * per-cpu scheduler provides, namely provide a proportional amount of compute
4591  * time to each task. This is expressed in the following equation:
4592  *
4593  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
4594  *
4595  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
4596  * W_i,0 is defined as:
4597  *
4598  *   W_i,0 = \Sum_j w_i,j                                             (2)
4599  *
4600  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
4601  * is derived from the nice value as per prio_to_weight[].
4602  *
4603  * The weight average is an exponential decay average of the instantaneous
4604  * weight:
4605  *
4606  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
4607  *
4608  * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
4609  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
4610  * can also include other factors [XXX].
4611  *
4612  * To achieve this balance we define a measure of imbalance which follows
4613  * directly from (1):
4614  *
4615  *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
4616  *
4617  * We them move tasks around to minimize the imbalance. In the continuous
4618  * function space it is obvious this converges, in the discrete case we get
4619  * a few fun cases generally called infeasible weight scenarios.
4620  *
4621  * [XXX expand on:
4622  *     - infeasible weights;
4623  *     - local vs global optima in the discrete case. ]
4624  *
4625  *
4626  * SCHED DOMAINS
4627  *
4628  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
4629  * for all i,j solution, we create a tree of cpus that follows the hardware
4630  * topology where each level pairs two lower groups (or better). This results
4631  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
4632  * tree to only the first of the previous level and we decrease the frequency
4633  * of load-balance at each level inv. proportional to the number of cpus in
4634  * the groups.
4635  *
4636  * This yields:
4637  *
4638  *     log_2 n     1     n
4639  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
4640  *     i = 0      2^i   2^i
4641  *                               `- size of each group
4642  *         |         |     `- number of cpus doing load-balance
4643  *         |         `- freq
4644  *         `- sum over all levels
4645  *
4646  * Coupled with a limit on how many tasks we can migrate every balance pass,
4647  * this makes (5) the runtime complexity of the balancer.
4648  *
4649  * An important property here is that each CPU is still (indirectly) connected
4650  * to every other cpu in at most O(log n) steps:
4651  *
4652  * The adjacency matrix of the resulting graph is given by:
4653  *
4654  *             log_2 n
4655  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
4656  *             k = 0
4657  *
4658  * And you'll find that:
4659  *
4660  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
4661  *
4662  * Showing there's indeed a path between every cpu in at most O(log n) steps.
4663  * The task movement gives a factor of O(m), giving a convergence complexity
4664  * of:
4665  *
4666  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
4667  *
4668  *
4669  * WORK CONSERVING
4670  *
4671  * In order to avoid CPUs going idle while there's still work to do, new idle
4672  * balancing is more aggressive and has the newly idle cpu iterate up the domain
4673  * tree itself instead of relying on other CPUs to bring it work.
4674  *
4675  * This adds some complexity to both (5) and (8) but it reduces the total idle
4676  * time.
4677  *
4678  * [XXX more?]
4679  *
4680  *
4681  * CGROUPS
4682  *
4683  * Cgroups make a horror show out of (2), instead of a simple sum we get:
4684  *
4685  *                                s_k,i
4686  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
4687  *                                 S_k
4688  *
4689  * Where
4690  *
4691  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
4692  *
4693  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
4694  *
4695  * The big problem is S_k, its a global sum needed to compute a local (W_i)
4696  * property.
4697  *
4698  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
4699  *      rewrite all of this once again.]
4700  */
4701 
4702 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4703 
4704 enum fbq_type { regular, remote, all };
4705 
4706 #define LBF_ALL_PINNED	0x01
4707 #define LBF_NEED_BREAK	0x02
4708 #define LBF_DST_PINNED  0x04
4709 #define LBF_SOME_PINNED	0x08
4710 
4711 struct lb_env {
4712 	struct sched_domain	*sd;
4713 
4714 	struct rq		*src_rq;
4715 	int			src_cpu;
4716 
4717 	int			dst_cpu;
4718 	struct rq		*dst_rq;
4719 
4720 	struct cpumask		*dst_grpmask;
4721 	int			new_dst_cpu;
4722 	enum cpu_idle_type	idle;
4723 	long			imbalance;
4724 	/* The set of CPUs under consideration for load-balancing */
4725 	struct cpumask		*cpus;
4726 
4727 	unsigned int		flags;
4728 
4729 	unsigned int		loop;
4730 	unsigned int		loop_break;
4731 	unsigned int		loop_max;
4732 
4733 	enum fbq_type		fbq_type;
4734 };
4735 
4736 /*
4737  * move_task - move a task from one runqueue to another runqueue.
4738  * Both runqueues must be locked.
4739  */
4740 static void move_task(struct task_struct *p, struct lb_env *env)
4741 {
4742 	deactivate_task(env->src_rq, p, 0);
4743 	set_task_cpu(p, env->dst_cpu);
4744 	activate_task(env->dst_rq, p, 0);
4745 	check_preempt_curr(env->dst_rq, p, 0);
4746 }
4747 
4748 /*
4749  * Is this task likely cache-hot:
4750  */
4751 static int
4752 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
4753 {
4754 	s64 delta;
4755 
4756 	if (p->sched_class != &fair_sched_class)
4757 		return 0;
4758 
4759 	if (unlikely(p->policy == SCHED_IDLE))
4760 		return 0;
4761 
4762 	/*
4763 	 * Buddy candidates are cache hot:
4764 	 */
4765 	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
4766 			(&p->se == cfs_rq_of(&p->se)->next ||
4767 			 &p->se == cfs_rq_of(&p->se)->last))
4768 		return 1;
4769 
4770 	if (sysctl_sched_migration_cost == -1)
4771 		return 1;
4772 	if (sysctl_sched_migration_cost == 0)
4773 		return 0;
4774 
4775 	delta = now - p->se.exec_start;
4776 
4777 	return delta < (s64)sysctl_sched_migration_cost;
4778 }
4779 
4780 #ifdef CONFIG_NUMA_BALANCING
4781 /* Returns true if the destination node has incurred more faults */
4782 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4783 {
4784 	int src_nid, dst_nid;
4785 
4786 	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
4787 	    !(env->sd->flags & SD_NUMA)) {
4788 		return false;
4789 	}
4790 
4791 	src_nid = cpu_to_node(env->src_cpu);
4792 	dst_nid = cpu_to_node(env->dst_cpu);
4793 
4794 	if (src_nid == dst_nid)
4795 		return false;
4796 
4797 	/* Always encourage migration to the preferred node. */
4798 	if (dst_nid == p->numa_preferred_nid)
4799 		return true;
4800 
4801 	/* If both task and group weight improve, this move is a winner. */
4802 	if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
4803 	    group_weight(p, dst_nid) > group_weight(p, src_nid))
4804 		return true;
4805 
4806 	return false;
4807 }
4808 
4809 
4810 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4811 {
4812 	int src_nid, dst_nid;
4813 
4814 	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4815 		return false;
4816 
4817 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
4818 		return false;
4819 
4820 	src_nid = cpu_to_node(env->src_cpu);
4821 	dst_nid = cpu_to_node(env->dst_cpu);
4822 
4823 	if (src_nid == dst_nid)
4824 		return false;
4825 
4826 	/* Migrating away from the preferred node is always bad. */
4827 	if (src_nid == p->numa_preferred_nid)
4828 		return true;
4829 
4830 	/* If either task or group weight get worse, don't do it. */
4831 	if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
4832 	    group_weight(p, dst_nid) < group_weight(p, src_nid))
4833 		return true;
4834 
4835 	return false;
4836 }
4837 
4838 #else
4839 static inline bool migrate_improves_locality(struct task_struct *p,
4840 					     struct lb_env *env)
4841 {
4842 	return false;
4843 }
4844 
4845 static inline bool migrate_degrades_locality(struct task_struct *p,
4846 					     struct lb_env *env)
4847 {
4848 	return false;
4849 }
4850 #endif
4851 
4852 /*
4853  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
4854  */
4855 static
4856 int can_migrate_task(struct task_struct *p, struct lb_env *env)
4857 {
4858 	int tsk_cache_hot = 0;
4859 	/*
4860 	 * We do not migrate tasks that are:
4861 	 * 1) throttled_lb_pair, or
4862 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
4863 	 * 3) running (obviously), or
4864 	 * 4) are cache-hot on their current CPU.
4865 	 */
4866 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
4867 		return 0;
4868 
4869 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
4870 		int cpu;
4871 
4872 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
4873 
4874 		env->flags |= LBF_SOME_PINNED;
4875 
4876 		/*
4877 		 * Remember if this task can be migrated to any other cpu in
4878 		 * our sched_group. We may want to revisit it if we couldn't
4879 		 * meet load balance goals by pulling other tasks on src_cpu.
4880 		 *
4881 		 * Also avoid computing new_dst_cpu if we have already computed
4882 		 * one in current iteration.
4883 		 */
4884 		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
4885 			return 0;
4886 
4887 		/* Prevent to re-select dst_cpu via env's cpus */
4888 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4889 			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4890 				env->flags |= LBF_DST_PINNED;
4891 				env->new_dst_cpu = cpu;
4892 				break;
4893 			}
4894 		}
4895 
4896 		return 0;
4897 	}
4898 
4899 	/* Record that we found atleast one task that could run on dst_cpu */
4900 	env->flags &= ~LBF_ALL_PINNED;
4901 
4902 	if (task_running(env->src_rq, p)) {
4903 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
4904 		return 0;
4905 	}
4906 
4907 	/*
4908 	 * Aggressive migration if:
4909 	 * 1) destination numa is preferred
4910 	 * 2) task is cache cold, or
4911 	 * 3) too many balance attempts have failed.
4912 	 */
4913 	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
4914 	if (!tsk_cache_hot)
4915 		tsk_cache_hot = migrate_degrades_locality(p, env);
4916 
4917 	if (migrate_improves_locality(p, env)) {
4918 #ifdef CONFIG_SCHEDSTATS
4919 		if (tsk_cache_hot) {
4920 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4921 			schedstat_inc(p, se.statistics.nr_forced_migrations);
4922 		}
4923 #endif
4924 		return 1;
4925 	}
4926 
4927 	if (!tsk_cache_hot ||
4928 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4929 
4930 		if (tsk_cache_hot) {
4931 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4932 			schedstat_inc(p, se.statistics.nr_forced_migrations);
4933 		}
4934 
4935 		return 1;
4936 	}
4937 
4938 	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
4939 	return 0;
4940 }
4941 
4942 /*
4943  * move_one_task tries to move exactly one task from busiest to this_rq, as
4944  * part of active balancing operations within "domain".
4945  * Returns 1 if successful and 0 otherwise.
4946  *
4947  * Called with both runqueues locked.
4948  */
4949 static int move_one_task(struct lb_env *env)
4950 {
4951 	struct task_struct *p, *n;
4952 
4953 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
4954 		if (!can_migrate_task(p, env))
4955 			continue;
4956 
4957 		move_task(p, env);
4958 		/*
4959 		 * Right now, this is only the second place move_task()
4960 		 * is called, so we can safely collect move_task()
4961 		 * stats here rather than inside move_task().
4962 		 */
4963 		schedstat_inc(env->sd, lb_gained[env->idle]);
4964 		return 1;
4965 	}
4966 	return 0;
4967 }
4968 
4969 static const unsigned int sched_nr_migrate_break = 32;
4970 
4971 /*
4972  * move_tasks tries to move up to imbalance weighted load from busiest to
4973  * this_rq, as part of a balancing operation within domain "sd".
4974  * Returns 1 if successful and 0 otherwise.
4975  *
4976  * Called with both runqueues locked.
4977  */
4978 static int move_tasks(struct lb_env *env)
4979 {
4980 	struct list_head *tasks = &env->src_rq->cfs_tasks;
4981 	struct task_struct *p;
4982 	unsigned long load;
4983 	int pulled = 0;
4984 
4985 	if (env->imbalance <= 0)
4986 		return 0;
4987 
4988 	while (!list_empty(tasks)) {
4989 		p = list_first_entry(tasks, struct task_struct, se.group_node);
4990 
4991 		env->loop++;
4992 		/* We've more or less seen every task there is, call it quits */
4993 		if (env->loop > env->loop_max)
4994 			break;
4995 
4996 		/* take a breather every nr_migrate tasks */
4997 		if (env->loop > env->loop_break) {
4998 			env->loop_break += sched_nr_migrate_break;
4999 			env->flags |= LBF_NEED_BREAK;
5000 			break;
5001 		}
5002 
5003 		if (!can_migrate_task(p, env))
5004 			goto next;
5005 
5006 		load = task_h_load(p);
5007 
5008 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
5009 			goto next;
5010 
5011 		if ((load / 2) > env->imbalance)
5012 			goto next;
5013 
5014 		move_task(p, env);
5015 		pulled++;
5016 		env->imbalance -= load;
5017 
5018 #ifdef CONFIG_PREEMPT
5019 		/*
5020 		 * NEWIDLE balancing is a source of latency, so preemptible
5021 		 * kernels will stop after the first task is pulled to minimize
5022 		 * the critical section.
5023 		 */
5024 		if (env->idle == CPU_NEWLY_IDLE)
5025 			break;
5026 #endif
5027 
5028 		/*
5029 		 * We only want to steal up to the prescribed amount of
5030 		 * weighted load.
5031 		 */
5032 		if (env->imbalance <= 0)
5033 			break;
5034 
5035 		continue;
5036 next:
5037 		list_move_tail(&p->se.group_node, tasks);
5038 	}
5039 
5040 	/*
5041 	 * Right now, this is one of only two places move_task() is called,
5042 	 * so we can safely collect move_task() stats here rather than
5043 	 * inside move_task().
5044 	 */
5045 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
5046 
5047 	return pulled;
5048 }
5049 
5050 #ifdef CONFIG_FAIR_GROUP_SCHED
5051 /*
5052  * update tg->load_weight by folding this cpu's load_avg
5053  */
5054 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
5055 {
5056 	struct sched_entity *se = tg->se[cpu];
5057 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
5058 
5059 	/* throttled entities do not contribute to load */
5060 	if (throttled_hierarchy(cfs_rq))
5061 		return;
5062 
5063 	update_cfs_rq_blocked_load(cfs_rq, 1);
5064 
5065 	if (se) {
5066 		update_entity_load_avg(se, 1);
5067 		/*
5068 		 * We pivot on our runnable average having decayed to zero for
5069 		 * list removal.  This generally implies that all our children
5070 		 * have also been removed (modulo rounding error or bandwidth
5071 		 * control); however, such cases are rare and we can fix these
5072 		 * at enqueue.
5073 		 *
5074 		 * TODO: fix up out-of-order children on enqueue.
5075 		 */
5076 		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
5077 			list_del_leaf_cfs_rq(cfs_rq);
5078 	} else {
5079 		struct rq *rq = rq_of(cfs_rq);
5080 		update_rq_runnable_avg(rq, rq->nr_running);
5081 	}
5082 }
5083 
5084 static void update_blocked_averages(int cpu)
5085 {
5086 	struct rq *rq = cpu_rq(cpu);
5087 	struct cfs_rq *cfs_rq;
5088 	unsigned long flags;
5089 
5090 	raw_spin_lock_irqsave(&rq->lock, flags);
5091 	update_rq_clock(rq);
5092 	/*
5093 	 * Iterates the task_group tree in a bottom up fashion, see
5094 	 * list_add_leaf_cfs_rq() for details.
5095 	 */
5096 	for_each_leaf_cfs_rq(rq, cfs_rq) {
5097 		/*
5098 		 * Note: We may want to consider periodically releasing
5099 		 * rq->lock about these updates so that creating many task
5100 		 * groups does not result in continually extending hold time.
5101 		 */
5102 		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
5103 	}
5104 
5105 	raw_spin_unlock_irqrestore(&rq->lock, flags);
5106 }
5107 
5108 /*
5109  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
5110  * This needs to be done in a top-down fashion because the load of a child
5111  * group is a fraction of its parents load.
5112  */
5113 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
5114 {
5115 	struct rq *rq = rq_of(cfs_rq);
5116 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
5117 	unsigned long now = jiffies;
5118 	unsigned long load;
5119 
5120 	if (cfs_rq->last_h_load_update == now)
5121 		return;
5122 
5123 	cfs_rq->h_load_next = NULL;
5124 	for_each_sched_entity(se) {
5125 		cfs_rq = cfs_rq_of(se);
5126 		cfs_rq->h_load_next = se;
5127 		if (cfs_rq->last_h_load_update == now)
5128 			break;
5129 	}
5130 
5131 	if (!se) {
5132 		cfs_rq->h_load = cfs_rq->runnable_load_avg;
5133 		cfs_rq->last_h_load_update = now;
5134 	}
5135 
5136 	while ((se = cfs_rq->h_load_next) != NULL) {
5137 		load = cfs_rq->h_load;
5138 		load = div64_ul(load * se->avg.load_avg_contrib,
5139 				cfs_rq->runnable_load_avg + 1);
5140 		cfs_rq = group_cfs_rq(se);
5141 		cfs_rq->h_load = load;
5142 		cfs_rq->last_h_load_update = now;
5143 	}
5144 }
5145 
5146 static unsigned long task_h_load(struct task_struct *p)
5147 {
5148 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
5149 
5150 	update_cfs_rq_h_load(cfs_rq);
5151 	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
5152 			cfs_rq->runnable_load_avg + 1);
5153 }
5154 #else
5155 static inline void update_blocked_averages(int cpu)
5156 {
5157 }
5158 
5159 static unsigned long task_h_load(struct task_struct *p)
5160 {
5161 	return p->se.avg.load_avg_contrib;
5162 }
5163 #endif
5164 
5165 /********** Helpers for find_busiest_group ************************/
5166 /*
5167  * sg_lb_stats - stats of a sched_group required for load_balancing
5168  */
5169 struct sg_lb_stats {
5170 	unsigned long avg_load; /*Avg load across the CPUs of the group */
5171 	unsigned long group_load; /* Total load over the CPUs of the group */
5172 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5173 	unsigned long load_per_task;
5174 	unsigned long group_power;
5175 	unsigned int sum_nr_running; /* Nr tasks running in the group */
5176 	unsigned int group_capacity;
5177 	unsigned int idle_cpus;
5178 	unsigned int group_weight;
5179 	int group_imb; /* Is there an imbalance in the group ? */
5180 	int group_has_capacity; /* Is there extra capacity in the group? */
5181 #ifdef CONFIG_NUMA_BALANCING
5182 	unsigned int nr_numa_running;
5183 	unsigned int nr_preferred_running;
5184 #endif
5185 };
5186 
5187 /*
5188  * sd_lb_stats - Structure to store the statistics of a sched_domain
5189  *		 during load balancing.
5190  */
5191 struct sd_lb_stats {
5192 	struct sched_group *busiest;	/* Busiest group in this sd */
5193 	struct sched_group *local;	/* Local group in this sd */
5194 	unsigned long total_load;	/* Total load of all groups in sd */
5195 	unsigned long total_pwr;	/* Total power of all groups in sd */
5196 	unsigned long avg_load;	/* Average load across all groups in sd */
5197 
5198 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
5199 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
5200 };
5201 
5202 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5203 {
5204 	/*
5205 	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
5206 	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
5207 	 * We must however clear busiest_stat::avg_load because
5208 	 * update_sd_pick_busiest() reads this before assignment.
5209 	 */
5210 	*sds = (struct sd_lb_stats){
5211 		.busiest = NULL,
5212 		.local = NULL,
5213 		.total_load = 0UL,
5214 		.total_pwr = 0UL,
5215 		.busiest_stat = {
5216 			.avg_load = 0UL,
5217 		},
5218 	};
5219 }
5220 
5221 /**
5222  * get_sd_load_idx - Obtain the load index for a given sched domain.
5223  * @sd: The sched_domain whose load_idx is to be obtained.
5224  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
5225  *
5226  * Return: The load index.
5227  */
5228 static inline int get_sd_load_idx(struct sched_domain *sd,
5229 					enum cpu_idle_type idle)
5230 {
5231 	int load_idx;
5232 
5233 	switch (idle) {
5234 	case CPU_NOT_IDLE:
5235 		load_idx = sd->busy_idx;
5236 		break;
5237 
5238 	case CPU_NEWLY_IDLE:
5239 		load_idx = sd->newidle_idx;
5240 		break;
5241 	default:
5242 		load_idx = sd->idle_idx;
5243 		break;
5244 	}
5245 
5246 	return load_idx;
5247 }
5248 
5249 static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
5250 {
5251 	return SCHED_POWER_SCALE;
5252 }
5253 
5254 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
5255 {
5256 	return default_scale_freq_power(sd, cpu);
5257 }
5258 
5259 static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
5260 {
5261 	unsigned long weight = sd->span_weight;
5262 	unsigned long smt_gain = sd->smt_gain;
5263 
5264 	smt_gain /= weight;
5265 
5266 	return smt_gain;
5267 }
5268 
5269 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
5270 {
5271 	return default_scale_smt_power(sd, cpu);
5272 }
5273 
5274 static unsigned long scale_rt_power(int cpu)
5275 {
5276 	struct rq *rq = cpu_rq(cpu);
5277 	u64 total, available, age_stamp, avg;
5278 
5279 	/*
5280 	 * Since we're reading these variables without serialization make sure
5281 	 * we read them once before doing sanity checks on them.
5282 	 */
5283 	age_stamp = ACCESS_ONCE(rq->age_stamp);
5284 	avg = ACCESS_ONCE(rq->rt_avg);
5285 
5286 	total = sched_avg_period() + (rq_clock(rq) - age_stamp);
5287 
5288 	if (unlikely(total < avg)) {
5289 		/* Ensures that power won't end up being negative */
5290 		available = 0;
5291 	} else {
5292 		available = total - avg;
5293 	}
5294 
5295 	if (unlikely((s64)total < SCHED_POWER_SCALE))
5296 		total = SCHED_POWER_SCALE;
5297 
5298 	total >>= SCHED_POWER_SHIFT;
5299 
5300 	return div_u64(available, total);
5301 }
5302 
5303 static void update_cpu_power(struct sched_domain *sd, int cpu)
5304 {
5305 	unsigned long weight = sd->span_weight;
5306 	unsigned long power = SCHED_POWER_SCALE;
5307 	struct sched_group *sdg = sd->groups;
5308 
5309 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
5310 		if (sched_feat(ARCH_POWER))
5311 			power *= arch_scale_smt_power(sd, cpu);
5312 		else
5313 			power *= default_scale_smt_power(sd, cpu);
5314 
5315 		power >>= SCHED_POWER_SHIFT;
5316 	}
5317 
5318 	sdg->sgp->power_orig = power;
5319 
5320 	if (sched_feat(ARCH_POWER))
5321 		power *= arch_scale_freq_power(sd, cpu);
5322 	else
5323 		power *= default_scale_freq_power(sd, cpu);
5324 
5325 	power >>= SCHED_POWER_SHIFT;
5326 
5327 	power *= scale_rt_power(cpu);
5328 	power >>= SCHED_POWER_SHIFT;
5329 
5330 	if (!power)
5331 		power = 1;
5332 
5333 	cpu_rq(cpu)->cpu_power = power;
5334 	sdg->sgp->power = power;
5335 }
5336 
5337 void update_group_power(struct sched_domain *sd, int cpu)
5338 {
5339 	struct sched_domain *child = sd->child;
5340 	struct sched_group *group, *sdg = sd->groups;
5341 	unsigned long power, power_orig;
5342 	unsigned long interval;
5343 
5344 	interval = msecs_to_jiffies(sd->balance_interval);
5345 	interval = clamp(interval, 1UL, max_load_balance_interval);
5346 	sdg->sgp->next_update = jiffies + interval;
5347 
5348 	if (!child) {
5349 		update_cpu_power(sd, cpu);
5350 		return;
5351 	}
5352 
5353 	power_orig = power = 0;
5354 
5355 	if (child->flags & SD_OVERLAP) {
5356 		/*
5357 		 * SD_OVERLAP domains cannot assume that child groups
5358 		 * span the current group.
5359 		 */
5360 
5361 		for_each_cpu(cpu, sched_group_cpus(sdg)) {
5362 			struct sched_group_power *sgp;
5363 			struct rq *rq = cpu_rq(cpu);
5364 
5365 			/*
5366 			 * build_sched_domains() -> init_sched_groups_power()
5367 			 * gets here before we've attached the domains to the
5368 			 * runqueues.
5369 			 *
5370 			 * Use power_of(), which is set irrespective of domains
5371 			 * in update_cpu_power().
5372 			 *
5373 			 * This avoids power/power_orig from being 0 and
5374 			 * causing divide-by-zero issues on boot.
5375 			 *
5376 			 * Runtime updates will correct power_orig.
5377 			 */
5378 			if (unlikely(!rq->sd)) {
5379 				power_orig += power_of(cpu);
5380 				power += power_of(cpu);
5381 				continue;
5382 			}
5383 
5384 			sgp = rq->sd->groups->sgp;
5385 			power_orig += sgp->power_orig;
5386 			power += sgp->power;
5387 		}
5388 	} else  {
5389 		/*
5390 		 * !SD_OVERLAP domains can assume that child groups
5391 		 * span the current group.
5392 		 */
5393 
5394 		group = child->groups;
5395 		do {
5396 			power_orig += group->sgp->power_orig;
5397 			power += group->sgp->power;
5398 			group = group->next;
5399 		} while (group != child->groups);
5400 	}
5401 
5402 	sdg->sgp->power_orig = power_orig;
5403 	sdg->sgp->power = power;
5404 }
5405 
5406 /*
5407  * Try and fix up capacity for tiny siblings, this is needed when
5408  * things like SD_ASYM_PACKING need f_b_g to select another sibling
5409  * which on its own isn't powerful enough.
5410  *
5411  * See update_sd_pick_busiest() and check_asym_packing().
5412  */
5413 static inline int
5414 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
5415 {
5416 	/*
5417 	 * Only siblings can have significantly less than SCHED_POWER_SCALE
5418 	 */
5419 	if (!(sd->flags & SD_SHARE_CPUPOWER))
5420 		return 0;
5421 
5422 	/*
5423 	 * If ~90% of the cpu_power is still there, we're good.
5424 	 */
5425 	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
5426 		return 1;
5427 
5428 	return 0;
5429 }
5430 
5431 /*
5432  * Group imbalance indicates (and tries to solve) the problem where balancing
5433  * groups is inadequate due to tsk_cpus_allowed() constraints.
5434  *
5435  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
5436  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
5437  * Something like:
5438  *
5439  * 	{ 0 1 2 3 } { 4 5 6 7 }
5440  * 	        *     * * *
5441  *
5442  * If we were to balance group-wise we'd place two tasks in the first group and
5443  * two tasks in the second group. Clearly this is undesired as it will overload
5444  * cpu 3 and leave one of the cpus in the second group unused.
5445  *
5446  * The current solution to this issue is detecting the skew in the first group
5447  * by noticing the lower domain failed to reach balance and had difficulty
5448  * moving tasks due to affinity constraints.
5449  *
5450  * When this is so detected; this group becomes a candidate for busiest; see
5451  * update_sd_pick_busiest(). And calculate_imbalance() and
5452  * find_busiest_group() avoid some of the usual balance conditions to allow it
5453  * to create an effective group imbalance.
5454  *
5455  * This is a somewhat tricky proposition since the next run might not find the
5456  * group imbalance and decide the groups need to be balanced again. A most
5457  * subtle and fragile situation.
5458  */
5459 
5460 static inline int sg_imbalanced(struct sched_group *group)
5461 {
5462 	return group->sgp->imbalance;
5463 }
5464 
5465 /*
5466  * Compute the group capacity.
5467  *
5468  * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
5469  * first dividing out the smt factor and computing the actual number of cores
5470  * and limit power unit capacity with that.
5471  */
5472 static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
5473 {
5474 	unsigned int capacity, smt, cpus;
5475 	unsigned int power, power_orig;
5476 
5477 	power = group->sgp->power;
5478 	power_orig = group->sgp->power_orig;
5479 	cpus = group->group_weight;
5480 
5481 	/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
5482 	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
5483 	capacity = cpus / smt; /* cores */
5484 
5485 	capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
5486 	if (!capacity)
5487 		capacity = fix_small_capacity(env->sd, group);
5488 
5489 	return capacity;
5490 }
5491 
5492 /**
5493  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
5494  * @env: The load balancing environment.
5495  * @group: sched_group whose statistics are to be updated.
5496  * @load_idx: Load index of sched_domain of this_cpu for load calc.
5497  * @local_group: Does group contain this_cpu.
5498  * @sgs: variable to hold the statistics for this group.
5499  */
5500 static inline void update_sg_lb_stats(struct lb_env *env,
5501 			struct sched_group *group, int load_idx,
5502 			int local_group, struct sg_lb_stats *sgs)
5503 {
5504 	unsigned long load;
5505 	int i;
5506 
5507 	memset(sgs, 0, sizeof(*sgs));
5508 
5509 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5510 		struct rq *rq = cpu_rq(i);
5511 
5512 		/* Bias balancing toward cpus of our domain */
5513 		if (local_group)
5514 			load = target_load(i, load_idx);
5515 		else
5516 			load = source_load(i, load_idx);
5517 
5518 		sgs->group_load += load;
5519 		sgs->sum_nr_running += rq->nr_running;
5520 #ifdef CONFIG_NUMA_BALANCING
5521 		sgs->nr_numa_running += rq->nr_numa_running;
5522 		sgs->nr_preferred_running += rq->nr_preferred_running;
5523 #endif
5524 		sgs->sum_weighted_load += weighted_cpuload(i);
5525 		if (idle_cpu(i))
5526 			sgs->idle_cpus++;
5527 	}
5528 
5529 	/* Adjust by relative CPU power of the group */
5530 	sgs->group_power = group->sgp->power;
5531 	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
5532 
5533 	if (sgs->sum_nr_running)
5534 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
5535 
5536 	sgs->group_weight = group->group_weight;
5537 
5538 	sgs->group_imb = sg_imbalanced(group);
5539 	sgs->group_capacity = sg_capacity(env, group);
5540 
5541 	if (sgs->group_capacity > sgs->sum_nr_running)
5542 		sgs->group_has_capacity = 1;
5543 }
5544 
5545 /**
5546  * update_sd_pick_busiest - return 1 on busiest group
5547  * @env: The load balancing environment.
5548  * @sds: sched_domain statistics
5549  * @sg: sched_group candidate to be checked for being the busiest
5550  * @sgs: sched_group statistics
5551  *
5552  * Determine if @sg is a busier group than the previously selected
5553  * busiest group.
5554  *
5555  * Return: %true if @sg is a busier group than the previously selected
5556  * busiest group. %false otherwise.
5557  */
5558 static bool update_sd_pick_busiest(struct lb_env *env,
5559 				   struct sd_lb_stats *sds,
5560 				   struct sched_group *sg,
5561 				   struct sg_lb_stats *sgs)
5562 {
5563 	if (sgs->avg_load <= sds->busiest_stat.avg_load)
5564 		return false;
5565 
5566 	if (sgs->sum_nr_running > sgs->group_capacity)
5567 		return true;
5568 
5569 	if (sgs->group_imb)
5570 		return true;
5571 
5572 	/*
5573 	 * ASYM_PACKING needs to move all the work to the lowest
5574 	 * numbered CPUs in the group, therefore mark all groups
5575 	 * higher than ourself as busy.
5576 	 */
5577 	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
5578 	    env->dst_cpu < group_first_cpu(sg)) {
5579 		if (!sds->busiest)
5580 			return true;
5581 
5582 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
5583 			return true;
5584 	}
5585 
5586 	return false;
5587 }
5588 
5589 #ifdef CONFIG_NUMA_BALANCING
5590 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5591 {
5592 	if (sgs->sum_nr_running > sgs->nr_numa_running)
5593 		return regular;
5594 	if (sgs->sum_nr_running > sgs->nr_preferred_running)
5595 		return remote;
5596 	return all;
5597 }
5598 
5599 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5600 {
5601 	if (rq->nr_running > rq->nr_numa_running)
5602 		return regular;
5603 	if (rq->nr_running > rq->nr_preferred_running)
5604 		return remote;
5605 	return all;
5606 }
5607 #else
5608 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5609 {
5610 	return all;
5611 }
5612 
5613 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5614 {
5615 	return regular;
5616 }
5617 #endif /* CONFIG_NUMA_BALANCING */
5618 
5619 /**
5620  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
5621  * @env: The load balancing environment.
5622  * @sds: variable to hold the statistics for this sched_domain.
5623  */
5624 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
5625 {
5626 	struct sched_domain *child = env->sd->child;
5627 	struct sched_group *sg = env->sd->groups;
5628 	struct sg_lb_stats tmp_sgs;
5629 	int load_idx, prefer_sibling = 0;
5630 
5631 	if (child && child->flags & SD_PREFER_SIBLING)
5632 		prefer_sibling = 1;
5633 
5634 	load_idx = get_sd_load_idx(env->sd, env->idle);
5635 
5636 	do {
5637 		struct sg_lb_stats *sgs = &tmp_sgs;
5638 		int local_group;
5639 
5640 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
5641 		if (local_group) {
5642 			sds->local = sg;
5643 			sgs = &sds->local_stat;
5644 
5645 			if (env->idle != CPU_NEWLY_IDLE ||
5646 			    time_after_eq(jiffies, sg->sgp->next_update))
5647 				update_group_power(env->sd, env->dst_cpu);
5648 		}
5649 
5650 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
5651 
5652 		if (local_group)
5653 			goto next_group;
5654 
5655 		/*
5656 		 * In case the child domain prefers tasks go to siblings
5657 		 * first, lower the sg capacity to one so that we'll try
5658 		 * and move all the excess tasks away. We lower the capacity
5659 		 * of a group only if the local group has the capacity to fit
5660 		 * these excess tasks, i.e. nr_running < group_capacity. The
5661 		 * extra check prevents the case where you always pull from the
5662 		 * heaviest group when it is already under-utilized (possible
5663 		 * with a large weight task outweighs the tasks on the system).
5664 		 */
5665 		if (prefer_sibling && sds->local &&
5666 		    sds->local_stat.group_has_capacity)
5667 			sgs->group_capacity = min(sgs->group_capacity, 1U);
5668 
5669 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
5670 			sds->busiest = sg;
5671 			sds->busiest_stat = *sgs;
5672 		}
5673 
5674 next_group:
5675 		/* Now, start updating sd_lb_stats */
5676 		sds->total_load += sgs->group_load;
5677 		sds->total_pwr += sgs->group_power;
5678 
5679 		sg = sg->next;
5680 	} while (sg != env->sd->groups);
5681 
5682 	if (env->sd->flags & SD_NUMA)
5683 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
5684 }
5685 
5686 /**
5687  * check_asym_packing - Check to see if the group is packed into the
5688  *			sched doman.
5689  *
5690  * This is primarily intended to used at the sibling level.  Some
5691  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
5692  * case of POWER7, it can move to lower SMT modes only when higher
5693  * threads are idle.  When in lower SMT modes, the threads will
5694  * perform better since they share less core resources.  Hence when we
5695  * have idle threads, we want them to be the higher ones.
5696  *
5697  * This packing function is run on idle threads.  It checks to see if
5698  * the busiest CPU in this domain (core in the P7 case) has a higher
5699  * CPU number than the packing function is being run on.  Here we are
5700  * assuming lower CPU number will be equivalent to lower a SMT thread
5701  * number.
5702  *
5703  * Return: 1 when packing is required and a task should be moved to
5704  * this CPU.  The amount of the imbalance is returned in *imbalance.
5705  *
5706  * @env: The load balancing environment.
5707  * @sds: Statistics of the sched_domain which is to be packed
5708  */
5709 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
5710 {
5711 	int busiest_cpu;
5712 
5713 	if (!(env->sd->flags & SD_ASYM_PACKING))
5714 		return 0;
5715 
5716 	if (!sds->busiest)
5717 		return 0;
5718 
5719 	busiest_cpu = group_first_cpu(sds->busiest);
5720 	if (env->dst_cpu > busiest_cpu)
5721 		return 0;
5722 
5723 	env->imbalance = DIV_ROUND_CLOSEST(
5724 		sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
5725 		SCHED_POWER_SCALE);
5726 
5727 	return 1;
5728 }
5729 
5730 /**
5731  * fix_small_imbalance - Calculate the minor imbalance that exists
5732  *			amongst the groups of a sched_domain, during
5733  *			load balancing.
5734  * @env: The load balancing environment.
5735  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
5736  */
5737 static inline
5738 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5739 {
5740 	unsigned long tmp, pwr_now = 0, pwr_move = 0;
5741 	unsigned int imbn = 2;
5742 	unsigned long scaled_busy_load_per_task;
5743 	struct sg_lb_stats *local, *busiest;
5744 
5745 	local = &sds->local_stat;
5746 	busiest = &sds->busiest_stat;
5747 
5748 	if (!local->sum_nr_running)
5749 		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
5750 	else if (busiest->load_per_task > local->load_per_task)
5751 		imbn = 1;
5752 
5753 	scaled_busy_load_per_task =
5754 		(busiest->load_per_task * SCHED_POWER_SCALE) /
5755 		busiest->group_power;
5756 
5757 	if (busiest->avg_load + scaled_busy_load_per_task >=
5758 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
5759 		env->imbalance = busiest->load_per_task;
5760 		return;
5761 	}
5762 
5763 	/*
5764 	 * OK, we don't have enough imbalance to justify moving tasks,
5765 	 * however we may be able to increase total CPU power used by
5766 	 * moving them.
5767 	 */
5768 
5769 	pwr_now += busiest->group_power *
5770 			min(busiest->load_per_task, busiest->avg_load);
5771 	pwr_now += local->group_power *
5772 			min(local->load_per_task, local->avg_load);
5773 	pwr_now /= SCHED_POWER_SCALE;
5774 
5775 	/* Amount of load we'd subtract */
5776 	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
5777 		busiest->group_power;
5778 	if (busiest->avg_load > tmp) {
5779 		pwr_move += busiest->group_power *
5780 			    min(busiest->load_per_task,
5781 				busiest->avg_load - tmp);
5782 	}
5783 
5784 	/* Amount of load we'd add */
5785 	if (busiest->avg_load * busiest->group_power <
5786 	    busiest->load_per_task * SCHED_POWER_SCALE) {
5787 		tmp = (busiest->avg_load * busiest->group_power) /
5788 		      local->group_power;
5789 	} else {
5790 		tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
5791 		      local->group_power;
5792 	}
5793 	pwr_move += local->group_power *
5794 		    min(local->load_per_task, local->avg_load + tmp);
5795 	pwr_move /= SCHED_POWER_SCALE;
5796 
5797 	/* Move if we gain throughput */
5798 	if (pwr_move > pwr_now)
5799 		env->imbalance = busiest->load_per_task;
5800 }
5801 
5802 /**
5803  * calculate_imbalance - Calculate the amount of imbalance present within the
5804  *			 groups of a given sched_domain during load balance.
5805  * @env: load balance environment
5806  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
5807  */
5808 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5809 {
5810 	unsigned long max_pull, load_above_capacity = ~0UL;
5811 	struct sg_lb_stats *local, *busiest;
5812 
5813 	local = &sds->local_stat;
5814 	busiest = &sds->busiest_stat;
5815 
5816 	if (busiest->group_imb) {
5817 		/*
5818 		 * In the group_imb case we cannot rely on group-wide averages
5819 		 * to ensure cpu-load equilibrium, look at wider averages. XXX
5820 		 */
5821 		busiest->load_per_task =
5822 			min(busiest->load_per_task, sds->avg_load);
5823 	}
5824 
5825 	/*
5826 	 * In the presence of smp nice balancing, certain scenarios can have
5827 	 * max load less than avg load(as we skip the groups at or below
5828 	 * its cpu_power, while calculating max_load..)
5829 	 */
5830 	if (busiest->avg_load <= sds->avg_load ||
5831 	    local->avg_load >= sds->avg_load) {
5832 		env->imbalance = 0;
5833 		return fix_small_imbalance(env, sds);
5834 	}
5835 
5836 	if (!busiest->group_imb) {
5837 		/*
5838 		 * Don't want to pull so many tasks that a group would go idle.
5839 		 * Except of course for the group_imb case, since then we might
5840 		 * have to drop below capacity to reach cpu-load equilibrium.
5841 		 */
5842 		load_above_capacity =
5843 			(busiest->sum_nr_running - busiest->group_capacity);
5844 
5845 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
5846 		load_above_capacity /= busiest->group_power;
5847 	}
5848 
5849 	/*
5850 	 * We're trying to get all the cpus to the average_load, so we don't
5851 	 * want to push ourselves above the average load, nor do we wish to
5852 	 * reduce the max loaded cpu below the average load. At the same time,
5853 	 * we also don't want to reduce the group load below the group capacity
5854 	 * (so that we can implement power-savings policies etc). Thus we look
5855 	 * for the minimum possible imbalance.
5856 	 */
5857 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
5858 
5859 	/* How much load to actually move to equalise the imbalance */
5860 	env->imbalance = min(
5861 		max_pull * busiest->group_power,
5862 		(sds->avg_load - local->avg_load) * local->group_power
5863 	) / SCHED_POWER_SCALE;
5864 
5865 	/*
5866 	 * if *imbalance is less than the average load per runnable task
5867 	 * there is no guarantee that any tasks will be moved so we'll have
5868 	 * a think about bumping its value to force at least one task to be
5869 	 * moved
5870 	 */
5871 	if (env->imbalance < busiest->load_per_task)
5872 		return fix_small_imbalance(env, sds);
5873 }
5874 
5875 /******* find_busiest_group() helpers end here *********************/
5876 
5877 /**
5878  * find_busiest_group - Returns the busiest group within the sched_domain
5879  * if there is an imbalance. If there isn't an imbalance, and
5880  * the user has opted for power-savings, it returns a group whose
5881  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
5882  * such a group exists.
5883  *
5884  * Also calculates the amount of weighted load which should be moved
5885  * to restore balance.
5886  *
5887  * @env: The load balancing environment.
5888  *
5889  * Return:	- The busiest group if imbalance exists.
5890  *		- If no imbalance and user has opted for power-savings balance,
5891  *		   return the least loaded group whose CPUs can be
5892  *		   put to idle by rebalancing its tasks onto our group.
5893  */
5894 static struct sched_group *find_busiest_group(struct lb_env *env)
5895 {
5896 	struct sg_lb_stats *local, *busiest;
5897 	struct sd_lb_stats sds;
5898 
5899 	init_sd_lb_stats(&sds);
5900 
5901 	/*
5902 	 * Compute the various statistics relavent for load balancing at
5903 	 * this level.
5904 	 */
5905 	update_sd_lb_stats(env, &sds);
5906 	local = &sds.local_stat;
5907 	busiest = &sds.busiest_stat;
5908 
5909 	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
5910 	    check_asym_packing(env, &sds))
5911 		return sds.busiest;
5912 
5913 	/* There is no busy sibling group to pull tasks from */
5914 	if (!sds.busiest || busiest->sum_nr_running == 0)
5915 		goto out_balanced;
5916 
5917 	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
5918 
5919 	/*
5920 	 * If the busiest group is imbalanced the below checks don't
5921 	 * work because they assume all things are equal, which typically
5922 	 * isn't true due to cpus_allowed constraints and the like.
5923 	 */
5924 	if (busiest->group_imb)
5925 		goto force_balance;
5926 
5927 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
5928 	if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
5929 	    !busiest->group_has_capacity)
5930 		goto force_balance;
5931 
5932 	/*
5933 	 * If the local group is more busy than the selected busiest group
5934 	 * don't try and pull any tasks.
5935 	 */
5936 	if (local->avg_load >= busiest->avg_load)
5937 		goto out_balanced;
5938 
5939 	/*
5940 	 * Don't pull any tasks if this group is already above the domain
5941 	 * average load.
5942 	 */
5943 	if (local->avg_load >= sds.avg_load)
5944 		goto out_balanced;
5945 
5946 	if (env->idle == CPU_IDLE) {
5947 		/*
5948 		 * This cpu is idle. If the busiest group load doesn't
5949 		 * have more tasks than the number of available cpu's and
5950 		 * there is no imbalance between this and busiest group
5951 		 * wrt to idle cpu's, it is balanced.
5952 		 */
5953 		if ((local->idle_cpus < busiest->idle_cpus) &&
5954 		    busiest->sum_nr_running <= busiest->group_weight)
5955 			goto out_balanced;
5956 	} else {
5957 		/*
5958 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
5959 		 * imbalance_pct to be conservative.
5960 		 */
5961 		if (100 * busiest->avg_load <=
5962 				env->sd->imbalance_pct * local->avg_load)
5963 			goto out_balanced;
5964 	}
5965 
5966 force_balance:
5967 	/* Looks like there is an imbalance. Compute it */
5968 	calculate_imbalance(env, &sds);
5969 	return sds.busiest;
5970 
5971 out_balanced:
5972 	env->imbalance = 0;
5973 	return NULL;
5974 }
5975 
5976 /*
5977  * find_busiest_queue - find the busiest runqueue among the cpus in group.
5978  */
5979 static struct rq *find_busiest_queue(struct lb_env *env,
5980 				     struct sched_group *group)
5981 {
5982 	struct rq *busiest = NULL, *rq;
5983 	unsigned long busiest_load = 0, busiest_power = 1;
5984 	int i;
5985 
5986 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5987 		unsigned long power, capacity, wl;
5988 		enum fbq_type rt;
5989 
5990 		rq = cpu_rq(i);
5991 		rt = fbq_classify_rq(rq);
5992 
5993 		/*
5994 		 * We classify groups/runqueues into three groups:
5995 		 *  - regular: there are !numa tasks
5996 		 *  - remote:  there are numa tasks that run on the 'wrong' node
5997 		 *  - all:     there is no distinction
5998 		 *
5999 		 * In order to avoid migrating ideally placed numa tasks,
6000 		 * ignore those when there's better options.
6001 		 *
6002 		 * If we ignore the actual busiest queue to migrate another
6003 		 * task, the next balance pass can still reduce the busiest
6004 		 * queue by moving tasks around inside the node.
6005 		 *
6006 		 * If we cannot move enough load due to this classification
6007 		 * the next pass will adjust the group classification and
6008 		 * allow migration of more tasks.
6009 		 *
6010 		 * Both cases only affect the total convergence complexity.
6011 		 */
6012 		if (rt > env->fbq_type)
6013 			continue;
6014 
6015 		power = power_of(i);
6016 		capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
6017 		if (!capacity)
6018 			capacity = fix_small_capacity(env->sd, group);
6019 
6020 		wl = weighted_cpuload(i);
6021 
6022 		/*
6023 		 * When comparing with imbalance, use weighted_cpuload()
6024 		 * which is not scaled with the cpu power.
6025 		 */
6026 		if (capacity && rq->nr_running == 1 && wl > env->imbalance)
6027 			continue;
6028 
6029 		/*
6030 		 * For the load comparisons with the other cpu's, consider
6031 		 * the weighted_cpuload() scaled with the cpu power, so that
6032 		 * the load can be moved away from the cpu that is potentially
6033 		 * running at a lower capacity.
6034 		 *
6035 		 * Thus we're looking for max(wl_i / power_i), crosswise
6036 		 * multiplication to rid ourselves of the division works out
6037 		 * to: wl_i * power_j > wl_j * power_i;  where j is our
6038 		 * previous maximum.
6039 		 */
6040 		if (wl * busiest_power > busiest_load * power) {
6041 			busiest_load = wl;
6042 			busiest_power = power;
6043 			busiest = rq;
6044 		}
6045 	}
6046 
6047 	return busiest;
6048 }
6049 
6050 /*
6051  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
6052  * so long as it is large enough.
6053  */
6054 #define MAX_PINNED_INTERVAL	512
6055 
6056 /* Working cpumask for load_balance and load_balance_newidle. */
6057 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
6058 
6059 static int need_active_balance(struct lb_env *env)
6060 {
6061 	struct sched_domain *sd = env->sd;
6062 
6063 	if (env->idle == CPU_NEWLY_IDLE) {
6064 
6065 		/*
6066 		 * ASYM_PACKING needs to force migrate tasks from busy but
6067 		 * higher numbered CPUs in order to pack all tasks in the
6068 		 * lowest numbered CPUs.
6069 		 */
6070 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
6071 			return 1;
6072 	}
6073 
6074 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
6075 }
6076 
6077 static int active_load_balance_cpu_stop(void *data);
6078 
6079 static int should_we_balance(struct lb_env *env)
6080 {
6081 	struct sched_group *sg = env->sd->groups;
6082 	struct cpumask *sg_cpus, *sg_mask;
6083 	int cpu, balance_cpu = -1;
6084 
6085 	/*
6086 	 * In the newly idle case, we will allow all the cpu's
6087 	 * to do the newly idle load balance.
6088 	 */
6089 	if (env->idle == CPU_NEWLY_IDLE)
6090 		return 1;
6091 
6092 	sg_cpus = sched_group_cpus(sg);
6093 	sg_mask = sched_group_mask(sg);
6094 	/* Try to find first idle cpu */
6095 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
6096 		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
6097 			continue;
6098 
6099 		balance_cpu = cpu;
6100 		break;
6101 	}
6102 
6103 	if (balance_cpu == -1)
6104 		balance_cpu = group_balance_cpu(sg);
6105 
6106 	/*
6107 	 * First idle cpu or the first cpu(busiest) in this sched group
6108 	 * is eligible for doing load balancing at this and above domains.
6109 	 */
6110 	return balance_cpu == env->dst_cpu;
6111 }
6112 
6113 /*
6114  * Check this_cpu to ensure it is balanced within domain. Attempt to move
6115  * tasks if there is an imbalance.
6116  */
6117 static int load_balance(int this_cpu, struct rq *this_rq,
6118 			struct sched_domain *sd, enum cpu_idle_type idle,
6119 			int *continue_balancing)
6120 {
6121 	int ld_moved, cur_ld_moved, active_balance = 0;
6122 	struct sched_domain *sd_parent = sd->parent;
6123 	struct sched_group *group;
6124 	struct rq *busiest;
6125 	unsigned long flags;
6126 	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
6127 
6128 	struct lb_env env = {
6129 		.sd		= sd,
6130 		.dst_cpu	= this_cpu,
6131 		.dst_rq		= this_rq,
6132 		.dst_grpmask    = sched_group_cpus(sd->groups),
6133 		.idle		= idle,
6134 		.loop_break	= sched_nr_migrate_break,
6135 		.cpus		= cpus,
6136 		.fbq_type	= all,
6137 	};
6138 
6139 	/*
6140 	 * For NEWLY_IDLE load_balancing, we don't need to consider
6141 	 * other cpus in our group
6142 	 */
6143 	if (idle == CPU_NEWLY_IDLE)
6144 		env.dst_grpmask = NULL;
6145 
6146 	cpumask_copy(cpus, cpu_active_mask);
6147 
6148 	schedstat_inc(sd, lb_count[idle]);
6149 
6150 redo:
6151 	if (!should_we_balance(&env)) {
6152 		*continue_balancing = 0;
6153 		goto out_balanced;
6154 	}
6155 
6156 	group = find_busiest_group(&env);
6157 	if (!group) {
6158 		schedstat_inc(sd, lb_nobusyg[idle]);
6159 		goto out_balanced;
6160 	}
6161 
6162 	busiest = find_busiest_queue(&env, group);
6163 	if (!busiest) {
6164 		schedstat_inc(sd, lb_nobusyq[idle]);
6165 		goto out_balanced;
6166 	}
6167 
6168 	BUG_ON(busiest == env.dst_rq);
6169 
6170 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
6171 
6172 	ld_moved = 0;
6173 	if (busiest->nr_running > 1) {
6174 		/*
6175 		 * Attempt to move tasks. If find_busiest_group has found
6176 		 * an imbalance but busiest->nr_running <= 1, the group is
6177 		 * still unbalanced. ld_moved simply stays zero, so it is
6178 		 * correctly treated as an imbalance.
6179 		 */
6180 		env.flags |= LBF_ALL_PINNED;
6181 		env.src_cpu   = busiest->cpu;
6182 		env.src_rq    = busiest;
6183 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
6184 
6185 more_balance:
6186 		local_irq_save(flags);
6187 		double_rq_lock(env.dst_rq, busiest);
6188 
6189 		/*
6190 		 * cur_ld_moved - load moved in current iteration
6191 		 * ld_moved     - cumulative load moved across iterations
6192 		 */
6193 		cur_ld_moved = move_tasks(&env);
6194 		ld_moved += cur_ld_moved;
6195 		double_rq_unlock(env.dst_rq, busiest);
6196 		local_irq_restore(flags);
6197 
6198 		/*
6199 		 * some other cpu did the load balance for us.
6200 		 */
6201 		if (cur_ld_moved && env.dst_cpu != smp_processor_id())
6202 			resched_cpu(env.dst_cpu);
6203 
6204 		if (env.flags & LBF_NEED_BREAK) {
6205 			env.flags &= ~LBF_NEED_BREAK;
6206 			goto more_balance;
6207 		}
6208 
6209 		/*
6210 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
6211 		 * us and move them to an alternate dst_cpu in our sched_group
6212 		 * where they can run. The upper limit on how many times we
6213 		 * iterate on same src_cpu is dependent on number of cpus in our
6214 		 * sched_group.
6215 		 *
6216 		 * This changes load balance semantics a bit on who can move
6217 		 * load to a given_cpu. In addition to the given_cpu itself
6218 		 * (or a ilb_cpu acting on its behalf where given_cpu is
6219 		 * nohz-idle), we now have balance_cpu in a position to move
6220 		 * load to given_cpu. In rare situations, this may cause
6221 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
6222 		 * _independently_ and at _same_ time to move some load to
6223 		 * given_cpu) causing exceess load to be moved to given_cpu.
6224 		 * This however should not happen so much in practice and
6225 		 * moreover subsequent load balance cycles should correct the
6226 		 * excess load moved.
6227 		 */
6228 		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6229 
6230 			/* Prevent to re-select dst_cpu via env's cpus */
6231 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
6232 
6233 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
6234 			env.dst_cpu	 = env.new_dst_cpu;
6235 			env.flags	&= ~LBF_DST_PINNED;
6236 			env.loop	 = 0;
6237 			env.loop_break	 = sched_nr_migrate_break;
6238 
6239 			/*
6240 			 * Go back to "more_balance" rather than "redo" since we
6241 			 * need to continue with same src_cpu.
6242 			 */
6243 			goto more_balance;
6244 		}
6245 
6246 		/*
6247 		 * We failed to reach balance because of affinity.
6248 		 */
6249 		if (sd_parent) {
6250 			int *group_imbalance = &sd_parent->groups->sgp->imbalance;
6251 
6252 			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6253 				*group_imbalance = 1;
6254 			} else if (*group_imbalance)
6255 				*group_imbalance = 0;
6256 		}
6257 
6258 		/* All tasks on this runqueue were pinned by CPU affinity */
6259 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
6260 			cpumask_clear_cpu(cpu_of(busiest), cpus);
6261 			if (!cpumask_empty(cpus)) {
6262 				env.loop = 0;
6263 				env.loop_break = sched_nr_migrate_break;
6264 				goto redo;
6265 			}
6266 			goto out_balanced;
6267 		}
6268 	}
6269 
6270 	if (!ld_moved) {
6271 		schedstat_inc(sd, lb_failed[idle]);
6272 		/*
6273 		 * Increment the failure counter only on periodic balance.
6274 		 * We do not want newidle balance, which can be very
6275 		 * frequent, pollute the failure counter causing
6276 		 * excessive cache_hot migrations and active balances.
6277 		 */
6278 		if (idle != CPU_NEWLY_IDLE)
6279 			sd->nr_balance_failed++;
6280 
6281 		if (need_active_balance(&env)) {
6282 			raw_spin_lock_irqsave(&busiest->lock, flags);
6283 
6284 			/* don't kick the active_load_balance_cpu_stop,
6285 			 * if the curr task on busiest cpu can't be
6286 			 * moved to this_cpu
6287 			 */
6288 			if (!cpumask_test_cpu(this_cpu,
6289 					tsk_cpus_allowed(busiest->curr))) {
6290 				raw_spin_unlock_irqrestore(&busiest->lock,
6291 							    flags);
6292 				env.flags |= LBF_ALL_PINNED;
6293 				goto out_one_pinned;
6294 			}
6295 
6296 			/*
6297 			 * ->active_balance synchronizes accesses to
6298 			 * ->active_balance_work.  Once set, it's cleared
6299 			 * only after active load balance is finished.
6300 			 */
6301 			if (!busiest->active_balance) {
6302 				busiest->active_balance = 1;
6303 				busiest->push_cpu = this_cpu;
6304 				active_balance = 1;
6305 			}
6306 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
6307 
6308 			if (active_balance) {
6309 				stop_one_cpu_nowait(cpu_of(busiest),
6310 					active_load_balance_cpu_stop, busiest,
6311 					&busiest->active_balance_work);
6312 			}
6313 
6314 			/*
6315 			 * We've kicked active balancing, reset the failure
6316 			 * counter.
6317 			 */
6318 			sd->nr_balance_failed = sd->cache_nice_tries+1;
6319 		}
6320 	} else
6321 		sd->nr_balance_failed = 0;
6322 
6323 	if (likely(!active_balance)) {
6324 		/* We were unbalanced, so reset the balancing interval */
6325 		sd->balance_interval = sd->min_interval;
6326 	} else {
6327 		/*
6328 		 * If we've begun active balancing, start to back off. This
6329 		 * case may not be covered by the all_pinned logic if there
6330 		 * is only 1 task on the busy runqueue (because we don't call
6331 		 * move_tasks).
6332 		 */
6333 		if (sd->balance_interval < sd->max_interval)
6334 			sd->balance_interval *= 2;
6335 	}
6336 
6337 	goto out;
6338 
6339 out_balanced:
6340 	schedstat_inc(sd, lb_balanced[idle]);
6341 
6342 	sd->nr_balance_failed = 0;
6343 
6344 out_one_pinned:
6345 	/* tune up the balancing interval */
6346 	if (((env.flags & LBF_ALL_PINNED) &&
6347 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
6348 			(sd->balance_interval < sd->max_interval))
6349 		sd->balance_interval *= 2;
6350 
6351 	ld_moved = 0;
6352 out:
6353 	return ld_moved;
6354 }
6355 
6356 /*
6357  * idle_balance is called by schedule() if this_cpu is about to become
6358  * idle. Attempts to pull tasks from other CPUs.
6359  */
6360 void idle_balance(int this_cpu, struct rq *this_rq)
6361 {
6362 	struct sched_domain *sd;
6363 	int pulled_task = 0;
6364 	unsigned long next_balance = jiffies + HZ;
6365 	u64 curr_cost = 0;
6366 
6367 	this_rq->idle_stamp = rq_clock(this_rq);
6368 
6369 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
6370 		return;
6371 
6372 	/*
6373 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
6374 	 */
6375 	raw_spin_unlock(&this_rq->lock);
6376 
6377 	update_blocked_averages(this_cpu);
6378 	rcu_read_lock();
6379 	for_each_domain(this_cpu, sd) {
6380 		unsigned long interval;
6381 		int continue_balancing = 1;
6382 		u64 t0, domain_cost;
6383 
6384 		if (!(sd->flags & SD_LOAD_BALANCE))
6385 			continue;
6386 
6387 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
6388 			break;
6389 
6390 		if (sd->flags & SD_BALANCE_NEWIDLE) {
6391 			t0 = sched_clock_cpu(this_cpu);
6392 
6393 			/* If we've pulled tasks over stop searching: */
6394 			pulled_task = load_balance(this_cpu, this_rq,
6395 						   sd, CPU_NEWLY_IDLE,
6396 						   &continue_balancing);
6397 
6398 			domain_cost = sched_clock_cpu(this_cpu) - t0;
6399 			if (domain_cost > sd->max_newidle_lb_cost)
6400 				sd->max_newidle_lb_cost = domain_cost;
6401 
6402 			curr_cost += domain_cost;
6403 		}
6404 
6405 		interval = msecs_to_jiffies(sd->balance_interval);
6406 		if (time_after(next_balance, sd->last_balance + interval))
6407 			next_balance = sd->last_balance + interval;
6408 		if (pulled_task) {
6409 			this_rq->idle_stamp = 0;
6410 			break;
6411 		}
6412 	}
6413 	rcu_read_unlock();
6414 
6415 	raw_spin_lock(&this_rq->lock);
6416 
6417 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6418 		/*
6419 		 * We are going idle. next_balance may be set based on
6420 		 * a busy processor. So reset next_balance.
6421 		 */
6422 		this_rq->next_balance = next_balance;
6423 	}
6424 
6425 	if (curr_cost > this_rq->max_idle_balance_cost)
6426 		this_rq->max_idle_balance_cost = curr_cost;
6427 }
6428 
6429 /*
6430  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
6431  * running tasks off the busiest CPU onto idle CPUs. It requires at
6432  * least 1 task to be running on each physical CPU where possible, and
6433  * avoids physical / logical imbalances.
6434  */
6435 static int active_load_balance_cpu_stop(void *data)
6436 {
6437 	struct rq *busiest_rq = data;
6438 	int busiest_cpu = cpu_of(busiest_rq);
6439 	int target_cpu = busiest_rq->push_cpu;
6440 	struct rq *target_rq = cpu_rq(target_cpu);
6441 	struct sched_domain *sd;
6442 
6443 	raw_spin_lock_irq(&busiest_rq->lock);
6444 
6445 	/* make sure the requested cpu hasn't gone down in the meantime */
6446 	if (unlikely(busiest_cpu != smp_processor_id() ||
6447 		     !busiest_rq->active_balance))
6448 		goto out_unlock;
6449 
6450 	/* Is there any task to move? */
6451 	if (busiest_rq->nr_running <= 1)
6452 		goto out_unlock;
6453 
6454 	/*
6455 	 * This condition is "impossible", if it occurs
6456 	 * we need to fix it. Originally reported by
6457 	 * Bjorn Helgaas on a 128-cpu setup.
6458 	 */
6459 	BUG_ON(busiest_rq == target_rq);
6460 
6461 	/* move a task from busiest_rq to target_rq */
6462 	double_lock_balance(busiest_rq, target_rq);
6463 
6464 	/* Search for an sd spanning us and the target CPU. */
6465 	rcu_read_lock();
6466 	for_each_domain(target_cpu, sd) {
6467 		if ((sd->flags & SD_LOAD_BALANCE) &&
6468 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6469 				break;
6470 	}
6471 
6472 	if (likely(sd)) {
6473 		struct lb_env env = {
6474 			.sd		= sd,
6475 			.dst_cpu	= target_cpu,
6476 			.dst_rq		= target_rq,
6477 			.src_cpu	= busiest_rq->cpu,
6478 			.src_rq		= busiest_rq,
6479 			.idle		= CPU_IDLE,
6480 		};
6481 
6482 		schedstat_inc(sd, alb_count);
6483 
6484 		if (move_one_task(&env))
6485 			schedstat_inc(sd, alb_pushed);
6486 		else
6487 			schedstat_inc(sd, alb_failed);
6488 	}
6489 	rcu_read_unlock();
6490 	double_unlock_balance(busiest_rq, target_rq);
6491 out_unlock:
6492 	busiest_rq->active_balance = 0;
6493 	raw_spin_unlock_irq(&busiest_rq->lock);
6494 	return 0;
6495 }
6496 
6497 #ifdef CONFIG_NO_HZ_COMMON
6498 /*
6499  * idle load balancing details
6500  * - When one of the busy CPUs notice that there may be an idle rebalancing
6501  *   needed, they will kick the idle load balancer, which then does idle
6502  *   load balancing for all the idle CPUs.
6503  */
6504 static struct {
6505 	cpumask_var_t idle_cpus_mask;
6506 	atomic_t nr_cpus;
6507 	unsigned long next_balance;     /* in jiffy units */
6508 } nohz ____cacheline_aligned;
6509 
6510 static inline int find_new_ilb(void)
6511 {
6512 	int ilb = cpumask_first(nohz.idle_cpus_mask);
6513 
6514 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
6515 		return ilb;
6516 
6517 	return nr_cpu_ids;
6518 }
6519 
6520 /*
6521  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
6522  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
6523  * CPU (if there is one).
6524  */
6525 static void nohz_balancer_kick(void)
6526 {
6527 	int ilb_cpu;
6528 
6529 	nohz.next_balance++;
6530 
6531 	ilb_cpu = find_new_ilb();
6532 
6533 	if (ilb_cpu >= nr_cpu_ids)
6534 		return;
6535 
6536 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
6537 		return;
6538 	/*
6539 	 * Use smp_send_reschedule() instead of resched_cpu().
6540 	 * This way we generate a sched IPI on the target cpu which
6541 	 * is idle. And the softirq performing nohz idle load balance
6542 	 * will be run before returning from the IPI.
6543 	 */
6544 	smp_send_reschedule(ilb_cpu);
6545 	return;
6546 }
6547 
6548 static inline void nohz_balance_exit_idle(int cpu)
6549 {
6550 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
6551 		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
6552 		atomic_dec(&nohz.nr_cpus);
6553 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6554 	}
6555 }
6556 
6557 static inline void set_cpu_sd_state_busy(void)
6558 {
6559 	struct sched_domain *sd;
6560 	int cpu = smp_processor_id();
6561 
6562 	rcu_read_lock();
6563 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
6564 
6565 	if (!sd || !sd->nohz_idle)
6566 		goto unlock;
6567 	sd->nohz_idle = 0;
6568 
6569 	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
6570 unlock:
6571 	rcu_read_unlock();
6572 }
6573 
6574 void set_cpu_sd_state_idle(void)
6575 {
6576 	struct sched_domain *sd;
6577 	int cpu = smp_processor_id();
6578 
6579 	rcu_read_lock();
6580 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
6581 
6582 	if (!sd || sd->nohz_idle)
6583 		goto unlock;
6584 	sd->nohz_idle = 1;
6585 
6586 	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
6587 unlock:
6588 	rcu_read_unlock();
6589 }
6590 
6591 /*
6592  * This routine will record that the cpu is going idle with tick stopped.
6593  * This info will be used in performing idle load balancing in the future.
6594  */
6595 void nohz_balance_enter_idle(int cpu)
6596 {
6597 	/*
6598 	 * If this cpu is going down, then nothing needs to be done.
6599 	 */
6600 	if (!cpu_active(cpu))
6601 		return;
6602 
6603 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
6604 		return;
6605 
6606 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
6607 	atomic_inc(&nohz.nr_cpus);
6608 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6609 }
6610 
6611 static int sched_ilb_notifier(struct notifier_block *nfb,
6612 					unsigned long action, void *hcpu)
6613 {
6614 	switch (action & ~CPU_TASKS_FROZEN) {
6615 	case CPU_DYING:
6616 		nohz_balance_exit_idle(smp_processor_id());
6617 		return NOTIFY_OK;
6618 	default:
6619 		return NOTIFY_DONE;
6620 	}
6621 }
6622 #endif
6623 
6624 static DEFINE_SPINLOCK(balancing);
6625 
6626 /*
6627  * Scale the max load_balance interval with the number of CPUs in the system.
6628  * This trades load-balance latency on larger machines for less cross talk.
6629  */
6630 void update_max_interval(void)
6631 {
6632 	max_load_balance_interval = HZ*num_online_cpus()/10;
6633 }
6634 
6635 /*
6636  * It checks each scheduling domain to see if it is due to be balanced,
6637  * and initiates a balancing operation if so.
6638  *
6639  * Balancing parameters are set up in init_sched_domains.
6640  */
6641 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
6642 {
6643 	int continue_balancing = 1;
6644 	int cpu = rq->cpu;
6645 	unsigned long interval;
6646 	struct sched_domain *sd;
6647 	/* Earliest time when we have to do rebalance again */
6648 	unsigned long next_balance = jiffies + 60*HZ;
6649 	int update_next_balance = 0;
6650 	int need_serialize, need_decay = 0;
6651 	u64 max_cost = 0;
6652 
6653 	update_blocked_averages(cpu);
6654 
6655 	rcu_read_lock();
6656 	for_each_domain(cpu, sd) {
6657 		/*
6658 		 * Decay the newidle max times here because this is a regular
6659 		 * visit to all the domains. Decay ~1% per second.
6660 		 */
6661 		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
6662 			sd->max_newidle_lb_cost =
6663 				(sd->max_newidle_lb_cost * 253) / 256;
6664 			sd->next_decay_max_lb_cost = jiffies + HZ;
6665 			need_decay = 1;
6666 		}
6667 		max_cost += sd->max_newidle_lb_cost;
6668 
6669 		if (!(sd->flags & SD_LOAD_BALANCE))
6670 			continue;
6671 
6672 		/*
6673 		 * Stop the load balance at this level. There is another
6674 		 * CPU in our sched group which is doing load balancing more
6675 		 * actively.
6676 		 */
6677 		if (!continue_balancing) {
6678 			if (need_decay)
6679 				continue;
6680 			break;
6681 		}
6682 
6683 		interval = sd->balance_interval;
6684 		if (idle != CPU_IDLE)
6685 			interval *= sd->busy_factor;
6686 
6687 		/* scale ms to jiffies */
6688 		interval = msecs_to_jiffies(interval);
6689 		interval = clamp(interval, 1UL, max_load_balance_interval);
6690 
6691 		need_serialize = sd->flags & SD_SERIALIZE;
6692 
6693 		if (need_serialize) {
6694 			if (!spin_trylock(&balancing))
6695 				goto out;
6696 		}
6697 
6698 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
6699 			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
6700 				/*
6701 				 * The LBF_DST_PINNED logic could have changed
6702 				 * env->dst_cpu, so we can't know our idle
6703 				 * state even if we migrated tasks. Update it.
6704 				 */
6705 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
6706 			}
6707 			sd->last_balance = jiffies;
6708 		}
6709 		if (need_serialize)
6710 			spin_unlock(&balancing);
6711 out:
6712 		if (time_after(next_balance, sd->last_balance + interval)) {
6713 			next_balance = sd->last_balance + interval;
6714 			update_next_balance = 1;
6715 		}
6716 	}
6717 	if (need_decay) {
6718 		/*
6719 		 * Ensure the rq-wide value also decays but keep it at a
6720 		 * reasonable floor to avoid funnies with rq->avg_idle.
6721 		 */
6722 		rq->max_idle_balance_cost =
6723 			max((u64)sysctl_sched_migration_cost, max_cost);
6724 	}
6725 	rcu_read_unlock();
6726 
6727 	/*
6728 	 * next_balance will be updated only when there is a need.
6729 	 * When the cpu is attached to null domain for ex, it will not be
6730 	 * updated.
6731 	 */
6732 	if (likely(update_next_balance))
6733 		rq->next_balance = next_balance;
6734 }
6735 
6736 #ifdef CONFIG_NO_HZ_COMMON
6737 /*
6738  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
6739  * rebalancing for all the cpus for whom scheduler ticks are stopped.
6740  */
6741 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
6742 {
6743 	int this_cpu = this_rq->cpu;
6744 	struct rq *rq;
6745 	int balance_cpu;
6746 
6747 	if (idle != CPU_IDLE ||
6748 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
6749 		goto end;
6750 
6751 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
6752 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
6753 			continue;
6754 
6755 		/*
6756 		 * If this cpu gets work to do, stop the load balancing
6757 		 * work being done for other cpus. Next load
6758 		 * balancing owner will pick it up.
6759 		 */
6760 		if (need_resched())
6761 			break;
6762 
6763 		rq = cpu_rq(balance_cpu);
6764 
6765 		raw_spin_lock_irq(&rq->lock);
6766 		update_rq_clock(rq);
6767 		update_idle_cpu_load(rq);
6768 		raw_spin_unlock_irq(&rq->lock);
6769 
6770 		rebalance_domains(rq, CPU_IDLE);
6771 
6772 		if (time_after(this_rq->next_balance, rq->next_balance))
6773 			this_rq->next_balance = rq->next_balance;
6774 	}
6775 	nohz.next_balance = this_rq->next_balance;
6776 end:
6777 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
6778 }
6779 
6780 /*
6781  * Current heuristic for kicking the idle load balancer in the presence
6782  * of an idle cpu is the system.
6783  *   - This rq has more than one task.
6784  *   - At any scheduler domain level, this cpu's scheduler group has multiple
6785  *     busy cpu's exceeding the group's power.
6786  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
6787  *     domain span are idle.
6788  */
6789 static inline int nohz_kick_needed(struct rq *rq)
6790 {
6791 	unsigned long now = jiffies;
6792 	struct sched_domain *sd;
6793 	struct sched_group_power *sgp;
6794 	int nr_busy, cpu = rq->cpu;
6795 
6796 	if (unlikely(rq->idle_balance))
6797 		return 0;
6798 
6799        /*
6800 	* We may be recently in ticked or tickless idle mode. At the first
6801 	* busy tick after returning from idle, we will update the busy stats.
6802 	*/
6803 	set_cpu_sd_state_busy();
6804 	nohz_balance_exit_idle(cpu);
6805 
6806 	/*
6807 	 * None are in tickless mode and hence no need for NOHZ idle load
6808 	 * balancing.
6809 	 */
6810 	if (likely(!atomic_read(&nohz.nr_cpus)))
6811 		return 0;
6812 
6813 	if (time_before(now, nohz.next_balance))
6814 		return 0;
6815 
6816 	if (rq->nr_running >= 2)
6817 		goto need_kick;
6818 
6819 	rcu_read_lock();
6820 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
6821 
6822 	if (sd) {
6823 		sgp = sd->groups->sgp;
6824 		nr_busy = atomic_read(&sgp->nr_busy_cpus);
6825 
6826 		if (nr_busy > 1)
6827 			goto need_kick_unlock;
6828 	}
6829 
6830 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
6831 
6832 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
6833 				  sched_domain_span(sd)) < cpu))
6834 		goto need_kick_unlock;
6835 
6836 	rcu_read_unlock();
6837 	return 0;
6838 
6839 need_kick_unlock:
6840 	rcu_read_unlock();
6841 need_kick:
6842 	return 1;
6843 }
6844 #else
6845 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
6846 #endif
6847 
6848 /*
6849  * run_rebalance_domains is triggered when needed from the scheduler tick.
6850  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
6851  */
6852 static void run_rebalance_domains(struct softirq_action *h)
6853 {
6854 	struct rq *this_rq = this_rq();
6855 	enum cpu_idle_type idle = this_rq->idle_balance ?
6856 						CPU_IDLE : CPU_NOT_IDLE;
6857 
6858 	rebalance_domains(this_rq, idle);
6859 
6860 	/*
6861 	 * If this cpu has a pending nohz_balance_kick, then do the
6862 	 * balancing on behalf of the other idle cpus whose ticks are
6863 	 * stopped.
6864 	 */
6865 	nohz_idle_balance(this_rq, idle);
6866 }
6867 
6868 static inline int on_null_domain(struct rq *rq)
6869 {
6870 	return !rcu_dereference_sched(rq->sd);
6871 }
6872 
6873 /*
6874  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
6875  */
6876 void trigger_load_balance(struct rq *rq)
6877 {
6878 	/* Don't need to rebalance while attached to NULL domain */
6879 	if (unlikely(on_null_domain(rq)))
6880 		return;
6881 
6882 	if (time_after_eq(jiffies, rq->next_balance))
6883 		raise_softirq(SCHED_SOFTIRQ);
6884 #ifdef CONFIG_NO_HZ_COMMON
6885 	if (nohz_kick_needed(rq))
6886 		nohz_balancer_kick();
6887 #endif
6888 }
6889 
6890 static void rq_online_fair(struct rq *rq)
6891 {
6892 	update_sysctl();
6893 }
6894 
6895 static void rq_offline_fair(struct rq *rq)
6896 {
6897 	update_sysctl();
6898 
6899 	/* Ensure any throttled groups are reachable by pick_next_task */
6900 	unthrottle_offline_cfs_rqs(rq);
6901 }
6902 
6903 #endif /* CONFIG_SMP */
6904 
6905 /*
6906  * scheduler tick hitting a task of our scheduling class:
6907  */
6908 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
6909 {
6910 	struct cfs_rq *cfs_rq;
6911 	struct sched_entity *se = &curr->se;
6912 
6913 	for_each_sched_entity(se) {
6914 		cfs_rq = cfs_rq_of(se);
6915 		entity_tick(cfs_rq, se, queued);
6916 	}
6917 
6918 	if (numabalancing_enabled)
6919 		task_tick_numa(rq, curr);
6920 
6921 	update_rq_runnable_avg(rq, 1);
6922 }
6923 
6924 /*
6925  * called on fork with the child task as argument from the parent's context
6926  *  - child not yet on the tasklist
6927  *  - preemption disabled
6928  */
6929 static void task_fork_fair(struct task_struct *p)
6930 {
6931 	struct cfs_rq *cfs_rq;
6932 	struct sched_entity *se = &p->se, *curr;
6933 	int this_cpu = smp_processor_id();
6934 	struct rq *rq = this_rq();
6935 	unsigned long flags;
6936 
6937 	raw_spin_lock_irqsave(&rq->lock, flags);
6938 
6939 	update_rq_clock(rq);
6940 
6941 	cfs_rq = task_cfs_rq(current);
6942 	curr = cfs_rq->curr;
6943 
6944 	/*
6945 	 * Not only the cpu but also the task_group of the parent might have
6946 	 * been changed after parent->se.parent,cfs_rq were copied to
6947 	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
6948 	 * of child point to valid ones.
6949 	 */
6950 	rcu_read_lock();
6951 	__set_task_cpu(p, this_cpu);
6952 	rcu_read_unlock();
6953 
6954 	update_curr(cfs_rq);
6955 
6956 	if (curr)
6957 		se->vruntime = curr->vruntime;
6958 	place_entity(cfs_rq, se, 1);
6959 
6960 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
6961 		/*
6962 		 * Upon rescheduling, sched_class::put_prev_task() will place
6963 		 * 'current' within the tree based on its new key value.
6964 		 */
6965 		swap(curr->vruntime, se->vruntime);
6966 		resched_task(rq->curr);
6967 	}
6968 
6969 	se->vruntime -= cfs_rq->min_vruntime;
6970 
6971 	raw_spin_unlock_irqrestore(&rq->lock, flags);
6972 }
6973 
6974 /*
6975  * Priority of the task has changed. Check to see if we preempt
6976  * the current task.
6977  */
6978 static void
6979 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
6980 {
6981 	if (!p->se.on_rq)
6982 		return;
6983 
6984 	/*
6985 	 * Reschedule if we are currently running on this runqueue and
6986 	 * our priority decreased, or if we are not currently running on
6987 	 * this runqueue and our priority is higher than the current's
6988 	 */
6989 	if (rq->curr == p) {
6990 		if (p->prio > oldprio)
6991 			resched_task(rq->curr);
6992 	} else
6993 		check_preempt_curr(rq, p, 0);
6994 }
6995 
6996 static void switched_from_fair(struct rq *rq, struct task_struct *p)
6997 {
6998 	struct sched_entity *se = &p->se;
6999 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
7000 
7001 	/*
7002 	 * Ensure the task's vruntime is normalized, so that when its
7003 	 * switched back to the fair class the enqueue_entity(.flags=0) will
7004 	 * do the right thing.
7005 	 *
7006 	 * If it was on_rq, then the dequeue_entity(.flags=0) will already
7007 	 * have normalized the vruntime, if it was !on_rq, then only when
7008 	 * the task is sleeping will it still have non-normalized vruntime.
7009 	 */
7010 	if (!se->on_rq && p->state != TASK_RUNNING) {
7011 		/*
7012 		 * Fix up our vruntime so that the current sleep doesn't
7013 		 * cause 'unlimited' sleep bonus.
7014 		 */
7015 		place_entity(cfs_rq, se, 0);
7016 		se->vruntime -= cfs_rq->min_vruntime;
7017 	}
7018 
7019 #ifdef CONFIG_SMP
7020 	/*
7021 	* Remove our load from contribution when we leave sched_fair
7022 	* and ensure we don't carry in an old decay_count if we
7023 	* switch back.
7024 	*/
7025 	if (se->avg.decay_count) {
7026 		__synchronize_entity_decay(se);
7027 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
7028 	}
7029 #endif
7030 }
7031 
7032 /*
7033  * We switched to the sched_fair class.
7034  */
7035 static void switched_to_fair(struct rq *rq, struct task_struct *p)
7036 {
7037 	if (!p->se.on_rq)
7038 		return;
7039 
7040 	/*
7041 	 * We were most likely switched from sched_rt, so
7042 	 * kick off the schedule if running, otherwise just see
7043 	 * if we can still preempt the current task.
7044 	 */
7045 	if (rq->curr == p)
7046 		resched_task(rq->curr);
7047 	else
7048 		check_preempt_curr(rq, p, 0);
7049 }
7050 
7051 /* Account for a task changing its policy or group.
7052  *
7053  * This routine is mostly called to set cfs_rq->curr field when a task
7054  * migrates between groups/classes.
7055  */
7056 static void set_curr_task_fair(struct rq *rq)
7057 {
7058 	struct sched_entity *se = &rq->curr->se;
7059 
7060 	for_each_sched_entity(se) {
7061 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
7062 
7063 		set_next_entity(cfs_rq, se);
7064 		/* ensure bandwidth has been allocated on our new cfs_rq */
7065 		account_cfs_rq_runtime(cfs_rq, 0);
7066 	}
7067 }
7068 
7069 void init_cfs_rq(struct cfs_rq *cfs_rq)
7070 {
7071 	cfs_rq->tasks_timeline = RB_ROOT;
7072 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7073 #ifndef CONFIG_64BIT
7074 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7075 #endif
7076 #ifdef CONFIG_SMP
7077 	atomic64_set(&cfs_rq->decay_counter, 1);
7078 	atomic_long_set(&cfs_rq->removed_load, 0);
7079 #endif
7080 }
7081 
7082 #ifdef CONFIG_FAIR_GROUP_SCHED
7083 static void task_move_group_fair(struct task_struct *p, int on_rq)
7084 {
7085 	struct cfs_rq *cfs_rq;
7086 	/*
7087 	 * If the task was not on the rq at the time of this cgroup movement
7088 	 * it must have been asleep, sleeping tasks keep their ->vruntime
7089 	 * absolute on their old rq until wakeup (needed for the fair sleeper
7090 	 * bonus in place_entity()).
7091 	 *
7092 	 * If it was on the rq, we've just 'preempted' it, which does convert
7093 	 * ->vruntime to a relative base.
7094 	 *
7095 	 * Make sure both cases convert their relative position when migrating
7096 	 * to another cgroup's rq. This does somewhat interfere with the
7097 	 * fair sleeper stuff for the first placement, but who cares.
7098 	 */
7099 	/*
7100 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
7101 	 * But there are some cases where it has already been normalized:
7102 	 *
7103 	 * - Moving a forked child which is waiting for being woken up by
7104 	 *   wake_up_new_task().
7105 	 * - Moving a task which has been woken up by try_to_wake_up() and
7106 	 *   waiting for actually being woken up by sched_ttwu_pending().
7107 	 *
7108 	 * To prevent boost or penalty in the new cfs_rq caused by delta
7109 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7110 	 */
7111 	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
7112 		on_rq = 1;
7113 
7114 	if (!on_rq)
7115 		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
7116 	set_task_rq(p, task_cpu(p));
7117 	if (!on_rq) {
7118 		cfs_rq = cfs_rq_of(&p->se);
7119 		p->se.vruntime += cfs_rq->min_vruntime;
7120 #ifdef CONFIG_SMP
7121 		/*
7122 		 * migrate_task_rq_fair() will have removed our previous
7123 		 * contribution, but we must synchronize for ongoing future
7124 		 * decay.
7125 		 */
7126 		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7127 		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
7128 #endif
7129 	}
7130 }
7131 
7132 void free_fair_sched_group(struct task_group *tg)
7133 {
7134 	int i;
7135 
7136 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
7137 
7138 	for_each_possible_cpu(i) {
7139 		if (tg->cfs_rq)
7140 			kfree(tg->cfs_rq[i]);
7141 		if (tg->se)
7142 			kfree(tg->se[i]);
7143 	}
7144 
7145 	kfree(tg->cfs_rq);
7146 	kfree(tg->se);
7147 }
7148 
7149 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7150 {
7151 	struct cfs_rq *cfs_rq;
7152 	struct sched_entity *se;
7153 	int i;
7154 
7155 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7156 	if (!tg->cfs_rq)
7157 		goto err;
7158 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7159 	if (!tg->se)
7160 		goto err;
7161 
7162 	tg->shares = NICE_0_LOAD;
7163 
7164 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
7165 
7166 	for_each_possible_cpu(i) {
7167 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
7168 				      GFP_KERNEL, cpu_to_node(i));
7169 		if (!cfs_rq)
7170 			goto err;
7171 
7172 		se = kzalloc_node(sizeof(struct sched_entity),
7173 				  GFP_KERNEL, cpu_to_node(i));
7174 		if (!se)
7175 			goto err_free_rq;
7176 
7177 		init_cfs_rq(cfs_rq);
7178 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
7179 	}
7180 
7181 	return 1;
7182 
7183 err_free_rq:
7184 	kfree(cfs_rq);
7185 err:
7186 	return 0;
7187 }
7188 
7189 void unregister_fair_sched_group(struct task_group *tg, int cpu)
7190 {
7191 	struct rq *rq = cpu_rq(cpu);
7192 	unsigned long flags;
7193 
7194 	/*
7195 	* Only empty task groups can be destroyed; so we can speculatively
7196 	* check on_list without danger of it being re-added.
7197 	*/
7198 	if (!tg->cfs_rq[cpu]->on_list)
7199 		return;
7200 
7201 	raw_spin_lock_irqsave(&rq->lock, flags);
7202 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
7203 	raw_spin_unlock_irqrestore(&rq->lock, flags);
7204 }
7205 
7206 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7207 			struct sched_entity *se, int cpu,
7208 			struct sched_entity *parent)
7209 {
7210 	struct rq *rq = cpu_rq(cpu);
7211 
7212 	cfs_rq->tg = tg;
7213 	cfs_rq->rq = rq;
7214 	init_cfs_rq_runtime(cfs_rq);
7215 
7216 	tg->cfs_rq[cpu] = cfs_rq;
7217 	tg->se[cpu] = se;
7218 
7219 	/* se could be NULL for root_task_group */
7220 	if (!se)
7221 		return;
7222 
7223 	if (!parent)
7224 		se->cfs_rq = &rq->cfs;
7225 	else
7226 		se->cfs_rq = parent->my_q;
7227 
7228 	se->my_q = cfs_rq;
7229 	/* guarantee group entities always have weight */
7230 	update_load_set(&se->load, NICE_0_LOAD);
7231 	se->parent = parent;
7232 }
7233 
7234 static DEFINE_MUTEX(shares_mutex);
7235 
7236 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7237 {
7238 	int i;
7239 	unsigned long flags;
7240 
7241 	/*
7242 	 * We can't change the weight of the root cgroup.
7243 	 */
7244 	if (!tg->se[0])
7245 		return -EINVAL;
7246 
7247 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
7248 
7249 	mutex_lock(&shares_mutex);
7250 	if (tg->shares == shares)
7251 		goto done;
7252 
7253 	tg->shares = shares;
7254 	for_each_possible_cpu(i) {
7255 		struct rq *rq = cpu_rq(i);
7256 		struct sched_entity *se;
7257 
7258 		se = tg->se[i];
7259 		/* Propagate contribution to hierarchy */
7260 		raw_spin_lock_irqsave(&rq->lock, flags);
7261 
7262 		/* Possible calls to update_curr() need rq clock */
7263 		update_rq_clock(rq);
7264 		for_each_sched_entity(se)
7265 			update_cfs_shares(group_cfs_rq(se));
7266 		raw_spin_unlock_irqrestore(&rq->lock, flags);
7267 	}
7268 
7269 done:
7270 	mutex_unlock(&shares_mutex);
7271 	return 0;
7272 }
7273 #else /* CONFIG_FAIR_GROUP_SCHED */
7274 
7275 void free_fair_sched_group(struct task_group *tg) { }
7276 
7277 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7278 {
7279 	return 1;
7280 }
7281 
7282 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
7283 
7284 #endif /* CONFIG_FAIR_GROUP_SCHED */
7285 
7286 
7287 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
7288 {
7289 	struct sched_entity *se = &task->se;
7290 	unsigned int rr_interval = 0;
7291 
7292 	/*
7293 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
7294 	 * idle runqueue:
7295 	 */
7296 	if (rq->cfs.load.weight)
7297 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
7298 
7299 	return rr_interval;
7300 }
7301 
7302 /*
7303  * All the scheduling class methods:
7304  */
7305 const struct sched_class fair_sched_class = {
7306 	.next			= &idle_sched_class,
7307 	.enqueue_task		= enqueue_task_fair,
7308 	.dequeue_task		= dequeue_task_fair,
7309 	.yield_task		= yield_task_fair,
7310 	.yield_to_task		= yield_to_task_fair,
7311 
7312 	.check_preempt_curr	= check_preempt_wakeup,
7313 
7314 	.pick_next_task		= pick_next_task_fair,
7315 	.put_prev_task		= put_prev_task_fair,
7316 
7317 #ifdef CONFIG_SMP
7318 	.select_task_rq		= select_task_rq_fair,
7319 	.migrate_task_rq	= migrate_task_rq_fair,
7320 
7321 	.rq_online		= rq_online_fair,
7322 	.rq_offline		= rq_offline_fair,
7323 
7324 	.task_waking		= task_waking_fair,
7325 #endif
7326 
7327 	.set_curr_task          = set_curr_task_fair,
7328 	.task_tick		= task_tick_fair,
7329 	.task_fork		= task_fork_fair,
7330 
7331 	.prio_changed		= prio_changed_fair,
7332 	.switched_from		= switched_from_fair,
7333 	.switched_to		= switched_to_fair,
7334 
7335 	.get_rr_interval	= get_rr_interval_fair,
7336 
7337 #ifdef CONFIG_FAIR_GROUP_SCHED
7338 	.task_move_group	= task_move_group_fair,
7339 #endif
7340 };
7341 
7342 #ifdef CONFIG_SCHED_DEBUG
7343 void print_cfs_stats(struct seq_file *m, int cpu)
7344 {
7345 	struct cfs_rq *cfs_rq;
7346 
7347 	rcu_read_lock();
7348 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
7349 		print_cfs_rq(m, cpu, cfs_rq);
7350 	rcu_read_unlock();
7351 }
7352 #endif
7353 
7354 __init void init_sched_fair_class(void)
7355 {
7356 #ifdef CONFIG_SMP
7357 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7358 
7359 #ifdef CONFIG_NO_HZ_COMMON
7360 	nohz.next_balance = jiffies;
7361 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7362 	cpu_notifier(sched_ilb_notifier, 0);
7363 #endif
7364 #endif /* SMP */
7365 
7366 }
7367