xref: /openbmc/linux/kernel/sched/fair.c (revision d894fc60)
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21  */
22 
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/cpuidle.h>
27 #include <linux/slab.h>
28 #include <linux/profile.h>
29 #include <linux/interrupt.h>
30 #include <linux/mempolicy.h>
31 #include <linux/migrate.h>
32 #include <linux/task_work.h>
33 
34 #include <trace/events/sched.h>
35 
36 #include "sched.h"
37 
38 /*
39  * Targeted preemption latency for CPU-bound tasks:
40  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
41  *
42  * NOTE: this latency value is not the same as the concept of
43  * 'timeslice length' - timeslices in CFS are of variable length
44  * and have no persistent notion like in traditional, time-slice
45  * based scheduling concepts.
46  *
47  * (to see the precise effective timeslice length of your workload,
48  *  run vmstat and monitor the context-switches (cs) field)
49  */
50 unsigned int sysctl_sched_latency = 6000000ULL;
51 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
52 
53 /*
54  * The initial- and re-scaling of tunables is configurable
55  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
56  *
57  * Options are:
58  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
59  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
60  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
61  */
62 enum sched_tunable_scaling sysctl_sched_tunable_scaling
63 	= SCHED_TUNABLESCALING_LOG;
64 
65 /*
66  * Minimal preemption granularity for CPU-bound tasks:
67  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
68  */
69 unsigned int sysctl_sched_min_granularity = 750000ULL;
70 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
71 
72 /*
73  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
74  */
75 static unsigned int sched_nr_latency = 8;
76 
77 /*
78  * After fork, child runs first. If set to 0 (default) then
79  * parent will (try to) run first.
80  */
81 unsigned int sysctl_sched_child_runs_first __read_mostly;
82 
83 /*
84  * SCHED_OTHER wake-up granularity.
85  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
86  *
87  * This option delays the preemption effects of decoupled workloads
88  * and reduces their over-scheduling. Synchronous workloads will still
89  * have immediate wakeup/sleep latencies.
90  */
91 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
92 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
93 
94 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
95 
96 /*
97  * The exponential sliding  window over which load is averaged for shares
98  * distribution.
99  * (default: 10msec)
100  */
101 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
102 
103 #ifdef CONFIG_CFS_BANDWIDTH
104 /*
105  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
106  * each time a cfs_rq requests quota.
107  *
108  * Note: in the case that the slice exceeds the runtime remaining (either due
109  * to consumption or the quota being specified to be smaller than the slice)
110  * we will always only issue the remaining available time.
111  *
112  * default: 5 msec, units: microseconds
113   */
114 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115 #endif
116 
117 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
118 {
119 	lw->weight += inc;
120 	lw->inv_weight = 0;
121 }
122 
123 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
124 {
125 	lw->weight -= dec;
126 	lw->inv_weight = 0;
127 }
128 
129 static inline void update_load_set(struct load_weight *lw, unsigned long w)
130 {
131 	lw->weight = w;
132 	lw->inv_weight = 0;
133 }
134 
135 /*
136  * Increase the granularity value when there are more CPUs,
137  * because with more CPUs the 'effective latency' as visible
138  * to users decreases. But the relationship is not linear,
139  * so pick a second-best guess by going with the log2 of the
140  * number of CPUs.
141  *
142  * This idea comes from the SD scheduler of Con Kolivas:
143  */
144 static int get_update_sysctl_factor(void)
145 {
146 	unsigned int cpus = min_t(int, num_online_cpus(), 8);
147 	unsigned int factor;
148 
149 	switch (sysctl_sched_tunable_scaling) {
150 	case SCHED_TUNABLESCALING_NONE:
151 		factor = 1;
152 		break;
153 	case SCHED_TUNABLESCALING_LINEAR:
154 		factor = cpus;
155 		break;
156 	case SCHED_TUNABLESCALING_LOG:
157 	default:
158 		factor = 1 + ilog2(cpus);
159 		break;
160 	}
161 
162 	return factor;
163 }
164 
165 static void update_sysctl(void)
166 {
167 	unsigned int factor = get_update_sysctl_factor();
168 
169 #define SET_SYSCTL(name) \
170 	(sysctl_##name = (factor) * normalized_sysctl_##name)
171 	SET_SYSCTL(sched_min_granularity);
172 	SET_SYSCTL(sched_latency);
173 	SET_SYSCTL(sched_wakeup_granularity);
174 #undef SET_SYSCTL
175 }
176 
177 void sched_init_granularity(void)
178 {
179 	update_sysctl();
180 }
181 
182 #define WMULT_CONST	(~0U)
183 #define WMULT_SHIFT	32
184 
185 static void __update_inv_weight(struct load_weight *lw)
186 {
187 	unsigned long w;
188 
189 	if (likely(lw->inv_weight))
190 		return;
191 
192 	w = scale_load_down(lw->weight);
193 
194 	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
195 		lw->inv_weight = 1;
196 	else if (unlikely(!w))
197 		lw->inv_weight = WMULT_CONST;
198 	else
199 		lw->inv_weight = WMULT_CONST / w;
200 }
201 
202 /*
203  * delta_exec * weight / lw.weight
204  *   OR
205  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
206  *
207  * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
208  * we're guaranteed shift stays positive because inv_weight is guaranteed to
209  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
210  *
211  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
212  * weight/lw.weight <= 1, and therefore our shift will also be positive.
213  */
214 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
215 {
216 	u64 fact = scale_load_down(weight);
217 	int shift = WMULT_SHIFT;
218 
219 	__update_inv_weight(lw);
220 
221 	if (unlikely(fact >> 32)) {
222 		while (fact >> 32) {
223 			fact >>= 1;
224 			shift--;
225 		}
226 	}
227 
228 	/* hint to use a 32x32->64 mul */
229 	fact = (u64)(u32)fact * lw->inv_weight;
230 
231 	while (fact >> 32) {
232 		fact >>= 1;
233 		shift--;
234 	}
235 
236 	return mul_u64_u32_shr(delta_exec, fact, shift);
237 }
238 
239 
240 const struct sched_class fair_sched_class;
241 
242 /**************************************************************
243  * CFS operations on generic schedulable entities:
244  */
245 
246 #ifdef CONFIG_FAIR_GROUP_SCHED
247 
248 /* cpu runqueue to which this cfs_rq is attached */
249 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
250 {
251 	return cfs_rq->rq;
252 }
253 
254 /* An entity is a task if it doesn't "own" a runqueue */
255 #define entity_is_task(se)	(!se->my_q)
256 
257 static inline struct task_struct *task_of(struct sched_entity *se)
258 {
259 #ifdef CONFIG_SCHED_DEBUG
260 	WARN_ON_ONCE(!entity_is_task(se));
261 #endif
262 	return container_of(se, struct task_struct, se);
263 }
264 
265 /* Walk up scheduling entities hierarchy */
266 #define for_each_sched_entity(se) \
267 		for (; se; se = se->parent)
268 
269 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
270 {
271 	return p->se.cfs_rq;
272 }
273 
274 /* runqueue on which this entity is (to be) queued */
275 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
276 {
277 	return se->cfs_rq;
278 }
279 
280 /* runqueue "owned" by this group */
281 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
282 {
283 	return grp->my_q;
284 }
285 
286 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
287 				       int force_update);
288 
289 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
290 {
291 	if (!cfs_rq->on_list) {
292 		/*
293 		 * Ensure we either appear before our parent (if already
294 		 * enqueued) or force our parent to appear after us when it is
295 		 * enqueued.  The fact that we always enqueue bottom-up
296 		 * reduces this to two cases.
297 		 */
298 		if (cfs_rq->tg->parent &&
299 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
300 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
301 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
302 		} else {
303 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
304 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
305 		}
306 
307 		cfs_rq->on_list = 1;
308 		/* We should have no load, but we need to update last_decay. */
309 		update_cfs_rq_blocked_load(cfs_rq, 0);
310 	}
311 }
312 
313 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
314 {
315 	if (cfs_rq->on_list) {
316 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
317 		cfs_rq->on_list = 0;
318 	}
319 }
320 
321 /* Iterate thr' all leaf cfs_rq's on a runqueue */
322 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
323 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
324 
325 /* Do the two (enqueued) entities belong to the same group ? */
326 static inline struct cfs_rq *
327 is_same_group(struct sched_entity *se, struct sched_entity *pse)
328 {
329 	if (se->cfs_rq == pse->cfs_rq)
330 		return se->cfs_rq;
331 
332 	return NULL;
333 }
334 
335 static inline struct sched_entity *parent_entity(struct sched_entity *se)
336 {
337 	return se->parent;
338 }
339 
340 static void
341 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
342 {
343 	int se_depth, pse_depth;
344 
345 	/*
346 	 * preemption test can be made between sibling entities who are in the
347 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
348 	 * both tasks until we find their ancestors who are siblings of common
349 	 * parent.
350 	 */
351 
352 	/* First walk up until both entities are at same depth */
353 	se_depth = (*se)->depth;
354 	pse_depth = (*pse)->depth;
355 
356 	while (se_depth > pse_depth) {
357 		se_depth--;
358 		*se = parent_entity(*se);
359 	}
360 
361 	while (pse_depth > se_depth) {
362 		pse_depth--;
363 		*pse = parent_entity(*pse);
364 	}
365 
366 	while (!is_same_group(*se, *pse)) {
367 		*se = parent_entity(*se);
368 		*pse = parent_entity(*pse);
369 	}
370 }
371 
372 #else	/* !CONFIG_FAIR_GROUP_SCHED */
373 
374 static inline struct task_struct *task_of(struct sched_entity *se)
375 {
376 	return container_of(se, struct task_struct, se);
377 }
378 
379 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
380 {
381 	return container_of(cfs_rq, struct rq, cfs);
382 }
383 
384 #define entity_is_task(se)	1
385 
386 #define for_each_sched_entity(se) \
387 		for (; se; se = NULL)
388 
389 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
390 {
391 	return &task_rq(p)->cfs;
392 }
393 
394 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
395 {
396 	struct task_struct *p = task_of(se);
397 	struct rq *rq = task_rq(p);
398 
399 	return &rq->cfs;
400 }
401 
402 /* runqueue "owned" by this group */
403 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
404 {
405 	return NULL;
406 }
407 
408 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
409 {
410 }
411 
412 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
413 {
414 }
415 
416 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
417 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
418 
419 static inline struct sched_entity *parent_entity(struct sched_entity *se)
420 {
421 	return NULL;
422 }
423 
424 static inline void
425 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
426 {
427 }
428 
429 #endif	/* CONFIG_FAIR_GROUP_SCHED */
430 
431 static __always_inline
432 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
433 
434 /**************************************************************
435  * Scheduling class tree data structure manipulation methods:
436  */
437 
438 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
439 {
440 	s64 delta = (s64)(vruntime - max_vruntime);
441 	if (delta > 0)
442 		max_vruntime = vruntime;
443 
444 	return max_vruntime;
445 }
446 
447 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
448 {
449 	s64 delta = (s64)(vruntime - min_vruntime);
450 	if (delta < 0)
451 		min_vruntime = vruntime;
452 
453 	return min_vruntime;
454 }
455 
456 static inline int entity_before(struct sched_entity *a,
457 				struct sched_entity *b)
458 {
459 	return (s64)(a->vruntime - b->vruntime) < 0;
460 }
461 
462 static void update_min_vruntime(struct cfs_rq *cfs_rq)
463 {
464 	u64 vruntime = cfs_rq->min_vruntime;
465 
466 	if (cfs_rq->curr)
467 		vruntime = cfs_rq->curr->vruntime;
468 
469 	if (cfs_rq->rb_leftmost) {
470 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
471 						   struct sched_entity,
472 						   run_node);
473 
474 		if (!cfs_rq->curr)
475 			vruntime = se->vruntime;
476 		else
477 			vruntime = min_vruntime(vruntime, se->vruntime);
478 	}
479 
480 	/* ensure we never gain time by being placed backwards. */
481 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
482 #ifndef CONFIG_64BIT
483 	smp_wmb();
484 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
485 #endif
486 }
487 
488 /*
489  * Enqueue an entity into the rb-tree:
490  */
491 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
492 {
493 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
494 	struct rb_node *parent = NULL;
495 	struct sched_entity *entry;
496 	int leftmost = 1;
497 
498 	/*
499 	 * Find the right place in the rbtree:
500 	 */
501 	while (*link) {
502 		parent = *link;
503 		entry = rb_entry(parent, struct sched_entity, run_node);
504 		/*
505 		 * We dont care about collisions. Nodes with
506 		 * the same key stay together.
507 		 */
508 		if (entity_before(se, entry)) {
509 			link = &parent->rb_left;
510 		} else {
511 			link = &parent->rb_right;
512 			leftmost = 0;
513 		}
514 	}
515 
516 	/*
517 	 * Maintain a cache of leftmost tree entries (it is frequently
518 	 * used):
519 	 */
520 	if (leftmost)
521 		cfs_rq->rb_leftmost = &se->run_node;
522 
523 	rb_link_node(&se->run_node, parent, link);
524 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
525 }
526 
527 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
528 {
529 	if (cfs_rq->rb_leftmost == &se->run_node) {
530 		struct rb_node *next_node;
531 
532 		next_node = rb_next(&se->run_node);
533 		cfs_rq->rb_leftmost = next_node;
534 	}
535 
536 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
537 }
538 
539 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
540 {
541 	struct rb_node *left = cfs_rq->rb_leftmost;
542 
543 	if (!left)
544 		return NULL;
545 
546 	return rb_entry(left, struct sched_entity, run_node);
547 }
548 
549 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
550 {
551 	struct rb_node *next = rb_next(&se->run_node);
552 
553 	if (!next)
554 		return NULL;
555 
556 	return rb_entry(next, struct sched_entity, run_node);
557 }
558 
559 #ifdef CONFIG_SCHED_DEBUG
560 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
561 {
562 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
563 
564 	if (!last)
565 		return NULL;
566 
567 	return rb_entry(last, struct sched_entity, run_node);
568 }
569 
570 /**************************************************************
571  * Scheduling class statistics methods:
572  */
573 
574 int sched_proc_update_handler(struct ctl_table *table, int write,
575 		void __user *buffer, size_t *lenp,
576 		loff_t *ppos)
577 {
578 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
579 	int factor = get_update_sysctl_factor();
580 
581 	if (ret || !write)
582 		return ret;
583 
584 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
585 					sysctl_sched_min_granularity);
586 
587 #define WRT_SYSCTL(name) \
588 	(normalized_sysctl_##name = sysctl_##name / (factor))
589 	WRT_SYSCTL(sched_min_granularity);
590 	WRT_SYSCTL(sched_latency);
591 	WRT_SYSCTL(sched_wakeup_granularity);
592 #undef WRT_SYSCTL
593 
594 	return 0;
595 }
596 #endif
597 
598 /*
599  * delta /= w
600  */
601 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
602 {
603 	if (unlikely(se->load.weight != NICE_0_LOAD))
604 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
605 
606 	return delta;
607 }
608 
609 /*
610  * The idea is to set a period in which each task runs once.
611  *
612  * When there are too many tasks (sched_nr_latency) we have to stretch
613  * this period because otherwise the slices get too small.
614  *
615  * p = (nr <= nl) ? l : l*nr/nl
616  */
617 static u64 __sched_period(unsigned long nr_running)
618 {
619 	u64 period = sysctl_sched_latency;
620 	unsigned long nr_latency = sched_nr_latency;
621 
622 	if (unlikely(nr_running > nr_latency)) {
623 		period = sysctl_sched_min_granularity;
624 		period *= nr_running;
625 	}
626 
627 	return period;
628 }
629 
630 /*
631  * We calculate the wall-time slice from the period by taking a part
632  * proportional to the weight.
633  *
634  * s = p*P[w/rw]
635  */
636 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
637 {
638 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
639 
640 	for_each_sched_entity(se) {
641 		struct load_weight *load;
642 		struct load_weight lw;
643 
644 		cfs_rq = cfs_rq_of(se);
645 		load = &cfs_rq->load;
646 
647 		if (unlikely(!se->on_rq)) {
648 			lw = cfs_rq->load;
649 
650 			update_load_add(&lw, se->load.weight);
651 			load = &lw;
652 		}
653 		slice = __calc_delta(slice, se->load.weight, load);
654 	}
655 	return slice;
656 }
657 
658 /*
659  * We calculate the vruntime slice of a to-be-inserted task.
660  *
661  * vs = s/w
662  */
663 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
664 {
665 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
666 }
667 
668 #ifdef CONFIG_SMP
669 static int select_idle_sibling(struct task_struct *p, int cpu);
670 static unsigned long task_h_load(struct task_struct *p);
671 
672 static inline void __update_task_entity_contrib(struct sched_entity *se);
673 
674 /* Give new task start runnable values to heavy its load in infant time */
675 void init_task_runnable_average(struct task_struct *p)
676 {
677 	u32 slice;
678 
679 	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
680 	p->se.avg.runnable_avg_sum = slice;
681 	p->se.avg.runnable_avg_period = slice;
682 	__update_task_entity_contrib(&p->se);
683 }
684 #else
685 void init_task_runnable_average(struct task_struct *p)
686 {
687 }
688 #endif
689 
690 /*
691  * Update the current task's runtime statistics.
692  */
693 static void update_curr(struct cfs_rq *cfs_rq)
694 {
695 	struct sched_entity *curr = cfs_rq->curr;
696 	u64 now = rq_clock_task(rq_of(cfs_rq));
697 	u64 delta_exec;
698 
699 	if (unlikely(!curr))
700 		return;
701 
702 	delta_exec = now - curr->exec_start;
703 	if (unlikely((s64)delta_exec <= 0))
704 		return;
705 
706 	curr->exec_start = now;
707 
708 	schedstat_set(curr->statistics.exec_max,
709 		      max(delta_exec, curr->statistics.exec_max));
710 
711 	curr->sum_exec_runtime += delta_exec;
712 	schedstat_add(cfs_rq, exec_clock, delta_exec);
713 
714 	curr->vruntime += calc_delta_fair(delta_exec, curr);
715 	update_min_vruntime(cfs_rq);
716 
717 	if (entity_is_task(curr)) {
718 		struct task_struct *curtask = task_of(curr);
719 
720 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
721 		cpuacct_charge(curtask, delta_exec);
722 		account_group_exec_runtime(curtask, delta_exec);
723 	}
724 
725 	account_cfs_rq_runtime(cfs_rq, delta_exec);
726 }
727 
728 static void update_curr_fair(struct rq *rq)
729 {
730 	update_curr(cfs_rq_of(&rq->curr->se));
731 }
732 
733 static inline void
734 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
735 {
736 	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
737 }
738 
739 /*
740  * Task is being enqueued - update stats:
741  */
742 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
743 {
744 	/*
745 	 * Are we enqueueing a waiting task? (for current tasks
746 	 * a dequeue/enqueue event is a NOP)
747 	 */
748 	if (se != cfs_rq->curr)
749 		update_stats_wait_start(cfs_rq, se);
750 }
751 
752 static void
753 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
754 {
755 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
756 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
757 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
758 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
759 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
760 #ifdef CONFIG_SCHEDSTATS
761 	if (entity_is_task(se)) {
762 		trace_sched_stat_wait(task_of(se),
763 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
764 	}
765 #endif
766 	schedstat_set(se->statistics.wait_start, 0);
767 }
768 
769 static inline void
770 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
771 {
772 	/*
773 	 * Mark the end of the wait period if dequeueing a
774 	 * waiting task:
775 	 */
776 	if (se != cfs_rq->curr)
777 		update_stats_wait_end(cfs_rq, se);
778 }
779 
780 /*
781  * We are picking a new current task - update its stats:
782  */
783 static inline void
784 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
785 {
786 	/*
787 	 * We are starting a new run period:
788 	 */
789 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
790 }
791 
792 /**************************************************
793  * Scheduling class queueing methods:
794  */
795 
796 #ifdef CONFIG_NUMA_BALANCING
797 /*
798  * Approximate time to scan a full NUMA task in ms. The task scan period is
799  * calculated based on the tasks virtual memory size and
800  * numa_balancing_scan_size.
801  */
802 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
803 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
804 
805 /* Portion of address space to scan in MB */
806 unsigned int sysctl_numa_balancing_scan_size = 256;
807 
808 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
809 unsigned int sysctl_numa_balancing_scan_delay = 1000;
810 
811 static unsigned int task_nr_scan_windows(struct task_struct *p)
812 {
813 	unsigned long rss = 0;
814 	unsigned long nr_scan_pages;
815 
816 	/*
817 	 * Calculations based on RSS as non-present and empty pages are skipped
818 	 * by the PTE scanner and NUMA hinting faults should be trapped based
819 	 * on resident pages
820 	 */
821 	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
822 	rss = get_mm_rss(p->mm);
823 	if (!rss)
824 		rss = nr_scan_pages;
825 
826 	rss = round_up(rss, nr_scan_pages);
827 	return rss / nr_scan_pages;
828 }
829 
830 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
831 #define MAX_SCAN_WINDOW 2560
832 
833 static unsigned int task_scan_min(struct task_struct *p)
834 {
835 	unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
836 	unsigned int scan, floor;
837 	unsigned int windows = 1;
838 
839 	if (scan_size < MAX_SCAN_WINDOW)
840 		windows = MAX_SCAN_WINDOW / scan_size;
841 	floor = 1000 / windows;
842 
843 	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
844 	return max_t(unsigned int, floor, scan);
845 }
846 
847 static unsigned int task_scan_max(struct task_struct *p)
848 {
849 	unsigned int smin = task_scan_min(p);
850 	unsigned int smax;
851 
852 	/* Watch for min being lower than max due to floor calculations */
853 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
854 	return max(smin, smax);
855 }
856 
857 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
858 {
859 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
860 	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
861 }
862 
863 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
864 {
865 	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
866 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
867 }
868 
869 struct numa_group {
870 	atomic_t refcount;
871 
872 	spinlock_t lock; /* nr_tasks, tasks */
873 	int nr_tasks;
874 	pid_t gid;
875 
876 	struct rcu_head rcu;
877 	nodemask_t active_nodes;
878 	unsigned long total_faults;
879 	/*
880 	 * Faults_cpu is used to decide whether memory should move
881 	 * towards the CPU. As a consequence, these stats are weighted
882 	 * more by CPU use than by memory faults.
883 	 */
884 	unsigned long *faults_cpu;
885 	unsigned long faults[0];
886 };
887 
888 /* Shared or private faults. */
889 #define NR_NUMA_HINT_FAULT_TYPES 2
890 
891 /* Memory and CPU locality */
892 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
893 
894 /* Averaged statistics, and temporary buffers. */
895 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
896 
897 pid_t task_numa_group_id(struct task_struct *p)
898 {
899 	return p->numa_group ? p->numa_group->gid : 0;
900 }
901 
902 /*
903  * The averaged statistics, shared & private, memory & cpu,
904  * occupy the first half of the array. The second half of the
905  * array is for current counters, which are averaged into the
906  * first set by task_numa_placement.
907  */
908 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
909 {
910 	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
911 }
912 
913 static inline unsigned long task_faults(struct task_struct *p, int nid)
914 {
915 	if (!p->numa_faults)
916 		return 0;
917 
918 	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
919 		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
920 }
921 
922 static inline unsigned long group_faults(struct task_struct *p, int nid)
923 {
924 	if (!p->numa_group)
925 		return 0;
926 
927 	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
928 		p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
929 }
930 
931 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
932 {
933 	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
934 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
935 }
936 
937 /* Handle placement on systems where not all nodes are directly connected. */
938 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
939 					int maxdist, bool task)
940 {
941 	unsigned long score = 0;
942 	int node;
943 
944 	/*
945 	 * All nodes are directly connected, and the same distance
946 	 * from each other. No need for fancy placement algorithms.
947 	 */
948 	if (sched_numa_topology_type == NUMA_DIRECT)
949 		return 0;
950 
951 	/*
952 	 * This code is called for each node, introducing N^2 complexity,
953 	 * which should be ok given the number of nodes rarely exceeds 8.
954 	 */
955 	for_each_online_node(node) {
956 		unsigned long faults;
957 		int dist = node_distance(nid, node);
958 
959 		/*
960 		 * The furthest away nodes in the system are not interesting
961 		 * for placement; nid was already counted.
962 		 */
963 		if (dist == sched_max_numa_distance || node == nid)
964 			continue;
965 
966 		/*
967 		 * On systems with a backplane NUMA topology, compare groups
968 		 * of nodes, and move tasks towards the group with the most
969 		 * memory accesses. When comparing two nodes at distance
970 		 * "hoplimit", only nodes closer by than "hoplimit" are part
971 		 * of each group. Skip other nodes.
972 		 */
973 		if (sched_numa_topology_type == NUMA_BACKPLANE &&
974 					dist > maxdist)
975 			continue;
976 
977 		/* Add up the faults from nearby nodes. */
978 		if (task)
979 			faults = task_faults(p, node);
980 		else
981 			faults = group_faults(p, node);
982 
983 		/*
984 		 * On systems with a glueless mesh NUMA topology, there are
985 		 * no fixed "groups of nodes". Instead, nodes that are not
986 		 * directly connected bounce traffic through intermediate
987 		 * nodes; a numa_group can occupy any set of nodes.
988 		 * The further away a node is, the less the faults count.
989 		 * This seems to result in good task placement.
990 		 */
991 		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
992 			faults *= (sched_max_numa_distance - dist);
993 			faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
994 		}
995 
996 		score += faults;
997 	}
998 
999 	return score;
1000 }
1001 
1002 /*
1003  * These return the fraction of accesses done by a particular task, or
1004  * task group, on a particular numa node.  The group weight is given a
1005  * larger multiplier, in order to group tasks together that are almost
1006  * evenly spread out between numa nodes.
1007  */
1008 static inline unsigned long task_weight(struct task_struct *p, int nid,
1009 					int dist)
1010 {
1011 	unsigned long faults, total_faults;
1012 
1013 	if (!p->numa_faults)
1014 		return 0;
1015 
1016 	total_faults = p->total_numa_faults;
1017 
1018 	if (!total_faults)
1019 		return 0;
1020 
1021 	faults = task_faults(p, nid);
1022 	faults += score_nearby_nodes(p, nid, dist, true);
1023 
1024 	return 1000 * faults / total_faults;
1025 }
1026 
1027 static inline unsigned long group_weight(struct task_struct *p, int nid,
1028 					 int dist)
1029 {
1030 	unsigned long faults, total_faults;
1031 
1032 	if (!p->numa_group)
1033 		return 0;
1034 
1035 	total_faults = p->numa_group->total_faults;
1036 
1037 	if (!total_faults)
1038 		return 0;
1039 
1040 	faults = group_faults(p, nid);
1041 	faults += score_nearby_nodes(p, nid, dist, false);
1042 
1043 	return 1000 * faults / total_faults;
1044 }
1045 
1046 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1047 				int src_nid, int dst_cpu)
1048 {
1049 	struct numa_group *ng = p->numa_group;
1050 	int dst_nid = cpu_to_node(dst_cpu);
1051 	int last_cpupid, this_cpupid;
1052 
1053 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1054 
1055 	/*
1056 	 * Multi-stage node selection is used in conjunction with a periodic
1057 	 * migration fault to build a temporal task<->page relation. By using
1058 	 * a two-stage filter we remove short/unlikely relations.
1059 	 *
1060 	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1061 	 * a task's usage of a particular page (n_p) per total usage of this
1062 	 * page (n_t) (in a given time-span) to a probability.
1063 	 *
1064 	 * Our periodic faults will sample this probability and getting the
1065 	 * same result twice in a row, given these samples are fully
1066 	 * independent, is then given by P(n)^2, provided our sample period
1067 	 * is sufficiently short compared to the usage pattern.
1068 	 *
1069 	 * This quadric squishes small probabilities, making it less likely we
1070 	 * act on an unlikely task<->page relation.
1071 	 */
1072 	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1073 	if (!cpupid_pid_unset(last_cpupid) &&
1074 				cpupid_to_nid(last_cpupid) != dst_nid)
1075 		return false;
1076 
1077 	/* Always allow migrate on private faults */
1078 	if (cpupid_match_pid(p, last_cpupid))
1079 		return true;
1080 
1081 	/* A shared fault, but p->numa_group has not been set up yet. */
1082 	if (!ng)
1083 		return true;
1084 
1085 	/*
1086 	 * Do not migrate if the destination is not a node that
1087 	 * is actively used by this numa group.
1088 	 */
1089 	if (!node_isset(dst_nid, ng->active_nodes))
1090 		return false;
1091 
1092 	/*
1093 	 * Source is a node that is not actively used by this
1094 	 * numa group, while the destination is. Migrate.
1095 	 */
1096 	if (!node_isset(src_nid, ng->active_nodes))
1097 		return true;
1098 
1099 	/*
1100 	 * Both source and destination are nodes in active
1101 	 * use by this numa group. Maximize memory bandwidth
1102 	 * by migrating from more heavily used groups, to less
1103 	 * heavily used ones, spreading the load around.
1104 	 * Use a 1/4 hysteresis to avoid spurious page movement.
1105 	 */
1106 	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1107 }
1108 
1109 static unsigned long weighted_cpuload(const int cpu);
1110 static unsigned long source_load(int cpu, int type);
1111 static unsigned long target_load(int cpu, int type);
1112 static unsigned long capacity_of(int cpu);
1113 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1114 
1115 /* Cached statistics for all CPUs within a node */
1116 struct numa_stats {
1117 	unsigned long nr_running;
1118 	unsigned long load;
1119 
1120 	/* Total compute capacity of CPUs on a node */
1121 	unsigned long compute_capacity;
1122 
1123 	/* Approximate capacity in terms of runnable tasks on a node */
1124 	unsigned long task_capacity;
1125 	int has_free_capacity;
1126 };
1127 
1128 /*
1129  * XXX borrowed from update_sg_lb_stats
1130  */
1131 static void update_numa_stats(struct numa_stats *ns, int nid)
1132 {
1133 	int smt, cpu, cpus = 0;
1134 	unsigned long capacity;
1135 
1136 	memset(ns, 0, sizeof(*ns));
1137 	for_each_cpu(cpu, cpumask_of_node(nid)) {
1138 		struct rq *rq = cpu_rq(cpu);
1139 
1140 		ns->nr_running += rq->nr_running;
1141 		ns->load += weighted_cpuload(cpu);
1142 		ns->compute_capacity += capacity_of(cpu);
1143 
1144 		cpus++;
1145 	}
1146 
1147 	/*
1148 	 * If we raced with hotplug and there are no CPUs left in our mask
1149 	 * the @ns structure is NULL'ed and task_numa_compare() will
1150 	 * not find this node attractive.
1151 	 *
1152 	 * We'll either bail at !has_free_capacity, or we'll detect a huge
1153 	 * imbalance and bail there.
1154 	 */
1155 	if (!cpus)
1156 		return;
1157 
1158 	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1159 	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1160 	capacity = cpus / smt; /* cores */
1161 
1162 	ns->task_capacity = min_t(unsigned, capacity,
1163 		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1164 	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1165 }
1166 
1167 struct task_numa_env {
1168 	struct task_struct *p;
1169 
1170 	int src_cpu, src_nid;
1171 	int dst_cpu, dst_nid;
1172 
1173 	struct numa_stats src_stats, dst_stats;
1174 
1175 	int imbalance_pct;
1176 	int dist;
1177 
1178 	struct task_struct *best_task;
1179 	long best_imp;
1180 	int best_cpu;
1181 };
1182 
1183 static void task_numa_assign(struct task_numa_env *env,
1184 			     struct task_struct *p, long imp)
1185 {
1186 	if (env->best_task)
1187 		put_task_struct(env->best_task);
1188 	if (p)
1189 		get_task_struct(p);
1190 
1191 	env->best_task = p;
1192 	env->best_imp = imp;
1193 	env->best_cpu = env->dst_cpu;
1194 }
1195 
1196 static bool load_too_imbalanced(long src_load, long dst_load,
1197 				struct task_numa_env *env)
1198 {
1199 	long imb, old_imb;
1200 	long orig_src_load, orig_dst_load;
1201 	long src_capacity, dst_capacity;
1202 
1203 	/*
1204 	 * The load is corrected for the CPU capacity available on each node.
1205 	 *
1206 	 * src_load        dst_load
1207 	 * ------------ vs ---------
1208 	 * src_capacity    dst_capacity
1209 	 */
1210 	src_capacity = env->src_stats.compute_capacity;
1211 	dst_capacity = env->dst_stats.compute_capacity;
1212 
1213 	/* We care about the slope of the imbalance, not the direction. */
1214 	if (dst_load < src_load)
1215 		swap(dst_load, src_load);
1216 
1217 	/* Is the difference below the threshold? */
1218 	imb = dst_load * src_capacity * 100 -
1219 	      src_load * dst_capacity * env->imbalance_pct;
1220 	if (imb <= 0)
1221 		return false;
1222 
1223 	/*
1224 	 * The imbalance is above the allowed threshold.
1225 	 * Compare it with the old imbalance.
1226 	 */
1227 	orig_src_load = env->src_stats.load;
1228 	orig_dst_load = env->dst_stats.load;
1229 
1230 	if (orig_dst_load < orig_src_load)
1231 		swap(orig_dst_load, orig_src_load);
1232 
1233 	old_imb = orig_dst_load * src_capacity * 100 -
1234 		  orig_src_load * dst_capacity * env->imbalance_pct;
1235 
1236 	/* Would this change make things worse? */
1237 	return (imb > old_imb);
1238 }
1239 
1240 /*
1241  * This checks if the overall compute and NUMA accesses of the system would
1242  * be improved if the source tasks was migrated to the target dst_cpu taking
1243  * into account that it might be best if task running on the dst_cpu should
1244  * be exchanged with the source task
1245  */
1246 static void task_numa_compare(struct task_numa_env *env,
1247 			      long taskimp, long groupimp)
1248 {
1249 	struct rq *src_rq = cpu_rq(env->src_cpu);
1250 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
1251 	struct task_struct *cur;
1252 	long src_load, dst_load;
1253 	long load;
1254 	long imp = env->p->numa_group ? groupimp : taskimp;
1255 	long moveimp = imp;
1256 	int dist = env->dist;
1257 
1258 	rcu_read_lock();
1259 
1260 	raw_spin_lock_irq(&dst_rq->lock);
1261 	cur = dst_rq->curr;
1262 	/*
1263 	 * No need to move the exiting task, and this ensures that ->curr
1264 	 * wasn't reaped and thus get_task_struct() in task_numa_assign()
1265 	 * is safe under RCU read lock.
1266 	 * Note that rcu_read_lock() itself can't protect from the final
1267 	 * put_task_struct() after the last schedule().
1268 	 */
1269 	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1270 		cur = NULL;
1271 	raw_spin_unlock_irq(&dst_rq->lock);
1272 
1273 	/*
1274 	 * Because we have preemption enabled we can get migrated around and
1275 	 * end try selecting ourselves (current == env->p) as a swap candidate.
1276 	 */
1277 	if (cur == env->p)
1278 		goto unlock;
1279 
1280 	/*
1281 	 * "imp" is the fault differential for the source task between the
1282 	 * source and destination node. Calculate the total differential for
1283 	 * the source task and potential destination task. The more negative
1284 	 * the value is, the more rmeote accesses that would be expected to
1285 	 * be incurred if the tasks were swapped.
1286 	 */
1287 	if (cur) {
1288 		/* Skip this swap candidate if cannot move to the source cpu */
1289 		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1290 			goto unlock;
1291 
1292 		/*
1293 		 * If dst and source tasks are in the same NUMA group, or not
1294 		 * in any group then look only at task weights.
1295 		 */
1296 		if (cur->numa_group == env->p->numa_group) {
1297 			imp = taskimp + task_weight(cur, env->src_nid, dist) -
1298 			      task_weight(cur, env->dst_nid, dist);
1299 			/*
1300 			 * Add some hysteresis to prevent swapping the
1301 			 * tasks within a group over tiny differences.
1302 			 */
1303 			if (cur->numa_group)
1304 				imp -= imp/16;
1305 		} else {
1306 			/*
1307 			 * Compare the group weights. If a task is all by
1308 			 * itself (not part of a group), use the task weight
1309 			 * instead.
1310 			 */
1311 			if (cur->numa_group)
1312 				imp += group_weight(cur, env->src_nid, dist) -
1313 				       group_weight(cur, env->dst_nid, dist);
1314 			else
1315 				imp += task_weight(cur, env->src_nid, dist) -
1316 				       task_weight(cur, env->dst_nid, dist);
1317 		}
1318 	}
1319 
1320 	if (imp <= env->best_imp && moveimp <= env->best_imp)
1321 		goto unlock;
1322 
1323 	if (!cur) {
1324 		/* Is there capacity at our destination? */
1325 		if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1326 		    !env->dst_stats.has_free_capacity)
1327 			goto unlock;
1328 
1329 		goto balance;
1330 	}
1331 
1332 	/* Balance doesn't matter much if we're running a task per cpu */
1333 	if (imp > env->best_imp && src_rq->nr_running == 1 &&
1334 			dst_rq->nr_running == 1)
1335 		goto assign;
1336 
1337 	/*
1338 	 * In the overloaded case, try and keep the load balanced.
1339 	 */
1340 balance:
1341 	load = task_h_load(env->p);
1342 	dst_load = env->dst_stats.load + load;
1343 	src_load = env->src_stats.load - load;
1344 
1345 	if (moveimp > imp && moveimp > env->best_imp) {
1346 		/*
1347 		 * If the improvement from just moving env->p direction is
1348 		 * better than swapping tasks around, check if a move is
1349 		 * possible. Store a slightly smaller score than moveimp,
1350 		 * so an actually idle CPU will win.
1351 		 */
1352 		if (!load_too_imbalanced(src_load, dst_load, env)) {
1353 			imp = moveimp - 1;
1354 			cur = NULL;
1355 			goto assign;
1356 		}
1357 	}
1358 
1359 	if (imp <= env->best_imp)
1360 		goto unlock;
1361 
1362 	if (cur) {
1363 		load = task_h_load(cur);
1364 		dst_load -= load;
1365 		src_load += load;
1366 	}
1367 
1368 	if (load_too_imbalanced(src_load, dst_load, env))
1369 		goto unlock;
1370 
1371 	/*
1372 	 * One idle CPU per node is evaluated for a task numa move.
1373 	 * Call select_idle_sibling to maybe find a better one.
1374 	 */
1375 	if (!cur)
1376 		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1377 
1378 assign:
1379 	task_numa_assign(env, cur, imp);
1380 unlock:
1381 	rcu_read_unlock();
1382 }
1383 
1384 static void task_numa_find_cpu(struct task_numa_env *env,
1385 				long taskimp, long groupimp)
1386 {
1387 	int cpu;
1388 
1389 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1390 		/* Skip this CPU if the source task cannot migrate */
1391 		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1392 			continue;
1393 
1394 		env->dst_cpu = cpu;
1395 		task_numa_compare(env, taskimp, groupimp);
1396 	}
1397 }
1398 
1399 static int task_numa_migrate(struct task_struct *p)
1400 {
1401 	struct task_numa_env env = {
1402 		.p = p,
1403 
1404 		.src_cpu = task_cpu(p),
1405 		.src_nid = task_node(p),
1406 
1407 		.imbalance_pct = 112,
1408 
1409 		.best_task = NULL,
1410 		.best_imp = 0,
1411 		.best_cpu = -1
1412 	};
1413 	struct sched_domain *sd;
1414 	unsigned long taskweight, groupweight;
1415 	int nid, ret, dist;
1416 	long taskimp, groupimp;
1417 
1418 	/*
1419 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
1420 	 * imbalance and would be the first to start moving tasks about.
1421 	 *
1422 	 * And we want to avoid any moving of tasks about, as that would create
1423 	 * random movement of tasks -- counter the numa conditions we're trying
1424 	 * to satisfy here.
1425 	 */
1426 	rcu_read_lock();
1427 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1428 	if (sd)
1429 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1430 	rcu_read_unlock();
1431 
1432 	/*
1433 	 * Cpusets can break the scheduler domain tree into smaller
1434 	 * balance domains, some of which do not cross NUMA boundaries.
1435 	 * Tasks that are "trapped" in such domains cannot be migrated
1436 	 * elsewhere, so there is no point in (re)trying.
1437 	 */
1438 	if (unlikely(!sd)) {
1439 		p->numa_preferred_nid = task_node(p);
1440 		return -EINVAL;
1441 	}
1442 
1443 	env.dst_nid = p->numa_preferred_nid;
1444 	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1445 	taskweight = task_weight(p, env.src_nid, dist);
1446 	groupweight = group_weight(p, env.src_nid, dist);
1447 	update_numa_stats(&env.src_stats, env.src_nid);
1448 	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1449 	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1450 	update_numa_stats(&env.dst_stats, env.dst_nid);
1451 
1452 	/* Try to find a spot on the preferred nid. */
1453 	task_numa_find_cpu(&env, taskimp, groupimp);
1454 
1455 	/*
1456 	 * Look at other nodes in these cases:
1457 	 * - there is no space available on the preferred_nid
1458 	 * - the task is part of a numa_group that is interleaved across
1459 	 *   multiple NUMA nodes; in order to better consolidate the group,
1460 	 *   we need to check other locations.
1461 	 */
1462 	if (env.best_cpu == -1 || (p->numa_group &&
1463 			nodes_weight(p->numa_group->active_nodes) > 1)) {
1464 		for_each_online_node(nid) {
1465 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
1466 				continue;
1467 
1468 			dist = node_distance(env.src_nid, env.dst_nid);
1469 			if (sched_numa_topology_type == NUMA_BACKPLANE &&
1470 						dist != env.dist) {
1471 				taskweight = task_weight(p, env.src_nid, dist);
1472 				groupweight = group_weight(p, env.src_nid, dist);
1473 			}
1474 
1475 			/* Only consider nodes where both task and groups benefit */
1476 			taskimp = task_weight(p, nid, dist) - taskweight;
1477 			groupimp = group_weight(p, nid, dist) - groupweight;
1478 			if (taskimp < 0 && groupimp < 0)
1479 				continue;
1480 
1481 			env.dist = dist;
1482 			env.dst_nid = nid;
1483 			update_numa_stats(&env.dst_stats, env.dst_nid);
1484 			task_numa_find_cpu(&env, taskimp, groupimp);
1485 		}
1486 	}
1487 
1488 	/*
1489 	 * If the task is part of a workload that spans multiple NUMA nodes,
1490 	 * and is migrating into one of the workload's active nodes, remember
1491 	 * this node as the task's preferred numa node, so the workload can
1492 	 * settle down.
1493 	 * A task that migrated to a second choice node will be better off
1494 	 * trying for a better one later. Do not set the preferred node here.
1495 	 */
1496 	if (p->numa_group) {
1497 		if (env.best_cpu == -1)
1498 			nid = env.src_nid;
1499 		else
1500 			nid = env.dst_nid;
1501 
1502 		if (node_isset(nid, p->numa_group->active_nodes))
1503 			sched_setnuma(p, env.dst_nid);
1504 	}
1505 
1506 	/* No better CPU than the current one was found. */
1507 	if (env.best_cpu == -1)
1508 		return -EAGAIN;
1509 
1510 	/*
1511 	 * Reset the scan period if the task is being rescheduled on an
1512 	 * alternative node to recheck if the tasks is now properly placed.
1513 	 */
1514 	p->numa_scan_period = task_scan_min(p);
1515 
1516 	if (env.best_task == NULL) {
1517 		ret = migrate_task_to(p, env.best_cpu);
1518 		if (ret != 0)
1519 			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1520 		return ret;
1521 	}
1522 
1523 	ret = migrate_swap(p, env.best_task);
1524 	if (ret != 0)
1525 		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1526 	put_task_struct(env.best_task);
1527 	return ret;
1528 }
1529 
1530 /* Attempt to migrate a task to a CPU on the preferred node. */
1531 static void numa_migrate_preferred(struct task_struct *p)
1532 {
1533 	unsigned long interval = HZ;
1534 
1535 	/* This task has no NUMA fault statistics yet */
1536 	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1537 		return;
1538 
1539 	/* Periodically retry migrating the task to the preferred node */
1540 	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1541 	p->numa_migrate_retry = jiffies + interval;
1542 
1543 	/* Success if task is already running on preferred CPU */
1544 	if (task_node(p) == p->numa_preferred_nid)
1545 		return;
1546 
1547 	/* Otherwise, try migrate to a CPU on the preferred node */
1548 	task_numa_migrate(p);
1549 }
1550 
1551 /*
1552  * Find the nodes on which the workload is actively running. We do this by
1553  * tracking the nodes from which NUMA hinting faults are triggered. This can
1554  * be different from the set of nodes where the workload's memory is currently
1555  * located.
1556  *
1557  * The bitmask is used to make smarter decisions on when to do NUMA page
1558  * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1559  * are added when they cause over 6/16 of the maximum number of faults, but
1560  * only removed when they drop below 3/16.
1561  */
1562 static void update_numa_active_node_mask(struct numa_group *numa_group)
1563 {
1564 	unsigned long faults, max_faults = 0;
1565 	int nid;
1566 
1567 	for_each_online_node(nid) {
1568 		faults = group_faults_cpu(numa_group, nid);
1569 		if (faults > max_faults)
1570 			max_faults = faults;
1571 	}
1572 
1573 	for_each_online_node(nid) {
1574 		faults = group_faults_cpu(numa_group, nid);
1575 		if (!node_isset(nid, numa_group->active_nodes)) {
1576 			if (faults > max_faults * 6 / 16)
1577 				node_set(nid, numa_group->active_nodes);
1578 		} else if (faults < max_faults * 3 / 16)
1579 			node_clear(nid, numa_group->active_nodes);
1580 	}
1581 }
1582 
1583 /*
1584  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1585  * increments. The more local the fault statistics are, the higher the scan
1586  * period will be for the next scan window. If local/(local+remote) ratio is
1587  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1588  * the scan period will decrease. Aim for 70% local accesses.
1589  */
1590 #define NUMA_PERIOD_SLOTS 10
1591 #define NUMA_PERIOD_THRESHOLD 7
1592 
1593 /*
1594  * Increase the scan period (slow down scanning) if the majority of
1595  * our memory is already on our local node, or if the majority of
1596  * the page accesses are shared with other processes.
1597  * Otherwise, decrease the scan period.
1598  */
1599 static void update_task_scan_period(struct task_struct *p,
1600 			unsigned long shared, unsigned long private)
1601 {
1602 	unsigned int period_slot;
1603 	int ratio;
1604 	int diff;
1605 
1606 	unsigned long remote = p->numa_faults_locality[0];
1607 	unsigned long local = p->numa_faults_locality[1];
1608 
1609 	/*
1610 	 * If there were no record hinting faults then either the task is
1611 	 * completely idle or all activity is areas that are not of interest
1612 	 * to automatic numa balancing. Scan slower
1613 	 */
1614 	if (local + shared == 0) {
1615 		p->numa_scan_period = min(p->numa_scan_period_max,
1616 			p->numa_scan_period << 1);
1617 
1618 		p->mm->numa_next_scan = jiffies +
1619 			msecs_to_jiffies(p->numa_scan_period);
1620 
1621 		return;
1622 	}
1623 
1624 	/*
1625 	 * Prepare to scale scan period relative to the current period.
1626 	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
1627 	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1628 	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1629 	 */
1630 	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1631 	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1632 	if (ratio >= NUMA_PERIOD_THRESHOLD) {
1633 		int slot = ratio - NUMA_PERIOD_THRESHOLD;
1634 		if (!slot)
1635 			slot = 1;
1636 		diff = slot * period_slot;
1637 	} else {
1638 		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1639 
1640 		/*
1641 		 * Scale scan rate increases based on sharing. There is an
1642 		 * inverse relationship between the degree of sharing and
1643 		 * the adjustment made to the scanning period. Broadly
1644 		 * speaking the intent is that there is little point
1645 		 * scanning faster if shared accesses dominate as it may
1646 		 * simply bounce migrations uselessly
1647 		 */
1648 		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1649 		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1650 	}
1651 
1652 	p->numa_scan_period = clamp(p->numa_scan_period + diff,
1653 			task_scan_min(p), task_scan_max(p));
1654 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1655 }
1656 
1657 /*
1658  * Get the fraction of time the task has been running since the last
1659  * NUMA placement cycle. The scheduler keeps similar statistics, but
1660  * decays those on a 32ms period, which is orders of magnitude off
1661  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1662  * stats only if the task is so new there are no NUMA statistics yet.
1663  */
1664 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1665 {
1666 	u64 runtime, delta, now;
1667 	/* Use the start of this time slice to avoid calculations. */
1668 	now = p->se.exec_start;
1669 	runtime = p->se.sum_exec_runtime;
1670 
1671 	if (p->last_task_numa_placement) {
1672 		delta = runtime - p->last_sum_exec_runtime;
1673 		*period = now - p->last_task_numa_placement;
1674 	} else {
1675 		delta = p->se.avg.runnable_avg_sum;
1676 		*period = p->se.avg.runnable_avg_period;
1677 	}
1678 
1679 	p->last_sum_exec_runtime = runtime;
1680 	p->last_task_numa_placement = now;
1681 
1682 	return delta;
1683 }
1684 
1685 /*
1686  * Determine the preferred nid for a task in a numa_group. This needs to
1687  * be done in a way that produces consistent results with group_weight,
1688  * otherwise workloads might not converge.
1689  */
1690 static int preferred_group_nid(struct task_struct *p, int nid)
1691 {
1692 	nodemask_t nodes;
1693 	int dist;
1694 
1695 	/* Direct connections between all NUMA nodes. */
1696 	if (sched_numa_topology_type == NUMA_DIRECT)
1697 		return nid;
1698 
1699 	/*
1700 	 * On a system with glueless mesh NUMA topology, group_weight
1701 	 * scores nodes according to the number of NUMA hinting faults on
1702 	 * both the node itself, and on nearby nodes.
1703 	 */
1704 	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1705 		unsigned long score, max_score = 0;
1706 		int node, max_node = nid;
1707 
1708 		dist = sched_max_numa_distance;
1709 
1710 		for_each_online_node(node) {
1711 			score = group_weight(p, node, dist);
1712 			if (score > max_score) {
1713 				max_score = score;
1714 				max_node = node;
1715 			}
1716 		}
1717 		return max_node;
1718 	}
1719 
1720 	/*
1721 	 * Finding the preferred nid in a system with NUMA backplane
1722 	 * interconnect topology is more involved. The goal is to locate
1723 	 * tasks from numa_groups near each other in the system, and
1724 	 * untangle workloads from different sides of the system. This requires
1725 	 * searching down the hierarchy of node groups, recursively searching
1726 	 * inside the highest scoring group of nodes. The nodemask tricks
1727 	 * keep the complexity of the search down.
1728 	 */
1729 	nodes = node_online_map;
1730 	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1731 		unsigned long max_faults = 0;
1732 		nodemask_t max_group = NODE_MASK_NONE;
1733 		int a, b;
1734 
1735 		/* Are there nodes at this distance from each other? */
1736 		if (!find_numa_distance(dist))
1737 			continue;
1738 
1739 		for_each_node_mask(a, nodes) {
1740 			unsigned long faults = 0;
1741 			nodemask_t this_group;
1742 			nodes_clear(this_group);
1743 
1744 			/* Sum group's NUMA faults; includes a==b case. */
1745 			for_each_node_mask(b, nodes) {
1746 				if (node_distance(a, b) < dist) {
1747 					faults += group_faults(p, b);
1748 					node_set(b, this_group);
1749 					node_clear(b, nodes);
1750 				}
1751 			}
1752 
1753 			/* Remember the top group. */
1754 			if (faults > max_faults) {
1755 				max_faults = faults;
1756 				max_group = this_group;
1757 				/*
1758 				 * subtle: at the smallest distance there is
1759 				 * just one node left in each "group", the
1760 				 * winner is the preferred nid.
1761 				 */
1762 				nid = a;
1763 			}
1764 		}
1765 		/* Next round, evaluate the nodes within max_group. */
1766 		nodes = max_group;
1767 	}
1768 	return nid;
1769 }
1770 
1771 static void task_numa_placement(struct task_struct *p)
1772 {
1773 	int seq, nid, max_nid = -1, max_group_nid = -1;
1774 	unsigned long max_faults = 0, max_group_faults = 0;
1775 	unsigned long fault_types[2] = { 0, 0 };
1776 	unsigned long total_faults;
1777 	u64 runtime, period;
1778 	spinlock_t *group_lock = NULL;
1779 
1780 	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
1781 	if (p->numa_scan_seq == seq)
1782 		return;
1783 	p->numa_scan_seq = seq;
1784 	p->numa_scan_period_max = task_scan_max(p);
1785 
1786 	total_faults = p->numa_faults_locality[0] +
1787 		       p->numa_faults_locality[1];
1788 	runtime = numa_get_avg_runtime(p, &period);
1789 
1790 	/* If the task is part of a group prevent parallel updates to group stats */
1791 	if (p->numa_group) {
1792 		group_lock = &p->numa_group->lock;
1793 		spin_lock_irq(group_lock);
1794 	}
1795 
1796 	/* Find the node with the highest number of faults */
1797 	for_each_online_node(nid) {
1798 		/* Keep track of the offsets in numa_faults array */
1799 		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1800 		unsigned long faults = 0, group_faults = 0;
1801 		int priv;
1802 
1803 		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1804 			long diff, f_diff, f_weight;
1805 
1806 			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1807 			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1808 			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1809 			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1810 
1811 			/* Decay existing window, copy faults since last scan */
1812 			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1813 			fault_types[priv] += p->numa_faults[membuf_idx];
1814 			p->numa_faults[membuf_idx] = 0;
1815 
1816 			/*
1817 			 * Normalize the faults_from, so all tasks in a group
1818 			 * count according to CPU use, instead of by the raw
1819 			 * number of faults. Tasks with little runtime have
1820 			 * little over-all impact on throughput, and thus their
1821 			 * faults are less important.
1822 			 */
1823 			f_weight = div64_u64(runtime << 16, period + 1);
1824 			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1825 				   (total_faults + 1);
1826 			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1827 			p->numa_faults[cpubuf_idx] = 0;
1828 
1829 			p->numa_faults[mem_idx] += diff;
1830 			p->numa_faults[cpu_idx] += f_diff;
1831 			faults += p->numa_faults[mem_idx];
1832 			p->total_numa_faults += diff;
1833 			if (p->numa_group) {
1834 				/*
1835 				 * safe because we can only change our own group
1836 				 *
1837 				 * mem_idx represents the offset for a given
1838 				 * nid and priv in a specific region because it
1839 				 * is at the beginning of the numa_faults array.
1840 				 */
1841 				p->numa_group->faults[mem_idx] += diff;
1842 				p->numa_group->faults_cpu[mem_idx] += f_diff;
1843 				p->numa_group->total_faults += diff;
1844 				group_faults += p->numa_group->faults[mem_idx];
1845 			}
1846 		}
1847 
1848 		if (faults > max_faults) {
1849 			max_faults = faults;
1850 			max_nid = nid;
1851 		}
1852 
1853 		if (group_faults > max_group_faults) {
1854 			max_group_faults = group_faults;
1855 			max_group_nid = nid;
1856 		}
1857 	}
1858 
1859 	update_task_scan_period(p, fault_types[0], fault_types[1]);
1860 
1861 	if (p->numa_group) {
1862 		update_numa_active_node_mask(p->numa_group);
1863 		spin_unlock_irq(group_lock);
1864 		max_nid = preferred_group_nid(p, max_group_nid);
1865 	}
1866 
1867 	if (max_faults) {
1868 		/* Set the new preferred node */
1869 		if (max_nid != p->numa_preferred_nid)
1870 			sched_setnuma(p, max_nid);
1871 
1872 		if (task_node(p) != p->numa_preferred_nid)
1873 			numa_migrate_preferred(p);
1874 	}
1875 }
1876 
1877 static inline int get_numa_group(struct numa_group *grp)
1878 {
1879 	return atomic_inc_not_zero(&grp->refcount);
1880 }
1881 
1882 static inline void put_numa_group(struct numa_group *grp)
1883 {
1884 	if (atomic_dec_and_test(&grp->refcount))
1885 		kfree_rcu(grp, rcu);
1886 }
1887 
1888 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1889 			int *priv)
1890 {
1891 	struct numa_group *grp, *my_grp;
1892 	struct task_struct *tsk;
1893 	bool join = false;
1894 	int cpu = cpupid_to_cpu(cpupid);
1895 	int i;
1896 
1897 	if (unlikely(!p->numa_group)) {
1898 		unsigned int size = sizeof(struct numa_group) +
1899 				    4*nr_node_ids*sizeof(unsigned long);
1900 
1901 		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1902 		if (!grp)
1903 			return;
1904 
1905 		atomic_set(&grp->refcount, 1);
1906 		spin_lock_init(&grp->lock);
1907 		grp->gid = p->pid;
1908 		/* Second half of the array tracks nids where faults happen */
1909 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1910 						nr_node_ids;
1911 
1912 		node_set(task_node(current), grp->active_nodes);
1913 
1914 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1915 			grp->faults[i] = p->numa_faults[i];
1916 
1917 		grp->total_faults = p->total_numa_faults;
1918 
1919 		grp->nr_tasks++;
1920 		rcu_assign_pointer(p->numa_group, grp);
1921 	}
1922 
1923 	rcu_read_lock();
1924 	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1925 
1926 	if (!cpupid_match_pid(tsk, cpupid))
1927 		goto no_join;
1928 
1929 	grp = rcu_dereference(tsk->numa_group);
1930 	if (!grp)
1931 		goto no_join;
1932 
1933 	my_grp = p->numa_group;
1934 	if (grp == my_grp)
1935 		goto no_join;
1936 
1937 	/*
1938 	 * Only join the other group if its bigger; if we're the bigger group,
1939 	 * the other task will join us.
1940 	 */
1941 	if (my_grp->nr_tasks > grp->nr_tasks)
1942 		goto no_join;
1943 
1944 	/*
1945 	 * Tie-break on the grp address.
1946 	 */
1947 	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1948 		goto no_join;
1949 
1950 	/* Always join threads in the same process. */
1951 	if (tsk->mm == current->mm)
1952 		join = true;
1953 
1954 	/* Simple filter to avoid false positives due to PID collisions */
1955 	if (flags & TNF_SHARED)
1956 		join = true;
1957 
1958 	/* Update priv based on whether false sharing was detected */
1959 	*priv = !join;
1960 
1961 	if (join && !get_numa_group(grp))
1962 		goto no_join;
1963 
1964 	rcu_read_unlock();
1965 
1966 	if (!join)
1967 		return;
1968 
1969 	BUG_ON(irqs_disabled());
1970 	double_lock_irq(&my_grp->lock, &grp->lock);
1971 
1972 	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1973 		my_grp->faults[i] -= p->numa_faults[i];
1974 		grp->faults[i] += p->numa_faults[i];
1975 	}
1976 	my_grp->total_faults -= p->total_numa_faults;
1977 	grp->total_faults += p->total_numa_faults;
1978 
1979 	my_grp->nr_tasks--;
1980 	grp->nr_tasks++;
1981 
1982 	spin_unlock(&my_grp->lock);
1983 	spin_unlock_irq(&grp->lock);
1984 
1985 	rcu_assign_pointer(p->numa_group, grp);
1986 
1987 	put_numa_group(my_grp);
1988 	return;
1989 
1990 no_join:
1991 	rcu_read_unlock();
1992 	return;
1993 }
1994 
1995 void task_numa_free(struct task_struct *p)
1996 {
1997 	struct numa_group *grp = p->numa_group;
1998 	void *numa_faults = p->numa_faults;
1999 	unsigned long flags;
2000 	int i;
2001 
2002 	if (grp) {
2003 		spin_lock_irqsave(&grp->lock, flags);
2004 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2005 			grp->faults[i] -= p->numa_faults[i];
2006 		grp->total_faults -= p->total_numa_faults;
2007 
2008 		grp->nr_tasks--;
2009 		spin_unlock_irqrestore(&grp->lock, flags);
2010 		RCU_INIT_POINTER(p->numa_group, NULL);
2011 		put_numa_group(grp);
2012 	}
2013 
2014 	p->numa_faults = NULL;
2015 	kfree(numa_faults);
2016 }
2017 
2018 /*
2019  * Got a PROT_NONE fault for a page on @node.
2020  */
2021 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2022 {
2023 	struct task_struct *p = current;
2024 	bool migrated = flags & TNF_MIGRATED;
2025 	int cpu_node = task_node(current);
2026 	int local = !!(flags & TNF_FAULT_LOCAL);
2027 	int priv;
2028 
2029 	if (!numabalancing_enabled)
2030 		return;
2031 
2032 	/* for example, ksmd faulting in a user's mm */
2033 	if (!p->mm)
2034 		return;
2035 
2036 	/* Allocate buffer to track faults on a per-node basis */
2037 	if (unlikely(!p->numa_faults)) {
2038 		int size = sizeof(*p->numa_faults) *
2039 			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2040 
2041 		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2042 		if (!p->numa_faults)
2043 			return;
2044 
2045 		p->total_numa_faults = 0;
2046 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2047 	}
2048 
2049 	/*
2050 	 * First accesses are treated as private, otherwise consider accesses
2051 	 * to be private if the accessing pid has not changed
2052 	 */
2053 	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2054 		priv = 1;
2055 	} else {
2056 		priv = cpupid_match_pid(p, last_cpupid);
2057 		if (!priv && !(flags & TNF_NO_GROUP))
2058 			task_numa_group(p, last_cpupid, flags, &priv);
2059 	}
2060 
2061 	/*
2062 	 * If a workload spans multiple NUMA nodes, a shared fault that
2063 	 * occurs wholly within the set of nodes that the workload is
2064 	 * actively using should be counted as local. This allows the
2065 	 * scan rate to slow down when a workload has settled down.
2066 	 */
2067 	if (!priv && !local && p->numa_group &&
2068 			node_isset(cpu_node, p->numa_group->active_nodes) &&
2069 			node_isset(mem_node, p->numa_group->active_nodes))
2070 		local = 1;
2071 
2072 	task_numa_placement(p);
2073 
2074 	/*
2075 	 * Retry task to preferred node migration periodically, in case it
2076 	 * case it previously failed, or the scheduler moved us.
2077 	 */
2078 	if (time_after(jiffies, p->numa_migrate_retry))
2079 		numa_migrate_preferred(p);
2080 
2081 	if (migrated)
2082 		p->numa_pages_migrated += pages;
2083 
2084 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2085 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2086 	p->numa_faults_locality[local] += pages;
2087 }
2088 
2089 static void reset_ptenuma_scan(struct task_struct *p)
2090 {
2091 	ACCESS_ONCE(p->mm->numa_scan_seq)++;
2092 	p->mm->numa_scan_offset = 0;
2093 }
2094 
2095 /*
2096  * The expensive part of numa migration is done from task_work context.
2097  * Triggered from task_tick_numa().
2098  */
2099 void task_numa_work(struct callback_head *work)
2100 {
2101 	unsigned long migrate, next_scan, now = jiffies;
2102 	struct task_struct *p = current;
2103 	struct mm_struct *mm = p->mm;
2104 	struct vm_area_struct *vma;
2105 	unsigned long start, end;
2106 	unsigned long nr_pte_updates = 0;
2107 	long pages;
2108 
2109 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2110 
2111 	work->next = work; /* protect against double add */
2112 	/*
2113 	 * Who cares about NUMA placement when they're dying.
2114 	 *
2115 	 * NOTE: make sure not to dereference p->mm before this check,
2116 	 * exit_task_work() happens _after_ exit_mm() so we could be called
2117 	 * without p->mm even though we still had it when we enqueued this
2118 	 * work.
2119 	 */
2120 	if (p->flags & PF_EXITING)
2121 		return;
2122 
2123 	if (!mm->numa_next_scan) {
2124 		mm->numa_next_scan = now +
2125 			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2126 	}
2127 
2128 	/*
2129 	 * Enforce maximal scan/migration frequency..
2130 	 */
2131 	migrate = mm->numa_next_scan;
2132 	if (time_before(now, migrate))
2133 		return;
2134 
2135 	if (p->numa_scan_period == 0) {
2136 		p->numa_scan_period_max = task_scan_max(p);
2137 		p->numa_scan_period = task_scan_min(p);
2138 	}
2139 
2140 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2141 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2142 		return;
2143 
2144 	/*
2145 	 * Delay this task enough that another task of this mm will likely win
2146 	 * the next time around.
2147 	 */
2148 	p->node_stamp += 2 * TICK_NSEC;
2149 
2150 	start = mm->numa_scan_offset;
2151 	pages = sysctl_numa_balancing_scan_size;
2152 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2153 	if (!pages)
2154 		return;
2155 
2156 	down_read(&mm->mmap_sem);
2157 	vma = find_vma(mm, start);
2158 	if (!vma) {
2159 		reset_ptenuma_scan(p);
2160 		start = 0;
2161 		vma = mm->mmap;
2162 	}
2163 	for (; vma; vma = vma->vm_next) {
2164 		if (!vma_migratable(vma) || !vma_policy_mof(vma))
2165 			continue;
2166 
2167 		/*
2168 		 * Shared library pages mapped by multiple processes are not
2169 		 * migrated as it is expected they are cache replicated. Avoid
2170 		 * hinting faults in read-only file-backed mappings or the vdso
2171 		 * as migrating the pages will be of marginal benefit.
2172 		 */
2173 		if (!vma->vm_mm ||
2174 		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2175 			continue;
2176 
2177 		/*
2178 		 * Skip inaccessible VMAs to avoid any confusion between
2179 		 * PROT_NONE and NUMA hinting ptes
2180 		 */
2181 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2182 			continue;
2183 
2184 		do {
2185 			start = max(start, vma->vm_start);
2186 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2187 			end = min(end, vma->vm_end);
2188 			nr_pte_updates += change_prot_numa(vma, start, end);
2189 
2190 			/*
2191 			 * Scan sysctl_numa_balancing_scan_size but ensure that
2192 			 * at least one PTE is updated so that unused virtual
2193 			 * address space is quickly skipped.
2194 			 */
2195 			if (nr_pte_updates)
2196 				pages -= (end - start) >> PAGE_SHIFT;
2197 
2198 			start = end;
2199 			if (pages <= 0)
2200 				goto out;
2201 
2202 			cond_resched();
2203 		} while (end != vma->vm_end);
2204 	}
2205 
2206 out:
2207 	/*
2208 	 * It is possible to reach the end of the VMA list but the last few
2209 	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2210 	 * would find the !migratable VMA on the next scan but not reset the
2211 	 * scanner to the start so check it now.
2212 	 */
2213 	if (vma)
2214 		mm->numa_scan_offset = start;
2215 	else
2216 		reset_ptenuma_scan(p);
2217 	up_read(&mm->mmap_sem);
2218 }
2219 
2220 /*
2221  * Drive the periodic memory faults..
2222  */
2223 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2224 {
2225 	struct callback_head *work = &curr->numa_work;
2226 	u64 period, now;
2227 
2228 	/*
2229 	 * We don't care about NUMA placement if we don't have memory.
2230 	 */
2231 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2232 		return;
2233 
2234 	/*
2235 	 * Using runtime rather than walltime has the dual advantage that
2236 	 * we (mostly) drive the selection from busy threads and that the
2237 	 * task needs to have done some actual work before we bother with
2238 	 * NUMA placement.
2239 	 */
2240 	now = curr->se.sum_exec_runtime;
2241 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2242 
2243 	if (now - curr->node_stamp > period) {
2244 		if (!curr->node_stamp)
2245 			curr->numa_scan_period = task_scan_min(curr);
2246 		curr->node_stamp += period;
2247 
2248 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2249 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2250 			task_work_add(curr, work, true);
2251 		}
2252 	}
2253 }
2254 #else
2255 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2256 {
2257 }
2258 
2259 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2260 {
2261 }
2262 
2263 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2264 {
2265 }
2266 #endif /* CONFIG_NUMA_BALANCING */
2267 
2268 static void
2269 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2270 {
2271 	update_load_add(&cfs_rq->load, se->load.weight);
2272 	if (!parent_entity(se))
2273 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2274 #ifdef CONFIG_SMP
2275 	if (entity_is_task(se)) {
2276 		struct rq *rq = rq_of(cfs_rq);
2277 
2278 		account_numa_enqueue(rq, task_of(se));
2279 		list_add(&se->group_node, &rq->cfs_tasks);
2280 	}
2281 #endif
2282 	cfs_rq->nr_running++;
2283 }
2284 
2285 static void
2286 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2287 {
2288 	update_load_sub(&cfs_rq->load, se->load.weight);
2289 	if (!parent_entity(se))
2290 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2291 	if (entity_is_task(se)) {
2292 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2293 		list_del_init(&se->group_node);
2294 	}
2295 	cfs_rq->nr_running--;
2296 }
2297 
2298 #ifdef CONFIG_FAIR_GROUP_SCHED
2299 # ifdef CONFIG_SMP
2300 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2301 {
2302 	long tg_weight;
2303 
2304 	/*
2305 	 * Use this CPU's actual weight instead of the last load_contribution
2306 	 * to gain a more accurate current total weight. See
2307 	 * update_cfs_rq_load_contribution().
2308 	 */
2309 	tg_weight = atomic_long_read(&tg->load_avg);
2310 	tg_weight -= cfs_rq->tg_load_contrib;
2311 	tg_weight += cfs_rq->load.weight;
2312 
2313 	return tg_weight;
2314 }
2315 
2316 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2317 {
2318 	long tg_weight, load, shares;
2319 
2320 	tg_weight = calc_tg_weight(tg, cfs_rq);
2321 	load = cfs_rq->load.weight;
2322 
2323 	shares = (tg->shares * load);
2324 	if (tg_weight)
2325 		shares /= tg_weight;
2326 
2327 	if (shares < MIN_SHARES)
2328 		shares = MIN_SHARES;
2329 	if (shares > tg->shares)
2330 		shares = tg->shares;
2331 
2332 	return shares;
2333 }
2334 # else /* CONFIG_SMP */
2335 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2336 {
2337 	return tg->shares;
2338 }
2339 # endif /* CONFIG_SMP */
2340 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2341 			    unsigned long weight)
2342 {
2343 	if (se->on_rq) {
2344 		/* commit outstanding execution time */
2345 		if (cfs_rq->curr == se)
2346 			update_curr(cfs_rq);
2347 		account_entity_dequeue(cfs_rq, se);
2348 	}
2349 
2350 	update_load_set(&se->load, weight);
2351 
2352 	if (se->on_rq)
2353 		account_entity_enqueue(cfs_rq, se);
2354 }
2355 
2356 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2357 
2358 static void update_cfs_shares(struct cfs_rq *cfs_rq)
2359 {
2360 	struct task_group *tg;
2361 	struct sched_entity *se;
2362 	long shares;
2363 
2364 	tg = cfs_rq->tg;
2365 	se = tg->se[cpu_of(rq_of(cfs_rq))];
2366 	if (!se || throttled_hierarchy(cfs_rq))
2367 		return;
2368 #ifndef CONFIG_SMP
2369 	if (likely(se->load.weight == tg->shares))
2370 		return;
2371 #endif
2372 	shares = calc_cfs_shares(cfs_rq, tg);
2373 
2374 	reweight_entity(cfs_rq_of(se), se, shares);
2375 }
2376 #else /* CONFIG_FAIR_GROUP_SCHED */
2377 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2378 {
2379 }
2380 #endif /* CONFIG_FAIR_GROUP_SCHED */
2381 
2382 #ifdef CONFIG_SMP
2383 /*
2384  * We choose a half-life close to 1 scheduling period.
2385  * Note: The tables below are dependent on this value.
2386  */
2387 #define LOAD_AVG_PERIOD 32
2388 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
2389 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
2390 
2391 /* Precomputed fixed inverse multiplies for multiplication by y^n */
2392 static const u32 runnable_avg_yN_inv[] = {
2393 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2394 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2395 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2396 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2397 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2398 	0x85aac367, 0x82cd8698,
2399 };
2400 
2401 /*
2402  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2403  * over-estimates when re-combining.
2404  */
2405 static const u32 runnable_avg_yN_sum[] = {
2406 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2407 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2408 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2409 };
2410 
2411 /*
2412  * Approximate:
2413  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2414  */
2415 static __always_inline u64 decay_load(u64 val, u64 n)
2416 {
2417 	unsigned int local_n;
2418 
2419 	if (!n)
2420 		return val;
2421 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2422 		return 0;
2423 
2424 	/* after bounds checking we can collapse to 32-bit */
2425 	local_n = n;
2426 
2427 	/*
2428 	 * As y^PERIOD = 1/2, we can combine
2429 	 *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2430 	 * With a look-up table which covers y^n (n<PERIOD)
2431 	 *
2432 	 * To achieve constant time decay_load.
2433 	 */
2434 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2435 		val >>= local_n / LOAD_AVG_PERIOD;
2436 		local_n %= LOAD_AVG_PERIOD;
2437 	}
2438 
2439 	val *= runnable_avg_yN_inv[local_n];
2440 	/* We don't use SRR here since we always want to round down. */
2441 	return val >> 32;
2442 }
2443 
2444 /*
2445  * For updates fully spanning n periods, the contribution to runnable
2446  * average will be: \Sum 1024*y^n
2447  *
2448  * We can compute this reasonably efficiently by combining:
2449  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2450  */
2451 static u32 __compute_runnable_contrib(u64 n)
2452 {
2453 	u32 contrib = 0;
2454 
2455 	if (likely(n <= LOAD_AVG_PERIOD))
2456 		return runnable_avg_yN_sum[n];
2457 	else if (unlikely(n >= LOAD_AVG_MAX_N))
2458 		return LOAD_AVG_MAX;
2459 
2460 	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2461 	do {
2462 		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2463 		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2464 
2465 		n -= LOAD_AVG_PERIOD;
2466 	} while (n > LOAD_AVG_PERIOD);
2467 
2468 	contrib = decay_load(contrib, n);
2469 	return contrib + runnable_avg_yN_sum[n];
2470 }
2471 
2472 /*
2473  * We can represent the historical contribution to runnable average as the
2474  * coefficients of a geometric series.  To do this we sub-divide our runnable
2475  * history into segments of approximately 1ms (1024us); label the segment that
2476  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2477  *
2478  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2479  *      p0            p1           p2
2480  *     (now)       (~1ms ago)  (~2ms ago)
2481  *
2482  * Let u_i denote the fraction of p_i that the entity was runnable.
2483  *
2484  * We then designate the fractions u_i as our co-efficients, yielding the
2485  * following representation of historical load:
2486  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2487  *
2488  * We choose y based on the with of a reasonably scheduling period, fixing:
2489  *   y^32 = 0.5
2490  *
2491  * This means that the contribution to load ~32ms ago (u_32) will be weighted
2492  * approximately half as much as the contribution to load within the last ms
2493  * (u_0).
2494  *
2495  * When a period "rolls over" and we have new u_0`, multiplying the previous
2496  * sum again by y is sufficient to update:
2497  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2498  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2499  */
2500 static __always_inline int __update_entity_runnable_avg(u64 now,
2501 							struct sched_avg *sa,
2502 							int runnable)
2503 {
2504 	u64 delta, periods;
2505 	u32 runnable_contrib;
2506 	int delta_w, decayed = 0;
2507 
2508 	delta = now - sa->last_runnable_update;
2509 	/*
2510 	 * This should only happen when time goes backwards, which it
2511 	 * unfortunately does during sched clock init when we swap over to TSC.
2512 	 */
2513 	if ((s64)delta < 0) {
2514 		sa->last_runnable_update = now;
2515 		return 0;
2516 	}
2517 
2518 	/*
2519 	 * Use 1024ns as the unit of measurement since it's a reasonable
2520 	 * approximation of 1us and fast to compute.
2521 	 */
2522 	delta >>= 10;
2523 	if (!delta)
2524 		return 0;
2525 	sa->last_runnable_update = now;
2526 
2527 	/* delta_w is the amount already accumulated against our next period */
2528 	delta_w = sa->runnable_avg_period % 1024;
2529 	if (delta + delta_w >= 1024) {
2530 		/* period roll-over */
2531 		decayed = 1;
2532 
2533 		/*
2534 		 * Now that we know we're crossing a period boundary, figure
2535 		 * out how much from delta we need to complete the current
2536 		 * period and accrue it.
2537 		 */
2538 		delta_w = 1024 - delta_w;
2539 		if (runnable)
2540 			sa->runnable_avg_sum += delta_w;
2541 		sa->runnable_avg_period += delta_w;
2542 
2543 		delta -= delta_w;
2544 
2545 		/* Figure out how many additional periods this update spans */
2546 		periods = delta / 1024;
2547 		delta %= 1024;
2548 
2549 		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
2550 						  periods + 1);
2551 		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
2552 						     periods + 1);
2553 
2554 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
2555 		runnable_contrib = __compute_runnable_contrib(periods);
2556 		if (runnable)
2557 			sa->runnable_avg_sum += runnable_contrib;
2558 		sa->runnable_avg_period += runnable_contrib;
2559 	}
2560 
2561 	/* Remainder of delta accrued against u_0` */
2562 	if (runnable)
2563 		sa->runnable_avg_sum += delta;
2564 	sa->runnable_avg_period += delta;
2565 
2566 	return decayed;
2567 }
2568 
2569 /* Synchronize an entity's decay with its parenting cfs_rq.*/
2570 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2571 {
2572 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2573 	u64 decays = atomic64_read(&cfs_rq->decay_counter);
2574 
2575 	decays -= se->avg.decay_count;
2576 	se->avg.decay_count = 0;
2577 	if (!decays)
2578 		return 0;
2579 
2580 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2581 
2582 	return decays;
2583 }
2584 
2585 #ifdef CONFIG_FAIR_GROUP_SCHED
2586 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2587 						 int force_update)
2588 {
2589 	struct task_group *tg = cfs_rq->tg;
2590 	long tg_contrib;
2591 
2592 	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
2593 	tg_contrib -= cfs_rq->tg_load_contrib;
2594 
2595 	if (!tg_contrib)
2596 		return;
2597 
2598 	if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
2599 		atomic_long_add(tg_contrib, &tg->load_avg);
2600 		cfs_rq->tg_load_contrib += tg_contrib;
2601 	}
2602 }
2603 
2604 /*
2605  * Aggregate cfs_rq runnable averages into an equivalent task_group
2606  * representation for computing load contributions.
2607  */
2608 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2609 						  struct cfs_rq *cfs_rq)
2610 {
2611 	struct task_group *tg = cfs_rq->tg;
2612 	long contrib;
2613 
2614 	/* The fraction of a cpu used by this cfs_rq */
2615 	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
2616 			  sa->runnable_avg_period + 1);
2617 	contrib -= cfs_rq->tg_runnable_contrib;
2618 
2619 	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
2620 		atomic_add(contrib, &tg->runnable_avg);
2621 		cfs_rq->tg_runnable_contrib += contrib;
2622 	}
2623 }
2624 
2625 static inline void __update_group_entity_contrib(struct sched_entity *se)
2626 {
2627 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
2628 	struct task_group *tg = cfs_rq->tg;
2629 	int runnable_avg;
2630 
2631 	u64 contrib;
2632 
2633 	contrib = cfs_rq->tg_load_contrib * tg->shares;
2634 	se->avg.load_avg_contrib = div_u64(contrib,
2635 				     atomic_long_read(&tg->load_avg) + 1);
2636 
2637 	/*
2638 	 * For group entities we need to compute a correction term in the case
2639 	 * that they are consuming <1 cpu so that we would contribute the same
2640 	 * load as a task of equal weight.
2641 	 *
2642 	 * Explicitly co-ordinating this measurement would be expensive, but
2643 	 * fortunately the sum of each cpus contribution forms a usable
2644 	 * lower-bound on the true value.
2645 	 *
2646 	 * Consider the aggregate of 2 contributions.  Either they are disjoint
2647 	 * (and the sum represents true value) or they are disjoint and we are
2648 	 * understating by the aggregate of their overlap.
2649 	 *
2650 	 * Extending this to N cpus, for a given overlap, the maximum amount we
2651 	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
2652 	 * cpus that overlap for this interval and w_i is the interval width.
2653 	 *
2654 	 * On a small machine; the first term is well-bounded which bounds the
2655 	 * total error since w_i is a subset of the period.  Whereas on a
2656 	 * larger machine, while this first term can be larger, if w_i is the
2657 	 * of consequential size guaranteed to see n_i*w_i quickly converge to
2658 	 * our upper bound of 1-cpu.
2659 	 */
2660 	runnable_avg = atomic_read(&tg->runnable_avg);
2661 	if (runnable_avg < NICE_0_LOAD) {
2662 		se->avg.load_avg_contrib *= runnable_avg;
2663 		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2664 	}
2665 }
2666 
2667 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2668 {
2669 	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2670 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
2671 }
2672 #else /* CONFIG_FAIR_GROUP_SCHED */
2673 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2674 						 int force_update) {}
2675 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2676 						  struct cfs_rq *cfs_rq) {}
2677 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2678 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2679 #endif /* CONFIG_FAIR_GROUP_SCHED */
2680 
2681 static inline void __update_task_entity_contrib(struct sched_entity *se)
2682 {
2683 	u32 contrib;
2684 
2685 	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2686 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2687 	contrib /= (se->avg.runnable_avg_period + 1);
2688 	se->avg.load_avg_contrib = scale_load(contrib);
2689 }
2690 
2691 /* Compute the current contribution to load_avg by se, return any delta */
2692 static long __update_entity_load_avg_contrib(struct sched_entity *se)
2693 {
2694 	long old_contrib = se->avg.load_avg_contrib;
2695 
2696 	if (entity_is_task(se)) {
2697 		__update_task_entity_contrib(se);
2698 	} else {
2699 		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
2700 		__update_group_entity_contrib(se);
2701 	}
2702 
2703 	return se->avg.load_avg_contrib - old_contrib;
2704 }
2705 
2706 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
2707 						 long load_contrib)
2708 {
2709 	if (likely(load_contrib < cfs_rq->blocked_load_avg))
2710 		cfs_rq->blocked_load_avg -= load_contrib;
2711 	else
2712 		cfs_rq->blocked_load_avg = 0;
2713 }
2714 
2715 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2716 
2717 /* Update a sched_entity's runnable average */
2718 static inline void update_entity_load_avg(struct sched_entity *se,
2719 					  int update_cfs_rq)
2720 {
2721 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2722 	long contrib_delta;
2723 	u64 now;
2724 
2725 	/*
2726 	 * For a group entity we need to use their owned cfs_rq_clock_task() in
2727 	 * case they are the parent of a throttled hierarchy.
2728 	 */
2729 	if (entity_is_task(se))
2730 		now = cfs_rq_clock_task(cfs_rq);
2731 	else
2732 		now = cfs_rq_clock_task(group_cfs_rq(se));
2733 
2734 	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
2735 		return;
2736 
2737 	contrib_delta = __update_entity_load_avg_contrib(se);
2738 
2739 	if (!update_cfs_rq)
2740 		return;
2741 
2742 	if (se->on_rq)
2743 		cfs_rq->runnable_load_avg += contrib_delta;
2744 	else
2745 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2746 }
2747 
2748 /*
2749  * Decay the load contributed by all blocked children and account this so that
2750  * their contribution may appropriately discounted when they wake up.
2751  */
2752 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2753 {
2754 	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
2755 	u64 decays;
2756 
2757 	decays = now - cfs_rq->last_decay;
2758 	if (!decays && !force_update)
2759 		return;
2760 
2761 	if (atomic_long_read(&cfs_rq->removed_load)) {
2762 		unsigned long removed_load;
2763 		removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
2764 		subtract_blocked_load_contrib(cfs_rq, removed_load);
2765 	}
2766 
2767 	if (decays) {
2768 		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
2769 						      decays);
2770 		atomic64_add(decays, &cfs_rq->decay_counter);
2771 		cfs_rq->last_decay = now;
2772 	}
2773 
2774 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2775 }
2776 
2777 /* Add the load generated by se into cfs_rq's child load-average */
2778 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2779 						  struct sched_entity *se,
2780 						  int wakeup)
2781 {
2782 	/*
2783 	 * We track migrations using entity decay_count <= 0, on a wake-up
2784 	 * migration we use a negative decay count to track the remote decays
2785 	 * accumulated while sleeping.
2786 	 *
2787 	 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
2788 	 * are seen by enqueue_entity_load_avg() as a migration with an already
2789 	 * constructed load_avg_contrib.
2790 	 */
2791 	if (unlikely(se->avg.decay_count <= 0)) {
2792 		se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
2793 		if (se->avg.decay_count) {
2794 			/*
2795 			 * In a wake-up migration we have to approximate the
2796 			 * time sleeping.  This is because we can't synchronize
2797 			 * clock_task between the two cpus, and it is not
2798 			 * guaranteed to be read-safe.  Instead, we can
2799 			 * approximate this using our carried decays, which are
2800 			 * explicitly atomically readable.
2801 			 */
2802 			se->avg.last_runnable_update -= (-se->avg.decay_count)
2803 							<< 20;
2804 			update_entity_load_avg(se, 0);
2805 			/* Indicate that we're now synchronized and on-rq */
2806 			se->avg.decay_count = 0;
2807 		}
2808 		wakeup = 0;
2809 	} else {
2810 		__synchronize_entity_decay(se);
2811 	}
2812 
2813 	/* migrated tasks did not contribute to our blocked load */
2814 	if (wakeup) {
2815 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
2816 		update_entity_load_avg(se, 0);
2817 	}
2818 
2819 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
2820 	/* we force update consideration on load-balancer moves */
2821 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2822 }
2823 
2824 /*
2825  * Remove se's load from this cfs_rq child load-average, if the entity is
2826  * transitioning to a blocked state we track its projected decay using
2827  * blocked_load_avg.
2828  */
2829 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2830 						  struct sched_entity *se,
2831 						  int sleep)
2832 {
2833 	update_entity_load_avg(se, 1);
2834 	/* we force update consideration on load-balancer moves */
2835 	update_cfs_rq_blocked_load(cfs_rq, !sleep);
2836 
2837 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
2838 	if (sleep) {
2839 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
2840 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
2841 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
2842 }
2843 
2844 /*
2845  * Update the rq's load with the elapsed running time before entering
2846  * idle. if the last scheduled task is not a CFS task, idle_enter will
2847  * be the only way to update the runnable statistic.
2848  */
2849 void idle_enter_fair(struct rq *this_rq)
2850 {
2851 	update_rq_runnable_avg(this_rq, 1);
2852 }
2853 
2854 /*
2855  * Update the rq's load with the elapsed idle time before a task is
2856  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
2857  * be the only way to update the runnable statistic.
2858  */
2859 void idle_exit_fair(struct rq *this_rq)
2860 {
2861 	update_rq_runnable_avg(this_rq, 0);
2862 }
2863 
2864 static int idle_balance(struct rq *this_rq);
2865 
2866 #else /* CONFIG_SMP */
2867 
2868 static inline void update_entity_load_avg(struct sched_entity *se,
2869 					  int update_cfs_rq) {}
2870 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2871 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2872 					   struct sched_entity *se,
2873 					   int wakeup) {}
2874 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2875 					   struct sched_entity *se,
2876 					   int sleep) {}
2877 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2878 					      int force_update) {}
2879 
2880 static inline int idle_balance(struct rq *rq)
2881 {
2882 	return 0;
2883 }
2884 
2885 #endif /* CONFIG_SMP */
2886 
2887 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2888 {
2889 #ifdef CONFIG_SCHEDSTATS
2890 	struct task_struct *tsk = NULL;
2891 
2892 	if (entity_is_task(se))
2893 		tsk = task_of(se);
2894 
2895 	if (se->statistics.sleep_start) {
2896 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
2897 
2898 		if ((s64)delta < 0)
2899 			delta = 0;
2900 
2901 		if (unlikely(delta > se->statistics.sleep_max))
2902 			se->statistics.sleep_max = delta;
2903 
2904 		se->statistics.sleep_start = 0;
2905 		se->statistics.sum_sleep_runtime += delta;
2906 
2907 		if (tsk) {
2908 			account_scheduler_latency(tsk, delta >> 10, 1);
2909 			trace_sched_stat_sleep(tsk, delta);
2910 		}
2911 	}
2912 	if (se->statistics.block_start) {
2913 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
2914 
2915 		if ((s64)delta < 0)
2916 			delta = 0;
2917 
2918 		if (unlikely(delta > se->statistics.block_max))
2919 			se->statistics.block_max = delta;
2920 
2921 		se->statistics.block_start = 0;
2922 		se->statistics.sum_sleep_runtime += delta;
2923 
2924 		if (tsk) {
2925 			if (tsk->in_iowait) {
2926 				se->statistics.iowait_sum += delta;
2927 				se->statistics.iowait_count++;
2928 				trace_sched_stat_iowait(tsk, delta);
2929 			}
2930 
2931 			trace_sched_stat_blocked(tsk, delta);
2932 
2933 			/*
2934 			 * Blocking time is in units of nanosecs, so shift by
2935 			 * 20 to get a milliseconds-range estimation of the
2936 			 * amount of time that the task spent sleeping:
2937 			 */
2938 			if (unlikely(prof_on == SLEEP_PROFILING)) {
2939 				profile_hits(SLEEP_PROFILING,
2940 						(void *)get_wchan(tsk),
2941 						delta >> 20);
2942 			}
2943 			account_scheduler_latency(tsk, delta >> 10, 0);
2944 		}
2945 	}
2946 #endif
2947 }
2948 
2949 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
2950 {
2951 #ifdef CONFIG_SCHED_DEBUG
2952 	s64 d = se->vruntime - cfs_rq->min_vruntime;
2953 
2954 	if (d < 0)
2955 		d = -d;
2956 
2957 	if (d > 3*sysctl_sched_latency)
2958 		schedstat_inc(cfs_rq, nr_spread_over);
2959 #endif
2960 }
2961 
2962 static void
2963 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
2964 {
2965 	u64 vruntime = cfs_rq->min_vruntime;
2966 
2967 	/*
2968 	 * The 'current' period is already promised to the current tasks,
2969 	 * however the extra weight of the new task will slow them down a
2970 	 * little, place the new task so that it fits in the slot that
2971 	 * stays open at the end.
2972 	 */
2973 	if (initial && sched_feat(START_DEBIT))
2974 		vruntime += sched_vslice(cfs_rq, se);
2975 
2976 	/* sleeps up to a single latency don't count. */
2977 	if (!initial) {
2978 		unsigned long thresh = sysctl_sched_latency;
2979 
2980 		/*
2981 		 * Halve their sleep time's effect, to allow
2982 		 * for a gentler effect of sleepers:
2983 		 */
2984 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
2985 			thresh >>= 1;
2986 
2987 		vruntime -= thresh;
2988 	}
2989 
2990 	/* ensure we never gain time by being placed backwards. */
2991 	se->vruntime = max_vruntime(se->vruntime, vruntime);
2992 }
2993 
2994 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
2995 
2996 static void
2997 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
2998 {
2999 	/*
3000 	 * Update the normalized vruntime before updating min_vruntime
3001 	 * through calling update_curr().
3002 	 */
3003 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
3004 		se->vruntime += cfs_rq->min_vruntime;
3005 
3006 	/*
3007 	 * Update run-time statistics of the 'current'.
3008 	 */
3009 	update_curr(cfs_rq);
3010 	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
3011 	account_entity_enqueue(cfs_rq, se);
3012 	update_cfs_shares(cfs_rq);
3013 
3014 	if (flags & ENQUEUE_WAKEUP) {
3015 		place_entity(cfs_rq, se, 0);
3016 		enqueue_sleeper(cfs_rq, se);
3017 	}
3018 
3019 	update_stats_enqueue(cfs_rq, se);
3020 	check_spread(cfs_rq, se);
3021 	if (se != cfs_rq->curr)
3022 		__enqueue_entity(cfs_rq, se);
3023 	se->on_rq = 1;
3024 
3025 	if (cfs_rq->nr_running == 1) {
3026 		list_add_leaf_cfs_rq(cfs_rq);
3027 		check_enqueue_throttle(cfs_rq);
3028 	}
3029 }
3030 
3031 static void __clear_buddies_last(struct sched_entity *se)
3032 {
3033 	for_each_sched_entity(se) {
3034 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3035 		if (cfs_rq->last != se)
3036 			break;
3037 
3038 		cfs_rq->last = NULL;
3039 	}
3040 }
3041 
3042 static void __clear_buddies_next(struct sched_entity *se)
3043 {
3044 	for_each_sched_entity(se) {
3045 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3046 		if (cfs_rq->next != se)
3047 			break;
3048 
3049 		cfs_rq->next = NULL;
3050 	}
3051 }
3052 
3053 static void __clear_buddies_skip(struct sched_entity *se)
3054 {
3055 	for_each_sched_entity(se) {
3056 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3057 		if (cfs_rq->skip != se)
3058 			break;
3059 
3060 		cfs_rq->skip = NULL;
3061 	}
3062 }
3063 
3064 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3065 {
3066 	if (cfs_rq->last == se)
3067 		__clear_buddies_last(se);
3068 
3069 	if (cfs_rq->next == se)
3070 		__clear_buddies_next(se);
3071 
3072 	if (cfs_rq->skip == se)
3073 		__clear_buddies_skip(se);
3074 }
3075 
3076 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3077 
3078 static void
3079 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3080 {
3081 	/*
3082 	 * Update run-time statistics of the 'current'.
3083 	 */
3084 	update_curr(cfs_rq);
3085 	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
3086 
3087 	update_stats_dequeue(cfs_rq, se);
3088 	if (flags & DEQUEUE_SLEEP) {
3089 #ifdef CONFIG_SCHEDSTATS
3090 		if (entity_is_task(se)) {
3091 			struct task_struct *tsk = task_of(se);
3092 
3093 			if (tsk->state & TASK_INTERRUPTIBLE)
3094 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
3095 			if (tsk->state & TASK_UNINTERRUPTIBLE)
3096 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
3097 		}
3098 #endif
3099 	}
3100 
3101 	clear_buddies(cfs_rq, se);
3102 
3103 	if (se != cfs_rq->curr)
3104 		__dequeue_entity(cfs_rq, se);
3105 	se->on_rq = 0;
3106 	account_entity_dequeue(cfs_rq, se);
3107 
3108 	/*
3109 	 * Normalize the entity after updating the min_vruntime because the
3110 	 * update can refer to the ->curr item and we need to reflect this
3111 	 * movement in our normalized position.
3112 	 */
3113 	if (!(flags & DEQUEUE_SLEEP))
3114 		se->vruntime -= cfs_rq->min_vruntime;
3115 
3116 	/* return excess runtime on last dequeue */
3117 	return_cfs_rq_runtime(cfs_rq);
3118 
3119 	update_min_vruntime(cfs_rq);
3120 	update_cfs_shares(cfs_rq);
3121 }
3122 
3123 /*
3124  * Preempt the current task with a newly woken task if needed:
3125  */
3126 static void
3127 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3128 {
3129 	unsigned long ideal_runtime, delta_exec;
3130 	struct sched_entity *se;
3131 	s64 delta;
3132 
3133 	ideal_runtime = sched_slice(cfs_rq, curr);
3134 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3135 	if (delta_exec > ideal_runtime) {
3136 		resched_curr(rq_of(cfs_rq));
3137 		/*
3138 		 * The current task ran long enough, ensure it doesn't get
3139 		 * re-elected due to buddy favours.
3140 		 */
3141 		clear_buddies(cfs_rq, curr);
3142 		return;
3143 	}
3144 
3145 	/*
3146 	 * Ensure that a task that missed wakeup preemption by a
3147 	 * narrow margin doesn't have to wait for a full slice.
3148 	 * This also mitigates buddy induced latencies under load.
3149 	 */
3150 	if (delta_exec < sysctl_sched_min_granularity)
3151 		return;
3152 
3153 	se = __pick_first_entity(cfs_rq);
3154 	delta = curr->vruntime - se->vruntime;
3155 
3156 	if (delta < 0)
3157 		return;
3158 
3159 	if (delta > ideal_runtime)
3160 		resched_curr(rq_of(cfs_rq));
3161 }
3162 
3163 static void
3164 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3165 {
3166 	/* 'current' is not kept within the tree. */
3167 	if (se->on_rq) {
3168 		/*
3169 		 * Any task has to be enqueued before it get to execute on
3170 		 * a CPU. So account for the time it spent waiting on the
3171 		 * runqueue.
3172 		 */
3173 		update_stats_wait_end(cfs_rq, se);
3174 		__dequeue_entity(cfs_rq, se);
3175 	}
3176 
3177 	update_stats_curr_start(cfs_rq, se);
3178 	cfs_rq->curr = se;
3179 #ifdef CONFIG_SCHEDSTATS
3180 	/*
3181 	 * Track our maximum slice length, if the CPU's load is at
3182 	 * least twice that of our own weight (i.e. dont track it
3183 	 * when there are only lesser-weight tasks around):
3184 	 */
3185 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3186 		se->statistics.slice_max = max(se->statistics.slice_max,
3187 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
3188 	}
3189 #endif
3190 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
3191 }
3192 
3193 static int
3194 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3195 
3196 /*
3197  * Pick the next process, keeping these things in mind, in this order:
3198  * 1) keep things fair between processes/task groups
3199  * 2) pick the "next" process, since someone really wants that to run
3200  * 3) pick the "last" process, for cache locality
3201  * 4) do not run the "skip" process, if something else is available
3202  */
3203 static struct sched_entity *
3204 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3205 {
3206 	struct sched_entity *left = __pick_first_entity(cfs_rq);
3207 	struct sched_entity *se;
3208 
3209 	/*
3210 	 * If curr is set we have to see if its left of the leftmost entity
3211 	 * still in the tree, provided there was anything in the tree at all.
3212 	 */
3213 	if (!left || (curr && entity_before(curr, left)))
3214 		left = curr;
3215 
3216 	se = left; /* ideally we run the leftmost entity */
3217 
3218 	/*
3219 	 * Avoid running the skip buddy, if running something else can
3220 	 * be done without getting too unfair.
3221 	 */
3222 	if (cfs_rq->skip == se) {
3223 		struct sched_entity *second;
3224 
3225 		if (se == curr) {
3226 			second = __pick_first_entity(cfs_rq);
3227 		} else {
3228 			second = __pick_next_entity(se);
3229 			if (!second || (curr && entity_before(curr, second)))
3230 				second = curr;
3231 		}
3232 
3233 		if (second && wakeup_preempt_entity(second, left) < 1)
3234 			se = second;
3235 	}
3236 
3237 	/*
3238 	 * Prefer last buddy, try to return the CPU to a preempted task.
3239 	 */
3240 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3241 		se = cfs_rq->last;
3242 
3243 	/*
3244 	 * Someone really wants this to run. If it's not unfair, run it.
3245 	 */
3246 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3247 		se = cfs_rq->next;
3248 
3249 	clear_buddies(cfs_rq, se);
3250 
3251 	return se;
3252 }
3253 
3254 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3255 
3256 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3257 {
3258 	/*
3259 	 * If still on the runqueue then deactivate_task()
3260 	 * was not called and update_curr() has to be done:
3261 	 */
3262 	if (prev->on_rq)
3263 		update_curr(cfs_rq);
3264 
3265 	/* throttle cfs_rqs exceeding runtime */
3266 	check_cfs_rq_runtime(cfs_rq);
3267 
3268 	check_spread(cfs_rq, prev);
3269 	if (prev->on_rq) {
3270 		update_stats_wait_start(cfs_rq, prev);
3271 		/* Put 'current' back into the tree. */
3272 		__enqueue_entity(cfs_rq, prev);
3273 		/* in !on_rq case, update occurred at dequeue */
3274 		update_entity_load_avg(prev, 1);
3275 	}
3276 	cfs_rq->curr = NULL;
3277 }
3278 
3279 static void
3280 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3281 {
3282 	/*
3283 	 * Update run-time statistics of the 'current'.
3284 	 */
3285 	update_curr(cfs_rq);
3286 
3287 	/*
3288 	 * Ensure that runnable average is periodically updated.
3289 	 */
3290 	update_entity_load_avg(curr, 1);
3291 	update_cfs_rq_blocked_load(cfs_rq, 1);
3292 	update_cfs_shares(cfs_rq);
3293 
3294 #ifdef CONFIG_SCHED_HRTICK
3295 	/*
3296 	 * queued ticks are scheduled to match the slice, so don't bother
3297 	 * validating it and just reschedule.
3298 	 */
3299 	if (queued) {
3300 		resched_curr(rq_of(cfs_rq));
3301 		return;
3302 	}
3303 	/*
3304 	 * don't let the period tick interfere with the hrtick preemption
3305 	 */
3306 	if (!sched_feat(DOUBLE_TICK) &&
3307 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3308 		return;
3309 #endif
3310 
3311 	if (cfs_rq->nr_running > 1)
3312 		check_preempt_tick(cfs_rq, curr);
3313 }
3314 
3315 
3316 /**************************************************
3317  * CFS bandwidth control machinery
3318  */
3319 
3320 #ifdef CONFIG_CFS_BANDWIDTH
3321 
3322 #ifdef HAVE_JUMP_LABEL
3323 static struct static_key __cfs_bandwidth_used;
3324 
3325 static inline bool cfs_bandwidth_used(void)
3326 {
3327 	return static_key_false(&__cfs_bandwidth_used);
3328 }
3329 
3330 void cfs_bandwidth_usage_inc(void)
3331 {
3332 	static_key_slow_inc(&__cfs_bandwidth_used);
3333 }
3334 
3335 void cfs_bandwidth_usage_dec(void)
3336 {
3337 	static_key_slow_dec(&__cfs_bandwidth_used);
3338 }
3339 #else /* HAVE_JUMP_LABEL */
3340 static bool cfs_bandwidth_used(void)
3341 {
3342 	return true;
3343 }
3344 
3345 void cfs_bandwidth_usage_inc(void) {}
3346 void cfs_bandwidth_usage_dec(void) {}
3347 #endif /* HAVE_JUMP_LABEL */
3348 
3349 /*
3350  * default period for cfs group bandwidth.
3351  * default: 0.1s, units: nanoseconds
3352  */
3353 static inline u64 default_cfs_period(void)
3354 {
3355 	return 100000000ULL;
3356 }
3357 
3358 static inline u64 sched_cfs_bandwidth_slice(void)
3359 {
3360 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3361 }
3362 
3363 /*
3364  * Replenish runtime according to assigned quota and update expiration time.
3365  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3366  * additional synchronization around rq->lock.
3367  *
3368  * requires cfs_b->lock
3369  */
3370 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3371 {
3372 	u64 now;
3373 
3374 	if (cfs_b->quota == RUNTIME_INF)
3375 		return;
3376 
3377 	now = sched_clock_cpu(smp_processor_id());
3378 	cfs_b->runtime = cfs_b->quota;
3379 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3380 }
3381 
3382 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3383 {
3384 	return &tg->cfs_bandwidth;
3385 }
3386 
3387 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3388 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3389 {
3390 	if (unlikely(cfs_rq->throttle_count))
3391 		return cfs_rq->throttled_clock_task;
3392 
3393 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3394 }
3395 
3396 /* returns 0 on failure to allocate runtime */
3397 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3398 {
3399 	struct task_group *tg = cfs_rq->tg;
3400 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3401 	u64 amount = 0, min_amount, expires;
3402 
3403 	/* note: this is a positive sum as runtime_remaining <= 0 */
3404 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3405 
3406 	raw_spin_lock(&cfs_b->lock);
3407 	if (cfs_b->quota == RUNTIME_INF)
3408 		amount = min_amount;
3409 	else {
3410 		/*
3411 		 * If the bandwidth pool has become inactive, then at least one
3412 		 * period must have elapsed since the last consumption.
3413 		 * Refresh the global state and ensure bandwidth timer becomes
3414 		 * active.
3415 		 */
3416 		if (!cfs_b->timer_active) {
3417 			__refill_cfs_bandwidth_runtime(cfs_b);
3418 			__start_cfs_bandwidth(cfs_b, false);
3419 		}
3420 
3421 		if (cfs_b->runtime > 0) {
3422 			amount = min(cfs_b->runtime, min_amount);
3423 			cfs_b->runtime -= amount;
3424 			cfs_b->idle = 0;
3425 		}
3426 	}
3427 	expires = cfs_b->runtime_expires;
3428 	raw_spin_unlock(&cfs_b->lock);
3429 
3430 	cfs_rq->runtime_remaining += amount;
3431 	/*
3432 	 * we may have advanced our local expiration to account for allowed
3433 	 * spread between our sched_clock and the one on which runtime was
3434 	 * issued.
3435 	 */
3436 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3437 		cfs_rq->runtime_expires = expires;
3438 
3439 	return cfs_rq->runtime_remaining > 0;
3440 }
3441 
3442 /*
3443  * Note: This depends on the synchronization provided by sched_clock and the
3444  * fact that rq->clock snapshots this value.
3445  */
3446 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3447 {
3448 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3449 
3450 	/* if the deadline is ahead of our clock, nothing to do */
3451 	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3452 		return;
3453 
3454 	if (cfs_rq->runtime_remaining < 0)
3455 		return;
3456 
3457 	/*
3458 	 * If the local deadline has passed we have to consider the
3459 	 * possibility that our sched_clock is 'fast' and the global deadline
3460 	 * has not truly expired.
3461 	 *
3462 	 * Fortunately we can check determine whether this the case by checking
3463 	 * whether the global deadline has advanced. It is valid to compare
3464 	 * cfs_b->runtime_expires without any locks since we only care about
3465 	 * exact equality, so a partial write will still work.
3466 	 */
3467 
3468 	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3469 		/* extend local deadline, drift is bounded above by 2 ticks */
3470 		cfs_rq->runtime_expires += TICK_NSEC;
3471 	} else {
3472 		/* global deadline is ahead, expiration has passed */
3473 		cfs_rq->runtime_remaining = 0;
3474 	}
3475 }
3476 
3477 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3478 {
3479 	/* dock delta_exec before expiring quota (as it could span periods) */
3480 	cfs_rq->runtime_remaining -= delta_exec;
3481 	expire_cfs_rq_runtime(cfs_rq);
3482 
3483 	if (likely(cfs_rq->runtime_remaining > 0))
3484 		return;
3485 
3486 	/*
3487 	 * if we're unable to extend our runtime we resched so that the active
3488 	 * hierarchy can be throttled
3489 	 */
3490 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3491 		resched_curr(rq_of(cfs_rq));
3492 }
3493 
3494 static __always_inline
3495 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3496 {
3497 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3498 		return;
3499 
3500 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
3501 }
3502 
3503 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3504 {
3505 	return cfs_bandwidth_used() && cfs_rq->throttled;
3506 }
3507 
3508 /* check whether cfs_rq, or any parent, is throttled */
3509 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3510 {
3511 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
3512 }
3513 
3514 /*
3515  * Ensure that neither of the group entities corresponding to src_cpu or
3516  * dest_cpu are members of a throttled hierarchy when performing group
3517  * load-balance operations.
3518  */
3519 static inline int throttled_lb_pair(struct task_group *tg,
3520 				    int src_cpu, int dest_cpu)
3521 {
3522 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3523 
3524 	src_cfs_rq = tg->cfs_rq[src_cpu];
3525 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
3526 
3527 	return throttled_hierarchy(src_cfs_rq) ||
3528 	       throttled_hierarchy(dest_cfs_rq);
3529 }
3530 
3531 /* updated child weight may affect parent so we have to do this bottom up */
3532 static int tg_unthrottle_up(struct task_group *tg, void *data)
3533 {
3534 	struct rq *rq = data;
3535 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3536 
3537 	cfs_rq->throttle_count--;
3538 #ifdef CONFIG_SMP
3539 	if (!cfs_rq->throttle_count) {
3540 		/* adjust cfs_rq_clock_task() */
3541 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3542 					     cfs_rq->throttled_clock_task;
3543 	}
3544 #endif
3545 
3546 	return 0;
3547 }
3548 
3549 static int tg_throttle_down(struct task_group *tg, void *data)
3550 {
3551 	struct rq *rq = data;
3552 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3553 
3554 	/* group is entering throttled state, stop time */
3555 	if (!cfs_rq->throttle_count)
3556 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
3557 	cfs_rq->throttle_count++;
3558 
3559 	return 0;
3560 }
3561 
3562 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3563 {
3564 	struct rq *rq = rq_of(cfs_rq);
3565 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3566 	struct sched_entity *se;
3567 	long task_delta, dequeue = 1;
3568 
3569 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3570 
3571 	/* freeze hierarchy runnable averages while throttled */
3572 	rcu_read_lock();
3573 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3574 	rcu_read_unlock();
3575 
3576 	task_delta = cfs_rq->h_nr_running;
3577 	for_each_sched_entity(se) {
3578 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3579 		/* throttled entity or throttle-on-deactivate */
3580 		if (!se->on_rq)
3581 			break;
3582 
3583 		if (dequeue)
3584 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3585 		qcfs_rq->h_nr_running -= task_delta;
3586 
3587 		if (qcfs_rq->load.weight)
3588 			dequeue = 0;
3589 	}
3590 
3591 	if (!se)
3592 		sub_nr_running(rq, task_delta);
3593 
3594 	cfs_rq->throttled = 1;
3595 	cfs_rq->throttled_clock = rq_clock(rq);
3596 	raw_spin_lock(&cfs_b->lock);
3597 	/*
3598 	 * Add to the _head_ of the list, so that an already-started
3599 	 * distribute_cfs_runtime will not see us
3600 	 */
3601 	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3602 	if (!cfs_b->timer_active)
3603 		__start_cfs_bandwidth(cfs_b, false);
3604 	raw_spin_unlock(&cfs_b->lock);
3605 }
3606 
3607 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3608 {
3609 	struct rq *rq = rq_of(cfs_rq);
3610 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3611 	struct sched_entity *se;
3612 	int enqueue = 1;
3613 	long task_delta;
3614 
3615 	se = cfs_rq->tg->se[cpu_of(rq)];
3616 
3617 	cfs_rq->throttled = 0;
3618 
3619 	update_rq_clock(rq);
3620 
3621 	raw_spin_lock(&cfs_b->lock);
3622 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3623 	list_del_rcu(&cfs_rq->throttled_list);
3624 	raw_spin_unlock(&cfs_b->lock);
3625 
3626 	/* update hierarchical throttle state */
3627 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3628 
3629 	if (!cfs_rq->load.weight)
3630 		return;
3631 
3632 	task_delta = cfs_rq->h_nr_running;
3633 	for_each_sched_entity(se) {
3634 		if (se->on_rq)
3635 			enqueue = 0;
3636 
3637 		cfs_rq = cfs_rq_of(se);
3638 		if (enqueue)
3639 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3640 		cfs_rq->h_nr_running += task_delta;
3641 
3642 		if (cfs_rq_throttled(cfs_rq))
3643 			break;
3644 	}
3645 
3646 	if (!se)
3647 		add_nr_running(rq, task_delta);
3648 
3649 	/* determine whether we need to wake up potentially idle cpu */
3650 	if (rq->curr == rq->idle && rq->cfs.nr_running)
3651 		resched_curr(rq);
3652 }
3653 
3654 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3655 		u64 remaining, u64 expires)
3656 {
3657 	struct cfs_rq *cfs_rq;
3658 	u64 runtime;
3659 	u64 starting_runtime = remaining;
3660 
3661 	rcu_read_lock();
3662 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3663 				throttled_list) {
3664 		struct rq *rq = rq_of(cfs_rq);
3665 
3666 		raw_spin_lock(&rq->lock);
3667 		if (!cfs_rq_throttled(cfs_rq))
3668 			goto next;
3669 
3670 		runtime = -cfs_rq->runtime_remaining + 1;
3671 		if (runtime > remaining)
3672 			runtime = remaining;
3673 		remaining -= runtime;
3674 
3675 		cfs_rq->runtime_remaining += runtime;
3676 		cfs_rq->runtime_expires = expires;
3677 
3678 		/* we check whether we're throttled above */
3679 		if (cfs_rq->runtime_remaining > 0)
3680 			unthrottle_cfs_rq(cfs_rq);
3681 
3682 next:
3683 		raw_spin_unlock(&rq->lock);
3684 
3685 		if (!remaining)
3686 			break;
3687 	}
3688 	rcu_read_unlock();
3689 
3690 	return starting_runtime - remaining;
3691 }
3692 
3693 /*
3694  * Responsible for refilling a task_group's bandwidth and unthrottling its
3695  * cfs_rqs as appropriate. If there has been no activity within the last
3696  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3697  * used to track this state.
3698  */
3699 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3700 {
3701 	u64 runtime, runtime_expires;
3702 	int throttled;
3703 
3704 	/* no need to continue the timer with no bandwidth constraint */
3705 	if (cfs_b->quota == RUNTIME_INF)
3706 		goto out_deactivate;
3707 
3708 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3709 	cfs_b->nr_periods += overrun;
3710 
3711 	/*
3712 	 * idle depends on !throttled (for the case of a large deficit), and if
3713 	 * we're going inactive then everything else can be deferred
3714 	 */
3715 	if (cfs_b->idle && !throttled)
3716 		goto out_deactivate;
3717 
3718 	/*
3719 	 * if we have relooped after returning idle once, we need to update our
3720 	 * status as actually running, so that other cpus doing
3721 	 * __start_cfs_bandwidth will stop trying to cancel us.
3722 	 */
3723 	cfs_b->timer_active = 1;
3724 
3725 	__refill_cfs_bandwidth_runtime(cfs_b);
3726 
3727 	if (!throttled) {
3728 		/* mark as potentially idle for the upcoming period */
3729 		cfs_b->idle = 1;
3730 		return 0;
3731 	}
3732 
3733 	/* account preceding periods in which throttling occurred */
3734 	cfs_b->nr_throttled += overrun;
3735 
3736 	runtime_expires = cfs_b->runtime_expires;
3737 
3738 	/*
3739 	 * This check is repeated as we are holding onto the new bandwidth while
3740 	 * we unthrottle. This can potentially race with an unthrottled group
3741 	 * trying to acquire new bandwidth from the global pool. This can result
3742 	 * in us over-using our runtime if it is all used during this loop, but
3743 	 * only by limited amounts in that extreme case.
3744 	 */
3745 	while (throttled && cfs_b->runtime > 0) {
3746 		runtime = cfs_b->runtime;
3747 		raw_spin_unlock(&cfs_b->lock);
3748 		/* we can't nest cfs_b->lock while distributing bandwidth */
3749 		runtime = distribute_cfs_runtime(cfs_b, runtime,
3750 						 runtime_expires);
3751 		raw_spin_lock(&cfs_b->lock);
3752 
3753 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3754 
3755 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
3756 	}
3757 
3758 	/*
3759 	 * While we are ensured activity in the period following an
3760 	 * unthrottle, this also covers the case in which the new bandwidth is
3761 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
3762 	 * timer to remain active while there are any throttled entities.)
3763 	 */
3764 	cfs_b->idle = 0;
3765 
3766 	return 0;
3767 
3768 out_deactivate:
3769 	cfs_b->timer_active = 0;
3770 	return 1;
3771 }
3772 
3773 /* a cfs_rq won't donate quota below this amount */
3774 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3775 /* minimum remaining period time to redistribute slack quota */
3776 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3777 /* how long we wait to gather additional slack before distributing */
3778 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3779 
3780 /*
3781  * Are we near the end of the current quota period?
3782  *
3783  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3784  * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3785  * migrate_hrtimers, base is never cleared, so we are fine.
3786  */
3787 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3788 {
3789 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
3790 	u64 remaining;
3791 
3792 	/* if the call-back is running a quota refresh is already occurring */
3793 	if (hrtimer_callback_running(refresh_timer))
3794 		return 1;
3795 
3796 	/* is a quota refresh about to occur? */
3797 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3798 	if (remaining < min_expire)
3799 		return 1;
3800 
3801 	return 0;
3802 }
3803 
3804 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3805 {
3806 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3807 
3808 	/* if there's a quota refresh soon don't bother with slack */
3809 	if (runtime_refresh_within(cfs_b, min_left))
3810 		return;
3811 
3812 	start_bandwidth_timer(&cfs_b->slack_timer,
3813 				ns_to_ktime(cfs_bandwidth_slack_period));
3814 }
3815 
3816 /* we know any runtime found here is valid as update_curr() precedes return */
3817 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3818 {
3819 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3820 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3821 
3822 	if (slack_runtime <= 0)
3823 		return;
3824 
3825 	raw_spin_lock(&cfs_b->lock);
3826 	if (cfs_b->quota != RUNTIME_INF &&
3827 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3828 		cfs_b->runtime += slack_runtime;
3829 
3830 		/* we are under rq->lock, defer unthrottling using a timer */
3831 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3832 		    !list_empty(&cfs_b->throttled_cfs_rq))
3833 			start_cfs_slack_bandwidth(cfs_b);
3834 	}
3835 	raw_spin_unlock(&cfs_b->lock);
3836 
3837 	/* even if it's not valid for return we don't want to try again */
3838 	cfs_rq->runtime_remaining -= slack_runtime;
3839 }
3840 
3841 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3842 {
3843 	if (!cfs_bandwidth_used())
3844 		return;
3845 
3846 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3847 		return;
3848 
3849 	__return_cfs_rq_runtime(cfs_rq);
3850 }
3851 
3852 /*
3853  * This is done with a timer (instead of inline with bandwidth return) since
3854  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3855  */
3856 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3857 {
3858 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3859 	u64 expires;
3860 
3861 	/* confirm we're still not at a refresh boundary */
3862 	raw_spin_lock(&cfs_b->lock);
3863 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3864 		raw_spin_unlock(&cfs_b->lock);
3865 		return;
3866 	}
3867 
3868 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3869 		runtime = cfs_b->runtime;
3870 
3871 	expires = cfs_b->runtime_expires;
3872 	raw_spin_unlock(&cfs_b->lock);
3873 
3874 	if (!runtime)
3875 		return;
3876 
3877 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3878 
3879 	raw_spin_lock(&cfs_b->lock);
3880 	if (expires == cfs_b->runtime_expires)
3881 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
3882 	raw_spin_unlock(&cfs_b->lock);
3883 }
3884 
3885 /*
3886  * When a group wakes up we want to make sure that its quota is not already
3887  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3888  * runtime as update_curr() throttling can not not trigger until it's on-rq.
3889  */
3890 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3891 {
3892 	if (!cfs_bandwidth_used())
3893 		return;
3894 
3895 	/* an active group must be handled by the update_curr()->put() path */
3896 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3897 		return;
3898 
3899 	/* ensure the group is not already throttled */
3900 	if (cfs_rq_throttled(cfs_rq))
3901 		return;
3902 
3903 	/* update runtime allocation */
3904 	account_cfs_rq_runtime(cfs_rq, 0);
3905 	if (cfs_rq->runtime_remaining <= 0)
3906 		throttle_cfs_rq(cfs_rq);
3907 }
3908 
3909 /* conditionally throttle active cfs_rq's from put_prev_entity() */
3910 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3911 {
3912 	if (!cfs_bandwidth_used())
3913 		return false;
3914 
3915 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3916 		return false;
3917 
3918 	/*
3919 	 * it's possible for a throttled entity to be forced into a running
3920 	 * state (e.g. set_curr_task), in this case we're finished.
3921 	 */
3922 	if (cfs_rq_throttled(cfs_rq))
3923 		return true;
3924 
3925 	throttle_cfs_rq(cfs_rq);
3926 	return true;
3927 }
3928 
3929 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
3930 {
3931 	struct cfs_bandwidth *cfs_b =
3932 		container_of(timer, struct cfs_bandwidth, slack_timer);
3933 	do_sched_cfs_slack_timer(cfs_b);
3934 
3935 	return HRTIMER_NORESTART;
3936 }
3937 
3938 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3939 {
3940 	struct cfs_bandwidth *cfs_b =
3941 		container_of(timer, struct cfs_bandwidth, period_timer);
3942 	ktime_t now;
3943 	int overrun;
3944 	int idle = 0;
3945 
3946 	raw_spin_lock(&cfs_b->lock);
3947 	for (;;) {
3948 		now = hrtimer_cb_get_time(timer);
3949 		overrun = hrtimer_forward(timer, now, cfs_b->period);
3950 
3951 		if (!overrun)
3952 			break;
3953 
3954 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
3955 	}
3956 	raw_spin_unlock(&cfs_b->lock);
3957 
3958 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3959 }
3960 
3961 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3962 {
3963 	raw_spin_lock_init(&cfs_b->lock);
3964 	cfs_b->runtime = 0;
3965 	cfs_b->quota = RUNTIME_INF;
3966 	cfs_b->period = ns_to_ktime(default_cfs_period());
3967 
3968 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
3969 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3970 	cfs_b->period_timer.function = sched_cfs_period_timer;
3971 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3972 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
3973 }
3974 
3975 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3976 {
3977 	cfs_rq->runtime_enabled = 0;
3978 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
3979 }
3980 
3981 /* requires cfs_b->lock, may release to reprogram timer */
3982 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
3983 {
3984 	/*
3985 	 * The timer may be active because we're trying to set a new bandwidth
3986 	 * period or because we're racing with the tear-down path
3987 	 * (timer_active==0 becomes visible before the hrtimer call-back
3988 	 * terminates).  In either case we ensure that it's re-programmed
3989 	 */
3990 	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3991 	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3992 		/* bounce the lock to allow do_sched_cfs_period_timer to run */
3993 		raw_spin_unlock(&cfs_b->lock);
3994 		cpu_relax();
3995 		raw_spin_lock(&cfs_b->lock);
3996 		/* if someone else restarted the timer then we're done */
3997 		if (!force && cfs_b->timer_active)
3998 			return;
3999 	}
4000 
4001 	cfs_b->timer_active = 1;
4002 	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
4003 }
4004 
4005 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4006 {
4007 	/* init_cfs_bandwidth() was not called */
4008 	if (!cfs_b->throttled_cfs_rq.next)
4009 		return;
4010 
4011 	hrtimer_cancel(&cfs_b->period_timer);
4012 	hrtimer_cancel(&cfs_b->slack_timer);
4013 }
4014 
4015 static void __maybe_unused update_runtime_enabled(struct rq *rq)
4016 {
4017 	struct cfs_rq *cfs_rq;
4018 
4019 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4020 		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4021 
4022 		raw_spin_lock(&cfs_b->lock);
4023 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4024 		raw_spin_unlock(&cfs_b->lock);
4025 	}
4026 }
4027 
4028 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4029 {
4030 	struct cfs_rq *cfs_rq;
4031 
4032 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4033 		if (!cfs_rq->runtime_enabled)
4034 			continue;
4035 
4036 		/*
4037 		 * clock_task is not advancing so we just need to make sure
4038 		 * there's some valid quota amount
4039 		 */
4040 		cfs_rq->runtime_remaining = 1;
4041 		/*
4042 		 * Offline rq is schedulable till cpu is completely disabled
4043 		 * in take_cpu_down(), so we prevent new cfs throttling here.
4044 		 */
4045 		cfs_rq->runtime_enabled = 0;
4046 
4047 		if (cfs_rq_throttled(cfs_rq))
4048 			unthrottle_cfs_rq(cfs_rq);
4049 	}
4050 }
4051 
4052 #else /* CONFIG_CFS_BANDWIDTH */
4053 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4054 {
4055 	return rq_clock_task(rq_of(cfs_rq));
4056 }
4057 
4058 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4059 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4060 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4061 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4062 
4063 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4064 {
4065 	return 0;
4066 }
4067 
4068 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4069 {
4070 	return 0;
4071 }
4072 
4073 static inline int throttled_lb_pair(struct task_group *tg,
4074 				    int src_cpu, int dest_cpu)
4075 {
4076 	return 0;
4077 }
4078 
4079 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4080 
4081 #ifdef CONFIG_FAIR_GROUP_SCHED
4082 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4083 #endif
4084 
4085 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4086 {
4087 	return NULL;
4088 }
4089 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4090 static inline void update_runtime_enabled(struct rq *rq) {}
4091 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4092 
4093 #endif /* CONFIG_CFS_BANDWIDTH */
4094 
4095 /**************************************************
4096  * CFS operations on tasks:
4097  */
4098 
4099 #ifdef CONFIG_SCHED_HRTICK
4100 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4101 {
4102 	struct sched_entity *se = &p->se;
4103 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4104 
4105 	WARN_ON(task_rq(p) != rq);
4106 
4107 	if (cfs_rq->nr_running > 1) {
4108 		u64 slice = sched_slice(cfs_rq, se);
4109 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4110 		s64 delta = slice - ran;
4111 
4112 		if (delta < 0) {
4113 			if (rq->curr == p)
4114 				resched_curr(rq);
4115 			return;
4116 		}
4117 		hrtick_start(rq, delta);
4118 	}
4119 }
4120 
4121 /*
4122  * called from enqueue/dequeue and updates the hrtick when the
4123  * current task is from our class and nr_running is low enough
4124  * to matter.
4125  */
4126 static void hrtick_update(struct rq *rq)
4127 {
4128 	struct task_struct *curr = rq->curr;
4129 
4130 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4131 		return;
4132 
4133 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4134 		hrtick_start_fair(rq, curr);
4135 }
4136 #else /* !CONFIG_SCHED_HRTICK */
4137 static inline void
4138 hrtick_start_fair(struct rq *rq, struct task_struct *p)
4139 {
4140 }
4141 
4142 static inline void hrtick_update(struct rq *rq)
4143 {
4144 }
4145 #endif
4146 
4147 /*
4148  * The enqueue_task method is called before nr_running is
4149  * increased. Here we update the fair scheduling stats and
4150  * then put the task into the rbtree:
4151  */
4152 static void
4153 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4154 {
4155 	struct cfs_rq *cfs_rq;
4156 	struct sched_entity *se = &p->se;
4157 
4158 	for_each_sched_entity(se) {
4159 		if (se->on_rq)
4160 			break;
4161 		cfs_rq = cfs_rq_of(se);
4162 		enqueue_entity(cfs_rq, se, flags);
4163 
4164 		/*
4165 		 * end evaluation on encountering a throttled cfs_rq
4166 		 *
4167 		 * note: in the case of encountering a throttled cfs_rq we will
4168 		 * post the final h_nr_running increment below.
4169 		*/
4170 		if (cfs_rq_throttled(cfs_rq))
4171 			break;
4172 		cfs_rq->h_nr_running++;
4173 
4174 		flags = ENQUEUE_WAKEUP;
4175 	}
4176 
4177 	for_each_sched_entity(se) {
4178 		cfs_rq = cfs_rq_of(se);
4179 		cfs_rq->h_nr_running++;
4180 
4181 		if (cfs_rq_throttled(cfs_rq))
4182 			break;
4183 
4184 		update_cfs_shares(cfs_rq);
4185 		update_entity_load_avg(se, 1);
4186 	}
4187 
4188 	if (!se) {
4189 		update_rq_runnable_avg(rq, rq->nr_running);
4190 		add_nr_running(rq, 1);
4191 	}
4192 	hrtick_update(rq);
4193 }
4194 
4195 static void set_next_buddy(struct sched_entity *se);
4196 
4197 /*
4198  * The dequeue_task method is called before nr_running is
4199  * decreased. We remove the task from the rbtree and
4200  * update the fair scheduling stats:
4201  */
4202 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4203 {
4204 	struct cfs_rq *cfs_rq;
4205 	struct sched_entity *se = &p->se;
4206 	int task_sleep = flags & DEQUEUE_SLEEP;
4207 
4208 	for_each_sched_entity(se) {
4209 		cfs_rq = cfs_rq_of(se);
4210 		dequeue_entity(cfs_rq, se, flags);
4211 
4212 		/*
4213 		 * end evaluation on encountering a throttled cfs_rq
4214 		 *
4215 		 * note: in the case of encountering a throttled cfs_rq we will
4216 		 * post the final h_nr_running decrement below.
4217 		*/
4218 		if (cfs_rq_throttled(cfs_rq))
4219 			break;
4220 		cfs_rq->h_nr_running--;
4221 
4222 		/* Don't dequeue parent if it has other entities besides us */
4223 		if (cfs_rq->load.weight) {
4224 			/*
4225 			 * Bias pick_next to pick a task from this cfs_rq, as
4226 			 * p is sleeping when it is within its sched_slice.
4227 			 */
4228 			if (task_sleep && parent_entity(se))
4229 				set_next_buddy(parent_entity(se));
4230 
4231 			/* avoid re-evaluating load for this entity */
4232 			se = parent_entity(se);
4233 			break;
4234 		}
4235 		flags |= DEQUEUE_SLEEP;
4236 	}
4237 
4238 	for_each_sched_entity(se) {
4239 		cfs_rq = cfs_rq_of(se);
4240 		cfs_rq->h_nr_running--;
4241 
4242 		if (cfs_rq_throttled(cfs_rq))
4243 			break;
4244 
4245 		update_cfs_shares(cfs_rq);
4246 		update_entity_load_avg(se, 1);
4247 	}
4248 
4249 	if (!se) {
4250 		sub_nr_running(rq, 1);
4251 		update_rq_runnable_avg(rq, 1);
4252 	}
4253 	hrtick_update(rq);
4254 }
4255 
4256 #ifdef CONFIG_SMP
4257 /* Used instead of source_load when we know the type == 0 */
4258 static unsigned long weighted_cpuload(const int cpu)
4259 {
4260 	return cpu_rq(cpu)->cfs.runnable_load_avg;
4261 }
4262 
4263 /*
4264  * Return a low guess at the load of a migration-source cpu weighted
4265  * according to the scheduling class and "nice" value.
4266  *
4267  * We want to under-estimate the load of migration sources, to
4268  * balance conservatively.
4269  */
4270 static unsigned long source_load(int cpu, int type)
4271 {
4272 	struct rq *rq = cpu_rq(cpu);
4273 	unsigned long total = weighted_cpuload(cpu);
4274 
4275 	if (type == 0 || !sched_feat(LB_BIAS))
4276 		return total;
4277 
4278 	return min(rq->cpu_load[type-1], total);
4279 }
4280 
4281 /*
4282  * Return a high guess at the load of a migration-target cpu weighted
4283  * according to the scheduling class and "nice" value.
4284  */
4285 static unsigned long target_load(int cpu, int type)
4286 {
4287 	struct rq *rq = cpu_rq(cpu);
4288 	unsigned long total = weighted_cpuload(cpu);
4289 
4290 	if (type == 0 || !sched_feat(LB_BIAS))
4291 		return total;
4292 
4293 	return max(rq->cpu_load[type-1], total);
4294 }
4295 
4296 static unsigned long capacity_of(int cpu)
4297 {
4298 	return cpu_rq(cpu)->cpu_capacity;
4299 }
4300 
4301 static unsigned long cpu_avg_load_per_task(int cpu)
4302 {
4303 	struct rq *rq = cpu_rq(cpu);
4304 	unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
4305 	unsigned long load_avg = rq->cfs.runnable_load_avg;
4306 
4307 	if (nr_running)
4308 		return load_avg / nr_running;
4309 
4310 	return 0;
4311 }
4312 
4313 static void record_wakee(struct task_struct *p)
4314 {
4315 	/*
4316 	 * Rough decay (wiping) for cost saving, don't worry
4317 	 * about the boundary, really active task won't care
4318 	 * about the loss.
4319 	 */
4320 	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
4321 		current->wakee_flips >>= 1;
4322 		current->wakee_flip_decay_ts = jiffies;
4323 	}
4324 
4325 	if (current->last_wakee != p) {
4326 		current->last_wakee = p;
4327 		current->wakee_flips++;
4328 	}
4329 }
4330 
4331 static void task_waking_fair(struct task_struct *p)
4332 {
4333 	struct sched_entity *se = &p->se;
4334 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4335 	u64 min_vruntime;
4336 
4337 #ifndef CONFIG_64BIT
4338 	u64 min_vruntime_copy;
4339 
4340 	do {
4341 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
4342 		smp_rmb();
4343 		min_vruntime = cfs_rq->min_vruntime;
4344 	} while (min_vruntime != min_vruntime_copy);
4345 #else
4346 	min_vruntime = cfs_rq->min_vruntime;
4347 #endif
4348 
4349 	se->vruntime -= min_vruntime;
4350 	record_wakee(p);
4351 }
4352 
4353 #ifdef CONFIG_FAIR_GROUP_SCHED
4354 /*
4355  * effective_load() calculates the load change as seen from the root_task_group
4356  *
4357  * Adding load to a group doesn't make a group heavier, but can cause movement
4358  * of group shares between cpus. Assuming the shares were perfectly aligned one
4359  * can calculate the shift in shares.
4360  *
4361  * Calculate the effective load difference if @wl is added (subtracted) to @tg
4362  * on this @cpu and results in a total addition (subtraction) of @wg to the
4363  * total group weight.
4364  *
4365  * Given a runqueue weight distribution (rw_i) we can compute a shares
4366  * distribution (s_i) using:
4367  *
4368  *   s_i = rw_i / \Sum rw_j						(1)
4369  *
4370  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4371  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4372  * shares distribution (s_i):
4373  *
4374  *   rw_i = {   2,   4,   1,   0 }
4375  *   s_i  = { 2/7, 4/7, 1/7,   0 }
4376  *
4377  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4378  * task used to run on and the CPU the waker is running on), we need to
4379  * compute the effect of waking a task on either CPU and, in case of a sync
4380  * wakeup, compute the effect of the current task going to sleep.
4381  *
4382  * So for a change of @wl to the local @cpu with an overall group weight change
4383  * of @wl we can compute the new shares distribution (s'_i) using:
4384  *
4385  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
4386  *
4387  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4388  * differences in waking a task to CPU 0. The additional task changes the
4389  * weight and shares distributions like:
4390  *
4391  *   rw'_i = {   3,   4,   1,   0 }
4392  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
4393  *
4394  * We can then compute the difference in effective weight by using:
4395  *
4396  *   dw_i = S * (s'_i - s_i)						(3)
4397  *
4398  * Where 'S' is the group weight as seen by its parent.
4399  *
4400  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4401  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4402  * 4/7) times the weight of the group.
4403  */
4404 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4405 {
4406 	struct sched_entity *se = tg->se[cpu];
4407 
4408 	if (!tg->parent)	/* the trivial, non-cgroup case */
4409 		return wl;
4410 
4411 	for_each_sched_entity(se) {
4412 		long w, W;
4413 
4414 		tg = se->my_q->tg;
4415 
4416 		/*
4417 		 * W = @wg + \Sum rw_j
4418 		 */
4419 		W = wg + calc_tg_weight(tg, se->my_q);
4420 
4421 		/*
4422 		 * w = rw_i + @wl
4423 		 */
4424 		w = se->my_q->load.weight + wl;
4425 
4426 		/*
4427 		 * wl = S * s'_i; see (2)
4428 		 */
4429 		if (W > 0 && w < W)
4430 			wl = (w * (long)tg->shares) / W;
4431 		else
4432 			wl = tg->shares;
4433 
4434 		/*
4435 		 * Per the above, wl is the new se->load.weight value; since
4436 		 * those are clipped to [MIN_SHARES, ...) do so now. See
4437 		 * calc_cfs_shares().
4438 		 */
4439 		if (wl < MIN_SHARES)
4440 			wl = MIN_SHARES;
4441 
4442 		/*
4443 		 * wl = dw_i = S * (s'_i - s_i); see (3)
4444 		 */
4445 		wl -= se->load.weight;
4446 
4447 		/*
4448 		 * Recursively apply this logic to all parent groups to compute
4449 		 * the final effective load change on the root group. Since
4450 		 * only the @tg group gets extra weight, all parent groups can
4451 		 * only redistribute existing shares. @wl is the shift in shares
4452 		 * resulting from this level per the above.
4453 		 */
4454 		wg = 0;
4455 	}
4456 
4457 	return wl;
4458 }
4459 #else
4460 
4461 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4462 {
4463 	return wl;
4464 }
4465 
4466 #endif
4467 
4468 static int wake_wide(struct task_struct *p)
4469 {
4470 	int factor = this_cpu_read(sd_llc_size);
4471 
4472 	/*
4473 	 * Yeah, it's the switching-frequency, could means many wakee or
4474 	 * rapidly switch, use factor here will just help to automatically
4475 	 * adjust the loose-degree, so bigger node will lead to more pull.
4476 	 */
4477 	if (p->wakee_flips > factor) {
4478 		/*
4479 		 * wakee is somewhat hot, it needs certain amount of cpu
4480 		 * resource, so if waker is far more hot, prefer to leave
4481 		 * it alone.
4482 		 */
4483 		if (current->wakee_flips > (factor * p->wakee_flips))
4484 			return 1;
4485 	}
4486 
4487 	return 0;
4488 }
4489 
4490 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4491 {
4492 	s64 this_load, load;
4493 	s64 this_eff_load, prev_eff_load;
4494 	int idx, this_cpu, prev_cpu;
4495 	struct task_group *tg;
4496 	unsigned long weight;
4497 	int balanced;
4498 
4499 	/*
4500 	 * If we wake multiple tasks be careful to not bounce
4501 	 * ourselves around too much.
4502 	 */
4503 	if (wake_wide(p))
4504 		return 0;
4505 
4506 	idx	  = sd->wake_idx;
4507 	this_cpu  = smp_processor_id();
4508 	prev_cpu  = task_cpu(p);
4509 	load	  = source_load(prev_cpu, idx);
4510 	this_load = target_load(this_cpu, idx);
4511 
4512 	/*
4513 	 * If sync wakeup then subtract the (maximum possible)
4514 	 * effect of the currently running task from the load
4515 	 * of the current CPU:
4516 	 */
4517 	if (sync) {
4518 		tg = task_group(current);
4519 		weight = current->se.load.weight;
4520 
4521 		this_load += effective_load(tg, this_cpu, -weight, -weight);
4522 		load += effective_load(tg, prev_cpu, 0, -weight);
4523 	}
4524 
4525 	tg = task_group(p);
4526 	weight = p->se.load.weight;
4527 
4528 	/*
4529 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
4530 	 * due to the sync cause above having dropped this_load to 0, we'll
4531 	 * always have an imbalance, but there's really nothing you can do
4532 	 * about that, so that's good too.
4533 	 *
4534 	 * Otherwise check if either cpus are near enough in load to allow this
4535 	 * task to be woken on this_cpu.
4536 	 */
4537 	this_eff_load = 100;
4538 	this_eff_load *= capacity_of(prev_cpu);
4539 
4540 	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4541 	prev_eff_load *= capacity_of(this_cpu);
4542 
4543 	if (this_load > 0) {
4544 		this_eff_load *= this_load +
4545 			effective_load(tg, this_cpu, weight, weight);
4546 
4547 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4548 	}
4549 
4550 	balanced = this_eff_load <= prev_eff_load;
4551 
4552 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4553 
4554 	if (!balanced)
4555 		return 0;
4556 
4557 	schedstat_inc(sd, ttwu_move_affine);
4558 	schedstat_inc(p, se.statistics.nr_wakeups_affine);
4559 
4560 	return 1;
4561 }
4562 
4563 /*
4564  * find_idlest_group finds and returns the least busy CPU group within the
4565  * domain.
4566  */
4567 static struct sched_group *
4568 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4569 		  int this_cpu, int sd_flag)
4570 {
4571 	struct sched_group *idlest = NULL, *group = sd->groups;
4572 	unsigned long min_load = ULONG_MAX, this_load = 0;
4573 	int load_idx = sd->forkexec_idx;
4574 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
4575 
4576 	if (sd_flag & SD_BALANCE_WAKE)
4577 		load_idx = sd->wake_idx;
4578 
4579 	do {
4580 		unsigned long load, avg_load;
4581 		int local_group;
4582 		int i;
4583 
4584 		/* Skip over this group if it has no CPUs allowed */
4585 		if (!cpumask_intersects(sched_group_cpus(group),
4586 					tsk_cpus_allowed(p)))
4587 			continue;
4588 
4589 		local_group = cpumask_test_cpu(this_cpu,
4590 					       sched_group_cpus(group));
4591 
4592 		/* Tally up the load of all CPUs in the group */
4593 		avg_load = 0;
4594 
4595 		for_each_cpu(i, sched_group_cpus(group)) {
4596 			/* Bias balancing toward cpus of our domain */
4597 			if (local_group)
4598 				load = source_load(i, load_idx);
4599 			else
4600 				load = target_load(i, load_idx);
4601 
4602 			avg_load += load;
4603 		}
4604 
4605 		/* Adjust by relative CPU capacity of the group */
4606 		avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
4607 
4608 		if (local_group) {
4609 			this_load = avg_load;
4610 		} else if (avg_load < min_load) {
4611 			min_load = avg_load;
4612 			idlest = group;
4613 		}
4614 	} while (group = group->next, group != sd->groups);
4615 
4616 	if (!idlest || 100*this_load < imbalance*min_load)
4617 		return NULL;
4618 	return idlest;
4619 }
4620 
4621 /*
4622  * find_idlest_cpu - find the idlest cpu among the cpus in group.
4623  */
4624 static int
4625 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4626 {
4627 	unsigned long load, min_load = ULONG_MAX;
4628 	unsigned int min_exit_latency = UINT_MAX;
4629 	u64 latest_idle_timestamp = 0;
4630 	int least_loaded_cpu = this_cpu;
4631 	int shallowest_idle_cpu = -1;
4632 	int i;
4633 
4634 	/* Traverse only the allowed CPUs */
4635 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4636 		if (idle_cpu(i)) {
4637 			struct rq *rq = cpu_rq(i);
4638 			struct cpuidle_state *idle = idle_get_state(rq);
4639 			if (idle && idle->exit_latency < min_exit_latency) {
4640 				/*
4641 				 * We give priority to a CPU whose idle state
4642 				 * has the smallest exit latency irrespective
4643 				 * of any idle timestamp.
4644 				 */
4645 				min_exit_latency = idle->exit_latency;
4646 				latest_idle_timestamp = rq->idle_stamp;
4647 				shallowest_idle_cpu = i;
4648 			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
4649 				   rq->idle_stamp > latest_idle_timestamp) {
4650 				/*
4651 				 * If equal or no active idle state, then
4652 				 * the most recently idled CPU might have
4653 				 * a warmer cache.
4654 				 */
4655 				latest_idle_timestamp = rq->idle_stamp;
4656 				shallowest_idle_cpu = i;
4657 			}
4658 		} else if (shallowest_idle_cpu == -1) {
4659 			load = weighted_cpuload(i);
4660 			if (load < min_load || (load == min_load && i == this_cpu)) {
4661 				min_load = load;
4662 				least_loaded_cpu = i;
4663 			}
4664 		}
4665 	}
4666 
4667 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
4668 }
4669 
4670 /*
4671  * Try and locate an idle CPU in the sched_domain.
4672  */
4673 static int select_idle_sibling(struct task_struct *p, int target)
4674 {
4675 	struct sched_domain *sd;
4676 	struct sched_group *sg;
4677 	int i = task_cpu(p);
4678 
4679 	if (idle_cpu(target))
4680 		return target;
4681 
4682 	/*
4683 	 * If the prevous cpu is cache affine and idle, don't be stupid.
4684 	 */
4685 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4686 		return i;
4687 
4688 	/*
4689 	 * Otherwise, iterate the domains and find an elegible idle cpu.
4690 	 */
4691 	sd = rcu_dereference(per_cpu(sd_llc, target));
4692 	for_each_lower_domain(sd) {
4693 		sg = sd->groups;
4694 		do {
4695 			if (!cpumask_intersects(sched_group_cpus(sg),
4696 						tsk_cpus_allowed(p)))
4697 				goto next;
4698 
4699 			for_each_cpu(i, sched_group_cpus(sg)) {
4700 				if (i == target || !idle_cpu(i))
4701 					goto next;
4702 			}
4703 
4704 			target = cpumask_first_and(sched_group_cpus(sg),
4705 					tsk_cpus_allowed(p));
4706 			goto done;
4707 next:
4708 			sg = sg->next;
4709 		} while (sg != sd->groups);
4710 	}
4711 done:
4712 	return target;
4713 }
4714 
4715 /*
4716  * select_task_rq_fair: Select target runqueue for the waking task in domains
4717  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
4718  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
4719  *
4720  * Balances load by selecting the idlest cpu in the idlest group, or under
4721  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
4722  *
4723  * Returns the target cpu number.
4724  *
4725  * preempt must be disabled.
4726  */
4727 static int
4728 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
4729 {
4730 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
4731 	int cpu = smp_processor_id();
4732 	int new_cpu = cpu;
4733 	int want_affine = 0;
4734 	int sync = wake_flags & WF_SYNC;
4735 
4736 	if (sd_flag & SD_BALANCE_WAKE)
4737 		want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4738 
4739 	rcu_read_lock();
4740 	for_each_domain(cpu, tmp) {
4741 		if (!(tmp->flags & SD_LOAD_BALANCE))
4742 			continue;
4743 
4744 		/*
4745 		 * If both cpu and prev_cpu are part of this domain,
4746 		 * cpu is a valid SD_WAKE_AFFINE target.
4747 		 */
4748 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
4749 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
4750 			affine_sd = tmp;
4751 			break;
4752 		}
4753 
4754 		if (tmp->flags & sd_flag)
4755 			sd = tmp;
4756 	}
4757 
4758 	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4759 		prev_cpu = cpu;
4760 
4761 	if (sd_flag & SD_BALANCE_WAKE) {
4762 		new_cpu = select_idle_sibling(p, prev_cpu);
4763 		goto unlock;
4764 	}
4765 
4766 	while (sd) {
4767 		struct sched_group *group;
4768 		int weight;
4769 
4770 		if (!(sd->flags & sd_flag)) {
4771 			sd = sd->child;
4772 			continue;
4773 		}
4774 
4775 		group = find_idlest_group(sd, p, cpu, sd_flag);
4776 		if (!group) {
4777 			sd = sd->child;
4778 			continue;
4779 		}
4780 
4781 		new_cpu = find_idlest_cpu(group, p, cpu);
4782 		if (new_cpu == -1 || new_cpu == cpu) {
4783 			/* Now try balancing at a lower domain level of cpu */
4784 			sd = sd->child;
4785 			continue;
4786 		}
4787 
4788 		/* Now try balancing at a lower domain level of new_cpu */
4789 		cpu = new_cpu;
4790 		weight = sd->span_weight;
4791 		sd = NULL;
4792 		for_each_domain(cpu, tmp) {
4793 			if (weight <= tmp->span_weight)
4794 				break;
4795 			if (tmp->flags & sd_flag)
4796 				sd = tmp;
4797 		}
4798 		/* while loop will break here if sd == NULL */
4799 	}
4800 unlock:
4801 	rcu_read_unlock();
4802 
4803 	return new_cpu;
4804 }
4805 
4806 /*
4807  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
4808  * cfs_rq_of(p) references at time of call are still valid and identify the
4809  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
4810  * other assumptions, including the state of rq->lock, should be made.
4811  */
4812 static void
4813 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4814 {
4815 	struct sched_entity *se = &p->se;
4816 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4817 
4818 	/*
4819 	 * Load tracking: accumulate removed load so that it can be processed
4820 	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
4821 	 * to blocked load iff they have a positive decay-count.  It can never
4822 	 * be negative here since on-rq tasks have decay-count == 0.
4823 	 */
4824 	if (se->avg.decay_count) {
4825 		se->avg.decay_count = -__synchronize_entity_decay(se);
4826 		atomic_long_add(se->avg.load_avg_contrib,
4827 						&cfs_rq->removed_load);
4828 	}
4829 
4830 	/* We have migrated, no longer consider this task hot */
4831 	se->exec_start = 0;
4832 }
4833 #endif /* CONFIG_SMP */
4834 
4835 static unsigned long
4836 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
4837 {
4838 	unsigned long gran = sysctl_sched_wakeup_granularity;
4839 
4840 	/*
4841 	 * Since its curr running now, convert the gran from real-time
4842 	 * to virtual-time in his units.
4843 	 *
4844 	 * By using 'se' instead of 'curr' we penalize light tasks, so
4845 	 * they get preempted easier. That is, if 'se' < 'curr' then
4846 	 * the resulting gran will be larger, therefore penalizing the
4847 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
4848 	 * be smaller, again penalizing the lighter task.
4849 	 *
4850 	 * This is especially important for buddies when the leftmost
4851 	 * task is higher priority than the buddy.
4852 	 */
4853 	return calc_delta_fair(gran, se);
4854 }
4855 
4856 /*
4857  * Should 'se' preempt 'curr'.
4858  *
4859  *             |s1
4860  *        |s2
4861  *   |s3
4862  *         g
4863  *      |<--->|c
4864  *
4865  *  w(c, s1) = -1
4866  *  w(c, s2) =  0
4867  *  w(c, s3) =  1
4868  *
4869  */
4870 static int
4871 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
4872 {
4873 	s64 gran, vdiff = curr->vruntime - se->vruntime;
4874 
4875 	if (vdiff <= 0)
4876 		return -1;
4877 
4878 	gran = wakeup_gran(curr, se);
4879 	if (vdiff > gran)
4880 		return 1;
4881 
4882 	return 0;
4883 }
4884 
4885 static void set_last_buddy(struct sched_entity *se)
4886 {
4887 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4888 		return;
4889 
4890 	for_each_sched_entity(se)
4891 		cfs_rq_of(se)->last = se;
4892 }
4893 
4894 static void set_next_buddy(struct sched_entity *se)
4895 {
4896 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4897 		return;
4898 
4899 	for_each_sched_entity(se)
4900 		cfs_rq_of(se)->next = se;
4901 }
4902 
4903 static void set_skip_buddy(struct sched_entity *se)
4904 {
4905 	for_each_sched_entity(se)
4906 		cfs_rq_of(se)->skip = se;
4907 }
4908 
4909 /*
4910  * Preempt the current task with a newly woken task if needed:
4911  */
4912 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
4913 {
4914 	struct task_struct *curr = rq->curr;
4915 	struct sched_entity *se = &curr->se, *pse = &p->se;
4916 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4917 	int scale = cfs_rq->nr_running >= sched_nr_latency;
4918 	int next_buddy_marked = 0;
4919 
4920 	if (unlikely(se == pse))
4921 		return;
4922 
4923 	/*
4924 	 * This is possible from callers such as attach_tasks(), in which we
4925 	 * unconditionally check_prempt_curr() after an enqueue (which may have
4926 	 * lead to a throttle).  This both saves work and prevents false
4927 	 * next-buddy nomination below.
4928 	 */
4929 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
4930 		return;
4931 
4932 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
4933 		set_next_buddy(pse);
4934 		next_buddy_marked = 1;
4935 	}
4936 
4937 	/*
4938 	 * We can come here with TIF_NEED_RESCHED already set from new task
4939 	 * wake up path.
4940 	 *
4941 	 * Note: this also catches the edge-case of curr being in a throttled
4942 	 * group (e.g. via set_curr_task), since update_curr() (in the
4943 	 * enqueue of curr) will have resulted in resched being set.  This
4944 	 * prevents us from potentially nominating it as a false LAST_BUDDY
4945 	 * below.
4946 	 */
4947 	if (test_tsk_need_resched(curr))
4948 		return;
4949 
4950 	/* Idle tasks are by definition preempted by non-idle tasks. */
4951 	if (unlikely(curr->policy == SCHED_IDLE) &&
4952 	    likely(p->policy != SCHED_IDLE))
4953 		goto preempt;
4954 
4955 	/*
4956 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
4957 	 * is driven by the tick):
4958 	 */
4959 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
4960 		return;
4961 
4962 	find_matching_se(&se, &pse);
4963 	update_curr(cfs_rq_of(se));
4964 	BUG_ON(!pse);
4965 	if (wakeup_preempt_entity(se, pse) == 1) {
4966 		/*
4967 		 * Bias pick_next to pick the sched entity that is
4968 		 * triggering this preemption.
4969 		 */
4970 		if (!next_buddy_marked)
4971 			set_next_buddy(pse);
4972 		goto preempt;
4973 	}
4974 
4975 	return;
4976 
4977 preempt:
4978 	resched_curr(rq);
4979 	/*
4980 	 * Only set the backward buddy when the current task is still
4981 	 * on the rq. This can happen when a wakeup gets interleaved
4982 	 * with schedule on the ->pre_schedule() or idle_balance()
4983 	 * point, either of which can * drop the rq lock.
4984 	 *
4985 	 * Also, during early boot the idle thread is in the fair class,
4986 	 * for obvious reasons its a bad idea to schedule back to it.
4987 	 */
4988 	if (unlikely(!se->on_rq || curr == rq->idle))
4989 		return;
4990 
4991 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
4992 		set_last_buddy(se);
4993 }
4994 
4995 static struct task_struct *
4996 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
4997 {
4998 	struct cfs_rq *cfs_rq = &rq->cfs;
4999 	struct sched_entity *se;
5000 	struct task_struct *p;
5001 	int new_tasks;
5002 
5003 again:
5004 #ifdef CONFIG_FAIR_GROUP_SCHED
5005 	if (!cfs_rq->nr_running)
5006 		goto idle;
5007 
5008 	if (prev->sched_class != &fair_sched_class)
5009 		goto simple;
5010 
5011 	/*
5012 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
5013 	 * likely that a next task is from the same cgroup as the current.
5014 	 *
5015 	 * Therefore attempt to avoid putting and setting the entire cgroup
5016 	 * hierarchy, only change the part that actually changes.
5017 	 */
5018 
5019 	do {
5020 		struct sched_entity *curr = cfs_rq->curr;
5021 
5022 		/*
5023 		 * Since we got here without doing put_prev_entity() we also
5024 		 * have to consider cfs_rq->curr. If it is still a runnable
5025 		 * entity, update_curr() will update its vruntime, otherwise
5026 		 * forget we've ever seen it.
5027 		 */
5028 		if (curr && curr->on_rq)
5029 			update_curr(cfs_rq);
5030 		else
5031 			curr = NULL;
5032 
5033 		/*
5034 		 * This call to check_cfs_rq_runtime() will do the throttle and
5035 		 * dequeue its entity in the parent(s). Therefore the 'simple'
5036 		 * nr_running test will indeed be correct.
5037 		 */
5038 		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5039 			goto simple;
5040 
5041 		se = pick_next_entity(cfs_rq, curr);
5042 		cfs_rq = group_cfs_rq(se);
5043 	} while (cfs_rq);
5044 
5045 	p = task_of(se);
5046 
5047 	/*
5048 	 * Since we haven't yet done put_prev_entity and if the selected task
5049 	 * is a different task than we started out with, try and touch the
5050 	 * least amount of cfs_rqs.
5051 	 */
5052 	if (prev != p) {
5053 		struct sched_entity *pse = &prev->se;
5054 
5055 		while (!(cfs_rq = is_same_group(se, pse))) {
5056 			int se_depth = se->depth;
5057 			int pse_depth = pse->depth;
5058 
5059 			if (se_depth <= pse_depth) {
5060 				put_prev_entity(cfs_rq_of(pse), pse);
5061 				pse = parent_entity(pse);
5062 			}
5063 			if (se_depth >= pse_depth) {
5064 				set_next_entity(cfs_rq_of(se), se);
5065 				se = parent_entity(se);
5066 			}
5067 		}
5068 
5069 		put_prev_entity(cfs_rq, pse);
5070 		set_next_entity(cfs_rq, se);
5071 	}
5072 
5073 	if (hrtick_enabled(rq))
5074 		hrtick_start_fair(rq, p);
5075 
5076 	return p;
5077 simple:
5078 	cfs_rq = &rq->cfs;
5079 #endif
5080 
5081 	if (!cfs_rq->nr_running)
5082 		goto idle;
5083 
5084 	put_prev_task(rq, prev);
5085 
5086 	do {
5087 		se = pick_next_entity(cfs_rq, NULL);
5088 		set_next_entity(cfs_rq, se);
5089 		cfs_rq = group_cfs_rq(se);
5090 	} while (cfs_rq);
5091 
5092 	p = task_of(se);
5093 
5094 	if (hrtick_enabled(rq))
5095 		hrtick_start_fair(rq, p);
5096 
5097 	return p;
5098 
5099 idle:
5100 	new_tasks = idle_balance(rq);
5101 	/*
5102 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
5103 	 * possible for any higher priority task to appear. In that case we
5104 	 * must re-start the pick_next_entity() loop.
5105 	 */
5106 	if (new_tasks < 0)
5107 		return RETRY_TASK;
5108 
5109 	if (new_tasks > 0)
5110 		goto again;
5111 
5112 	return NULL;
5113 }
5114 
5115 /*
5116  * Account for a descheduled task:
5117  */
5118 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5119 {
5120 	struct sched_entity *se = &prev->se;
5121 	struct cfs_rq *cfs_rq;
5122 
5123 	for_each_sched_entity(se) {
5124 		cfs_rq = cfs_rq_of(se);
5125 		put_prev_entity(cfs_rq, se);
5126 	}
5127 }
5128 
5129 /*
5130  * sched_yield() is very simple
5131  *
5132  * The magic of dealing with the ->skip buddy is in pick_next_entity.
5133  */
5134 static void yield_task_fair(struct rq *rq)
5135 {
5136 	struct task_struct *curr = rq->curr;
5137 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5138 	struct sched_entity *se = &curr->se;
5139 
5140 	/*
5141 	 * Are we the only task in the tree?
5142 	 */
5143 	if (unlikely(rq->nr_running == 1))
5144 		return;
5145 
5146 	clear_buddies(cfs_rq, se);
5147 
5148 	if (curr->policy != SCHED_BATCH) {
5149 		update_rq_clock(rq);
5150 		/*
5151 		 * Update run-time statistics of the 'current'.
5152 		 */
5153 		update_curr(cfs_rq);
5154 		/*
5155 		 * Tell update_rq_clock() that we've just updated,
5156 		 * so we don't do microscopic update in schedule()
5157 		 * and double the fastpath cost.
5158 		 */
5159 		rq_clock_skip_update(rq, true);
5160 	}
5161 
5162 	set_skip_buddy(se);
5163 }
5164 
5165 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5166 {
5167 	struct sched_entity *se = &p->se;
5168 
5169 	/* throttled hierarchies are not runnable */
5170 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5171 		return false;
5172 
5173 	/* Tell the scheduler that we'd really like pse to run next. */
5174 	set_next_buddy(se);
5175 
5176 	yield_task_fair(rq);
5177 
5178 	return true;
5179 }
5180 
5181 #ifdef CONFIG_SMP
5182 /**************************************************
5183  * Fair scheduling class load-balancing methods.
5184  *
5185  * BASICS
5186  *
5187  * The purpose of load-balancing is to achieve the same basic fairness the
5188  * per-cpu scheduler provides, namely provide a proportional amount of compute
5189  * time to each task. This is expressed in the following equation:
5190  *
5191  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
5192  *
5193  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5194  * W_i,0 is defined as:
5195  *
5196  *   W_i,0 = \Sum_j w_i,j                                             (2)
5197  *
5198  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5199  * is derived from the nice value as per prio_to_weight[].
5200  *
5201  * The weight average is an exponential decay average of the instantaneous
5202  * weight:
5203  *
5204  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
5205  *
5206  * C_i is the compute capacity of cpu i, typically it is the
5207  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5208  * can also include other factors [XXX].
5209  *
5210  * To achieve this balance we define a measure of imbalance which follows
5211  * directly from (1):
5212  *
5213  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
5214  *
5215  * We them move tasks around to minimize the imbalance. In the continuous
5216  * function space it is obvious this converges, in the discrete case we get
5217  * a few fun cases generally called infeasible weight scenarios.
5218  *
5219  * [XXX expand on:
5220  *     - infeasible weights;
5221  *     - local vs global optima in the discrete case. ]
5222  *
5223  *
5224  * SCHED DOMAINS
5225  *
5226  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5227  * for all i,j solution, we create a tree of cpus that follows the hardware
5228  * topology where each level pairs two lower groups (or better). This results
5229  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5230  * tree to only the first of the previous level and we decrease the frequency
5231  * of load-balance at each level inv. proportional to the number of cpus in
5232  * the groups.
5233  *
5234  * This yields:
5235  *
5236  *     log_2 n     1     n
5237  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
5238  *     i = 0      2^i   2^i
5239  *                               `- size of each group
5240  *         |         |     `- number of cpus doing load-balance
5241  *         |         `- freq
5242  *         `- sum over all levels
5243  *
5244  * Coupled with a limit on how many tasks we can migrate every balance pass,
5245  * this makes (5) the runtime complexity of the balancer.
5246  *
5247  * An important property here is that each CPU is still (indirectly) connected
5248  * to every other cpu in at most O(log n) steps:
5249  *
5250  * The adjacency matrix of the resulting graph is given by:
5251  *
5252  *             log_2 n
5253  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
5254  *             k = 0
5255  *
5256  * And you'll find that:
5257  *
5258  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
5259  *
5260  * Showing there's indeed a path between every cpu in at most O(log n) steps.
5261  * The task movement gives a factor of O(m), giving a convergence complexity
5262  * of:
5263  *
5264  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
5265  *
5266  *
5267  * WORK CONSERVING
5268  *
5269  * In order to avoid CPUs going idle while there's still work to do, new idle
5270  * balancing is more aggressive and has the newly idle cpu iterate up the domain
5271  * tree itself instead of relying on other CPUs to bring it work.
5272  *
5273  * This adds some complexity to both (5) and (8) but it reduces the total idle
5274  * time.
5275  *
5276  * [XXX more?]
5277  *
5278  *
5279  * CGROUPS
5280  *
5281  * Cgroups make a horror show out of (2), instead of a simple sum we get:
5282  *
5283  *                                s_k,i
5284  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
5285  *                                 S_k
5286  *
5287  * Where
5288  *
5289  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
5290  *
5291  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5292  *
5293  * The big problem is S_k, its a global sum needed to compute a local (W_i)
5294  * property.
5295  *
5296  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5297  *      rewrite all of this once again.]
5298  */
5299 
5300 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5301 
5302 enum fbq_type { regular, remote, all };
5303 
5304 #define LBF_ALL_PINNED	0x01
5305 #define LBF_NEED_BREAK	0x02
5306 #define LBF_DST_PINNED  0x04
5307 #define LBF_SOME_PINNED	0x08
5308 
5309 struct lb_env {
5310 	struct sched_domain	*sd;
5311 
5312 	struct rq		*src_rq;
5313 	int			src_cpu;
5314 
5315 	int			dst_cpu;
5316 	struct rq		*dst_rq;
5317 
5318 	struct cpumask		*dst_grpmask;
5319 	int			new_dst_cpu;
5320 	enum cpu_idle_type	idle;
5321 	long			imbalance;
5322 	/* The set of CPUs under consideration for load-balancing */
5323 	struct cpumask		*cpus;
5324 
5325 	unsigned int		flags;
5326 
5327 	unsigned int		loop;
5328 	unsigned int		loop_break;
5329 	unsigned int		loop_max;
5330 
5331 	enum fbq_type		fbq_type;
5332 	struct list_head	tasks;
5333 };
5334 
5335 /*
5336  * Is this task likely cache-hot:
5337  */
5338 static int task_hot(struct task_struct *p, struct lb_env *env)
5339 {
5340 	s64 delta;
5341 
5342 	lockdep_assert_held(&env->src_rq->lock);
5343 
5344 	if (p->sched_class != &fair_sched_class)
5345 		return 0;
5346 
5347 	if (unlikely(p->policy == SCHED_IDLE))
5348 		return 0;
5349 
5350 	/*
5351 	 * Buddy candidates are cache hot:
5352 	 */
5353 	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5354 			(&p->se == cfs_rq_of(&p->se)->next ||
5355 			 &p->se == cfs_rq_of(&p->se)->last))
5356 		return 1;
5357 
5358 	if (sysctl_sched_migration_cost == -1)
5359 		return 1;
5360 	if (sysctl_sched_migration_cost == 0)
5361 		return 0;
5362 
5363 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5364 
5365 	return delta < (s64)sysctl_sched_migration_cost;
5366 }
5367 
5368 #ifdef CONFIG_NUMA_BALANCING
5369 /* Returns true if the destination node has incurred more faults */
5370 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5371 {
5372 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
5373 	int src_nid, dst_nid;
5374 
5375 	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
5376 	    !(env->sd->flags & SD_NUMA)) {
5377 		return false;
5378 	}
5379 
5380 	src_nid = cpu_to_node(env->src_cpu);
5381 	dst_nid = cpu_to_node(env->dst_cpu);
5382 
5383 	if (src_nid == dst_nid)
5384 		return false;
5385 
5386 	if (numa_group) {
5387 		/* Task is already in the group's interleave set. */
5388 		if (node_isset(src_nid, numa_group->active_nodes))
5389 			return false;
5390 
5391 		/* Task is moving into the group's interleave set. */
5392 		if (node_isset(dst_nid, numa_group->active_nodes))
5393 			return true;
5394 
5395 		return group_faults(p, dst_nid) > group_faults(p, src_nid);
5396 	}
5397 
5398 	/* Encourage migration to the preferred node. */
5399 	if (dst_nid == p->numa_preferred_nid)
5400 		return true;
5401 
5402 	return task_faults(p, dst_nid) > task_faults(p, src_nid);
5403 }
5404 
5405 
5406 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5407 {
5408 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
5409 	int src_nid, dst_nid;
5410 
5411 	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
5412 		return false;
5413 
5414 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5415 		return false;
5416 
5417 	src_nid = cpu_to_node(env->src_cpu);
5418 	dst_nid = cpu_to_node(env->dst_cpu);
5419 
5420 	if (src_nid == dst_nid)
5421 		return false;
5422 
5423 	if (numa_group) {
5424 		/* Task is moving within/into the group's interleave set. */
5425 		if (node_isset(dst_nid, numa_group->active_nodes))
5426 			return false;
5427 
5428 		/* Task is moving out of the group's interleave set. */
5429 		if (node_isset(src_nid, numa_group->active_nodes))
5430 			return true;
5431 
5432 		return group_faults(p, dst_nid) < group_faults(p, src_nid);
5433 	}
5434 
5435 	/* Migrating away from the preferred node is always bad. */
5436 	if (src_nid == p->numa_preferred_nid)
5437 		return true;
5438 
5439 	return task_faults(p, dst_nid) < task_faults(p, src_nid);
5440 }
5441 
5442 #else
5443 static inline bool migrate_improves_locality(struct task_struct *p,
5444 					     struct lb_env *env)
5445 {
5446 	return false;
5447 }
5448 
5449 static inline bool migrate_degrades_locality(struct task_struct *p,
5450 					     struct lb_env *env)
5451 {
5452 	return false;
5453 }
5454 #endif
5455 
5456 /*
5457  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5458  */
5459 static
5460 int can_migrate_task(struct task_struct *p, struct lb_env *env)
5461 {
5462 	int tsk_cache_hot = 0;
5463 
5464 	lockdep_assert_held(&env->src_rq->lock);
5465 
5466 	/*
5467 	 * We do not migrate tasks that are:
5468 	 * 1) throttled_lb_pair, or
5469 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
5470 	 * 3) running (obviously), or
5471 	 * 4) are cache-hot on their current CPU.
5472 	 */
5473 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5474 		return 0;
5475 
5476 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
5477 		int cpu;
5478 
5479 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
5480 
5481 		env->flags |= LBF_SOME_PINNED;
5482 
5483 		/*
5484 		 * Remember if this task can be migrated to any other cpu in
5485 		 * our sched_group. We may want to revisit it if we couldn't
5486 		 * meet load balance goals by pulling other tasks on src_cpu.
5487 		 *
5488 		 * Also avoid computing new_dst_cpu if we have already computed
5489 		 * one in current iteration.
5490 		 */
5491 		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
5492 			return 0;
5493 
5494 		/* Prevent to re-select dst_cpu via env's cpus */
5495 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5496 			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5497 				env->flags |= LBF_DST_PINNED;
5498 				env->new_dst_cpu = cpu;
5499 				break;
5500 			}
5501 		}
5502 
5503 		return 0;
5504 	}
5505 
5506 	/* Record that we found atleast one task that could run on dst_cpu */
5507 	env->flags &= ~LBF_ALL_PINNED;
5508 
5509 	if (task_running(env->src_rq, p)) {
5510 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
5511 		return 0;
5512 	}
5513 
5514 	/*
5515 	 * Aggressive migration if:
5516 	 * 1) destination numa is preferred
5517 	 * 2) task is cache cold, or
5518 	 * 3) too many balance attempts have failed.
5519 	 */
5520 	tsk_cache_hot = task_hot(p, env);
5521 	if (!tsk_cache_hot)
5522 		tsk_cache_hot = migrate_degrades_locality(p, env);
5523 
5524 	if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
5525 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5526 		if (tsk_cache_hot) {
5527 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5528 			schedstat_inc(p, se.statistics.nr_forced_migrations);
5529 		}
5530 		return 1;
5531 	}
5532 
5533 	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
5534 	return 0;
5535 }
5536 
5537 /*
5538  * detach_task() -- detach the task for the migration specified in env
5539  */
5540 static void detach_task(struct task_struct *p, struct lb_env *env)
5541 {
5542 	lockdep_assert_held(&env->src_rq->lock);
5543 
5544 	deactivate_task(env->src_rq, p, 0);
5545 	p->on_rq = TASK_ON_RQ_MIGRATING;
5546 	set_task_cpu(p, env->dst_cpu);
5547 }
5548 
5549 /*
5550  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5551  * part of active balancing operations within "domain".
5552  *
5553  * Returns a task if successful and NULL otherwise.
5554  */
5555 static struct task_struct *detach_one_task(struct lb_env *env)
5556 {
5557 	struct task_struct *p, *n;
5558 
5559 	lockdep_assert_held(&env->src_rq->lock);
5560 
5561 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5562 		if (!can_migrate_task(p, env))
5563 			continue;
5564 
5565 		detach_task(p, env);
5566 
5567 		/*
5568 		 * Right now, this is only the second place where
5569 		 * lb_gained[env->idle] is updated (other is detach_tasks)
5570 		 * so we can safely collect stats here rather than
5571 		 * inside detach_tasks().
5572 		 */
5573 		schedstat_inc(env->sd, lb_gained[env->idle]);
5574 		return p;
5575 	}
5576 	return NULL;
5577 }
5578 
5579 static const unsigned int sched_nr_migrate_break = 32;
5580 
5581 /*
5582  * detach_tasks() -- tries to detach up to imbalance weighted load from
5583  * busiest_rq, as part of a balancing operation within domain "sd".
5584  *
5585  * Returns number of detached tasks if successful and 0 otherwise.
5586  */
5587 static int detach_tasks(struct lb_env *env)
5588 {
5589 	struct list_head *tasks = &env->src_rq->cfs_tasks;
5590 	struct task_struct *p;
5591 	unsigned long load;
5592 	int detached = 0;
5593 
5594 	lockdep_assert_held(&env->src_rq->lock);
5595 
5596 	if (env->imbalance <= 0)
5597 		return 0;
5598 
5599 	while (!list_empty(tasks)) {
5600 		p = list_first_entry(tasks, struct task_struct, se.group_node);
5601 
5602 		env->loop++;
5603 		/* We've more or less seen every task there is, call it quits */
5604 		if (env->loop > env->loop_max)
5605 			break;
5606 
5607 		/* take a breather every nr_migrate tasks */
5608 		if (env->loop > env->loop_break) {
5609 			env->loop_break += sched_nr_migrate_break;
5610 			env->flags |= LBF_NEED_BREAK;
5611 			break;
5612 		}
5613 
5614 		if (!can_migrate_task(p, env))
5615 			goto next;
5616 
5617 		load = task_h_load(p);
5618 
5619 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
5620 			goto next;
5621 
5622 		if ((load / 2) > env->imbalance)
5623 			goto next;
5624 
5625 		detach_task(p, env);
5626 		list_add(&p->se.group_node, &env->tasks);
5627 
5628 		detached++;
5629 		env->imbalance -= load;
5630 
5631 #ifdef CONFIG_PREEMPT
5632 		/*
5633 		 * NEWIDLE balancing is a source of latency, so preemptible
5634 		 * kernels will stop after the first task is detached to minimize
5635 		 * the critical section.
5636 		 */
5637 		if (env->idle == CPU_NEWLY_IDLE)
5638 			break;
5639 #endif
5640 
5641 		/*
5642 		 * We only want to steal up to the prescribed amount of
5643 		 * weighted load.
5644 		 */
5645 		if (env->imbalance <= 0)
5646 			break;
5647 
5648 		continue;
5649 next:
5650 		list_move_tail(&p->se.group_node, tasks);
5651 	}
5652 
5653 	/*
5654 	 * Right now, this is one of only two places we collect this stat
5655 	 * so we can safely collect detach_one_task() stats here rather
5656 	 * than inside detach_one_task().
5657 	 */
5658 	schedstat_add(env->sd, lb_gained[env->idle], detached);
5659 
5660 	return detached;
5661 }
5662 
5663 /*
5664  * attach_task() -- attach the task detached by detach_task() to its new rq.
5665  */
5666 static void attach_task(struct rq *rq, struct task_struct *p)
5667 {
5668 	lockdep_assert_held(&rq->lock);
5669 
5670 	BUG_ON(task_rq(p) != rq);
5671 	p->on_rq = TASK_ON_RQ_QUEUED;
5672 	activate_task(rq, p, 0);
5673 	check_preempt_curr(rq, p, 0);
5674 }
5675 
5676 /*
5677  * attach_one_task() -- attaches the task returned from detach_one_task() to
5678  * its new rq.
5679  */
5680 static void attach_one_task(struct rq *rq, struct task_struct *p)
5681 {
5682 	raw_spin_lock(&rq->lock);
5683 	attach_task(rq, p);
5684 	raw_spin_unlock(&rq->lock);
5685 }
5686 
5687 /*
5688  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
5689  * new rq.
5690  */
5691 static void attach_tasks(struct lb_env *env)
5692 {
5693 	struct list_head *tasks = &env->tasks;
5694 	struct task_struct *p;
5695 
5696 	raw_spin_lock(&env->dst_rq->lock);
5697 
5698 	while (!list_empty(tasks)) {
5699 		p = list_first_entry(tasks, struct task_struct, se.group_node);
5700 		list_del_init(&p->se.group_node);
5701 
5702 		attach_task(env->dst_rq, p);
5703 	}
5704 
5705 	raw_spin_unlock(&env->dst_rq->lock);
5706 }
5707 
5708 #ifdef CONFIG_FAIR_GROUP_SCHED
5709 /*
5710  * update tg->load_weight by folding this cpu's load_avg
5711  */
5712 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
5713 {
5714 	struct sched_entity *se = tg->se[cpu];
5715 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
5716 
5717 	/* throttled entities do not contribute to load */
5718 	if (throttled_hierarchy(cfs_rq))
5719 		return;
5720 
5721 	update_cfs_rq_blocked_load(cfs_rq, 1);
5722 
5723 	if (se) {
5724 		update_entity_load_avg(se, 1);
5725 		/*
5726 		 * We pivot on our runnable average having decayed to zero for
5727 		 * list removal.  This generally implies that all our children
5728 		 * have also been removed (modulo rounding error or bandwidth
5729 		 * control); however, such cases are rare and we can fix these
5730 		 * at enqueue.
5731 		 *
5732 		 * TODO: fix up out-of-order children on enqueue.
5733 		 */
5734 		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
5735 			list_del_leaf_cfs_rq(cfs_rq);
5736 	} else {
5737 		struct rq *rq = rq_of(cfs_rq);
5738 		update_rq_runnable_avg(rq, rq->nr_running);
5739 	}
5740 }
5741 
5742 static void update_blocked_averages(int cpu)
5743 {
5744 	struct rq *rq = cpu_rq(cpu);
5745 	struct cfs_rq *cfs_rq;
5746 	unsigned long flags;
5747 
5748 	raw_spin_lock_irqsave(&rq->lock, flags);
5749 	update_rq_clock(rq);
5750 	/*
5751 	 * Iterates the task_group tree in a bottom up fashion, see
5752 	 * list_add_leaf_cfs_rq() for details.
5753 	 */
5754 	for_each_leaf_cfs_rq(rq, cfs_rq) {
5755 		/*
5756 		 * Note: We may want to consider periodically releasing
5757 		 * rq->lock about these updates so that creating many task
5758 		 * groups does not result in continually extending hold time.
5759 		 */
5760 		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
5761 	}
5762 
5763 	raw_spin_unlock_irqrestore(&rq->lock, flags);
5764 }
5765 
5766 /*
5767  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
5768  * This needs to be done in a top-down fashion because the load of a child
5769  * group is a fraction of its parents load.
5770  */
5771 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
5772 {
5773 	struct rq *rq = rq_of(cfs_rq);
5774 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
5775 	unsigned long now = jiffies;
5776 	unsigned long load;
5777 
5778 	if (cfs_rq->last_h_load_update == now)
5779 		return;
5780 
5781 	cfs_rq->h_load_next = NULL;
5782 	for_each_sched_entity(se) {
5783 		cfs_rq = cfs_rq_of(se);
5784 		cfs_rq->h_load_next = se;
5785 		if (cfs_rq->last_h_load_update == now)
5786 			break;
5787 	}
5788 
5789 	if (!se) {
5790 		cfs_rq->h_load = cfs_rq->runnable_load_avg;
5791 		cfs_rq->last_h_load_update = now;
5792 	}
5793 
5794 	while ((se = cfs_rq->h_load_next) != NULL) {
5795 		load = cfs_rq->h_load;
5796 		load = div64_ul(load * se->avg.load_avg_contrib,
5797 				cfs_rq->runnable_load_avg + 1);
5798 		cfs_rq = group_cfs_rq(se);
5799 		cfs_rq->h_load = load;
5800 		cfs_rq->last_h_load_update = now;
5801 	}
5802 }
5803 
5804 static unsigned long task_h_load(struct task_struct *p)
5805 {
5806 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
5807 
5808 	update_cfs_rq_h_load(cfs_rq);
5809 	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
5810 			cfs_rq->runnable_load_avg + 1);
5811 }
5812 #else
5813 static inline void update_blocked_averages(int cpu)
5814 {
5815 }
5816 
5817 static unsigned long task_h_load(struct task_struct *p)
5818 {
5819 	return p->se.avg.load_avg_contrib;
5820 }
5821 #endif
5822 
5823 /********** Helpers for find_busiest_group ************************/
5824 
5825 enum group_type {
5826 	group_other = 0,
5827 	group_imbalanced,
5828 	group_overloaded,
5829 };
5830 
5831 /*
5832  * sg_lb_stats - stats of a sched_group required for load_balancing
5833  */
5834 struct sg_lb_stats {
5835 	unsigned long avg_load; /*Avg load across the CPUs of the group */
5836 	unsigned long group_load; /* Total load over the CPUs of the group */
5837 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5838 	unsigned long load_per_task;
5839 	unsigned long group_capacity;
5840 	unsigned int sum_nr_running; /* Nr tasks running in the group */
5841 	unsigned int group_capacity_factor;
5842 	unsigned int idle_cpus;
5843 	unsigned int group_weight;
5844 	enum group_type group_type;
5845 	int group_has_free_capacity;
5846 #ifdef CONFIG_NUMA_BALANCING
5847 	unsigned int nr_numa_running;
5848 	unsigned int nr_preferred_running;
5849 #endif
5850 };
5851 
5852 /*
5853  * sd_lb_stats - Structure to store the statistics of a sched_domain
5854  *		 during load balancing.
5855  */
5856 struct sd_lb_stats {
5857 	struct sched_group *busiest;	/* Busiest group in this sd */
5858 	struct sched_group *local;	/* Local group in this sd */
5859 	unsigned long total_load;	/* Total load of all groups in sd */
5860 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
5861 	unsigned long avg_load;	/* Average load across all groups in sd */
5862 
5863 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
5864 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
5865 };
5866 
5867 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5868 {
5869 	/*
5870 	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
5871 	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
5872 	 * We must however clear busiest_stat::avg_load because
5873 	 * update_sd_pick_busiest() reads this before assignment.
5874 	 */
5875 	*sds = (struct sd_lb_stats){
5876 		.busiest = NULL,
5877 		.local = NULL,
5878 		.total_load = 0UL,
5879 		.total_capacity = 0UL,
5880 		.busiest_stat = {
5881 			.avg_load = 0UL,
5882 			.sum_nr_running = 0,
5883 			.group_type = group_other,
5884 		},
5885 	};
5886 }
5887 
5888 /**
5889  * get_sd_load_idx - Obtain the load index for a given sched domain.
5890  * @sd: The sched_domain whose load_idx is to be obtained.
5891  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
5892  *
5893  * Return: The load index.
5894  */
5895 static inline int get_sd_load_idx(struct sched_domain *sd,
5896 					enum cpu_idle_type idle)
5897 {
5898 	int load_idx;
5899 
5900 	switch (idle) {
5901 	case CPU_NOT_IDLE:
5902 		load_idx = sd->busy_idx;
5903 		break;
5904 
5905 	case CPU_NEWLY_IDLE:
5906 		load_idx = sd->newidle_idx;
5907 		break;
5908 	default:
5909 		load_idx = sd->idle_idx;
5910 		break;
5911 	}
5912 
5913 	return load_idx;
5914 }
5915 
5916 static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
5917 {
5918 	return SCHED_CAPACITY_SCALE;
5919 }
5920 
5921 unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
5922 {
5923 	return default_scale_capacity(sd, cpu);
5924 }
5925 
5926 static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5927 {
5928 	if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
5929 		return sd->smt_gain / sd->span_weight;
5930 
5931 	return SCHED_CAPACITY_SCALE;
5932 }
5933 
5934 unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5935 {
5936 	return default_scale_cpu_capacity(sd, cpu);
5937 }
5938 
5939 static unsigned long scale_rt_capacity(int cpu)
5940 {
5941 	struct rq *rq = cpu_rq(cpu);
5942 	u64 total, available, age_stamp, avg;
5943 	s64 delta;
5944 
5945 	/*
5946 	 * Since we're reading these variables without serialization make sure
5947 	 * we read them once before doing sanity checks on them.
5948 	 */
5949 	age_stamp = ACCESS_ONCE(rq->age_stamp);
5950 	avg = ACCESS_ONCE(rq->rt_avg);
5951 	delta = __rq_clock_broken(rq) - age_stamp;
5952 
5953 	if (unlikely(delta < 0))
5954 		delta = 0;
5955 
5956 	total = sched_avg_period() + delta;
5957 
5958 	if (unlikely(total < avg)) {
5959 		/* Ensures that capacity won't end up being negative */
5960 		available = 0;
5961 	} else {
5962 		available = total - avg;
5963 	}
5964 
5965 	if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
5966 		total = SCHED_CAPACITY_SCALE;
5967 
5968 	total >>= SCHED_CAPACITY_SHIFT;
5969 
5970 	return div_u64(available, total);
5971 }
5972 
5973 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
5974 {
5975 	unsigned long capacity = SCHED_CAPACITY_SCALE;
5976 	struct sched_group *sdg = sd->groups;
5977 
5978 	if (sched_feat(ARCH_CAPACITY))
5979 		capacity *= arch_scale_cpu_capacity(sd, cpu);
5980 	else
5981 		capacity *= default_scale_cpu_capacity(sd, cpu);
5982 
5983 	capacity >>= SCHED_CAPACITY_SHIFT;
5984 
5985 	sdg->sgc->capacity_orig = capacity;
5986 
5987 	if (sched_feat(ARCH_CAPACITY))
5988 		capacity *= arch_scale_freq_capacity(sd, cpu);
5989 	else
5990 		capacity *= default_scale_capacity(sd, cpu);
5991 
5992 	capacity >>= SCHED_CAPACITY_SHIFT;
5993 
5994 	capacity *= scale_rt_capacity(cpu);
5995 	capacity >>= SCHED_CAPACITY_SHIFT;
5996 
5997 	if (!capacity)
5998 		capacity = 1;
5999 
6000 	cpu_rq(cpu)->cpu_capacity = capacity;
6001 	sdg->sgc->capacity = capacity;
6002 }
6003 
6004 void update_group_capacity(struct sched_domain *sd, int cpu)
6005 {
6006 	struct sched_domain *child = sd->child;
6007 	struct sched_group *group, *sdg = sd->groups;
6008 	unsigned long capacity, capacity_orig;
6009 	unsigned long interval;
6010 
6011 	interval = msecs_to_jiffies(sd->balance_interval);
6012 	interval = clamp(interval, 1UL, max_load_balance_interval);
6013 	sdg->sgc->next_update = jiffies + interval;
6014 
6015 	if (!child) {
6016 		update_cpu_capacity(sd, cpu);
6017 		return;
6018 	}
6019 
6020 	capacity_orig = capacity = 0;
6021 
6022 	if (child->flags & SD_OVERLAP) {
6023 		/*
6024 		 * SD_OVERLAP domains cannot assume that child groups
6025 		 * span the current group.
6026 		 */
6027 
6028 		for_each_cpu(cpu, sched_group_cpus(sdg)) {
6029 			struct sched_group_capacity *sgc;
6030 			struct rq *rq = cpu_rq(cpu);
6031 
6032 			/*
6033 			 * build_sched_domains() -> init_sched_groups_capacity()
6034 			 * gets here before we've attached the domains to the
6035 			 * runqueues.
6036 			 *
6037 			 * Use capacity_of(), which is set irrespective of domains
6038 			 * in update_cpu_capacity().
6039 			 *
6040 			 * This avoids capacity/capacity_orig from being 0 and
6041 			 * causing divide-by-zero issues on boot.
6042 			 *
6043 			 * Runtime updates will correct capacity_orig.
6044 			 */
6045 			if (unlikely(!rq->sd)) {
6046 				capacity_orig += capacity_of(cpu);
6047 				capacity += capacity_of(cpu);
6048 				continue;
6049 			}
6050 
6051 			sgc = rq->sd->groups->sgc;
6052 			capacity_orig += sgc->capacity_orig;
6053 			capacity += sgc->capacity;
6054 		}
6055 	} else  {
6056 		/*
6057 		 * !SD_OVERLAP domains can assume that child groups
6058 		 * span the current group.
6059 		 */
6060 
6061 		group = child->groups;
6062 		do {
6063 			capacity_orig += group->sgc->capacity_orig;
6064 			capacity += group->sgc->capacity;
6065 			group = group->next;
6066 		} while (group != child->groups);
6067 	}
6068 
6069 	sdg->sgc->capacity_orig = capacity_orig;
6070 	sdg->sgc->capacity = capacity;
6071 }
6072 
6073 /*
6074  * Try and fix up capacity for tiny siblings, this is needed when
6075  * things like SD_ASYM_PACKING need f_b_g to select another sibling
6076  * which on its own isn't powerful enough.
6077  *
6078  * See update_sd_pick_busiest() and check_asym_packing().
6079  */
6080 static inline int
6081 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
6082 {
6083 	/*
6084 	 * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
6085 	 */
6086 	if (!(sd->flags & SD_SHARE_CPUCAPACITY))
6087 		return 0;
6088 
6089 	/*
6090 	 * If ~90% of the cpu_capacity is still there, we're good.
6091 	 */
6092 	if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
6093 		return 1;
6094 
6095 	return 0;
6096 }
6097 
6098 /*
6099  * Group imbalance indicates (and tries to solve) the problem where balancing
6100  * groups is inadequate due to tsk_cpus_allowed() constraints.
6101  *
6102  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
6103  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
6104  * Something like:
6105  *
6106  * 	{ 0 1 2 3 } { 4 5 6 7 }
6107  * 	        *     * * *
6108  *
6109  * If we were to balance group-wise we'd place two tasks in the first group and
6110  * two tasks in the second group. Clearly this is undesired as it will overload
6111  * cpu 3 and leave one of the cpus in the second group unused.
6112  *
6113  * The current solution to this issue is detecting the skew in the first group
6114  * by noticing the lower domain failed to reach balance and had difficulty
6115  * moving tasks due to affinity constraints.
6116  *
6117  * When this is so detected; this group becomes a candidate for busiest; see
6118  * update_sd_pick_busiest(). And calculate_imbalance() and
6119  * find_busiest_group() avoid some of the usual balance conditions to allow it
6120  * to create an effective group imbalance.
6121  *
6122  * This is a somewhat tricky proposition since the next run might not find the
6123  * group imbalance and decide the groups need to be balanced again. A most
6124  * subtle and fragile situation.
6125  */
6126 
6127 static inline int sg_imbalanced(struct sched_group *group)
6128 {
6129 	return group->sgc->imbalance;
6130 }
6131 
6132 /*
6133  * Compute the group capacity factor.
6134  *
6135  * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
6136  * first dividing out the smt factor and computing the actual number of cores
6137  * and limit unit capacity with that.
6138  */
6139 static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
6140 {
6141 	unsigned int capacity_factor, smt, cpus;
6142 	unsigned int capacity, capacity_orig;
6143 
6144 	capacity = group->sgc->capacity;
6145 	capacity_orig = group->sgc->capacity_orig;
6146 	cpus = group->group_weight;
6147 
6148 	/* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
6149 	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
6150 	capacity_factor = cpus / smt; /* cores */
6151 
6152 	capacity_factor = min_t(unsigned,
6153 		capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
6154 	if (!capacity_factor)
6155 		capacity_factor = fix_small_capacity(env->sd, group);
6156 
6157 	return capacity_factor;
6158 }
6159 
6160 static enum group_type
6161 group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
6162 {
6163 	if (sgs->sum_nr_running > sgs->group_capacity_factor)
6164 		return group_overloaded;
6165 
6166 	if (sg_imbalanced(group))
6167 		return group_imbalanced;
6168 
6169 	return group_other;
6170 }
6171 
6172 /**
6173  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
6174  * @env: The load balancing environment.
6175  * @group: sched_group whose statistics are to be updated.
6176  * @load_idx: Load index of sched_domain of this_cpu for load calc.
6177  * @local_group: Does group contain this_cpu.
6178  * @sgs: variable to hold the statistics for this group.
6179  * @overload: Indicate more than one runnable task for any CPU.
6180  */
6181 static inline void update_sg_lb_stats(struct lb_env *env,
6182 			struct sched_group *group, int load_idx,
6183 			int local_group, struct sg_lb_stats *sgs,
6184 			bool *overload)
6185 {
6186 	unsigned long load;
6187 	int i;
6188 
6189 	memset(sgs, 0, sizeof(*sgs));
6190 
6191 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6192 		struct rq *rq = cpu_rq(i);
6193 
6194 		/* Bias balancing toward cpus of our domain */
6195 		if (local_group)
6196 			load = target_load(i, load_idx);
6197 		else
6198 			load = source_load(i, load_idx);
6199 
6200 		sgs->group_load += load;
6201 		sgs->sum_nr_running += rq->cfs.h_nr_running;
6202 
6203 		if (rq->nr_running > 1)
6204 			*overload = true;
6205 
6206 #ifdef CONFIG_NUMA_BALANCING
6207 		sgs->nr_numa_running += rq->nr_numa_running;
6208 		sgs->nr_preferred_running += rq->nr_preferred_running;
6209 #endif
6210 		sgs->sum_weighted_load += weighted_cpuload(i);
6211 		if (idle_cpu(i))
6212 			sgs->idle_cpus++;
6213 	}
6214 
6215 	/* Adjust by relative CPU capacity of the group */
6216 	sgs->group_capacity = group->sgc->capacity;
6217 	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
6218 
6219 	if (sgs->sum_nr_running)
6220 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
6221 
6222 	sgs->group_weight = group->group_weight;
6223 	sgs->group_capacity_factor = sg_capacity_factor(env, group);
6224 	sgs->group_type = group_classify(group, sgs);
6225 
6226 	if (sgs->group_capacity_factor > sgs->sum_nr_running)
6227 		sgs->group_has_free_capacity = 1;
6228 }
6229 
6230 /**
6231  * update_sd_pick_busiest - return 1 on busiest group
6232  * @env: The load balancing environment.
6233  * @sds: sched_domain statistics
6234  * @sg: sched_group candidate to be checked for being the busiest
6235  * @sgs: sched_group statistics
6236  *
6237  * Determine if @sg is a busier group than the previously selected
6238  * busiest group.
6239  *
6240  * Return: %true if @sg is a busier group than the previously selected
6241  * busiest group. %false otherwise.
6242  */
6243 static bool update_sd_pick_busiest(struct lb_env *env,
6244 				   struct sd_lb_stats *sds,
6245 				   struct sched_group *sg,
6246 				   struct sg_lb_stats *sgs)
6247 {
6248 	struct sg_lb_stats *busiest = &sds->busiest_stat;
6249 
6250 	if (sgs->group_type > busiest->group_type)
6251 		return true;
6252 
6253 	if (sgs->group_type < busiest->group_type)
6254 		return false;
6255 
6256 	if (sgs->avg_load <= busiest->avg_load)
6257 		return false;
6258 
6259 	/* This is the busiest node in its class. */
6260 	if (!(env->sd->flags & SD_ASYM_PACKING))
6261 		return true;
6262 
6263 	/*
6264 	 * ASYM_PACKING needs to move all the work to the lowest
6265 	 * numbered CPUs in the group, therefore mark all groups
6266 	 * higher than ourself as busy.
6267 	 */
6268 	if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
6269 		if (!sds->busiest)
6270 			return true;
6271 
6272 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
6273 			return true;
6274 	}
6275 
6276 	return false;
6277 }
6278 
6279 #ifdef CONFIG_NUMA_BALANCING
6280 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6281 {
6282 	if (sgs->sum_nr_running > sgs->nr_numa_running)
6283 		return regular;
6284 	if (sgs->sum_nr_running > sgs->nr_preferred_running)
6285 		return remote;
6286 	return all;
6287 }
6288 
6289 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6290 {
6291 	if (rq->nr_running > rq->nr_numa_running)
6292 		return regular;
6293 	if (rq->nr_running > rq->nr_preferred_running)
6294 		return remote;
6295 	return all;
6296 }
6297 #else
6298 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6299 {
6300 	return all;
6301 }
6302 
6303 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6304 {
6305 	return regular;
6306 }
6307 #endif /* CONFIG_NUMA_BALANCING */
6308 
6309 /**
6310  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
6311  * @env: The load balancing environment.
6312  * @sds: variable to hold the statistics for this sched_domain.
6313  */
6314 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
6315 {
6316 	struct sched_domain *child = env->sd->child;
6317 	struct sched_group *sg = env->sd->groups;
6318 	struct sg_lb_stats tmp_sgs;
6319 	int load_idx, prefer_sibling = 0;
6320 	bool overload = false;
6321 
6322 	if (child && child->flags & SD_PREFER_SIBLING)
6323 		prefer_sibling = 1;
6324 
6325 	load_idx = get_sd_load_idx(env->sd, env->idle);
6326 
6327 	do {
6328 		struct sg_lb_stats *sgs = &tmp_sgs;
6329 		int local_group;
6330 
6331 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
6332 		if (local_group) {
6333 			sds->local = sg;
6334 			sgs = &sds->local_stat;
6335 
6336 			if (env->idle != CPU_NEWLY_IDLE ||
6337 			    time_after_eq(jiffies, sg->sgc->next_update))
6338 				update_group_capacity(env->sd, env->dst_cpu);
6339 		}
6340 
6341 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6342 						&overload);
6343 
6344 		if (local_group)
6345 			goto next_group;
6346 
6347 		/*
6348 		 * In case the child domain prefers tasks go to siblings
6349 		 * first, lower the sg capacity factor to one so that we'll try
6350 		 * and move all the excess tasks away. We lower the capacity
6351 		 * of a group only if the local group has the capacity to fit
6352 		 * these excess tasks, i.e. nr_running < group_capacity_factor. The
6353 		 * extra check prevents the case where you always pull from the
6354 		 * heaviest group when it is already under-utilized (possible
6355 		 * with a large weight task outweighs the tasks on the system).
6356 		 */
6357 		if (prefer_sibling && sds->local &&
6358 		    sds->local_stat.group_has_free_capacity) {
6359 			sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
6360 			sgs->group_type = group_classify(sg, sgs);
6361 		}
6362 
6363 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6364 			sds->busiest = sg;
6365 			sds->busiest_stat = *sgs;
6366 		}
6367 
6368 next_group:
6369 		/* Now, start updating sd_lb_stats */
6370 		sds->total_load += sgs->group_load;
6371 		sds->total_capacity += sgs->group_capacity;
6372 
6373 		sg = sg->next;
6374 	} while (sg != env->sd->groups);
6375 
6376 	if (env->sd->flags & SD_NUMA)
6377 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6378 
6379 	if (!env->sd->parent) {
6380 		/* update overload indicator if we are at root domain */
6381 		if (env->dst_rq->rd->overload != overload)
6382 			env->dst_rq->rd->overload = overload;
6383 	}
6384 
6385 }
6386 
6387 /**
6388  * check_asym_packing - Check to see if the group is packed into the
6389  *			sched doman.
6390  *
6391  * This is primarily intended to used at the sibling level.  Some
6392  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
6393  * case of POWER7, it can move to lower SMT modes only when higher
6394  * threads are idle.  When in lower SMT modes, the threads will
6395  * perform better since they share less core resources.  Hence when we
6396  * have idle threads, we want them to be the higher ones.
6397  *
6398  * This packing function is run on idle threads.  It checks to see if
6399  * the busiest CPU in this domain (core in the P7 case) has a higher
6400  * CPU number than the packing function is being run on.  Here we are
6401  * assuming lower CPU number will be equivalent to lower a SMT thread
6402  * number.
6403  *
6404  * Return: 1 when packing is required and a task should be moved to
6405  * this CPU.  The amount of the imbalance is returned in *imbalance.
6406  *
6407  * @env: The load balancing environment.
6408  * @sds: Statistics of the sched_domain which is to be packed
6409  */
6410 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6411 {
6412 	int busiest_cpu;
6413 
6414 	if (!(env->sd->flags & SD_ASYM_PACKING))
6415 		return 0;
6416 
6417 	if (!sds->busiest)
6418 		return 0;
6419 
6420 	busiest_cpu = group_first_cpu(sds->busiest);
6421 	if (env->dst_cpu > busiest_cpu)
6422 		return 0;
6423 
6424 	env->imbalance = DIV_ROUND_CLOSEST(
6425 		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
6426 		SCHED_CAPACITY_SCALE);
6427 
6428 	return 1;
6429 }
6430 
6431 /**
6432  * fix_small_imbalance - Calculate the minor imbalance that exists
6433  *			amongst the groups of a sched_domain, during
6434  *			load balancing.
6435  * @env: The load balancing environment.
6436  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
6437  */
6438 static inline
6439 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6440 {
6441 	unsigned long tmp, capa_now = 0, capa_move = 0;
6442 	unsigned int imbn = 2;
6443 	unsigned long scaled_busy_load_per_task;
6444 	struct sg_lb_stats *local, *busiest;
6445 
6446 	local = &sds->local_stat;
6447 	busiest = &sds->busiest_stat;
6448 
6449 	if (!local->sum_nr_running)
6450 		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
6451 	else if (busiest->load_per_task > local->load_per_task)
6452 		imbn = 1;
6453 
6454 	scaled_busy_load_per_task =
6455 		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6456 		busiest->group_capacity;
6457 
6458 	if (busiest->avg_load + scaled_busy_load_per_task >=
6459 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
6460 		env->imbalance = busiest->load_per_task;
6461 		return;
6462 	}
6463 
6464 	/*
6465 	 * OK, we don't have enough imbalance to justify moving tasks,
6466 	 * however we may be able to increase total CPU capacity used by
6467 	 * moving them.
6468 	 */
6469 
6470 	capa_now += busiest->group_capacity *
6471 			min(busiest->load_per_task, busiest->avg_load);
6472 	capa_now += local->group_capacity *
6473 			min(local->load_per_task, local->avg_load);
6474 	capa_now /= SCHED_CAPACITY_SCALE;
6475 
6476 	/* Amount of load we'd subtract */
6477 	if (busiest->avg_load > scaled_busy_load_per_task) {
6478 		capa_move += busiest->group_capacity *
6479 			    min(busiest->load_per_task,
6480 				busiest->avg_load - scaled_busy_load_per_task);
6481 	}
6482 
6483 	/* Amount of load we'd add */
6484 	if (busiest->avg_load * busiest->group_capacity <
6485 	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
6486 		tmp = (busiest->avg_load * busiest->group_capacity) /
6487 		      local->group_capacity;
6488 	} else {
6489 		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6490 		      local->group_capacity;
6491 	}
6492 	capa_move += local->group_capacity *
6493 		    min(local->load_per_task, local->avg_load + tmp);
6494 	capa_move /= SCHED_CAPACITY_SCALE;
6495 
6496 	/* Move if we gain throughput */
6497 	if (capa_move > capa_now)
6498 		env->imbalance = busiest->load_per_task;
6499 }
6500 
6501 /**
6502  * calculate_imbalance - Calculate the amount of imbalance present within the
6503  *			 groups of a given sched_domain during load balance.
6504  * @env: load balance environment
6505  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
6506  */
6507 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6508 {
6509 	unsigned long max_pull, load_above_capacity = ~0UL;
6510 	struct sg_lb_stats *local, *busiest;
6511 
6512 	local = &sds->local_stat;
6513 	busiest = &sds->busiest_stat;
6514 
6515 	if (busiest->group_type == group_imbalanced) {
6516 		/*
6517 		 * In the group_imb case we cannot rely on group-wide averages
6518 		 * to ensure cpu-load equilibrium, look at wider averages. XXX
6519 		 */
6520 		busiest->load_per_task =
6521 			min(busiest->load_per_task, sds->avg_load);
6522 	}
6523 
6524 	/*
6525 	 * In the presence of smp nice balancing, certain scenarios can have
6526 	 * max load less than avg load(as we skip the groups at or below
6527 	 * its cpu_capacity, while calculating max_load..)
6528 	 */
6529 	if (busiest->avg_load <= sds->avg_load ||
6530 	    local->avg_load >= sds->avg_load) {
6531 		env->imbalance = 0;
6532 		return fix_small_imbalance(env, sds);
6533 	}
6534 
6535 	/*
6536 	 * If there aren't any idle cpus, avoid creating some.
6537 	 */
6538 	if (busiest->group_type == group_overloaded &&
6539 	    local->group_type   == group_overloaded) {
6540 		load_above_capacity =
6541 			(busiest->sum_nr_running - busiest->group_capacity_factor);
6542 
6543 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
6544 		load_above_capacity /= busiest->group_capacity;
6545 	}
6546 
6547 	/*
6548 	 * We're trying to get all the cpus to the average_load, so we don't
6549 	 * want to push ourselves above the average load, nor do we wish to
6550 	 * reduce the max loaded cpu below the average load. At the same time,
6551 	 * we also don't want to reduce the group load below the group capacity
6552 	 * (so that we can implement power-savings policies etc). Thus we look
6553 	 * for the minimum possible imbalance.
6554 	 */
6555 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
6556 
6557 	/* How much load to actually move to equalise the imbalance */
6558 	env->imbalance = min(
6559 		max_pull * busiest->group_capacity,
6560 		(sds->avg_load - local->avg_load) * local->group_capacity
6561 	) / SCHED_CAPACITY_SCALE;
6562 
6563 	/*
6564 	 * if *imbalance is less than the average load per runnable task
6565 	 * there is no guarantee that any tasks will be moved so we'll have
6566 	 * a think about bumping its value to force at least one task to be
6567 	 * moved
6568 	 */
6569 	if (env->imbalance < busiest->load_per_task)
6570 		return fix_small_imbalance(env, sds);
6571 }
6572 
6573 /******* find_busiest_group() helpers end here *********************/
6574 
6575 /**
6576  * find_busiest_group - Returns the busiest group within the sched_domain
6577  * if there is an imbalance. If there isn't an imbalance, and
6578  * the user has opted for power-savings, it returns a group whose
6579  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
6580  * such a group exists.
6581  *
6582  * Also calculates the amount of weighted load which should be moved
6583  * to restore balance.
6584  *
6585  * @env: The load balancing environment.
6586  *
6587  * Return:	- The busiest group if imbalance exists.
6588  *		- If no imbalance and user has opted for power-savings balance,
6589  *		   return the least loaded group whose CPUs can be
6590  *		   put to idle by rebalancing its tasks onto our group.
6591  */
6592 static struct sched_group *find_busiest_group(struct lb_env *env)
6593 {
6594 	struct sg_lb_stats *local, *busiest;
6595 	struct sd_lb_stats sds;
6596 
6597 	init_sd_lb_stats(&sds);
6598 
6599 	/*
6600 	 * Compute the various statistics relavent for load balancing at
6601 	 * this level.
6602 	 */
6603 	update_sd_lb_stats(env, &sds);
6604 	local = &sds.local_stat;
6605 	busiest = &sds.busiest_stat;
6606 
6607 	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
6608 	    check_asym_packing(env, &sds))
6609 		return sds.busiest;
6610 
6611 	/* There is no busy sibling group to pull tasks from */
6612 	if (!sds.busiest || busiest->sum_nr_running == 0)
6613 		goto out_balanced;
6614 
6615 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
6616 						/ sds.total_capacity;
6617 
6618 	/*
6619 	 * If the busiest group is imbalanced the below checks don't
6620 	 * work because they assume all things are equal, which typically
6621 	 * isn't true due to cpus_allowed constraints and the like.
6622 	 */
6623 	if (busiest->group_type == group_imbalanced)
6624 		goto force_balance;
6625 
6626 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
6627 	if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
6628 	    !busiest->group_has_free_capacity)
6629 		goto force_balance;
6630 
6631 	/*
6632 	 * If the local group is busier than the selected busiest group
6633 	 * don't try and pull any tasks.
6634 	 */
6635 	if (local->avg_load >= busiest->avg_load)
6636 		goto out_balanced;
6637 
6638 	/*
6639 	 * Don't pull any tasks if this group is already above the domain
6640 	 * average load.
6641 	 */
6642 	if (local->avg_load >= sds.avg_load)
6643 		goto out_balanced;
6644 
6645 	if (env->idle == CPU_IDLE) {
6646 		/*
6647 		 * This cpu is idle. If the busiest group is not overloaded
6648 		 * and there is no imbalance between this and busiest group
6649 		 * wrt idle cpus, it is balanced. The imbalance becomes
6650 		 * significant if the diff is greater than 1 otherwise we
6651 		 * might end up to just move the imbalance on another group
6652 		 */
6653 		if ((busiest->group_type != group_overloaded) &&
6654 				(local->idle_cpus <= (busiest->idle_cpus + 1)))
6655 			goto out_balanced;
6656 	} else {
6657 		/*
6658 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
6659 		 * imbalance_pct to be conservative.
6660 		 */
6661 		if (100 * busiest->avg_load <=
6662 				env->sd->imbalance_pct * local->avg_load)
6663 			goto out_balanced;
6664 	}
6665 
6666 force_balance:
6667 	/* Looks like there is an imbalance. Compute it */
6668 	calculate_imbalance(env, &sds);
6669 	return sds.busiest;
6670 
6671 out_balanced:
6672 	env->imbalance = 0;
6673 	return NULL;
6674 }
6675 
6676 /*
6677  * find_busiest_queue - find the busiest runqueue among the cpus in group.
6678  */
6679 static struct rq *find_busiest_queue(struct lb_env *env,
6680 				     struct sched_group *group)
6681 {
6682 	struct rq *busiest = NULL, *rq;
6683 	unsigned long busiest_load = 0, busiest_capacity = 1;
6684 	int i;
6685 
6686 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6687 		unsigned long capacity, capacity_factor, wl;
6688 		enum fbq_type rt;
6689 
6690 		rq = cpu_rq(i);
6691 		rt = fbq_classify_rq(rq);
6692 
6693 		/*
6694 		 * We classify groups/runqueues into three groups:
6695 		 *  - regular: there are !numa tasks
6696 		 *  - remote:  there are numa tasks that run on the 'wrong' node
6697 		 *  - all:     there is no distinction
6698 		 *
6699 		 * In order to avoid migrating ideally placed numa tasks,
6700 		 * ignore those when there's better options.
6701 		 *
6702 		 * If we ignore the actual busiest queue to migrate another
6703 		 * task, the next balance pass can still reduce the busiest
6704 		 * queue by moving tasks around inside the node.
6705 		 *
6706 		 * If we cannot move enough load due to this classification
6707 		 * the next pass will adjust the group classification and
6708 		 * allow migration of more tasks.
6709 		 *
6710 		 * Both cases only affect the total convergence complexity.
6711 		 */
6712 		if (rt > env->fbq_type)
6713 			continue;
6714 
6715 		capacity = capacity_of(i);
6716 		capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
6717 		if (!capacity_factor)
6718 			capacity_factor = fix_small_capacity(env->sd, group);
6719 
6720 		wl = weighted_cpuload(i);
6721 
6722 		/*
6723 		 * When comparing with imbalance, use weighted_cpuload()
6724 		 * which is not scaled with the cpu capacity.
6725 		 */
6726 		if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
6727 			continue;
6728 
6729 		/*
6730 		 * For the load comparisons with the other cpu's, consider
6731 		 * the weighted_cpuload() scaled with the cpu capacity, so
6732 		 * that the load can be moved away from the cpu that is
6733 		 * potentially running at a lower capacity.
6734 		 *
6735 		 * Thus we're looking for max(wl_i / capacity_i), crosswise
6736 		 * multiplication to rid ourselves of the division works out
6737 		 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
6738 		 * our previous maximum.
6739 		 */
6740 		if (wl * busiest_capacity > busiest_load * capacity) {
6741 			busiest_load = wl;
6742 			busiest_capacity = capacity;
6743 			busiest = rq;
6744 		}
6745 	}
6746 
6747 	return busiest;
6748 }
6749 
6750 /*
6751  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
6752  * so long as it is large enough.
6753  */
6754 #define MAX_PINNED_INTERVAL	512
6755 
6756 /* Working cpumask for load_balance and load_balance_newidle. */
6757 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
6758 
6759 static int need_active_balance(struct lb_env *env)
6760 {
6761 	struct sched_domain *sd = env->sd;
6762 
6763 	if (env->idle == CPU_NEWLY_IDLE) {
6764 
6765 		/*
6766 		 * ASYM_PACKING needs to force migrate tasks from busy but
6767 		 * higher numbered CPUs in order to pack all tasks in the
6768 		 * lowest numbered CPUs.
6769 		 */
6770 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
6771 			return 1;
6772 	}
6773 
6774 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
6775 }
6776 
6777 static int active_load_balance_cpu_stop(void *data);
6778 
6779 static int should_we_balance(struct lb_env *env)
6780 {
6781 	struct sched_group *sg = env->sd->groups;
6782 	struct cpumask *sg_cpus, *sg_mask;
6783 	int cpu, balance_cpu = -1;
6784 
6785 	/*
6786 	 * In the newly idle case, we will allow all the cpu's
6787 	 * to do the newly idle load balance.
6788 	 */
6789 	if (env->idle == CPU_NEWLY_IDLE)
6790 		return 1;
6791 
6792 	sg_cpus = sched_group_cpus(sg);
6793 	sg_mask = sched_group_mask(sg);
6794 	/* Try to find first idle cpu */
6795 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
6796 		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
6797 			continue;
6798 
6799 		balance_cpu = cpu;
6800 		break;
6801 	}
6802 
6803 	if (balance_cpu == -1)
6804 		balance_cpu = group_balance_cpu(sg);
6805 
6806 	/*
6807 	 * First idle cpu or the first cpu(busiest) in this sched group
6808 	 * is eligible for doing load balancing at this and above domains.
6809 	 */
6810 	return balance_cpu == env->dst_cpu;
6811 }
6812 
6813 /*
6814  * Check this_cpu to ensure it is balanced within domain. Attempt to move
6815  * tasks if there is an imbalance.
6816  */
6817 static int load_balance(int this_cpu, struct rq *this_rq,
6818 			struct sched_domain *sd, enum cpu_idle_type idle,
6819 			int *continue_balancing)
6820 {
6821 	int ld_moved, cur_ld_moved, active_balance = 0;
6822 	struct sched_domain *sd_parent = sd->parent;
6823 	struct sched_group *group;
6824 	struct rq *busiest;
6825 	unsigned long flags;
6826 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
6827 
6828 	struct lb_env env = {
6829 		.sd		= sd,
6830 		.dst_cpu	= this_cpu,
6831 		.dst_rq		= this_rq,
6832 		.dst_grpmask    = sched_group_cpus(sd->groups),
6833 		.idle		= idle,
6834 		.loop_break	= sched_nr_migrate_break,
6835 		.cpus		= cpus,
6836 		.fbq_type	= all,
6837 		.tasks		= LIST_HEAD_INIT(env.tasks),
6838 	};
6839 
6840 	/*
6841 	 * For NEWLY_IDLE load_balancing, we don't need to consider
6842 	 * other cpus in our group
6843 	 */
6844 	if (idle == CPU_NEWLY_IDLE)
6845 		env.dst_grpmask = NULL;
6846 
6847 	cpumask_copy(cpus, cpu_active_mask);
6848 
6849 	schedstat_inc(sd, lb_count[idle]);
6850 
6851 redo:
6852 	if (!should_we_balance(&env)) {
6853 		*continue_balancing = 0;
6854 		goto out_balanced;
6855 	}
6856 
6857 	group = find_busiest_group(&env);
6858 	if (!group) {
6859 		schedstat_inc(sd, lb_nobusyg[idle]);
6860 		goto out_balanced;
6861 	}
6862 
6863 	busiest = find_busiest_queue(&env, group);
6864 	if (!busiest) {
6865 		schedstat_inc(sd, lb_nobusyq[idle]);
6866 		goto out_balanced;
6867 	}
6868 
6869 	BUG_ON(busiest == env.dst_rq);
6870 
6871 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
6872 
6873 	ld_moved = 0;
6874 	if (busiest->nr_running > 1) {
6875 		/*
6876 		 * Attempt to move tasks. If find_busiest_group has found
6877 		 * an imbalance but busiest->nr_running <= 1, the group is
6878 		 * still unbalanced. ld_moved simply stays zero, so it is
6879 		 * correctly treated as an imbalance.
6880 		 */
6881 		env.flags |= LBF_ALL_PINNED;
6882 		env.src_cpu   = busiest->cpu;
6883 		env.src_rq    = busiest;
6884 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
6885 
6886 more_balance:
6887 		raw_spin_lock_irqsave(&busiest->lock, flags);
6888 
6889 		/*
6890 		 * cur_ld_moved - load moved in current iteration
6891 		 * ld_moved     - cumulative load moved across iterations
6892 		 */
6893 		cur_ld_moved = detach_tasks(&env);
6894 
6895 		/*
6896 		 * We've detached some tasks from busiest_rq. Every
6897 		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
6898 		 * unlock busiest->lock, and we are able to be sure
6899 		 * that nobody can manipulate the tasks in parallel.
6900 		 * See task_rq_lock() family for the details.
6901 		 */
6902 
6903 		raw_spin_unlock(&busiest->lock);
6904 
6905 		if (cur_ld_moved) {
6906 			attach_tasks(&env);
6907 			ld_moved += cur_ld_moved;
6908 		}
6909 
6910 		local_irq_restore(flags);
6911 
6912 		if (env.flags & LBF_NEED_BREAK) {
6913 			env.flags &= ~LBF_NEED_BREAK;
6914 			goto more_balance;
6915 		}
6916 
6917 		/*
6918 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
6919 		 * us and move them to an alternate dst_cpu in our sched_group
6920 		 * where they can run. The upper limit on how many times we
6921 		 * iterate on same src_cpu is dependent on number of cpus in our
6922 		 * sched_group.
6923 		 *
6924 		 * This changes load balance semantics a bit on who can move
6925 		 * load to a given_cpu. In addition to the given_cpu itself
6926 		 * (or a ilb_cpu acting on its behalf where given_cpu is
6927 		 * nohz-idle), we now have balance_cpu in a position to move
6928 		 * load to given_cpu. In rare situations, this may cause
6929 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
6930 		 * _independently_ and at _same_ time to move some load to
6931 		 * given_cpu) causing exceess load to be moved to given_cpu.
6932 		 * This however should not happen so much in practice and
6933 		 * moreover subsequent load balance cycles should correct the
6934 		 * excess load moved.
6935 		 */
6936 		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6937 
6938 			/* Prevent to re-select dst_cpu via env's cpus */
6939 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
6940 
6941 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
6942 			env.dst_cpu	 = env.new_dst_cpu;
6943 			env.flags	&= ~LBF_DST_PINNED;
6944 			env.loop	 = 0;
6945 			env.loop_break	 = sched_nr_migrate_break;
6946 
6947 			/*
6948 			 * Go back to "more_balance" rather than "redo" since we
6949 			 * need to continue with same src_cpu.
6950 			 */
6951 			goto more_balance;
6952 		}
6953 
6954 		/*
6955 		 * We failed to reach balance because of affinity.
6956 		 */
6957 		if (sd_parent) {
6958 			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6959 
6960 			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
6961 				*group_imbalance = 1;
6962 		}
6963 
6964 		/* All tasks on this runqueue were pinned by CPU affinity */
6965 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
6966 			cpumask_clear_cpu(cpu_of(busiest), cpus);
6967 			if (!cpumask_empty(cpus)) {
6968 				env.loop = 0;
6969 				env.loop_break = sched_nr_migrate_break;
6970 				goto redo;
6971 			}
6972 			goto out_all_pinned;
6973 		}
6974 	}
6975 
6976 	if (!ld_moved) {
6977 		schedstat_inc(sd, lb_failed[idle]);
6978 		/*
6979 		 * Increment the failure counter only on periodic balance.
6980 		 * We do not want newidle balance, which can be very
6981 		 * frequent, pollute the failure counter causing
6982 		 * excessive cache_hot migrations and active balances.
6983 		 */
6984 		if (idle != CPU_NEWLY_IDLE)
6985 			sd->nr_balance_failed++;
6986 
6987 		if (need_active_balance(&env)) {
6988 			raw_spin_lock_irqsave(&busiest->lock, flags);
6989 
6990 			/* don't kick the active_load_balance_cpu_stop,
6991 			 * if the curr task on busiest cpu can't be
6992 			 * moved to this_cpu
6993 			 */
6994 			if (!cpumask_test_cpu(this_cpu,
6995 					tsk_cpus_allowed(busiest->curr))) {
6996 				raw_spin_unlock_irqrestore(&busiest->lock,
6997 							    flags);
6998 				env.flags |= LBF_ALL_PINNED;
6999 				goto out_one_pinned;
7000 			}
7001 
7002 			/*
7003 			 * ->active_balance synchronizes accesses to
7004 			 * ->active_balance_work.  Once set, it's cleared
7005 			 * only after active load balance is finished.
7006 			 */
7007 			if (!busiest->active_balance) {
7008 				busiest->active_balance = 1;
7009 				busiest->push_cpu = this_cpu;
7010 				active_balance = 1;
7011 			}
7012 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
7013 
7014 			if (active_balance) {
7015 				stop_one_cpu_nowait(cpu_of(busiest),
7016 					active_load_balance_cpu_stop, busiest,
7017 					&busiest->active_balance_work);
7018 			}
7019 
7020 			/*
7021 			 * We've kicked active balancing, reset the failure
7022 			 * counter.
7023 			 */
7024 			sd->nr_balance_failed = sd->cache_nice_tries+1;
7025 		}
7026 	} else
7027 		sd->nr_balance_failed = 0;
7028 
7029 	if (likely(!active_balance)) {
7030 		/* We were unbalanced, so reset the balancing interval */
7031 		sd->balance_interval = sd->min_interval;
7032 	} else {
7033 		/*
7034 		 * If we've begun active balancing, start to back off. This
7035 		 * case may not be covered by the all_pinned logic if there
7036 		 * is only 1 task on the busy runqueue (because we don't call
7037 		 * detach_tasks).
7038 		 */
7039 		if (sd->balance_interval < sd->max_interval)
7040 			sd->balance_interval *= 2;
7041 	}
7042 
7043 	goto out;
7044 
7045 out_balanced:
7046 	/*
7047 	 * We reach balance although we may have faced some affinity
7048 	 * constraints. Clear the imbalance flag if it was set.
7049 	 */
7050 	if (sd_parent) {
7051 		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7052 
7053 		if (*group_imbalance)
7054 			*group_imbalance = 0;
7055 	}
7056 
7057 out_all_pinned:
7058 	/*
7059 	 * We reach balance because all tasks are pinned at this level so
7060 	 * we can't migrate them. Let the imbalance flag set so parent level
7061 	 * can try to migrate them.
7062 	 */
7063 	schedstat_inc(sd, lb_balanced[idle]);
7064 
7065 	sd->nr_balance_failed = 0;
7066 
7067 out_one_pinned:
7068 	/* tune up the balancing interval */
7069 	if (((env.flags & LBF_ALL_PINNED) &&
7070 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
7071 			(sd->balance_interval < sd->max_interval))
7072 		sd->balance_interval *= 2;
7073 
7074 	ld_moved = 0;
7075 out:
7076 	return ld_moved;
7077 }
7078 
7079 static inline unsigned long
7080 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7081 {
7082 	unsigned long interval = sd->balance_interval;
7083 
7084 	if (cpu_busy)
7085 		interval *= sd->busy_factor;
7086 
7087 	/* scale ms to jiffies */
7088 	interval = msecs_to_jiffies(interval);
7089 	interval = clamp(interval, 1UL, max_load_balance_interval);
7090 
7091 	return interval;
7092 }
7093 
7094 static inline void
7095 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
7096 {
7097 	unsigned long interval, next;
7098 
7099 	interval = get_sd_balance_interval(sd, cpu_busy);
7100 	next = sd->last_balance + interval;
7101 
7102 	if (time_after(*next_balance, next))
7103 		*next_balance = next;
7104 }
7105 
7106 /*
7107  * idle_balance is called by schedule() if this_cpu is about to become
7108  * idle. Attempts to pull tasks from other CPUs.
7109  */
7110 static int idle_balance(struct rq *this_rq)
7111 {
7112 	unsigned long next_balance = jiffies + HZ;
7113 	int this_cpu = this_rq->cpu;
7114 	struct sched_domain *sd;
7115 	int pulled_task = 0;
7116 	u64 curr_cost = 0;
7117 
7118 	idle_enter_fair(this_rq);
7119 
7120 	/*
7121 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
7122 	 * measure the duration of idle_balance() as idle time.
7123 	 */
7124 	this_rq->idle_stamp = rq_clock(this_rq);
7125 
7126 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
7127 	    !this_rq->rd->overload) {
7128 		rcu_read_lock();
7129 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
7130 		if (sd)
7131 			update_next_balance(sd, 0, &next_balance);
7132 		rcu_read_unlock();
7133 
7134 		goto out;
7135 	}
7136 
7137 	/*
7138 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
7139 	 */
7140 	raw_spin_unlock(&this_rq->lock);
7141 
7142 	update_blocked_averages(this_cpu);
7143 	rcu_read_lock();
7144 	for_each_domain(this_cpu, sd) {
7145 		int continue_balancing = 1;
7146 		u64 t0, domain_cost;
7147 
7148 		if (!(sd->flags & SD_LOAD_BALANCE))
7149 			continue;
7150 
7151 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7152 			update_next_balance(sd, 0, &next_balance);
7153 			break;
7154 		}
7155 
7156 		if (sd->flags & SD_BALANCE_NEWIDLE) {
7157 			t0 = sched_clock_cpu(this_cpu);
7158 
7159 			pulled_task = load_balance(this_cpu, this_rq,
7160 						   sd, CPU_NEWLY_IDLE,
7161 						   &continue_balancing);
7162 
7163 			domain_cost = sched_clock_cpu(this_cpu) - t0;
7164 			if (domain_cost > sd->max_newidle_lb_cost)
7165 				sd->max_newidle_lb_cost = domain_cost;
7166 
7167 			curr_cost += domain_cost;
7168 		}
7169 
7170 		update_next_balance(sd, 0, &next_balance);
7171 
7172 		/*
7173 		 * Stop searching for tasks to pull if there are
7174 		 * now runnable tasks on this rq.
7175 		 */
7176 		if (pulled_task || this_rq->nr_running > 0)
7177 			break;
7178 	}
7179 	rcu_read_unlock();
7180 
7181 	raw_spin_lock(&this_rq->lock);
7182 
7183 	if (curr_cost > this_rq->max_idle_balance_cost)
7184 		this_rq->max_idle_balance_cost = curr_cost;
7185 
7186 	/*
7187 	 * While browsing the domains, we released the rq lock, a task could
7188 	 * have been enqueued in the meantime. Since we're not going idle,
7189 	 * pretend we pulled a task.
7190 	 */
7191 	if (this_rq->cfs.h_nr_running && !pulled_task)
7192 		pulled_task = 1;
7193 
7194 out:
7195 	/* Move the next balance forward */
7196 	if (time_after(this_rq->next_balance, next_balance))
7197 		this_rq->next_balance = next_balance;
7198 
7199 	/* Is there a task of a high priority class? */
7200 	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
7201 		pulled_task = -1;
7202 
7203 	if (pulled_task) {
7204 		idle_exit_fair(this_rq);
7205 		this_rq->idle_stamp = 0;
7206 	}
7207 
7208 	return pulled_task;
7209 }
7210 
7211 /*
7212  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7213  * running tasks off the busiest CPU onto idle CPUs. It requires at
7214  * least 1 task to be running on each physical CPU where possible, and
7215  * avoids physical / logical imbalances.
7216  */
7217 static int active_load_balance_cpu_stop(void *data)
7218 {
7219 	struct rq *busiest_rq = data;
7220 	int busiest_cpu = cpu_of(busiest_rq);
7221 	int target_cpu = busiest_rq->push_cpu;
7222 	struct rq *target_rq = cpu_rq(target_cpu);
7223 	struct sched_domain *sd;
7224 	struct task_struct *p = NULL;
7225 
7226 	raw_spin_lock_irq(&busiest_rq->lock);
7227 
7228 	/* make sure the requested cpu hasn't gone down in the meantime */
7229 	if (unlikely(busiest_cpu != smp_processor_id() ||
7230 		     !busiest_rq->active_balance))
7231 		goto out_unlock;
7232 
7233 	/* Is there any task to move? */
7234 	if (busiest_rq->nr_running <= 1)
7235 		goto out_unlock;
7236 
7237 	/*
7238 	 * This condition is "impossible", if it occurs
7239 	 * we need to fix it. Originally reported by
7240 	 * Bjorn Helgaas on a 128-cpu setup.
7241 	 */
7242 	BUG_ON(busiest_rq == target_rq);
7243 
7244 	/* Search for an sd spanning us and the target CPU. */
7245 	rcu_read_lock();
7246 	for_each_domain(target_cpu, sd) {
7247 		if ((sd->flags & SD_LOAD_BALANCE) &&
7248 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7249 				break;
7250 	}
7251 
7252 	if (likely(sd)) {
7253 		struct lb_env env = {
7254 			.sd		= sd,
7255 			.dst_cpu	= target_cpu,
7256 			.dst_rq		= target_rq,
7257 			.src_cpu	= busiest_rq->cpu,
7258 			.src_rq		= busiest_rq,
7259 			.idle		= CPU_IDLE,
7260 		};
7261 
7262 		schedstat_inc(sd, alb_count);
7263 
7264 		p = detach_one_task(&env);
7265 		if (p)
7266 			schedstat_inc(sd, alb_pushed);
7267 		else
7268 			schedstat_inc(sd, alb_failed);
7269 	}
7270 	rcu_read_unlock();
7271 out_unlock:
7272 	busiest_rq->active_balance = 0;
7273 	raw_spin_unlock(&busiest_rq->lock);
7274 
7275 	if (p)
7276 		attach_one_task(target_rq, p);
7277 
7278 	local_irq_enable();
7279 
7280 	return 0;
7281 }
7282 
7283 static inline int on_null_domain(struct rq *rq)
7284 {
7285 	return unlikely(!rcu_dereference_sched(rq->sd));
7286 }
7287 
7288 #ifdef CONFIG_NO_HZ_COMMON
7289 /*
7290  * idle load balancing details
7291  * - When one of the busy CPUs notice that there may be an idle rebalancing
7292  *   needed, they will kick the idle load balancer, which then does idle
7293  *   load balancing for all the idle CPUs.
7294  */
7295 static struct {
7296 	cpumask_var_t idle_cpus_mask;
7297 	atomic_t nr_cpus;
7298 	unsigned long next_balance;     /* in jiffy units */
7299 } nohz ____cacheline_aligned;
7300 
7301 static inline int find_new_ilb(void)
7302 {
7303 	int ilb = cpumask_first(nohz.idle_cpus_mask);
7304 
7305 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
7306 		return ilb;
7307 
7308 	return nr_cpu_ids;
7309 }
7310 
7311 /*
7312  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
7313  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
7314  * CPU (if there is one).
7315  */
7316 static void nohz_balancer_kick(void)
7317 {
7318 	int ilb_cpu;
7319 
7320 	nohz.next_balance++;
7321 
7322 	ilb_cpu = find_new_ilb();
7323 
7324 	if (ilb_cpu >= nr_cpu_ids)
7325 		return;
7326 
7327 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
7328 		return;
7329 	/*
7330 	 * Use smp_send_reschedule() instead of resched_cpu().
7331 	 * This way we generate a sched IPI on the target cpu which
7332 	 * is idle. And the softirq performing nohz idle load balance
7333 	 * will be run before returning from the IPI.
7334 	 */
7335 	smp_send_reschedule(ilb_cpu);
7336 	return;
7337 }
7338 
7339 static inline void nohz_balance_exit_idle(int cpu)
7340 {
7341 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
7342 		/*
7343 		 * Completely isolated CPUs don't ever set, so we must test.
7344 		 */
7345 		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
7346 			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
7347 			atomic_dec(&nohz.nr_cpus);
7348 		}
7349 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7350 	}
7351 }
7352 
7353 static inline void set_cpu_sd_state_busy(void)
7354 {
7355 	struct sched_domain *sd;
7356 	int cpu = smp_processor_id();
7357 
7358 	rcu_read_lock();
7359 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7360 
7361 	if (!sd || !sd->nohz_idle)
7362 		goto unlock;
7363 	sd->nohz_idle = 0;
7364 
7365 	atomic_inc(&sd->groups->sgc->nr_busy_cpus);
7366 unlock:
7367 	rcu_read_unlock();
7368 }
7369 
7370 void set_cpu_sd_state_idle(void)
7371 {
7372 	struct sched_domain *sd;
7373 	int cpu = smp_processor_id();
7374 
7375 	rcu_read_lock();
7376 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7377 
7378 	if (!sd || sd->nohz_idle)
7379 		goto unlock;
7380 	sd->nohz_idle = 1;
7381 
7382 	atomic_dec(&sd->groups->sgc->nr_busy_cpus);
7383 unlock:
7384 	rcu_read_unlock();
7385 }
7386 
7387 /*
7388  * This routine will record that the cpu is going idle with tick stopped.
7389  * This info will be used in performing idle load balancing in the future.
7390  */
7391 void nohz_balance_enter_idle(int cpu)
7392 {
7393 	/*
7394 	 * If this cpu is going down, then nothing needs to be done.
7395 	 */
7396 	if (!cpu_active(cpu))
7397 		return;
7398 
7399 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
7400 		return;
7401 
7402 	/*
7403 	 * If we're a completely isolated CPU, we don't play.
7404 	 */
7405 	if (on_null_domain(cpu_rq(cpu)))
7406 		return;
7407 
7408 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
7409 	atomic_inc(&nohz.nr_cpus);
7410 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7411 }
7412 
7413 static int sched_ilb_notifier(struct notifier_block *nfb,
7414 					unsigned long action, void *hcpu)
7415 {
7416 	switch (action & ~CPU_TASKS_FROZEN) {
7417 	case CPU_DYING:
7418 		nohz_balance_exit_idle(smp_processor_id());
7419 		return NOTIFY_OK;
7420 	default:
7421 		return NOTIFY_DONE;
7422 	}
7423 }
7424 #endif
7425 
7426 static DEFINE_SPINLOCK(balancing);
7427 
7428 /*
7429  * Scale the max load_balance interval with the number of CPUs in the system.
7430  * This trades load-balance latency on larger machines for less cross talk.
7431  */
7432 void update_max_interval(void)
7433 {
7434 	max_load_balance_interval = HZ*num_online_cpus()/10;
7435 }
7436 
7437 /*
7438  * It checks each scheduling domain to see if it is due to be balanced,
7439  * and initiates a balancing operation if so.
7440  *
7441  * Balancing parameters are set up in init_sched_domains.
7442  */
7443 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7444 {
7445 	int continue_balancing = 1;
7446 	int cpu = rq->cpu;
7447 	unsigned long interval;
7448 	struct sched_domain *sd;
7449 	/* Earliest time when we have to do rebalance again */
7450 	unsigned long next_balance = jiffies + 60*HZ;
7451 	int update_next_balance = 0;
7452 	int need_serialize, need_decay = 0;
7453 	u64 max_cost = 0;
7454 
7455 	update_blocked_averages(cpu);
7456 
7457 	rcu_read_lock();
7458 	for_each_domain(cpu, sd) {
7459 		/*
7460 		 * Decay the newidle max times here because this is a regular
7461 		 * visit to all the domains. Decay ~1% per second.
7462 		 */
7463 		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
7464 			sd->max_newidle_lb_cost =
7465 				(sd->max_newidle_lb_cost * 253) / 256;
7466 			sd->next_decay_max_lb_cost = jiffies + HZ;
7467 			need_decay = 1;
7468 		}
7469 		max_cost += sd->max_newidle_lb_cost;
7470 
7471 		if (!(sd->flags & SD_LOAD_BALANCE))
7472 			continue;
7473 
7474 		/*
7475 		 * Stop the load balance at this level. There is another
7476 		 * CPU in our sched group which is doing load balancing more
7477 		 * actively.
7478 		 */
7479 		if (!continue_balancing) {
7480 			if (need_decay)
7481 				continue;
7482 			break;
7483 		}
7484 
7485 		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7486 
7487 		need_serialize = sd->flags & SD_SERIALIZE;
7488 		if (need_serialize) {
7489 			if (!spin_trylock(&balancing))
7490 				goto out;
7491 		}
7492 
7493 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
7494 			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
7495 				/*
7496 				 * The LBF_DST_PINNED logic could have changed
7497 				 * env->dst_cpu, so we can't know our idle
7498 				 * state even if we migrated tasks. Update it.
7499 				 */
7500 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7501 			}
7502 			sd->last_balance = jiffies;
7503 			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7504 		}
7505 		if (need_serialize)
7506 			spin_unlock(&balancing);
7507 out:
7508 		if (time_after(next_balance, sd->last_balance + interval)) {
7509 			next_balance = sd->last_balance + interval;
7510 			update_next_balance = 1;
7511 		}
7512 	}
7513 	if (need_decay) {
7514 		/*
7515 		 * Ensure the rq-wide value also decays but keep it at a
7516 		 * reasonable floor to avoid funnies with rq->avg_idle.
7517 		 */
7518 		rq->max_idle_balance_cost =
7519 			max((u64)sysctl_sched_migration_cost, max_cost);
7520 	}
7521 	rcu_read_unlock();
7522 
7523 	/*
7524 	 * next_balance will be updated only when there is a need.
7525 	 * When the cpu is attached to null domain for ex, it will not be
7526 	 * updated.
7527 	 */
7528 	if (likely(update_next_balance))
7529 		rq->next_balance = next_balance;
7530 }
7531 
7532 #ifdef CONFIG_NO_HZ_COMMON
7533 /*
7534  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
7535  * rebalancing for all the cpus for whom scheduler ticks are stopped.
7536  */
7537 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7538 {
7539 	int this_cpu = this_rq->cpu;
7540 	struct rq *rq;
7541 	int balance_cpu;
7542 
7543 	if (idle != CPU_IDLE ||
7544 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
7545 		goto end;
7546 
7547 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
7548 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
7549 			continue;
7550 
7551 		/*
7552 		 * If this cpu gets work to do, stop the load balancing
7553 		 * work being done for other cpus. Next load
7554 		 * balancing owner will pick it up.
7555 		 */
7556 		if (need_resched())
7557 			break;
7558 
7559 		rq = cpu_rq(balance_cpu);
7560 
7561 		/*
7562 		 * If time for next balance is due,
7563 		 * do the balance.
7564 		 */
7565 		if (time_after_eq(jiffies, rq->next_balance)) {
7566 			raw_spin_lock_irq(&rq->lock);
7567 			update_rq_clock(rq);
7568 			update_idle_cpu_load(rq);
7569 			raw_spin_unlock_irq(&rq->lock);
7570 			rebalance_domains(rq, CPU_IDLE);
7571 		}
7572 
7573 		if (time_after(this_rq->next_balance, rq->next_balance))
7574 			this_rq->next_balance = rq->next_balance;
7575 	}
7576 	nohz.next_balance = this_rq->next_balance;
7577 end:
7578 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
7579 }
7580 
7581 /*
7582  * Current heuristic for kicking the idle load balancer in the presence
7583  * of an idle cpu is the system.
7584  *   - This rq has more than one task.
7585  *   - At any scheduler domain level, this cpu's scheduler group has multiple
7586  *     busy cpu's exceeding the group's capacity.
7587  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
7588  *     domain span are idle.
7589  */
7590 static inline int nohz_kick_needed(struct rq *rq)
7591 {
7592 	unsigned long now = jiffies;
7593 	struct sched_domain *sd;
7594 	struct sched_group_capacity *sgc;
7595 	int nr_busy, cpu = rq->cpu;
7596 
7597 	if (unlikely(rq->idle_balance))
7598 		return 0;
7599 
7600        /*
7601 	* We may be recently in ticked or tickless idle mode. At the first
7602 	* busy tick after returning from idle, we will update the busy stats.
7603 	*/
7604 	set_cpu_sd_state_busy();
7605 	nohz_balance_exit_idle(cpu);
7606 
7607 	/*
7608 	 * None are in tickless mode and hence no need for NOHZ idle load
7609 	 * balancing.
7610 	 */
7611 	if (likely(!atomic_read(&nohz.nr_cpus)))
7612 		return 0;
7613 
7614 	if (time_before(now, nohz.next_balance))
7615 		return 0;
7616 
7617 	if (rq->nr_running >= 2)
7618 		goto need_kick;
7619 
7620 	rcu_read_lock();
7621 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7622 
7623 	if (sd) {
7624 		sgc = sd->groups->sgc;
7625 		nr_busy = atomic_read(&sgc->nr_busy_cpus);
7626 
7627 		if (nr_busy > 1)
7628 			goto need_kick_unlock;
7629 	}
7630 
7631 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
7632 
7633 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
7634 				  sched_domain_span(sd)) < cpu))
7635 		goto need_kick_unlock;
7636 
7637 	rcu_read_unlock();
7638 	return 0;
7639 
7640 need_kick_unlock:
7641 	rcu_read_unlock();
7642 need_kick:
7643 	return 1;
7644 }
7645 #else
7646 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
7647 #endif
7648 
7649 /*
7650  * run_rebalance_domains is triggered when needed from the scheduler tick.
7651  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
7652  */
7653 static void run_rebalance_domains(struct softirq_action *h)
7654 {
7655 	struct rq *this_rq = this_rq();
7656 	enum cpu_idle_type idle = this_rq->idle_balance ?
7657 						CPU_IDLE : CPU_NOT_IDLE;
7658 
7659 	rebalance_domains(this_rq, idle);
7660 
7661 	/*
7662 	 * If this cpu has a pending nohz_balance_kick, then do the
7663 	 * balancing on behalf of the other idle cpus whose ticks are
7664 	 * stopped.
7665 	 */
7666 	nohz_idle_balance(this_rq, idle);
7667 }
7668 
7669 /*
7670  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
7671  */
7672 void trigger_load_balance(struct rq *rq)
7673 {
7674 	/* Don't need to rebalance while attached to NULL domain */
7675 	if (unlikely(on_null_domain(rq)))
7676 		return;
7677 
7678 	if (time_after_eq(jiffies, rq->next_balance))
7679 		raise_softirq(SCHED_SOFTIRQ);
7680 #ifdef CONFIG_NO_HZ_COMMON
7681 	if (nohz_kick_needed(rq))
7682 		nohz_balancer_kick();
7683 #endif
7684 }
7685 
7686 static void rq_online_fair(struct rq *rq)
7687 {
7688 	update_sysctl();
7689 
7690 	update_runtime_enabled(rq);
7691 }
7692 
7693 static void rq_offline_fair(struct rq *rq)
7694 {
7695 	update_sysctl();
7696 
7697 	/* Ensure any throttled groups are reachable by pick_next_task */
7698 	unthrottle_offline_cfs_rqs(rq);
7699 }
7700 
7701 #endif /* CONFIG_SMP */
7702 
7703 /*
7704  * scheduler tick hitting a task of our scheduling class:
7705  */
7706 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
7707 {
7708 	struct cfs_rq *cfs_rq;
7709 	struct sched_entity *se = &curr->se;
7710 
7711 	for_each_sched_entity(se) {
7712 		cfs_rq = cfs_rq_of(se);
7713 		entity_tick(cfs_rq, se, queued);
7714 	}
7715 
7716 	if (numabalancing_enabled)
7717 		task_tick_numa(rq, curr);
7718 
7719 	update_rq_runnable_avg(rq, 1);
7720 }
7721 
7722 /*
7723  * called on fork with the child task as argument from the parent's context
7724  *  - child not yet on the tasklist
7725  *  - preemption disabled
7726  */
7727 static void task_fork_fair(struct task_struct *p)
7728 {
7729 	struct cfs_rq *cfs_rq;
7730 	struct sched_entity *se = &p->se, *curr;
7731 	int this_cpu = smp_processor_id();
7732 	struct rq *rq = this_rq();
7733 	unsigned long flags;
7734 
7735 	raw_spin_lock_irqsave(&rq->lock, flags);
7736 
7737 	update_rq_clock(rq);
7738 
7739 	cfs_rq = task_cfs_rq(current);
7740 	curr = cfs_rq->curr;
7741 
7742 	/*
7743 	 * Not only the cpu but also the task_group of the parent might have
7744 	 * been changed after parent->se.parent,cfs_rq were copied to
7745 	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
7746 	 * of child point to valid ones.
7747 	 */
7748 	rcu_read_lock();
7749 	__set_task_cpu(p, this_cpu);
7750 	rcu_read_unlock();
7751 
7752 	update_curr(cfs_rq);
7753 
7754 	if (curr)
7755 		se->vruntime = curr->vruntime;
7756 	place_entity(cfs_rq, se, 1);
7757 
7758 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
7759 		/*
7760 		 * Upon rescheduling, sched_class::put_prev_task() will place
7761 		 * 'current' within the tree based on its new key value.
7762 		 */
7763 		swap(curr->vruntime, se->vruntime);
7764 		resched_curr(rq);
7765 	}
7766 
7767 	se->vruntime -= cfs_rq->min_vruntime;
7768 
7769 	raw_spin_unlock_irqrestore(&rq->lock, flags);
7770 }
7771 
7772 /*
7773  * Priority of the task has changed. Check to see if we preempt
7774  * the current task.
7775  */
7776 static void
7777 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7778 {
7779 	if (!task_on_rq_queued(p))
7780 		return;
7781 
7782 	/*
7783 	 * Reschedule if we are currently running on this runqueue and
7784 	 * our priority decreased, or if we are not currently running on
7785 	 * this runqueue and our priority is higher than the current's
7786 	 */
7787 	if (rq->curr == p) {
7788 		if (p->prio > oldprio)
7789 			resched_curr(rq);
7790 	} else
7791 		check_preempt_curr(rq, p, 0);
7792 }
7793 
7794 static void switched_from_fair(struct rq *rq, struct task_struct *p)
7795 {
7796 	struct sched_entity *se = &p->se;
7797 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
7798 
7799 	/*
7800 	 * Ensure the task's vruntime is normalized, so that when it's
7801 	 * switched back to the fair class the enqueue_entity(.flags=0) will
7802 	 * do the right thing.
7803 	 *
7804 	 * If it's queued, then the dequeue_entity(.flags=0) will already
7805 	 * have normalized the vruntime, if it's !queued, then only when
7806 	 * the task is sleeping will it still have non-normalized vruntime.
7807 	 */
7808 	if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
7809 		/*
7810 		 * Fix up our vruntime so that the current sleep doesn't
7811 		 * cause 'unlimited' sleep bonus.
7812 		 */
7813 		place_entity(cfs_rq, se, 0);
7814 		se->vruntime -= cfs_rq->min_vruntime;
7815 	}
7816 
7817 #ifdef CONFIG_SMP
7818 	/*
7819 	* Remove our load from contribution when we leave sched_fair
7820 	* and ensure we don't carry in an old decay_count if we
7821 	* switch back.
7822 	*/
7823 	if (se->avg.decay_count) {
7824 		__synchronize_entity_decay(se);
7825 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
7826 	}
7827 #endif
7828 }
7829 
7830 /*
7831  * We switched to the sched_fair class.
7832  */
7833 static void switched_to_fair(struct rq *rq, struct task_struct *p)
7834 {
7835 #ifdef CONFIG_FAIR_GROUP_SCHED
7836 	struct sched_entity *se = &p->se;
7837 	/*
7838 	 * Since the real-depth could have been changed (only FAIR
7839 	 * class maintain depth value), reset depth properly.
7840 	 */
7841 	se->depth = se->parent ? se->parent->depth + 1 : 0;
7842 #endif
7843 	if (!task_on_rq_queued(p))
7844 		return;
7845 
7846 	/*
7847 	 * We were most likely switched from sched_rt, so
7848 	 * kick off the schedule if running, otherwise just see
7849 	 * if we can still preempt the current task.
7850 	 */
7851 	if (rq->curr == p)
7852 		resched_curr(rq);
7853 	else
7854 		check_preempt_curr(rq, p, 0);
7855 }
7856 
7857 /* Account for a task changing its policy or group.
7858  *
7859  * This routine is mostly called to set cfs_rq->curr field when a task
7860  * migrates between groups/classes.
7861  */
7862 static void set_curr_task_fair(struct rq *rq)
7863 {
7864 	struct sched_entity *se = &rq->curr->se;
7865 
7866 	for_each_sched_entity(se) {
7867 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
7868 
7869 		set_next_entity(cfs_rq, se);
7870 		/* ensure bandwidth has been allocated on our new cfs_rq */
7871 		account_cfs_rq_runtime(cfs_rq, 0);
7872 	}
7873 }
7874 
7875 void init_cfs_rq(struct cfs_rq *cfs_rq)
7876 {
7877 	cfs_rq->tasks_timeline = RB_ROOT;
7878 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7879 #ifndef CONFIG_64BIT
7880 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7881 #endif
7882 #ifdef CONFIG_SMP
7883 	atomic64_set(&cfs_rq->decay_counter, 1);
7884 	atomic_long_set(&cfs_rq->removed_load, 0);
7885 #endif
7886 }
7887 
7888 #ifdef CONFIG_FAIR_GROUP_SCHED
7889 static void task_move_group_fair(struct task_struct *p, int queued)
7890 {
7891 	struct sched_entity *se = &p->se;
7892 	struct cfs_rq *cfs_rq;
7893 
7894 	/*
7895 	 * If the task was not on the rq at the time of this cgroup movement
7896 	 * it must have been asleep, sleeping tasks keep their ->vruntime
7897 	 * absolute on their old rq until wakeup (needed for the fair sleeper
7898 	 * bonus in place_entity()).
7899 	 *
7900 	 * If it was on the rq, we've just 'preempted' it, which does convert
7901 	 * ->vruntime to a relative base.
7902 	 *
7903 	 * Make sure both cases convert their relative position when migrating
7904 	 * to another cgroup's rq. This does somewhat interfere with the
7905 	 * fair sleeper stuff for the first placement, but who cares.
7906 	 */
7907 	/*
7908 	 * When !queued, vruntime of the task has usually NOT been normalized.
7909 	 * But there are some cases where it has already been normalized:
7910 	 *
7911 	 * - Moving a forked child which is waiting for being woken up by
7912 	 *   wake_up_new_task().
7913 	 * - Moving a task which has been woken up by try_to_wake_up() and
7914 	 *   waiting for actually being woken up by sched_ttwu_pending().
7915 	 *
7916 	 * To prevent boost or penalty in the new cfs_rq caused by delta
7917 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7918 	 */
7919 	if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7920 		queued = 1;
7921 
7922 	if (!queued)
7923 		se->vruntime -= cfs_rq_of(se)->min_vruntime;
7924 	set_task_rq(p, task_cpu(p));
7925 	se->depth = se->parent ? se->parent->depth + 1 : 0;
7926 	if (!queued) {
7927 		cfs_rq = cfs_rq_of(se);
7928 		se->vruntime += cfs_rq->min_vruntime;
7929 #ifdef CONFIG_SMP
7930 		/*
7931 		 * migrate_task_rq_fair() will have removed our previous
7932 		 * contribution, but we must synchronize for ongoing future
7933 		 * decay.
7934 		 */
7935 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7936 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
7937 #endif
7938 	}
7939 }
7940 
7941 void free_fair_sched_group(struct task_group *tg)
7942 {
7943 	int i;
7944 
7945 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
7946 
7947 	for_each_possible_cpu(i) {
7948 		if (tg->cfs_rq)
7949 			kfree(tg->cfs_rq[i]);
7950 		if (tg->se)
7951 			kfree(tg->se[i]);
7952 	}
7953 
7954 	kfree(tg->cfs_rq);
7955 	kfree(tg->se);
7956 }
7957 
7958 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7959 {
7960 	struct cfs_rq *cfs_rq;
7961 	struct sched_entity *se;
7962 	int i;
7963 
7964 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7965 	if (!tg->cfs_rq)
7966 		goto err;
7967 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7968 	if (!tg->se)
7969 		goto err;
7970 
7971 	tg->shares = NICE_0_LOAD;
7972 
7973 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
7974 
7975 	for_each_possible_cpu(i) {
7976 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
7977 				      GFP_KERNEL, cpu_to_node(i));
7978 		if (!cfs_rq)
7979 			goto err;
7980 
7981 		se = kzalloc_node(sizeof(struct sched_entity),
7982 				  GFP_KERNEL, cpu_to_node(i));
7983 		if (!se)
7984 			goto err_free_rq;
7985 
7986 		init_cfs_rq(cfs_rq);
7987 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
7988 	}
7989 
7990 	return 1;
7991 
7992 err_free_rq:
7993 	kfree(cfs_rq);
7994 err:
7995 	return 0;
7996 }
7997 
7998 void unregister_fair_sched_group(struct task_group *tg, int cpu)
7999 {
8000 	struct rq *rq = cpu_rq(cpu);
8001 	unsigned long flags;
8002 
8003 	/*
8004 	* Only empty task groups can be destroyed; so we can speculatively
8005 	* check on_list without danger of it being re-added.
8006 	*/
8007 	if (!tg->cfs_rq[cpu]->on_list)
8008 		return;
8009 
8010 	raw_spin_lock_irqsave(&rq->lock, flags);
8011 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8012 	raw_spin_unlock_irqrestore(&rq->lock, flags);
8013 }
8014 
8015 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8016 			struct sched_entity *se, int cpu,
8017 			struct sched_entity *parent)
8018 {
8019 	struct rq *rq = cpu_rq(cpu);
8020 
8021 	cfs_rq->tg = tg;
8022 	cfs_rq->rq = rq;
8023 	init_cfs_rq_runtime(cfs_rq);
8024 
8025 	tg->cfs_rq[cpu] = cfs_rq;
8026 	tg->se[cpu] = se;
8027 
8028 	/* se could be NULL for root_task_group */
8029 	if (!se)
8030 		return;
8031 
8032 	if (!parent) {
8033 		se->cfs_rq = &rq->cfs;
8034 		se->depth = 0;
8035 	} else {
8036 		se->cfs_rq = parent->my_q;
8037 		se->depth = parent->depth + 1;
8038 	}
8039 
8040 	se->my_q = cfs_rq;
8041 	/* guarantee group entities always have weight */
8042 	update_load_set(&se->load, NICE_0_LOAD);
8043 	se->parent = parent;
8044 }
8045 
8046 static DEFINE_MUTEX(shares_mutex);
8047 
8048 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8049 {
8050 	int i;
8051 	unsigned long flags;
8052 
8053 	/*
8054 	 * We can't change the weight of the root cgroup.
8055 	 */
8056 	if (!tg->se[0])
8057 		return -EINVAL;
8058 
8059 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8060 
8061 	mutex_lock(&shares_mutex);
8062 	if (tg->shares == shares)
8063 		goto done;
8064 
8065 	tg->shares = shares;
8066 	for_each_possible_cpu(i) {
8067 		struct rq *rq = cpu_rq(i);
8068 		struct sched_entity *se;
8069 
8070 		se = tg->se[i];
8071 		/* Propagate contribution to hierarchy */
8072 		raw_spin_lock_irqsave(&rq->lock, flags);
8073 
8074 		/* Possible calls to update_curr() need rq clock */
8075 		update_rq_clock(rq);
8076 		for_each_sched_entity(se)
8077 			update_cfs_shares(group_cfs_rq(se));
8078 		raw_spin_unlock_irqrestore(&rq->lock, flags);
8079 	}
8080 
8081 done:
8082 	mutex_unlock(&shares_mutex);
8083 	return 0;
8084 }
8085 #else /* CONFIG_FAIR_GROUP_SCHED */
8086 
8087 void free_fair_sched_group(struct task_group *tg) { }
8088 
8089 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8090 {
8091 	return 1;
8092 }
8093 
8094 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
8095 
8096 #endif /* CONFIG_FAIR_GROUP_SCHED */
8097 
8098 
8099 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
8100 {
8101 	struct sched_entity *se = &task->se;
8102 	unsigned int rr_interval = 0;
8103 
8104 	/*
8105 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
8106 	 * idle runqueue:
8107 	 */
8108 	if (rq->cfs.load.weight)
8109 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
8110 
8111 	return rr_interval;
8112 }
8113 
8114 /*
8115  * All the scheduling class methods:
8116  */
8117 const struct sched_class fair_sched_class = {
8118 	.next			= &idle_sched_class,
8119 	.enqueue_task		= enqueue_task_fair,
8120 	.dequeue_task		= dequeue_task_fair,
8121 	.yield_task		= yield_task_fair,
8122 	.yield_to_task		= yield_to_task_fair,
8123 
8124 	.check_preempt_curr	= check_preempt_wakeup,
8125 
8126 	.pick_next_task		= pick_next_task_fair,
8127 	.put_prev_task		= put_prev_task_fair,
8128 
8129 #ifdef CONFIG_SMP
8130 	.select_task_rq		= select_task_rq_fair,
8131 	.migrate_task_rq	= migrate_task_rq_fair,
8132 
8133 	.rq_online		= rq_online_fair,
8134 	.rq_offline		= rq_offline_fair,
8135 
8136 	.task_waking		= task_waking_fair,
8137 #endif
8138 
8139 	.set_curr_task          = set_curr_task_fair,
8140 	.task_tick		= task_tick_fair,
8141 	.task_fork		= task_fork_fair,
8142 
8143 	.prio_changed		= prio_changed_fair,
8144 	.switched_from		= switched_from_fair,
8145 	.switched_to		= switched_to_fair,
8146 
8147 	.get_rr_interval	= get_rr_interval_fair,
8148 
8149 	.update_curr		= update_curr_fair,
8150 
8151 #ifdef CONFIG_FAIR_GROUP_SCHED
8152 	.task_move_group	= task_move_group_fair,
8153 #endif
8154 };
8155 
8156 #ifdef CONFIG_SCHED_DEBUG
8157 void print_cfs_stats(struct seq_file *m, int cpu)
8158 {
8159 	struct cfs_rq *cfs_rq;
8160 
8161 	rcu_read_lock();
8162 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
8163 		print_cfs_rq(m, cpu, cfs_rq);
8164 	rcu_read_unlock();
8165 }
8166 #endif
8167 
8168 __init void init_sched_fair_class(void)
8169 {
8170 #ifdef CONFIG_SMP
8171 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8172 
8173 #ifdef CONFIG_NO_HZ_COMMON
8174 	nohz.next_balance = jiffies;
8175 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8176 	cpu_notifier(sched_ilb_notifier, 0);
8177 #endif
8178 #endif /* SMP */
8179 
8180 }
8181