xref: /openbmc/linux/kernel/sched/fair.c (revision 1d240875)
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21  */
22 
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/slab.h>
27 #include <linux/profile.h>
28 #include <linux/interrupt.h>
29 #include <linux/mempolicy.h>
30 #include <linux/migrate.h>
31 #include <linux/task_work.h>
32 
33 #include <trace/events/sched.h>
34 
35 #include "sched.h"
36 
37 /*
38  * Targeted preemption latency for CPU-bound tasks:
39  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
40  *
41  * NOTE: this latency value is not the same as the concept of
42  * 'timeslice length' - timeslices in CFS are of variable length
43  * and have no persistent notion like in traditional, time-slice
44  * based scheduling concepts.
45  *
46  * (to see the precise effective timeslice length of your workload,
47  *  run vmstat and monitor the context-switches (cs) field)
48  */
49 unsigned int sysctl_sched_latency = 6000000ULL;
50 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
51 
52 /*
53  * The initial- and re-scaling of tunables is configurable
54  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
55  *
56  * Options are:
57  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
58  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
59  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
60  */
61 enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 	= SCHED_TUNABLESCALING_LOG;
63 
64 /*
65  * Minimal preemption granularity for CPU-bound tasks:
66  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
67  */
68 unsigned int sysctl_sched_min_granularity = 750000ULL;
69 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
70 
71 /*
72  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
73  */
74 static unsigned int sched_nr_latency = 8;
75 
76 /*
77  * After fork, child runs first. If set to 0 (default) then
78  * parent will (try to) run first.
79  */
80 unsigned int sysctl_sched_child_runs_first __read_mostly;
81 
82 /*
83  * SCHED_OTHER wake-up granularity.
84  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
85  *
86  * This option delays the preemption effects of decoupled workloads
87  * and reduces their over-scheduling. Synchronous workloads will still
88  * have immediate wakeup/sleep latencies.
89  */
90 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
92 
93 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94 
95 /*
96  * The exponential sliding  window over which load is averaged for shares
97  * distribution.
98  * (default: 10msec)
99  */
100 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101 
102 #ifdef CONFIG_CFS_BANDWIDTH
103 /*
104  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
105  * each time a cfs_rq requests quota.
106  *
107  * Note: in the case that the slice exceeds the runtime remaining (either due
108  * to consumption or the quota being specified to be smaller than the slice)
109  * we will always only issue the remaining available time.
110  *
111  * default: 5 msec, units: microseconds
112   */
113 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114 #endif
115 
116 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
117 {
118 	lw->weight += inc;
119 	lw->inv_weight = 0;
120 }
121 
122 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
123 {
124 	lw->weight -= dec;
125 	lw->inv_weight = 0;
126 }
127 
128 static inline void update_load_set(struct load_weight *lw, unsigned long w)
129 {
130 	lw->weight = w;
131 	lw->inv_weight = 0;
132 }
133 
134 /*
135  * Increase the granularity value when there are more CPUs,
136  * because with more CPUs the 'effective latency' as visible
137  * to users decreases. But the relationship is not linear,
138  * so pick a second-best guess by going with the log2 of the
139  * number of CPUs.
140  *
141  * This idea comes from the SD scheduler of Con Kolivas:
142  */
143 static int get_update_sysctl_factor(void)
144 {
145 	unsigned int cpus = min_t(int, num_online_cpus(), 8);
146 	unsigned int factor;
147 
148 	switch (sysctl_sched_tunable_scaling) {
149 	case SCHED_TUNABLESCALING_NONE:
150 		factor = 1;
151 		break;
152 	case SCHED_TUNABLESCALING_LINEAR:
153 		factor = cpus;
154 		break;
155 	case SCHED_TUNABLESCALING_LOG:
156 	default:
157 		factor = 1 + ilog2(cpus);
158 		break;
159 	}
160 
161 	return factor;
162 }
163 
164 static void update_sysctl(void)
165 {
166 	unsigned int factor = get_update_sysctl_factor();
167 
168 #define SET_SYSCTL(name) \
169 	(sysctl_##name = (factor) * normalized_sysctl_##name)
170 	SET_SYSCTL(sched_min_granularity);
171 	SET_SYSCTL(sched_latency);
172 	SET_SYSCTL(sched_wakeup_granularity);
173 #undef SET_SYSCTL
174 }
175 
176 void sched_init_granularity(void)
177 {
178 	update_sysctl();
179 }
180 
181 #define WMULT_CONST	(~0U)
182 #define WMULT_SHIFT	32
183 
184 static void __update_inv_weight(struct load_weight *lw)
185 {
186 	unsigned long w;
187 
188 	if (likely(lw->inv_weight))
189 		return;
190 
191 	w = scale_load_down(lw->weight);
192 
193 	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
194 		lw->inv_weight = 1;
195 	else if (unlikely(!w))
196 		lw->inv_weight = WMULT_CONST;
197 	else
198 		lw->inv_weight = WMULT_CONST / w;
199 }
200 
201 /*
202  * delta_exec * weight / lw.weight
203  *   OR
204  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
205  *
206  * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
207  * we're guaranteed shift stays positive because inv_weight is guaranteed to
208  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
209  *
210  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
211  * weight/lw.weight <= 1, and therefore our shift will also be positive.
212  */
213 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
214 {
215 	u64 fact = scale_load_down(weight);
216 	int shift = WMULT_SHIFT;
217 
218 	__update_inv_weight(lw);
219 
220 	if (unlikely(fact >> 32)) {
221 		while (fact >> 32) {
222 			fact >>= 1;
223 			shift--;
224 		}
225 	}
226 
227 	/* hint to use a 32x32->64 mul */
228 	fact = (u64)(u32)fact * lw->inv_weight;
229 
230 	while (fact >> 32) {
231 		fact >>= 1;
232 		shift--;
233 	}
234 
235 	return mul_u64_u32_shr(delta_exec, fact, shift);
236 }
237 
238 
239 const struct sched_class fair_sched_class;
240 
241 /**************************************************************
242  * CFS operations on generic schedulable entities:
243  */
244 
245 #ifdef CONFIG_FAIR_GROUP_SCHED
246 
247 /* cpu runqueue to which this cfs_rq is attached */
248 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
249 {
250 	return cfs_rq->rq;
251 }
252 
253 /* An entity is a task if it doesn't "own" a runqueue */
254 #define entity_is_task(se)	(!se->my_q)
255 
256 static inline struct task_struct *task_of(struct sched_entity *se)
257 {
258 #ifdef CONFIG_SCHED_DEBUG
259 	WARN_ON_ONCE(!entity_is_task(se));
260 #endif
261 	return container_of(se, struct task_struct, se);
262 }
263 
264 /* Walk up scheduling entities hierarchy */
265 #define for_each_sched_entity(se) \
266 		for (; se; se = se->parent)
267 
268 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
269 {
270 	return p->se.cfs_rq;
271 }
272 
273 /* runqueue on which this entity is (to be) queued */
274 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
275 {
276 	return se->cfs_rq;
277 }
278 
279 /* runqueue "owned" by this group */
280 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
281 {
282 	return grp->my_q;
283 }
284 
285 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
286 				       int force_update);
287 
288 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
289 {
290 	if (!cfs_rq->on_list) {
291 		/*
292 		 * Ensure we either appear before our parent (if already
293 		 * enqueued) or force our parent to appear after us when it is
294 		 * enqueued.  The fact that we always enqueue bottom-up
295 		 * reduces this to two cases.
296 		 */
297 		if (cfs_rq->tg->parent &&
298 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
299 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
300 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
301 		} else {
302 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
303 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
304 		}
305 
306 		cfs_rq->on_list = 1;
307 		/* We should have no load, but we need to update last_decay. */
308 		update_cfs_rq_blocked_load(cfs_rq, 0);
309 	}
310 }
311 
312 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
313 {
314 	if (cfs_rq->on_list) {
315 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
316 		cfs_rq->on_list = 0;
317 	}
318 }
319 
320 /* Iterate thr' all leaf cfs_rq's on a runqueue */
321 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
322 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
323 
324 /* Do the two (enqueued) entities belong to the same group ? */
325 static inline struct cfs_rq *
326 is_same_group(struct sched_entity *se, struct sched_entity *pse)
327 {
328 	if (se->cfs_rq == pse->cfs_rq)
329 		return se->cfs_rq;
330 
331 	return NULL;
332 }
333 
334 static inline struct sched_entity *parent_entity(struct sched_entity *se)
335 {
336 	return se->parent;
337 }
338 
339 static void
340 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
341 {
342 	int se_depth, pse_depth;
343 
344 	/*
345 	 * preemption test can be made between sibling entities who are in the
346 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
347 	 * both tasks until we find their ancestors who are siblings of common
348 	 * parent.
349 	 */
350 
351 	/* First walk up until both entities are at same depth */
352 	se_depth = (*se)->depth;
353 	pse_depth = (*pse)->depth;
354 
355 	while (se_depth > pse_depth) {
356 		se_depth--;
357 		*se = parent_entity(*se);
358 	}
359 
360 	while (pse_depth > se_depth) {
361 		pse_depth--;
362 		*pse = parent_entity(*pse);
363 	}
364 
365 	while (!is_same_group(*se, *pse)) {
366 		*se = parent_entity(*se);
367 		*pse = parent_entity(*pse);
368 	}
369 }
370 
371 #else	/* !CONFIG_FAIR_GROUP_SCHED */
372 
373 static inline struct task_struct *task_of(struct sched_entity *se)
374 {
375 	return container_of(se, struct task_struct, se);
376 }
377 
378 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
379 {
380 	return container_of(cfs_rq, struct rq, cfs);
381 }
382 
383 #define entity_is_task(se)	1
384 
385 #define for_each_sched_entity(se) \
386 		for (; se; se = NULL)
387 
388 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
389 {
390 	return &task_rq(p)->cfs;
391 }
392 
393 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
394 {
395 	struct task_struct *p = task_of(se);
396 	struct rq *rq = task_rq(p);
397 
398 	return &rq->cfs;
399 }
400 
401 /* runqueue "owned" by this group */
402 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
403 {
404 	return NULL;
405 }
406 
407 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
408 {
409 }
410 
411 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
412 {
413 }
414 
415 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
416 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
417 
418 static inline struct sched_entity *parent_entity(struct sched_entity *se)
419 {
420 	return NULL;
421 }
422 
423 static inline void
424 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
425 {
426 }
427 
428 #endif	/* CONFIG_FAIR_GROUP_SCHED */
429 
430 static __always_inline
431 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
432 
433 /**************************************************************
434  * Scheduling class tree data structure manipulation methods:
435  */
436 
437 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
438 {
439 	s64 delta = (s64)(vruntime - max_vruntime);
440 	if (delta > 0)
441 		max_vruntime = vruntime;
442 
443 	return max_vruntime;
444 }
445 
446 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
447 {
448 	s64 delta = (s64)(vruntime - min_vruntime);
449 	if (delta < 0)
450 		min_vruntime = vruntime;
451 
452 	return min_vruntime;
453 }
454 
455 static inline int entity_before(struct sched_entity *a,
456 				struct sched_entity *b)
457 {
458 	return (s64)(a->vruntime - b->vruntime) < 0;
459 }
460 
461 static void update_min_vruntime(struct cfs_rq *cfs_rq)
462 {
463 	u64 vruntime = cfs_rq->min_vruntime;
464 
465 	if (cfs_rq->curr)
466 		vruntime = cfs_rq->curr->vruntime;
467 
468 	if (cfs_rq->rb_leftmost) {
469 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
470 						   struct sched_entity,
471 						   run_node);
472 
473 		if (!cfs_rq->curr)
474 			vruntime = se->vruntime;
475 		else
476 			vruntime = min_vruntime(vruntime, se->vruntime);
477 	}
478 
479 	/* ensure we never gain time by being placed backwards. */
480 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
481 #ifndef CONFIG_64BIT
482 	smp_wmb();
483 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
484 #endif
485 }
486 
487 /*
488  * Enqueue an entity into the rb-tree:
489  */
490 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
491 {
492 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
493 	struct rb_node *parent = NULL;
494 	struct sched_entity *entry;
495 	int leftmost = 1;
496 
497 	/*
498 	 * Find the right place in the rbtree:
499 	 */
500 	while (*link) {
501 		parent = *link;
502 		entry = rb_entry(parent, struct sched_entity, run_node);
503 		/*
504 		 * We dont care about collisions. Nodes with
505 		 * the same key stay together.
506 		 */
507 		if (entity_before(se, entry)) {
508 			link = &parent->rb_left;
509 		} else {
510 			link = &parent->rb_right;
511 			leftmost = 0;
512 		}
513 	}
514 
515 	/*
516 	 * Maintain a cache of leftmost tree entries (it is frequently
517 	 * used):
518 	 */
519 	if (leftmost)
520 		cfs_rq->rb_leftmost = &se->run_node;
521 
522 	rb_link_node(&se->run_node, parent, link);
523 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
524 }
525 
526 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
527 {
528 	if (cfs_rq->rb_leftmost == &se->run_node) {
529 		struct rb_node *next_node;
530 
531 		next_node = rb_next(&se->run_node);
532 		cfs_rq->rb_leftmost = next_node;
533 	}
534 
535 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
536 }
537 
538 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
539 {
540 	struct rb_node *left = cfs_rq->rb_leftmost;
541 
542 	if (!left)
543 		return NULL;
544 
545 	return rb_entry(left, struct sched_entity, run_node);
546 }
547 
548 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
549 {
550 	struct rb_node *next = rb_next(&se->run_node);
551 
552 	if (!next)
553 		return NULL;
554 
555 	return rb_entry(next, struct sched_entity, run_node);
556 }
557 
558 #ifdef CONFIG_SCHED_DEBUG
559 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
560 {
561 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
562 
563 	if (!last)
564 		return NULL;
565 
566 	return rb_entry(last, struct sched_entity, run_node);
567 }
568 
569 /**************************************************************
570  * Scheduling class statistics methods:
571  */
572 
573 int sched_proc_update_handler(struct ctl_table *table, int write,
574 		void __user *buffer, size_t *lenp,
575 		loff_t *ppos)
576 {
577 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
578 	int factor = get_update_sysctl_factor();
579 
580 	if (ret || !write)
581 		return ret;
582 
583 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
584 					sysctl_sched_min_granularity);
585 
586 #define WRT_SYSCTL(name) \
587 	(normalized_sysctl_##name = sysctl_##name / (factor))
588 	WRT_SYSCTL(sched_min_granularity);
589 	WRT_SYSCTL(sched_latency);
590 	WRT_SYSCTL(sched_wakeup_granularity);
591 #undef WRT_SYSCTL
592 
593 	return 0;
594 }
595 #endif
596 
597 /*
598  * delta /= w
599  */
600 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
601 {
602 	if (unlikely(se->load.weight != NICE_0_LOAD))
603 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
604 
605 	return delta;
606 }
607 
608 /*
609  * The idea is to set a period in which each task runs once.
610  *
611  * When there are too many tasks (sched_nr_latency) we have to stretch
612  * this period because otherwise the slices get too small.
613  *
614  * p = (nr <= nl) ? l : l*nr/nl
615  */
616 static u64 __sched_period(unsigned long nr_running)
617 {
618 	u64 period = sysctl_sched_latency;
619 	unsigned long nr_latency = sched_nr_latency;
620 
621 	if (unlikely(nr_running > nr_latency)) {
622 		period = sysctl_sched_min_granularity;
623 		period *= nr_running;
624 	}
625 
626 	return period;
627 }
628 
629 /*
630  * We calculate the wall-time slice from the period by taking a part
631  * proportional to the weight.
632  *
633  * s = p*P[w/rw]
634  */
635 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
636 {
637 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
638 
639 	for_each_sched_entity(se) {
640 		struct load_weight *load;
641 		struct load_weight lw;
642 
643 		cfs_rq = cfs_rq_of(se);
644 		load = &cfs_rq->load;
645 
646 		if (unlikely(!se->on_rq)) {
647 			lw = cfs_rq->load;
648 
649 			update_load_add(&lw, se->load.weight);
650 			load = &lw;
651 		}
652 		slice = __calc_delta(slice, se->load.weight, load);
653 	}
654 	return slice;
655 }
656 
657 /*
658  * We calculate the vruntime slice of a to-be-inserted task.
659  *
660  * vs = s/w
661  */
662 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
663 {
664 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
665 }
666 
667 #ifdef CONFIG_SMP
668 static unsigned long task_h_load(struct task_struct *p);
669 
670 static inline void __update_task_entity_contrib(struct sched_entity *se);
671 
672 /* Give new task start runnable values to heavy its load in infant time */
673 void init_task_runnable_average(struct task_struct *p)
674 {
675 	u32 slice;
676 
677 	p->se.avg.decay_count = 0;
678 	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
679 	p->se.avg.runnable_avg_sum = slice;
680 	p->se.avg.runnable_avg_period = slice;
681 	__update_task_entity_contrib(&p->se);
682 }
683 #else
684 void init_task_runnable_average(struct task_struct *p)
685 {
686 }
687 #endif
688 
689 /*
690  * Update the current task's runtime statistics.
691  */
692 static void update_curr(struct cfs_rq *cfs_rq)
693 {
694 	struct sched_entity *curr = cfs_rq->curr;
695 	u64 now = rq_clock_task(rq_of(cfs_rq));
696 	u64 delta_exec;
697 
698 	if (unlikely(!curr))
699 		return;
700 
701 	delta_exec = now - curr->exec_start;
702 	if (unlikely((s64)delta_exec <= 0))
703 		return;
704 
705 	curr->exec_start = now;
706 
707 	schedstat_set(curr->statistics.exec_max,
708 		      max(delta_exec, curr->statistics.exec_max));
709 
710 	curr->sum_exec_runtime += delta_exec;
711 	schedstat_add(cfs_rq, exec_clock, delta_exec);
712 
713 	curr->vruntime += calc_delta_fair(delta_exec, curr);
714 	update_min_vruntime(cfs_rq);
715 
716 	if (entity_is_task(curr)) {
717 		struct task_struct *curtask = task_of(curr);
718 
719 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
720 		cpuacct_charge(curtask, delta_exec);
721 		account_group_exec_runtime(curtask, delta_exec);
722 	}
723 
724 	account_cfs_rq_runtime(cfs_rq, delta_exec);
725 }
726 
727 static inline void
728 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
729 {
730 	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
731 }
732 
733 /*
734  * Task is being enqueued - update stats:
735  */
736 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
737 {
738 	/*
739 	 * Are we enqueueing a waiting task? (for current tasks
740 	 * a dequeue/enqueue event is a NOP)
741 	 */
742 	if (se != cfs_rq->curr)
743 		update_stats_wait_start(cfs_rq, se);
744 }
745 
746 static void
747 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
748 {
749 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
750 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
751 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
752 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
753 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
754 #ifdef CONFIG_SCHEDSTATS
755 	if (entity_is_task(se)) {
756 		trace_sched_stat_wait(task_of(se),
757 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
758 	}
759 #endif
760 	schedstat_set(se->statistics.wait_start, 0);
761 }
762 
763 static inline void
764 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
765 {
766 	/*
767 	 * Mark the end of the wait period if dequeueing a
768 	 * waiting task:
769 	 */
770 	if (se != cfs_rq->curr)
771 		update_stats_wait_end(cfs_rq, se);
772 }
773 
774 /*
775  * We are picking a new current task - update its stats:
776  */
777 static inline void
778 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
779 {
780 	/*
781 	 * We are starting a new run period:
782 	 */
783 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
784 }
785 
786 /**************************************************
787  * Scheduling class queueing methods:
788  */
789 
790 #ifdef CONFIG_NUMA_BALANCING
791 /*
792  * Approximate time to scan a full NUMA task in ms. The task scan period is
793  * calculated based on the tasks virtual memory size and
794  * numa_balancing_scan_size.
795  */
796 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
797 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
798 
799 /* Portion of address space to scan in MB */
800 unsigned int sysctl_numa_balancing_scan_size = 256;
801 
802 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
803 unsigned int sysctl_numa_balancing_scan_delay = 1000;
804 
805 static unsigned int task_nr_scan_windows(struct task_struct *p)
806 {
807 	unsigned long rss = 0;
808 	unsigned long nr_scan_pages;
809 
810 	/*
811 	 * Calculations based on RSS as non-present and empty pages are skipped
812 	 * by the PTE scanner and NUMA hinting faults should be trapped based
813 	 * on resident pages
814 	 */
815 	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
816 	rss = get_mm_rss(p->mm);
817 	if (!rss)
818 		rss = nr_scan_pages;
819 
820 	rss = round_up(rss, nr_scan_pages);
821 	return rss / nr_scan_pages;
822 }
823 
824 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
825 #define MAX_SCAN_WINDOW 2560
826 
827 static unsigned int task_scan_min(struct task_struct *p)
828 {
829 	unsigned int scan, floor;
830 	unsigned int windows = 1;
831 
832 	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
833 		windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
834 	floor = 1000 / windows;
835 
836 	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
837 	return max_t(unsigned int, floor, scan);
838 }
839 
840 static unsigned int task_scan_max(struct task_struct *p)
841 {
842 	unsigned int smin = task_scan_min(p);
843 	unsigned int smax;
844 
845 	/* Watch for min being lower than max due to floor calculations */
846 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
847 	return max(smin, smax);
848 }
849 
850 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
851 {
852 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
853 	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
854 }
855 
856 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
857 {
858 	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
859 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
860 }
861 
862 struct numa_group {
863 	atomic_t refcount;
864 
865 	spinlock_t lock; /* nr_tasks, tasks */
866 	int nr_tasks;
867 	pid_t gid;
868 	struct list_head task_list;
869 
870 	struct rcu_head rcu;
871 	nodemask_t active_nodes;
872 	unsigned long total_faults;
873 	/*
874 	 * Faults_cpu is used to decide whether memory should move
875 	 * towards the CPU. As a consequence, these stats are weighted
876 	 * more by CPU use than by memory faults.
877 	 */
878 	unsigned long *faults_cpu;
879 	unsigned long faults[0];
880 };
881 
882 /* Shared or private faults. */
883 #define NR_NUMA_HINT_FAULT_TYPES 2
884 
885 /* Memory and CPU locality */
886 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
887 
888 /* Averaged statistics, and temporary buffers. */
889 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
890 
891 pid_t task_numa_group_id(struct task_struct *p)
892 {
893 	return p->numa_group ? p->numa_group->gid : 0;
894 }
895 
896 static inline int task_faults_idx(int nid, int priv)
897 {
898 	return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
899 }
900 
901 static inline unsigned long task_faults(struct task_struct *p, int nid)
902 {
903 	if (!p->numa_faults_memory)
904 		return 0;
905 
906 	return p->numa_faults_memory[task_faults_idx(nid, 0)] +
907 		p->numa_faults_memory[task_faults_idx(nid, 1)];
908 }
909 
910 static inline unsigned long group_faults(struct task_struct *p, int nid)
911 {
912 	if (!p->numa_group)
913 		return 0;
914 
915 	return p->numa_group->faults[task_faults_idx(nid, 0)] +
916 		p->numa_group->faults[task_faults_idx(nid, 1)];
917 }
918 
919 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
920 {
921 	return group->faults_cpu[task_faults_idx(nid, 0)] +
922 		group->faults_cpu[task_faults_idx(nid, 1)];
923 }
924 
925 /*
926  * These return the fraction of accesses done by a particular task, or
927  * task group, on a particular numa node.  The group weight is given a
928  * larger multiplier, in order to group tasks together that are almost
929  * evenly spread out between numa nodes.
930  */
931 static inline unsigned long task_weight(struct task_struct *p, int nid)
932 {
933 	unsigned long total_faults;
934 
935 	if (!p->numa_faults_memory)
936 		return 0;
937 
938 	total_faults = p->total_numa_faults;
939 
940 	if (!total_faults)
941 		return 0;
942 
943 	return 1000 * task_faults(p, nid) / total_faults;
944 }
945 
946 static inline unsigned long group_weight(struct task_struct *p, int nid)
947 {
948 	if (!p->numa_group || !p->numa_group->total_faults)
949 		return 0;
950 
951 	return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
952 }
953 
954 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
955 				int src_nid, int dst_cpu)
956 {
957 	struct numa_group *ng = p->numa_group;
958 	int dst_nid = cpu_to_node(dst_cpu);
959 	int last_cpupid, this_cpupid;
960 
961 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
962 
963 	/*
964 	 * Multi-stage node selection is used in conjunction with a periodic
965 	 * migration fault to build a temporal task<->page relation. By using
966 	 * a two-stage filter we remove short/unlikely relations.
967 	 *
968 	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
969 	 * a task's usage of a particular page (n_p) per total usage of this
970 	 * page (n_t) (in a given time-span) to a probability.
971 	 *
972 	 * Our periodic faults will sample this probability and getting the
973 	 * same result twice in a row, given these samples are fully
974 	 * independent, is then given by P(n)^2, provided our sample period
975 	 * is sufficiently short compared to the usage pattern.
976 	 *
977 	 * This quadric squishes small probabilities, making it less likely we
978 	 * act on an unlikely task<->page relation.
979 	 */
980 	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
981 	if (!cpupid_pid_unset(last_cpupid) &&
982 				cpupid_to_nid(last_cpupid) != dst_nid)
983 		return false;
984 
985 	/* Always allow migrate on private faults */
986 	if (cpupid_match_pid(p, last_cpupid))
987 		return true;
988 
989 	/* A shared fault, but p->numa_group has not been set up yet. */
990 	if (!ng)
991 		return true;
992 
993 	/*
994 	 * Do not migrate if the destination is not a node that
995 	 * is actively used by this numa group.
996 	 */
997 	if (!node_isset(dst_nid, ng->active_nodes))
998 		return false;
999 
1000 	/*
1001 	 * Source is a node that is not actively used by this
1002 	 * numa group, while the destination is. Migrate.
1003 	 */
1004 	if (!node_isset(src_nid, ng->active_nodes))
1005 		return true;
1006 
1007 	/*
1008 	 * Both source and destination are nodes in active
1009 	 * use by this numa group. Maximize memory bandwidth
1010 	 * by migrating from more heavily used groups, to less
1011 	 * heavily used ones, spreading the load around.
1012 	 * Use a 1/4 hysteresis to avoid spurious page movement.
1013 	 */
1014 	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1015 }
1016 
1017 static unsigned long weighted_cpuload(const int cpu);
1018 static unsigned long source_load(int cpu, int type);
1019 static unsigned long target_load(int cpu, int type);
1020 static unsigned long capacity_of(int cpu);
1021 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1022 
1023 /* Cached statistics for all CPUs within a node */
1024 struct numa_stats {
1025 	unsigned long nr_running;
1026 	unsigned long load;
1027 
1028 	/* Total compute capacity of CPUs on a node */
1029 	unsigned long compute_capacity;
1030 
1031 	/* Approximate capacity in terms of runnable tasks on a node */
1032 	unsigned long task_capacity;
1033 	int has_free_capacity;
1034 };
1035 
1036 /*
1037  * XXX borrowed from update_sg_lb_stats
1038  */
1039 static void update_numa_stats(struct numa_stats *ns, int nid)
1040 {
1041 	int cpu, cpus = 0;
1042 
1043 	memset(ns, 0, sizeof(*ns));
1044 	for_each_cpu(cpu, cpumask_of_node(nid)) {
1045 		struct rq *rq = cpu_rq(cpu);
1046 
1047 		ns->nr_running += rq->nr_running;
1048 		ns->load += weighted_cpuload(cpu);
1049 		ns->compute_capacity += capacity_of(cpu);
1050 
1051 		cpus++;
1052 	}
1053 
1054 	/*
1055 	 * If we raced with hotplug and there are no CPUs left in our mask
1056 	 * the @ns structure is NULL'ed and task_numa_compare() will
1057 	 * not find this node attractive.
1058 	 *
1059 	 * We'll either bail at !has_free_capacity, or we'll detect a huge
1060 	 * imbalance and bail there.
1061 	 */
1062 	if (!cpus)
1063 		return;
1064 
1065 	ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
1066 	ns->task_capacity =
1067 		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
1068 	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1069 }
1070 
1071 struct task_numa_env {
1072 	struct task_struct *p;
1073 
1074 	int src_cpu, src_nid;
1075 	int dst_cpu, dst_nid;
1076 
1077 	struct numa_stats src_stats, dst_stats;
1078 
1079 	int imbalance_pct;
1080 
1081 	struct task_struct *best_task;
1082 	long best_imp;
1083 	int best_cpu;
1084 };
1085 
1086 static void task_numa_assign(struct task_numa_env *env,
1087 			     struct task_struct *p, long imp)
1088 {
1089 	if (env->best_task)
1090 		put_task_struct(env->best_task);
1091 	if (p)
1092 		get_task_struct(p);
1093 
1094 	env->best_task = p;
1095 	env->best_imp = imp;
1096 	env->best_cpu = env->dst_cpu;
1097 }
1098 
1099 static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
1100 				long src_load, long dst_load,
1101 				struct task_numa_env *env)
1102 {
1103 	long imb, old_imb;
1104 
1105 	/* We care about the slope of the imbalance, not the direction. */
1106 	if (dst_load < src_load)
1107 		swap(dst_load, src_load);
1108 
1109 	/* Is the difference below the threshold? */
1110 	imb = dst_load * 100 - src_load * env->imbalance_pct;
1111 	if (imb <= 0)
1112 		return false;
1113 
1114 	/*
1115 	 * The imbalance is above the allowed threshold.
1116 	 * Compare it with the old imbalance.
1117 	 */
1118 	if (orig_dst_load < orig_src_load)
1119 		swap(orig_dst_load, orig_src_load);
1120 
1121 	old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
1122 
1123 	/* Would this change make things worse? */
1124 	return (imb > old_imb);
1125 }
1126 
1127 /*
1128  * This checks if the overall compute and NUMA accesses of the system would
1129  * be improved if the source tasks was migrated to the target dst_cpu taking
1130  * into account that it might be best if task running on the dst_cpu should
1131  * be exchanged with the source task
1132  */
1133 static void task_numa_compare(struct task_numa_env *env,
1134 			      long taskimp, long groupimp)
1135 {
1136 	struct rq *src_rq = cpu_rq(env->src_cpu);
1137 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
1138 	struct task_struct *cur;
1139 	long orig_src_load, src_load;
1140 	long orig_dst_load, dst_load;
1141 	long load;
1142 	long imp = (groupimp > 0) ? groupimp : taskimp;
1143 
1144 	rcu_read_lock();
1145 	cur = ACCESS_ONCE(dst_rq->curr);
1146 	if (cur->pid == 0) /* idle */
1147 		cur = NULL;
1148 
1149 	/*
1150 	 * "imp" is the fault differential for the source task between the
1151 	 * source and destination node. Calculate the total differential for
1152 	 * the source task and potential destination task. The more negative
1153 	 * the value is, the more rmeote accesses that would be expected to
1154 	 * be incurred if the tasks were swapped.
1155 	 */
1156 	if (cur) {
1157 		/* Skip this swap candidate if cannot move to the source cpu */
1158 		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1159 			goto unlock;
1160 
1161 		/*
1162 		 * If dst and source tasks are in the same NUMA group, or not
1163 		 * in any group then look only at task weights.
1164 		 */
1165 		if (cur->numa_group == env->p->numa_group) {
1166 			imp = taskimp + task_weight(cur, env->src_nid) -
1167 			      task_weight(cur, env->dst_nid);
1168 			/*
1169 			 * Add some hysteresis to prevent swapping the
1170 			 * tasks within a group over tiny differences.
1171 			 */
1172 			if (cur->numa_group)
1173 				imp -= imp/16;
1174 		} else {
1175 			/*
1176 			 * Compare the group weights. If a task is all by
1177 			 * itself (not part of a group), use the task weight
1178 			 * instead.
1179 			 */
1180 			if (env->p->numa_group)
1181 				imp = groupimp;
1182 			else
1183 				imp = taskimp;
1184 
1185 			if (cur->numa_group)
1186 				imp += group_weight(cur, env->src_nid) -
1187 				       group_weight(cur, env->dst_nid);
1188 			else
1189 				imp += task_weight(cur, env->src_nid) -
1190 				       task_weight(cur, env->dst_nid);
1191 		}
1192 	}
1193 
1194 	if (imp < env->best_imp)
1195 		goto unlock;
1196 
1197 	if (!cur) {
1198 		/* Is there capacity at our destination? */
1199 		if (env->src_stats.has_free_capacity &&
1200 		    !env->dst_stats.has_free_capacity)
1201 			goto unlock;
1202 
1203 		goto balance;
1204 	}
1205 
1206 	/* Balance doesn't matter much if we're running a task per cpu */
1207 	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1208 		goto assign;
1209 
1210 	/*
1211 	 * In the overloaded case, try and keep the load balanced.
1212 	 */
1213 balance:
1214 	orig_dst_load = env->dst_stats.load;
1215 	orig_src_load = env->src_stats.load;
1216 
1217 	/* XXX missing capacity terms */
1218 	load = task_h_load(env->p);
1219 	dst_load = orig_dst_load + load;
1220 	src_load = orig_src_load - load;
1221 
1222 	if (cur) {
1223 		load = task_h_load(cur);
1224 		dst_load -= load;
1225 		src_load += load;
1226 	}
1227 
1228 	if (load_too_imbalanced(orig_src_load, orig_dst_load,
1229 				src_load, dst_load, env))
1230 		goto unlock;
1231 
1232 assign:
1233 	task_numa_assign(env, cur, imp);
1234 unlock:
1235 	rcu_read_unlock();
1236 }
1237 
1238 static void task_numa_find_cpu(struct task_numa_env *env,
1239 				long taskimp, long groupimp)
1240 {
1241 	int cpu;
1242 
1243 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1244 		/* Skip this CPU if the source task cannot migrate */
1245 		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1246 			continue;
1247 
1248 		env->dst_cpu = cpu;
1249 		task_numa_compare(env, taskimp, groupimp);
1250 	}
1251 }
1252 
1253 static int task_numa_migrate(struct task_struct *p)
1254 {
1255 	struct task_numa_env env = {
1256 		.p = p,
1257 
1258 		.src_cpu = task_cpu(p),
1259 		.src_nid = task_node(p),
1260 
1261 		.imbalance_pct = 112,
1262 
1263 		.best_task = NULL,
1264 		.best_imp = 0,
1265 		.best_cpu = -1
1266 	};
1267 	struct sched_domain *sd;
1268 	unsigned long taskweight, groupweight;
1269 	int nid, ret;
1270 	long taskimp, groupimp;
1271 
1272 	/*
1273 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
1274 	 * imbalance and would be the first to start moving tasks about.
1275 	 *
1276 	 * And we want to avoid any moving of tasks about, as that would create
1277 	 * random movement of tasks -- counter the numa conditions we're trying
1278 	 * to satisfy here.
1279 	 */
1280 	rcu_read_lock();
1281 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1282 	if (sd)
1283 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1284 	rcu_read_unlock();
1285 
1286 	/*
1287 	 * Cpusets can break the scheduler domain tree into smaller
1288 	 * balance domains, some of which do not cross NUMA boundaries.
1289 	 * Tasks that are "trapped" in such domains cannot be migrated
1290 	 * elsewhere, so there is no point in (re)trying.
1291 	 */
1292 	if (unlikely(!sd)) {
1293 		p->numa_preferred_nid = task_node(p);
1294 		return -EINVAL;
1295 	}
1296 
1297 	taskweight = task_weight(p, env.src_nid);
1298 	groupweight = group_weight(p, env.src_nid);
1299 	update_numa_stats(&env.src_stats, env.src_nid);
1300 	env.dst_nid = p->numa_preferred_nid;
1301 	taskimp = task_weight(p, env.dst_nid) - taskweight;
1302 	groupimp = group_weight(p, env.dst_nid) - groupweight;
1303 	update_numa_stats(&env.dst_stats, env.dst_nid);
1304 
1305 	/* If the preferred nid has free capacity, try to use it. */
1306 	if (env.dst_stats.has_free_capacity)
1307 		task_numa_find_cpu(&env, taskimp, groupimp);
1308 
1309 	/* No space available on the preferred nid. Look elsewhere. */
1310 	if (env.best_cpu == -1) {
1311 		for_each_online_node(nid) {
1312 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
1313 				continue;
1314 
1315 			/* Only consider nodes where both task and groups benefit */
1316 			taskimp = task_weight(p, nid) - taskweight;
1317 			groupimp = group_weight(p, nid) - groupweight;
1318 			if (taskimp < 0 && groupimp < 0)
1319 				continue;
1320 
1321 			env.dst_nid = nid;
1322 			update_numa_stats(&env.dst_stats, env.dst_nid);
1323 			task_numa_find_cpu(&env, taskimp, groupimp);
1324 		}
1325 	}
1326 
1327 	/* No better CPU than the current one was found. */
1328 	if (env.best_cpu == -1)
1329 		return -EAGAIN;
1330 
1331 	/*
1332 	 * If the task is part of a workload that spans multiple NUMA nodes,
1333 	 * and is migrating into one of the workload's active nodes, remember
1334 	 * this node as the task's preferred numa node, so the workload can
1335 	 * settle down.
1336 	 * A task that migrated to a second choice node will be better off
1337 	 * trying for a better one later. Do not set the preferred node here.
1338 	 */
1339 	if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
1340 		sched_setnuma(p, env.dst_nid);
1341 
1342 	/*
1343 	 * Reset the scan period if the task is being rescheduled on an
1344 	 * alternative node to recheck if the tasks is now properly placed.
1345 	 */
1346 	p->numa_scan_period = task_scan_min(p);
1347 
1348 	if (env.best_task == NULL) {
1349 		ret = migrate_task_to(p, env.best_cpu);
1350 		if (ret != 0)
1351 			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1352 		return ret;
1353 	}
1354 
1355 	ret = migrate_swap(p, env.best_task);
1356 	if (ret != 0)
1357 		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1358 	put_task_struct(env.best_task);
1359 	return ret;
1360 }
1361 
1362 /* Attempt to migrate a task to a CPU on the preferred node. */
1363 static void numa_migrate_preferred(struct task_struct *p)
1364 {
1365 	unsigned long interval = HZ;
1366 
1367 	/* This task has no NUMA fault statistics yet */
1368 	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1369 		return;
1370 
1371 	/* Periodically retry migrating the task to the preferred node */
1372 	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1373 	p->numa_migrate_retry = jiffies + interval;
1374 
1375 	/* Success if task is already running on preferred CPU */
1376 	if (task_node(p) == p->numa_preferred_nid)
1377 		return;
1378 
1379 	/* Otherwise, try migrate to a CPU on the preferred node */
1380 	task_numa_migrate(p);
1381 }
1382 
1383 /*
1384  * Find the nodes on which the workload is actively running. We do this by
1385  * tracking the nodes from which NUMA hinting faults are triggered. This can
1386  * be different from the set of nodes where the workload's memory is currently
1387  * located.
1388  *
1389  * The bitmask is used to make smarter decisions on when to do NUMA page
1390  * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1391  * are added when they cause over 6/16 of the maximum number of faults, but
1392  * only removed when they drop below 3/16.
1393  */
1394 static void update_numa_active_node_mask(struct numa_group *numa_group)
1395 {
1396 	unsigned long faults, max_faults = 0;
1397 	int nid;
1398 
1399 	for_each_online_node(nid) {
1400 		faults = group_faults_cpu(numa_group, nid);
1401 		if (faults > max_faults)
1402 			max_faults = faults;
1403 	}
1404 
1405 	for_each_online_node(nid) {
1406 		faults = group_faults_cpu(numa_group, nid);
1407 		if (!node_isset(nid, numa_group->active_nodes)) {
1408 			if (faults > max_faults * 6 / 16)
1409 				node_set(nid, numa_group->active_nodes);
1410 		} else if (faults < max_faults * 3 / 16)
1411 			node_clear(nid, numa_group->active_nodes);
1412 	}
1413 }
1414 
1415 /*
1416  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1417  * increments. The more local the fault statistics are, the higher the scan
1418  * period will be for the next scan window. If local/remote ratio is below
1419  * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
1420  * scan period will decrease
1421  */
1422 #define NUMA_PERIOD_SLOTS 10
1423 #define NUMA_PERIOD_THRESHOLD 3
1424 
1425 /*
1426  * Increase the scan period (slow down scanning) if the majority of
1427  * our memory is already on our local node, or if the majority of
1428  * the page accesses are shared with other processes.
1429  * Otherwise, decrease the scan period.
1430  */
1431 static void update_task_scan_period(struct task_struct *p,
1432 			unsigned long shared, unsigned long private)
1433 {
1434 	unsigned int period_slot;
1435 	int ratio;
1436 	int diff;
1437 
1438 	unsigned long remote = p->numa_faults_locality[0];
1439 	unsigned long local = p->numa_faults_locality[1];
1440 
1441 	/*
1442 	 * If there were no record hinting faults then either the task is
1443 	 * completely idle or all activity is areas that are not of interest
1444 	 * to automatic numa balancing. Scan slower
1445 	 */
1446 	if (local + shared == 0) {
1447 		p->numa_scan_period = min(p->numa_scan_period_max,
1448 			p->numa_scan_period << 1);
1449 
1450 		p->mm->numa_next_scan = jiffies +
1451 			msecs_to_jiffies(p->numa_scan_period);
1452 
1453 		return;
1454 	}
1455 
1456 	/*
1457 	 * Prepare to scale scan period relative to the current period.
1458 	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
1459 	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1460 	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1461 	 */
1462 	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1463 	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1464 	if (ratio >= NUMA_PERIOD_THRESHOLD) {
1465 		int slot = ratio - NUMA_PERIOD_THRESHOLD;
1466 		if (!slot)
1467 			slot = 1;
1468 		diff = slot * period_slot;
1469 	} else {
1470 		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1471 
1472 		/*
1473 		 * Scale scan rate increases based on sharing. There is an
1474 		 * inverse relationship between the degree of sharing and
1475 		 * the adjustment made to the scanning period. Broadly
1476 		 * speaking the intent is that there is little point
1477 		 * scanning faster if shared accesses dominate as it may
1478 		 * simply bounce migrations uselessly
1479 		 */
1480 		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1481 		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1482 	}
1483 
1484 	p->numa_scan_period = clamp(p->numa_scan_period + diff,
1485 			task_scan_min(p), task_scan_max(p));
1486 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1487 }
1488 
1489 /*
1490  * Get the fraction of time the task has been running since the last
1491  * NUMA placement cycle. The scheduler keeps similar statistics, but
1492  * decays those on a 32ms period, which is orders of magnitude off
1493  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1494  * stats only if the task is so new there are no NUMA statistics yet.
1495  */
1496 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1497 {
1498 	u64 runtime, delta, now;
1499 	/* Use the start of this time slice to avoid calculations. */
1500 	now = p->se.exec_start;
1501 	runtime = p->se.sum_exec_runtime;
1502 
1503 	if (p->last_task_numa_placement) {
1504 		delta = runtime - p->last_sum_exec_runtime;
1505 		*period = now - p->last_task_numa_placement;
1506 	} else {
1507 		delta = p->se.avg.runnable_avg_sum;
1508 		*period = p->se.avg.runnable_avg_period;
1509 	}
1510 
1511 	p->last_sum_exec_runtime = runtime;
1512 	p->last_task_numa_placement = now;
1513 
1514 	return delta;
1515 }
1516 
1517 static void task_numa_placement(struct task_struct *p)
1518 {
1519 	int seq, nid, max_nid = -1, max_group_nid = -1;
1520 	unsigned long max_faults = 0, max_group_faults = 0;
1521 	unsigned long fault_types[2] = { 0, 0 };
1522 	unsigned long total_faults;
1523 	u64 runtime, period;
1524 	spinlock_t *group_lock = NULL;
1525 
1526 	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
1527 	if (p->numa_scan_seq == seq)
1528 		return;
1529 	p->numa_scan_seq = seq;
1530 	p->numa_scan_period_max = task_scan_max(p);
1531 
1532 	total_faults = p->numa_faults_locality[0] +
1533 		       p->numa_faults_locality[1];
1534 	runtime = numa_get_avg_runtime(p, &period);
1535 
1536 	/* If the task is part of a group prevent parallel updates to group stats */
1537 	if (p->numa_group) {
1538 		group_lock = &p->numa_group->lock;
1539 		spin_lock_irq(group_lock);
1540 	}
1541 
1542 	/* Find the node with the highest number of faults */
1543 	for_each_online_node(nid) {
1544 		unsigned long faults = 0, group_faults = 0;
1545 		int priv, i;
1546 
1547 		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1548 			long diff, f_diff, f_weight;
1549 
1550 			i = task_faults_idx(nid, priv);
1551 
1552 			/* Decay existing window, copy faults since last scan */
1553 			diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
1554 			fault_types[priv] += p->numa_faults_buffer_memory[i];
1555 			p->numa_faults_buffer_memory[i] = 0;
1556 
1557 			/*
1558 			 * Normalize the faults_from, so all tasks in a group
1559 			 * count according to CPU use, instead of by the raw
1560 			 * number of faults. Tasks with little runtime have
1561 			 * little over-all impact on throughput, and thus their
1562 			 * faults are less important.
1563 			 */
1564 			f_weight = div64_u64(runtime << 16, period + 1);
1565 			f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
1566 				   (total_faults + 1);
1567 			f_diff = f_weight - p->numa_faults_cpu[i] / 2;
1568 			p->numa_faults_buffer_cpu[i] = 0;
1569 
1570 			p->numa_faults_memory[i] += diff;
1571 			p->numa_faults_cpu[i] += f_diff;
1572 			faults += p->numa_faults_memory[i];
1573 			p->total_numa_faults += diff;
1574 			if (p->numa_group) {
1575 				/* safe because we can only change our own group */
1576 				p->numa_group->faults[i] += diff;
1577 				p->numa_group->faults_cpu[i] += f_diff;
1578 				p->numa_group->total_faults += diff;
1579 				group_faults += p->numa_group->faults[i];
1580 			}
1581 		}
1582 
1583 		if (faults > max_faults) {
1584 			max_faults = faults;
1585 			max_nid = nid;
1586 		}
1587 
1588 		if (group_faults > max_group_faults) {
1589 			max_group_faults = group_faults;
1590 			max_group_nid = nid;
1591 		}
1592 	}
1593 
1594 	update_task_scan_period(p, fault_types[0], fault_types[1]);
1595 
1596 	if (p->numa_group) {
1597 		update_numa_active_node_mask(p->numa_group);
1598 		/*
1599 		 * If the preferred task and group nids are different,
1600 		 * iterate over the nodes again to find the best place.
1601 		 */
1602 		if (max_nid != max_group_nid) {
1603 			unsigned long weight, max_weight = 0;
1604 
1605 			for_each_online_node(nid) {
1606 				weight = task_weight(p, nid) + group_weight(p, nid);
1607 				if (weight > max_weight) {
1608 					max_weight = weight;
1609 					max_nid = nid;
1610 				}
1611 			}
1612 		}
1613 
1614 		spin_unlock_irq(group_lock);
1615 	}
1616 
1617 	/* Preferred node as the node with the most faults */
1618 	if (max_faults && max_nid != p->numa_preferred_nid) {
1619 		/* Update the preferred nid and migrate task if possible */
1620 		sched_setnuma(p, max_nid);
1621 		numa_migrate_preferred(p);
1622 	}
1623 }
1624 
1625 static inline int get_numa_group(struct numa_group *grp)
1626 {
1627 	return atomic_inc_not_zero(&grp->refcount);
1628 }
1629 
1630 static inline void put_numa_group(struct numa_group *grp)
1631 {
1632 	if (atomic_dec_and_test(&grp->refcount))
1633 		kfree_rcu(grp, rcu);
1634 }
1635 
1636 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1637 			int *priv)
1638 {
1639 	struct numa_group *grp, *my_grp;
1640 	struct task_struct *tsk;
1641 	bool join = false;
1642 	int cpu = cpupid_to_cpu(cpupid);
1643 	int i;
1644 
1645 	if (unlikely(!p->numa_group)) {
1646 		unsigned int size = sizeof(struct numa_group) +
1647 				    4*nr_node_ids*sizeof(unsigned long);
1648 
1649 		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1650 		if (!grp)
1651 			return;
1652 
1653 		atomic_set(&grp->refcount, 1);
1654 		spin_lock_init(&grp->lock);
1655 		INIT_LIST_HEAD(&grp->task_list);
1656 		grp->gid = p->pid;
1657 		/* Second half of the array tracks nids where faults happen */
1658 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1659 						nr_node_ids;
1660 
1661 		node_set(task_node(current), grp->active_nodes);
1662 
1663 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1664 			grp->faults[i] = p->numa_faults_memory[i];
1665 
1666 		grp->total_faults = p->total_numa_faults;
1667 
1668 		list_add(&p->numa_entry, &grp->task_list);
1669 		grp->nr_tasks++;
1670 		rcu_assign_pointer(p->numa_group, grp);
1671 	}
1672 
1673 	rcu_read_lock();
1674 	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1675 
1676 	if (!cpupid_match_pid(tsk, cpupid))
1677 		goto no_join;
1678 
1679 	grp = rcu_dereference(tsk->numa_group);
1680 	if (!grp)
1681 		goto no_join;
1682 
1683 	my_grp = p->numa_group;
1684 	if (grp == my_grp)
1685 		goto no_join;
1686 
1687 	/*
1688 	 * Only join the other group if its bigger; if we're the bigger group,
1689 	 * the other task will join us.
1690 	 */
1691 	if (my_grp->nr_tasks > grp->nr_tasks)
1692 		goto no_join;
1693 
1694 	/*
1695 	 * Tie-break on the grp address.
1696 	 */
1697 	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1698 		goto no_join;
1699 
1700 	/* Always join threads in the same process. */
1701 	if (tsk->mm == current->mm)
1702 		join = true;
1703 
1704 	/* Simple filter to avoid false positives due to PID collisions */
1705 	if (flags & TNF_SHARED)
1706 		join = true;
1707 
1708 	/* Update priv based on whether false sharing was detected */
1709 	*priv = !join;
1710 
1711 	if (join && !get_numa_group(grp))
1712 		goto no_join;
1713 
1714 	rcu_read_unlock();
1715 
1716 	if (!join)
1717 		return;
1718 
1719 	BUG_ON(irqs_disabled());
1720 	double_lock_irq(&my_grp->lock, &grp->lock);
1721 
1722 	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1723 		my_grp->faults[i] -= p->numa_faults_memory[i];
1724 		grp->faults[i] += p->numa_faults_memory[i];
1725 	}
1726 	my_grp->total_faults -= p->total_numa_faults;
1727 	grp->total_faults += p->total_numa_faults;
1728 
1729 	list_move(&p->numa_entry, &grp->task_list);
1730 	my_grp->nr_tasks--;
1731 	grp->nr_tasks++;
1732 
1733 	spin_unlock(&my_grp->lock);
1734 	spin_unlock_irq(&grp->lock);
1735 
1736 	rcu_assign_pointer(p->numa_group, grp);
1737 
1738 	put_numa_group(my_grp);
1739 	return;
1740 
1741 no_join:
1742 	rcu_read_unlock();
1743 	return;
1744 }
1745 
1746 void task_numa_free(struct task_struct *p)
1747 {
1748 	struct numa_group *grp = p->numa_group;
1749 	void *numa_faults = p->numa_faults_memory;
1750 	unsigned long flags;
1751 	int i;
1752 
1753 	if (grp) {
1754 		spin_lock_irqsave(&grp->lock, flags);
1755 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1756 			grp->faults[i] -= p->numa_faults_memory[i];
1757 		grp->total_faults -= p->total_numa_faults;
1758 
1759 		list_del(&p->numa_entry);
1760 		grp->nr_tasks--;
1761 		spin_unlock_irqrestore(&grp->lock, flags);
1762 		rcu_assign_pointer(p->numa_group, NULL);
1763 		put_numa_group(grp);
1764 	}
1765 
1766 	p->numa_faults_memory = NULL;
1767 	p->numa_faults_buffer_memory = NULL;
1768 	p->numa_faults_cpu= NULL;
1769 	p->numa_faults_buffer_cpu = NULL;
1770 	kfree(numa_faults);
1771 }
1772 
1773 /*
1774  * Got a PROT_NONE fault for a page on @node.
1775  */
1776 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1777 {
1778 	struct task_struct *p = current;
1779 	bool migrated = flags & TNF_MIGRATED;
1780 	int cpu_node = task_node(current);
1781 	int local = !!(flags & TNF_FAULT_LOCAL);
1782 	int priv;
1783 
1784 	if (!numabalancing_enabled)
1785 		return;
1786 
1787 	/* for example, ksmd faulting in a user's mm */
1788 	if (!p->mm)
1789 		return;
1790 
1791 	/* Do not worry about placement if exiting */
1792 	if (p->state == TASK_DEAD)
1793 		return;
1794 
1795 	/* Allocate buffer to track faults on a per-node basis */
1796 	if (unlikely(!p->numa_faults_memory)) {
1797 		int size = sizeof(*p->numa_faults_memory) *
1798 			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1799 
1800 		p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1801 		if (!p->numa_faults_memory)
1802 			return;
1803 
1804 		BUG_ON(p->numa_faults_buffer_memory);
1805 		/*
1806 		 * The averaged statistics, shared & private, memory & cpu,
1807 		 * occupy the first half of the array. The second half of the
1808 		 * array is for current counters, which are averaged into the
1809 		 * first set by task_numa_placement.
1810 		 */
1811 		p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1812 		p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1813 		p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1814 		p->total_numa_faults = 0;
1815 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1816 	}
1817 
1818 	/*
1819 	 * First accesses are treated as private, otherwise consider accesses
1820 	 * to be private if the accessing pid has not changed
1821 	 */
1822 	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
1823 		priv = 1;
1824 	} else {
1825 		priv = cpupid_match_pid(p, last_cpupid);
1826 		if (!priv && !(flags & TNF_NO_GROUP))
1827 			task_numa_group(p, last_cpupid, flags, &priv);
1828 	}
1829 
1830 	/*
1831 	 * If a workload spans multiple NUMA nodes, a shared fault that
1832 	 * occurs wholly within the set of nodes that the workload is
1833 	 * actively using should be counted as local. This allows the
1834 	 * scan rate to slow down when a workload has settled down.
1835 	 */
1836 	if (!priv && !local && p->numa_group &&
1837 			node_isset(cpu_node, p->numa_group->active_nodes) &&
1838 			node_isset(mem_node, p->numa_group->active_nodes))
1839 		local = 1;
1840 
1841 	task_numa_placement(p);
1842 
1843 	/*
1844 	 * Retry task to preferred node migration periodically, in case it
1845 	 * case it previously failed, or the scheduler moved us.
1846 	 */
1847 	if (time_after(jiffies, p->numa_migrate_retry))
1848 		numa_migrate_preferred(p);
1849 
1850 	if (migrated)
1851 		p->numa_pages_migrated += pages;
1852 
1853 	p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1854 	p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1855 	p->numa_faults_locality[local] += pages;
1856 }
1857 
1858 static void reset_ptenuma_scan(struct task_struct *p)
1859 {
1860 	ACCESS_ONCE(p->mm->numa_scan_seq)++;
1861 	p->mm->numa_scan_offset = 0;
1862 }
1863 
1864 /*
1865  * The expensive part of numa migration is done from task_work context.
1866  * Triggered from task_tick_numa().
1867  */
1868 void task_numa_work(struct callback_head *work)
1869 {
1870 	unsigned long migrate, next_scan, now = jiffies;
1871 	struct task_struct *p = current;
1872 	struct mm_struct *mm = p->mm;
1873 	struct vm_area_struct *vma;
1874 	unsigned long start, end;
1875 	unsigned long nr_pte_updates = 0;
1876 	long pages;
1877 
1878 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
1879 
1880 	work->next = work; /* protect against double add */
1881 	/*
1882 	 * Who cares about NUMA placement when they're dying.
1883 	 *
1884 	 * NOTE: make sure not to dereference p->mm before this check,
1885 	 * exit_task_work() happens _after_ exit_mm() so we could be called
1886 	 * without p->mm even though we still had it when we enqueued this
1887 	 * work.
1888 	 */
1889 	if (p->flags & PF_EXITING)
1890 		return;
1891 
1892 	if (!mm->numa_next_scan) {
1893 		mm->numa_next_scan = now +
1894 			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1895 	}
1896 
1897 	/*
1898 	 * Enforce maximal scan/migration frequency..
1899 	 */
1900 	migrate = mm->numa_next_scan;
1901 	if (time_before(now, migrate))
1902 		return;
1903 
1904 	if (p->numa_scan_period == 0) {
1905 		p->numa_scan_period_max = task_scan_max(p);
1906 		p->numa_scan_period = task_scan_min(p);
1907 	}
1908 
1909 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
1910 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
1911 		return;
1912 
1913 	/*
1914 	 * Delay this task enough that another task of this mm will likely win
1915 	 * the next time around.
1916 	 */
1917 	p->node_stamp += 2 * TICK_NSEC;
1918 
1919 	start = mm->numa_scan_offset;
1920 	pages = sysctl_numa_balancing_scan_size;
1921 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
1922 	if (!pages)
1923 		return;
1924 
1925 	down_read(&mm->mmap_sem);
1926 	vma = find_vma(mm, start);
1927 	if (!vma) {
1928 		reset_ptenuma_scan(p);
1929 		start = 0;
1930 		vma = mm->mmap;
1931 	}
1932 	for (; vma; vma = vma->vm_next) {
1933 		if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
1934 			continue;
1935 
1936 		/*
1937 		 * Shared library pages mapped by multiple processes are not
1938 		 * migrated as it is expected they are cache replicated. Avoid
1939 		 * hinting faults in read-only file-backed mappings or the vdso
1940 		 * as migrating the pages will be of marginal benefit.
1941 		 */
1942 		if (!vma->vm_mm ||
1943 		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
1944 			continue;
1945 
1946 		/*
1947 		 * Skip inaccessible VMAs to avoid any confusion between
1948 		 * PROT_NONE and NUMA hinting ptes
1949 		 */
1950 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1951 			continue;
1952 
1953 		do {
1954 			start = max(start, vma->vm_start);
1955 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
1956 			end = min(end, vma->vm_end);
1957 			nr_pte_updates += change_prot_numa(vma, start, end);
1958 
1959 			/*
1960 			 * Scan sysctl_numa_balancing_scan_size but ensure that
1961 			 * at least one PTE is updated so that unused virtual
1962 			 * address space is quickly skipped.
1963 			 */
1964 			if (nr_pte_updates)
1965 				pages -= (end - start) >> PAGE_SHIFT;
1966 
1967 			start = end;
1968 			if (pages <= 0)
1969 				goto out;
1970 
1971 			cond_resched();
1972 		} while (end != vma->vm_end);
1973 	}
1974 
1975 out:
1976 	/*
1977 	 * It is possible to reach the end of the VMA list but the last few
1978 	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
1979 	 * would find the !migratable VMA on the next scan but not reset the
1980 	 * scanner to the start so check it now.
1981 	 */
1982 	if (vma)
1983 		mm->numa_scan_offset = start;
1984 	else
1985 		reset_ptenuma_scan(p);
1986 	up_read(&mm->mmap_sem);
1987 }
1988 
1989 /*
1990  * Drive the periodic memory faults..
1991  */
1992 void task_tick_numa(struct rq *rq, struct task_struct *curr)
1993 {
1994 	struct callback_head *work = &curr->numa_work;
1995 	u64 period, now;
1996 
1997 	/*
1998 	 * We don't care about NUMA placement if we don't have memory.
1999 	 */
2000 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2001 		return;
2002 
2003 	/*
2004 	 * Using runtime rather than walltime has the dual advantage that
2005 	 * we (mostly) drive the selection from busy threads and that the
2006 	 * task needs to have done some actual work before we bother with
2007 	 * NUMA placement.
2008 	 */
2009 	now = curr->se.sum_exec_runtime;
2010 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2011 
2012 	if (now - curr->node_stamp > period) {
2013 		if (!curr->node_stamp)
2014 			curr->numa_scan_period = task_scan_min(curr);
2015 		curr->node_stamp += period;
2016 
2017 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2018 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2019 			task_work_add(curr, work, true);
2020 		}
2021 	}
2022 }
2023 #else
2024 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2025 {
2026 }
2027 
2028 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2029 {
2030 }
2031 
2032 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2033 {
2034 }
2035 #endif /* CONFIG_NUMA_BALANCING */
2036 
2037 static void
2038 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2039 {
2040 	update_load_add(&cfs_rq->load, se->load.weight);
2041 	if (!parent_entity(se))
2042 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2043 #ifdef CONFIG_SMP
2044 	if (entity_is_task(se)) {
2045 		struct rq *rq = rq_of(cfs_rq);
2046 
2047 		account_numa_enqueue(rq, task_of(se));
2048 		list_add(&se->group_node, &rq->cfs_tasks);
2049 	}
2050 #endif
2051 	cfs_rq->nr_running++;
2052 }
2053 
2054 static void
2055 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2056 {
2057 	update_load_sub(&cfs_rq->load, se->load.weight);
2058 	if (!parent_entity(se))
2059 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2060 	if (entity_is_task(se)) {
2061 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2062 		list_del_init(&se->group_node);
2063 	}
2064 	cfs_rq->nr_running--;
2065 }
2066 
2067 #ifdef CONFIG_FAIR_GROUP_SCHED
2068 # ifdef CONFIG_SMP
2069 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2070 {
2071 	long tg_weight;
2072 
2073 	/*
2074 	 * Use this CPU's actual weight instead of the last load_contribution
2075 	 * to gain a more accurate current total weight. See
2076 	 * update_cfs_rq_load_contribution().
2077 	 */
2078 	tg_weight = atomic_long_read(&tg->load_avg);
2079 	tg_weight -= cfs_rq->tg_load_contrib;
2080 	tg_weight += cfs_rq->load.weight;
2081 
2082 	return tg_weight;
2083 }
2084 
2085 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2086 {
2087 	long tg_weight, load, shares;
2088 
2089 	tg_weight = calc_tg_weight(tg, cfs_rq);
2090 	load = cfs_rq->load.weight;
2091 
2092 	shares = (tg->shares * load);
2093 	if (tg_weight)
2094 		shares /= tg_weight;
2095 
2096 	if (shares < MIN_SHARES)
2097 		shares = MIN_SHARES;
2098 	if (shares > tg->shares)
2099 		shares = tg->shares;
2100 
2101 	return shares;
2102 }
2103 # else /* CONFIG_SMP */
2104 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2105 {
2106 	return tg->shares;
2107 }
2108 # endif /* CONFIG_SMP */
2109 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2110 			    unsigned long weight)
2111 {
2112 	if (se->on_rq) {
2113 		/* commit outstanding execution time */
2114 		if (cfs_rq->curr == se)
2115 			update_curr(cfs_rq);
2116 		account_entity_dequeue(cfs_rq, se);
2117 	}
2118 
2119 	update_load_set(&se->load, weight);
2120 
2121 	if (se->on_rq)
2122 		account_entity_enqueue(cfs_rq, se);
2123 }
2124 
2125 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2126 
2127 static void update_cfs_shares(struct cfs_rq *cfs_rq)
2128 {
2129 	struct task_group *tg;
2130 	struct sched_entity *se;
2131 	long shares;
2132 
2133 	tg = cfs_rq->tg;
2134 	se = tg->se[cpu_of(rq_of(cfs_rq))];
2135 	if (!se || throttled_hierarchy(cfs_rq))
2136 		return;
2137 #ifndef CONFIG_SMP
2138 	if (likely(se->load.weight == tg->shares))
2139 		return;
2140 #endif
2141 	shares = calc_cfs_shares(cfs_rq, tg);
2142 
2143 	reweight_entity(cfs_rq_of(se), se, shares);
2144 }
2145 #else /* CONFIG_FAIR_GROUP_SCHED */
2146 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2147 {
2148 }
2149 #endif /* CONFIG_FAIR_GROUP_SCHED */
2150 
2151 #ifdef CONFIG_SMP
2152 /*
2153  * We choose a half-life close to 1 scheduling period.
2154  * Note: The tables below are dependent on this value.
2155  */
2156 #define LOAD_AVG_PERIOD 32
2157 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
2158 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
2159 
2160 /* Precomputed fixed inverse multiplies for multiplication by y^n */
2161 static const u32 runnable_avg_yN_inv[] = {
2162 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2163 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2164 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2165 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2166 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2167 	0x85aac367, 0x82cd8698,
2168 };
2169 
2170 /*
2171  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2172  * over-estimates when re-combining.
2173  */
2174 static const u32 runnable_avg_yN_sum[] = {
2175 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2176 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2177 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2178 };
2179 
2180 /*
2181  * Approximate:
2182  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2183  */
2184 static __always_inline u64 decay_load(u64 val, u64 n)
2185 {
2186 	unsigned int local_n;
2187 
2188 	if (!n)
2189 		return val;
2190 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2191 		return 0;
2192 
2193 	/* after bounds checking we can collapse to 32-bit */
2194 	local_n = n;
2195 
2196 	/*
2197 	 * As y^PERIOD = 1/2, we can combine
2198 	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
2199 	 * With a look-up table which covers k^n (n<PERIOD)
2200 	 *
2201 	 * To achieve constant time decay_load.
2202 	 */
2203 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2204 		val >>= local_n / LOAD_AVG_PERIOD;
2205 		local_n %= LOAD_AVG_PERIOD;
2206 	}
2207 
2208 	val *= runnable_avg_yN_inv[local_n];
2209 	/* We don't use SRR here since we always want to round down. */
2210 	return val >> 32;
2211 }
2212 
2213 /*
2214  * For updates fully spanning n periods, the contribution to runnable
2215  * average will be: \Sum 1024*y^n
2216  *
2217  * We can compute this reasonably efficiently by combining:
2218  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2219  */
2220 static u32 __compute_runnable_contrib(u64 n)
2221 {
2222 	u32 contrib = 0;
2223 
2224 	if (likely(n <= LOAD_AVG_PERIOD))
2225 		return runnable_avg_yN_sum[n];
2226 	else if (unlikely(n >= LOAD_AVG_MAX_N))
2227 		return LOAD_AVG_MAX;
2228 
2229 	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2230 	do {
2231 		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2232 		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2233 
2234 		n -= LOAD_AVG_PERIOD;
2235 	} while (n > LOAD_AVG_PERIOD);
2236 
2237 	contrib = decay_load(contrib, n);
2238 	return contrib + runnable_avg_yN_sum[n];
2239 }
2240 
2241 /*
2242  * We can represent the historical contribution to runnable average as the
2243  * coefficients of a geometric series.  To do this we sub-divide our runnable
2244  * history into segments of approximately 1ms (1024us); label the segment that
2245  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2246  *
2247  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2248  *      p0            p1           p2
2249  *     (now)       (~1ms ago)  (~2ms ago)
2250  *
2251  * Let u_i denote the fraction of p_i that the entity was runnable.
2252  *
2253  * We then designate the fractions u_i as our co-efficients, yielding the
2254  * following representation of historical load:
2255  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2256  *
2257  * We choose y based on the with of a reasonably scheduling period, fixing:
2258  *   y^32 = 0.5
2259  *
2260  * This means that the contribution to load ~32ms ago (u_32) will be weighted
2261  * approximately half as much as the contribution to load within the last ms
2262  * (u_0).
2263  *
2264  * When a period "rolls over" and we have new u_0`, multiplying the previous
2265  * sum again by y is sufficient to update:
2266  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2267  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2268  */
2269 static __always_inline int __update_entity_runnable_avg(u64 now,
2270 							struct sched_avg *sa,
2271 							int runnable)
2272 {
2273 	u64 delta, periods;
2274 	u32 runnable_contrib;
2275 	int delta_w, decayed = 0;
2276 
2277 	delta = now - sa->last_runnable_update;
2278 	/*
2279 	 * This should only happen when time goes backwards, which it
2280 	 * unfortunately does during sched clock init when we swap over to TSC.
2281 	 */
2282 	if ((s64)delta < 0) {
2283 		sa->last_runnable_update = now;
2284 		return 0;
2285 	}
2286 
2287 	/*
2288 	 * Use 1024ns as the unit of measurement since it's a reasonable
2289 	 * approximation of 1us and fast to compute.
2290 	 */
2291 	delta >>= 10;
2292 	if (!delta)
2293 		return 0;
2294 	sa->last_runnable_update = now;
2295 
2296 	/* delta_w is the amount already accumulated against our next period */
2297 	delta_w = sa->runnable_avg_period % 1024;
2298 	if (delta + delta_w >= 1024) {
2299 		/* period roll-over */
2300 		decayed = 1;
2301 
2302 		/*
2303 		 * Now that we know we're crossing a period boundary, figure
2304 		 * out how much from delta we need to complete the current
2305 		 * period and accrue it.
2306 		 */
2307 		delta_w = 1024 - delta_w;
2308 		if (runnable)
2309 			sa->runnable_avg_sum += delta_w;
2310 		sa->runnable_avg_period += delta_w;
2311 
2312 		delta -= delta_w;
2313 
2314 		/* Figure out how many additional periods this update spans */
2315 		periods = delta / 1024;
2316 		delta %= 1024;
2317 
2318 		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
2319 						  periods + 1);
2320 		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
2321 						     periods + 1);
2322 
2323 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
2324 		runnable_contrib = __compute_runnable_contrib(periods);
2325 		if (runnable)
2326 			sa->runnable_avg_sum += runnable_contrib;
2327 		sa->runnable_avg_period += runnable_contrib;
2328 	}
2329 
2330 	/* Remainder of delta accrued against u_0` */
2331 	if (runnable)
2332 		sa->runnable_avg_sum += delta;
2333 	sa->runnable_avg_period += delta;
2334 
2335 	return decayed;
2336 }
2337 
2338 /* Synchronize an entity's decay with its parenting cfs_rq.*/
2339 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2340 {
2341 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2342 	u64 decays = atomic64_read(&cfs_rq->decay_counter);
2343 
2344 	decays -= se->avg.decay_count;
2345 	if (!decays)
2346 		return 0;
2347 
2348 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2349 	se->avg.decay_count = 0;
2350 
2351 	return decays;
2352 }
2353 
2354 #ifdef CONFIG_FAIR_GROUP_SCHED
2355 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2356 						 int force_update)
2357 {
2358 	struct task_group *tg = cfs_rq->tg;
2359 	long tg_contrib;
2360 
2361 	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
2362 	tg_contrib -= cfs_rq->tg_load_contrib;
2363 
2364 	if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
2365 		atomic_long_add(tg_contrib, &tg->load_avg);
2366 		cfs_rq->tg_load_contrib += tg_contrib;
2367 	}
2368 }
2369 
2370 /*
2371  * Aggregate cfs_rq runnable averages into an equivalent task_group
2372  * representation for computing load contributions.
2373  */
2374 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2375 						  struct cfs_rq *cfs_rq)
2376 {
2377 	struct task_group *tg = cfs_rq->tg;
2378 	long contrib;
2379 
2380 	/* The fraction of a cpu used by this cfs_rq */
2381 	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
2382 			  sa->runnable_avg_period + 1);
2383 	contrib -= cfs_rq->tg_runnable_contrib;
2384 
2385 	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
2386 		atomic_add(contrib, &tg->runnable_avg);
2387 		cfs_rq->tg_runnable_contrib += contrib;
2388 	}
2389 }
2390 
2391 static inline void __update_group_entity_contrib(struct sched_entity *se)
2392 {
2393 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
2394 	struct task_group *tg = cfs_rq->tg;
2395 	int runnable_avg;
2396 
2397 	u64 contrib;
2398 
2399 	contrib = cfs_rq->tg_load_contrib * tg->shares;
2400 	se->avg.load_avg_contrib = div_u64(contrib,
2401 				     atomic_long_read(&tg->load_avg) + 1);
2402 
2403 	/*
2404 	 * For group entities we need to compute a correction term in the case
2405 	 * that they are consuming <1 cpu so that we would contribute the same
2406 	 * load as a task of equal weight.
2407 	 *
2408 	 * Explicitly co-ordinating this measurement would be expensive, but
2409 	 * fortunately the sum of each cpus contribution forms a usable
2410 	 * lower-bound on the true value.
2411 	 *
2412 	 * Consider the aggregate of 2 contributions.  Either they are disjoint
2413 	 * (and the sum represents true value) or they are disjoint and we are
2414 	 * understating by the aggregate of their overlap.
2415 	 *
2416 	 * Extending this to N cpus, for a given overlap, the maximum amount we
2417 	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
2418 	 * cpus that overlap for this interval and w_i is the interval width.
2419 	 *
2420 	 * On a small machine; the first term is well-bounded which bounds the
2421 	 * total error since w_i is a subset of the period.  Whereas on a
2422 	 * larger machine, while this first term can be larger, if w_i is the
2423 	 * of consequential size guaranteed to see n_i*w_i quickly converge to
2424 	 * our upper bound of 1-cpu.
2425 	 */
2426 	runnable_avg = atomic_read(&tg->runnable_avg);
2427 	if (runnable_avg < NICE_0_LOAD) {
2428 		se->avg.load_avg_contrib *= runnable_avg;
2429 		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2430 	}
2431 }
2432 
2433 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2434 {
2435 	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2436 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
2437 }
2438 #else /* CONFIG_FAIR_GROUP_SCHED */
2439 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2440 						 int force_update) {}
2441 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2442 						  struct cfs_rq *cfs_rq) {}
2443 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2444 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2445 #endif /* CONFIG_FAIR_GROUP_SCHED */
2446 
2447 static inline void __update_task_entity_contrib(struct sched_entity *se)
2448 {
2449 	u32 contrib;
2450 
2451 	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2452 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2453 	contrib /= (se->avg.runnable_avg_period + 1);
2454 	se->avg.load_avg_contrib = scale_load(contrib);
2455 }
2456 
2457 /* Compute the current contribution to load_avg by se, return any delta */
2458 static long __update_entity_load_avg_contrib(struct sched_entity *se)
2459 {
2460 	long old_contrib = se->avg.load_avg_contrib;
2461 
2462 	if (entity_is_task(se)) {
2463 		__update_task_entity_contrib(se);
2464 	} else {
2465 		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
2466 		__update_group_entity_contrib(se);
2467 	}
2468 
2469 	return se->avg.load_avg_contrib - old_contrib;
2470 }
2471 
2472 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
2473 						 long load_contrib)
2474 {
2475 	if (likely(load_contrib < cfs_rq->blocked_load_avg))
2476 		cfs_rq->blocked_load_avg -= load_contrib;
2477 	else
2478 		cfs_rq->blocked_load_avg = 0;
2479 }
2480 
2481 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2482 
2483 /* Update a sched_entity's runnable average */
2484 static inline void update_entity_load_avg(struct sched_entity *se,
2485 					  int update_cfs_rq)
2486 {
2487 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2488 	long contrib_delta;
2489 	u64 now;
2490 
2491 	/*
2492 	 * For a group entity we need to use their owned cfs_rq_clock_task() in
2493 	 * case they are the parent of a throttled hierarchy.
2494 	 */
2495 	if (entity_is_task(se))
2496 		now = cfs_rq_clock_task(cfs_rq);
2497 	else
2498 		now = cfs_rq_clock_task(group_cfs_rq(se));
2499 
2500 	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
2501 		return;
2502 
2503 	contrib_delta = __update_entity_load_avg_contrib(se);
2504 
2505 	if (!update_cfs_rq)
2506 		return;
2507 
2508 	if (se->on_rq)
2509 		cfs_rq->runnable_load_avg += contrib_delta;
2510 	else
2511 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2512 }
2513 
2514 /*
2515  * Decay the load contributed by all blocked children and account this so that
2516  * their contribution may appropriately discounted when they wake up.
2517  */
2518 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2519 {
2520 	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
2521 	u64 decays;
2522 
2523 	decays = now - cfs_rq->last_decay;
2524 	if (!decays && !force_update)
2525 		return;
2526 
2527 	if (atomic_long_read(&cfs_rq->removed_load)) {
2528 		unsigned long removed_load;
2529 		removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
2530 		subtract_blocked_load_contrib(cfs_rq, removed_load);
2531 	}
2532 
2533 	if (decays) {
2534 		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
2535 						      decays);
2536 		atomic64_add(decays, &cfs_rq->decay_counter);
2537 		cfs_rq->last_decay = now;
2538 	}
2539 
2540 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2541 }
2542 
2543 /* Add the load generated by se into cfs_rq's child load-average */
2544 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2545 						  struct sched_entity *se,
2546 						  int wakeup)
2547 {
2548 	/*
2549 	 * We track migrations using entity decay_count <= 0, on a wake-up
2550 	 * migration we use a negative decay count to track the remote decays
2551 	 * accumulated while sleeping.
2552 	 *
2553 	 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
2554 	 * are seen by enqueue_entity_load_avg() as a migration with an already
2555 	 * constructed load_avg_contrib.
2556 	 */
2557 	if (unlikely(se->avg.decay_count <= 0)) {
2558 		se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
2559 		if (se->avg.decay_count) {
2560 			/*
2561 			 * In a wake-up migration we have to approximate the
2562 			 * time sleeping.  This is because we can't synchronize
2563 			 * clock_task between the two cpus, and it is not
2564 			 * guaranteed to be read-safe.  Instead, we can
2565 			 * approximate this using our carried decays, which are
2566 			 * explicitly atomically readable.
2567 			 */
2568 			se->avg.last_runnable_update -= (-se->avg.decay_count)
2569 							<< 20;
2570 			update_entity_load_avg(se, 0);
2571 			/* Indicate that we're now synchronized and on-rq */
2572 			se->avg.decay_count = 0;
2573 		}
2574 		wakeup = 0;
2575 	} else {
2576 		__synchronize_entity_decay(se);
2577 	}
2578 
2579 	/* migrated tasks did not contribute to our blocked load */
2580 	if (wakeup) {
2581 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
2582 		update_entity_load_avg(se, 0);
2583 	}
2584 
2585 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
2586 	/* we force update consideration on load-balancer moves */
2587 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2588 }
2589 
2590 /*
2591  * Remove se's load from this cfs_rq child load-average, if the entity is
2592  * transitioning to a blocked state we track its projected decay using
2593  * blocked_load_avg.
2594  */
2595 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2596 						  struct sched_entity *se,
2597 						  int sleep)
2598 {
2599 	update_entity_load_avg(se, 1);
2600 	/* we force update consideration on load-balancer moves */
2601 	update_cfs_rq_blocked_load(cfs_rq, !sleep);
2602 
2603 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
2604 	if (sleep) {
2605 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
2606 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
2607 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
2608 }
2609 
2610 /*
2611  * Update the rq's load with the elapsed running time before entering
2612  * idle. if the last scheduled task is not a CFS task, idle_enter will
2613  * be the only way to update the runnable statistic.
2614  */
2615 void idle_enter_fair(struct rq *this_rq)
2616 {
2617 	update_rq_runnable_avg(this_rq, 1);
2618 }
2619 
2620 /*
2621  * Update the rq's load with the elapsed idle time before a task is
2622  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
2623  * be the only way to update the runnable statistic.
2624  */
2625 void idle_exit_fair(struct rq *this_rq)
2626 {
2627 	update_rq_runnable_avg(this_rq, 0);
2628 }
2629 
2630 static int idle_balance(struct rq *this_rq);
2631 
2632 #else /* CONFIG_SMP */
2633 
2634 static inline void update_entity_load_avg(struct sched_entity *se,
2635 					  int update_cfs_rq) {}
2636 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2637 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2638 					   struct sched_entity *se,
2639 					   int wakeup) {}
2640 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2641 					   struct sched_entity *se,
2642 					   int sleep) {}
2643 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2644 					      int force_update) {}
2645 
2646 static inline int idle_balance(struct rq *rq)
2647 {
2648 	return 0;
2649 }
2650 
2651 #endif /* CONFIG_SMP */
2652 
2653 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2654 {
2655 #ifdef CONFIG_SCHEDSTATS
2656 	struct task_struct *tsk = NULL;
2657 
2658 	if (entity_is_task(se))
2659 		tsk = task_of(se);
2660 
2661 	if (se->statistics.sleep_start) {
2662 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
2663 
2664 		if ((s64)delta < 0)
2665 			delta = 0;
2666 
2667 		if (unlikely(delta > se->statistics.sleep_max))
2668 			se->statistics.sleep_max = delta;
2669 
2670 		se->statistics.sleep_start = 0;
2671 		se->statistics.sum_sleep_runtime += delta;
2672 
2673 		if (tsk) {
2674 			account_scheduler_latency(tsk, delta >> 10, 1);
2675 			trace_sched_stat_sleep(tsk, delta);
2676 		}
2677 	}
2678 	if (se->statistics.block_start) {
2679 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
2680 
2681 		if ((s64)delta < 0)
2682 			delta = 0;
2683 
2684 		if (unlikely(delta > se->statistics.block_max))
2685 			se->statistics.block_max = delta;
2686 
2687 		se->statistics.block_start = 0;
2688 		se->statistics.sum_sleep_runtime += delta;
2689 
2690 		if (tsk) {
2691 			if (tsk->in_iowait) {
2692 				se->statistics.iowait_sum += delta;
2693 				se->statistics.iowait_count++;
2694 				trace_sched_stat_iowait(tsk, delta);
2695 			}
2696 
2697 			trace_sched_stat_blocked(tsk, delta);
2698 
2699 			/*
2700 			 * Blocking time is in units of nanosecs, so shift by
2701 			 * 20 to get a milliseconds-range estimation of the
2702 			 * amount of time that the task spent sleeping:
2703 			 */
2704 			if (unlikely(prof_on == SLEEP_PROFILING)) {
2705 				profile_hits(SLEEP_PROFILING,
2706 						(void *)get_wchan(tsk),
2707 						delta >> 20);
2708 			}
2709 			account_scheduler_latency(tsk, delta >> 10, 0);
2710 		}
2711 	}
2712 #endif
2713 }
2714 
2715 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
2716 {
2717 #ifdef CONFIG_SCHED_DEBUG
2718 	s64 d = se->vruntime - cfs_rq->min_vruntime;
2719 
2720 	if (d < 0)
2721 		d = -d;
2722 
2723 	if (d > 3*sysctl_sched_latency)
2724 		schedstat_inc(cfs_rq, nr_spread_over);
2725 #endif
2726 }
2727 
2728 static void
2729 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
2730 {
2731 	u64 vruntime = cfs_rq->min_vruntime;
2732 
2733 	/*
2734 	 * The 'current' period is already promised to the current tasks,
2735 	 * however the extra weight of the new task will slow them down a
2736 	 * little, place the new task so that it fits in the slot that
2737 	 * stays open at the end.
2738 	 */
2739 	if (initial && sched_feat(START_DEBIT))
2740 		vruntime += sched_vslice(cfs_rq, se);
2741 
2742 	/* sleeps up to a single latency don't count. */
2743 	if (!initial) {
2744 		unsigned long thresh = sysctl_sched_latency;
2745 
2746 		/*
2747 		 * Halve their sleep time's effect, to allow
2748 		 * for a gentler effect of sleepers:
2749 		 */
2750 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
2751 			thresh >>= 1;
2752 
2753 		vruntime -= thresh;
2754 	}
2755 
2756 	/* ensure we never gain time by being placed backwards. */
2757 	se->vruntime = max_vruntime(se->vruntime, vruntime);
2758 }
2759 
2760 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
2761 
2762 static void
2763 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
2764 {
2765 	/*
2766 	 * Update the normalized vruntime before updating min_vruntime
2767 	 * through calling update_curr().
2768 	 */
2769 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
2770 		se->vruntime += cfs_rq->min_vruntime;
2771 
2772 	/*
2773 	 * Update run-time statistics of the 'current'.
2774 	 */
2775 	update_curr(cfs_rq);
2776 	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
2777 	account_entity_enqueue(cfs_rq, se);
2778 	update_cfs_shares(cfs_rq);
2779 
2780 	if (flags & ENQUEUE_WAKEUP) {
2781 		place_entity(cfs_rq, se, 0);
2782 		enqueue_sleeper(cfs_rq, se);
2783 	}
2784 
2785 	update_stats_enqueue(cfs_rq, se);
2786 	check_spread(cfs_rq, se);
2787 	if (se != cfs_rq->curr)
2788 		__enqueue_entity(cfs_rq, se);
2789 	se->on_rq = 1;
2790 
2791 	if (cfs_rq->nr_running == 1) {
2792 		list_add_leaf_cfs_rq(cfs_rq);
2793 		check_enqueue_throttle(cfs_rq);
2794 	}
2795 }
2796 
2797 static void __clear_buddies_last(struct sched_entity *se)
2798 {
2799 	for_each_sched_entity(se) {
2800 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
2801 		if (cfs_rq->last != se)
2802 			break;
2803 
2804 		cfs_rq->last = NULL;
2805 	}
2806 }
2807 
2808 static void __clear_buddies_next(struct sched_entity *se)
2809 {
2810 	for_each_sched_entity(se) {
2811 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
2812 		if (cfs_rq->next != se)
2813 			break;
2814 
2815 		cfs_rq->next = NULL;
2816 	}
2817 }
2818 
2819 static void __clear_buddies_skip(struct sched_entity *se)
2820 {
2821 	for_each_sched_entity(se) {
2822 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
2823 		if (cfs_rq->skip != se)
2824 			break;
2825 
2826 		cfs_rq->skip = NULL;
2827 	}
2828 }
2829 
2830 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
2831 {
2832 	if (cfs_rq->last == se)
2833 		__clear_buddies_last(se);
2834 
2835 	if (cfs_rq->next == se)
2836 		__clear_buddies_next(se);
2837 
2838 	if (cfs_rq->skip == se)
2839 		__clear_buddies_skip(se);
2840 }
2841 
2842 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2843 
2844 static void
2845 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
2846 {
2847 	/*
2848 	 * Update run-time statistics of the 'current'.
2849 	 */
2850 	update_curr(cfs_rq);
2851 	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
2852 
2853 	update_stats_dequeue(cfs_rq, se);
2854 	if (flags & DEQUEUE_SLEEP) {
2855 #ifdef CONFIG_SCHEDSTATS
2856 		if (entity_is_task(se)) {
2857 			struct task_struct *tsk = task_of(se);
2858 
2859 			if (tsk->state & TASK_INTERRUPTIBLE)
2860 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
2861 			if (tsk->state & TASK_UNINTERRUPTIBLE)
2862 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
2863 		}
2864 #endif
2865 	}
2866 
2867 	clear_buddies(cfs_rq, se);
2868 
2869 	if (se != cfs_rq->curr)
2870 		__dequeue_entity(cfs_rq, se);
2871 	se->on_rq = 0;
2872 	account_entity_dequeue(cfs_rq, se);
2873 
2874 	/*
2875 	 * Normalize the entity after updating the min_vruntime because the
2876 	 * update can refer to the ->curr item and we need to reflect this
2877 	 * movement in our normalized position.
2878 	 */
2879 	if (!(flags & DEQUEUE_SLEEP))
2880 		se->vruntime -= cfs_rq->min_vruntime;
2881 
2882 	/* return excess runtime on last dequeue */
2883 	return_cfs_rq_runtime(cfs_rq);
2884 
2885 	update_min_vruntime(cfs_rq);
2886 	update_cfs_shares(cfs_rq);
2887 }
2888 
2889 /*
2890  * Preempt the current task with a newly woken task if needed:
2891  */
2892 static void
2893 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2894 {
2895 	unsigned long ideal_runtime, delta_exec;
2896 	struct sched_entity *se;
2897 	s64 delta;
2898 
2899 	ideal_runtime = sched_slice(cfs_rq, curr);
2900 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2901 	if (delta_exec > ideal_runtime) {
2902 		resched_task(rq_of(cfs_rq)->curr);
2903 		/*
2904 		 * The current task ran long enough, ensure it doesn't get
2905 		 * re-elected due to buddy favours.
2906 		 */
2907 		clear_buddies(cfs_rq, curr);
2908 		return;
2909 	}
2910 
2911 	/*
2912 	 * Ensure that a task that missed wakeup preemption by a
2913 	 * narrow margin doesn't have to wait for a full slice.
2914 	 * This also mitigates buddy induced latencies under load.
2915 	 */
2916 	if (delta_exec < sysctl_sched_min_granularity)
2917 		return;
2918 
2919 	se = __pick_first_entity(cfs_rq);
2920 	delta = curr->vruntime - se->vruntime;
2921 
2922 	if (delta < 0)
2923 		return;
2924 
2925 	if (delta > ideal_runtime)
2926 		resched_task(rq_of(cfs_rq)->curr);
2927 }
2928 
2929 static void
2930 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
2931 {
2932 	/* 'current' is not kept within the tree. */
2933 	if (se->on_rq) {
2934 		/*
2935 		 * Any task has to be enqueued before it get to execute on
2936 		 * a CPU. So account for the time it spent waiting on the
2937 		 * runqueue.
2938 		 */
2939 		update_stats_wait_end(cfs_rq, se);
2940 		__dequeue_entity(cfs_rq, se);
2941 	}
2942 
2943 	update_stats_curr_start(cfs_rq, se);
2944 	cfs_rq->curr = se;
2945 #ifdef CONFIG_SCHEDSTATS
2946 	/*
2947 	 * Track our maximum slice length, if the CPU's load is at
2948 	 * least twice that of our own weight (i.e. dont track it
2949 	 * when there are only lesser-weight tasks around):
2950 	 */
2951 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
2952 		se->statistics.slice_max = max(se->statistics.slice_max,
2953 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
2954 	}
2955 #endif
2956 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
2957 }
2958 
2959 static int
2960 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2961 
2962 /*
2963  * Pick the next process, keeping these things in mind, in this order:
2964  * 1) keep things fair between processes/task groups
2965  * 2) pick the "next" process, since someone really wants that to run
2966  * 3) pick the "last" process, for cache locality
2967  * 4) do not run the "skip" process, if something else is available
2968  */
2969 static struct sched_entity *
2970 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2971 {
2972 	struct sched_entity *left = __pick_first_entity(cfs_rq);
2973 	struct sched_entity *se;
2974 
2975 	/*
2976 	 * If curr is set we have to see if its left of the leftmost entity
2977 	 * still in the tree, provided there was anything in the tree at all.
2978 	 */
2979 	if (!left || (curr && entity_before(curr, left)))
2980 		left = curr;
2981 
2982 	se = left; /* ideally we run the leftmost entity */
2983 
2984 	/*
2985 	 * Avoid running the skip buddy, if running something else can
2986 	 * be done without getting too unfair.
2987 	 */
2988 	if (cfs_rq->skip == se) {
2989 		struct sched_entity *second;
2990 
2991 		if (se == curr) {
2992 			second = __pick_first_entity(cfs_rq);
2993 		} else {
2994 			second = __pick_next_entity(se);
2995 			if (!second || (curr && entity_before(curr, second)))
2996 				second = curr;
2997 		}
2998 
2999 		if (second && wakeup_preempt_entity(second, left) < 1)
3000 			se = second;
3001 	}
3002 
3003 	/*
3004 	 * Prefer last buddy, try to return the CPU to a preempted task.
3005 	 */
3006 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3007 		se = cfs_rq->last;
3008 
3009 	/*
3010 	 * Someone really wants this to run. If it's not unfair, run it.
3011 	 */
3012 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3013 		se = cfs_rq->next;
3014 
3015 	clear_buddies(cfs_rq, se);
3016 
3017 	return se;
3018 }
3019 
3020 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3021 
3022 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3023 {
3024 	/*
3025 	 * If still on the runqueue then deactivate_task()
3026 	 * was not called and update_curr() has to be done:
3027 	 */
3028 	if (prev->on_rq)
3029 		update_curr(cfs_rq);
3030 
3031 	/* throttle cfs_rqs exceeding runtime */
3032 	check_cfs_rq_runtime(cfs_rq);
3033 
3034 	check_spread(cfs_rq, prev);
3035 	if (prev->on_rq) {
3036 		update_stats_wait_start(cfs_rq, prev);
3037 		/* Put 'current' back into the tree. */
3038 		__enqueue_entity(cfs_rq, prev);
3039 		/* in !on_rq case, update occurred at dequeue */
3040 		update_entity_load_avg(prev, 1);
3041 	}
3042 	cfs_rq->curr = NULL;
3043 }
3044 
3045 static void
3046 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3047 {
3048 	/*
3049 	 * Update run-time statistics of the 'current'.
3050 	 */
3051 	update_curr(cfs_rq);
3052 
3053 	/*
3054 	 * Ensure that runnable average is periodically updated.
3055 	 */
3056 	update_entity_load_avg(curr, 1);
3057 	update_cfs_rq_blocked_load(cfs_rq, 1);
3058 	update_cfs_shares(cfs_rq);
3059 
3060 #ifdef CONFIG_SCHED_HRTICK
3061 	/*
3062 	 * queued ticks are scheduled to match the slice, so don't bother
3063 	 * validating it and just reschedule.
3064 	 */
3065 	if (queued) {
3066 		resched_task(rq_of(cfs_rq)->curr);
3067 		return;
3068 	}
3069 	/*
3070 	 * don't let the period tick interfere with the hrtick preemption
3071 	 */
3072 	if (!sched_feat(DOUBLE_TICK) &&
3073 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3074 		return;
3075 #endif
3076 
3077 	if (cfs_rq->nr_running > 1)
3078 		check_preempt_tick(cfs_rq, curr);
3079 }
3080 
3081 
3082 /**************************************************
3083  * CFS bandwidth control machinery
3084  */
3085 
3086 #ifdef CONFIG_CFS_BANDWIDTH
3087 
3088 #ifdef HAVE_JUMP_LABEL
3089 static struct static_key __cfs_bandwidth_used;
3090 
3091 static inline bool cfs_bandwidth_used(void)
3092 {
3093 	return static_key_false(&__cfs_bandwidth_used);
3094 }
3095 
3096 void cfs_bandwidth_usage_inc(void)
3097 {
3098 	static_key_slow_inc(&__cfs_bandwidth_used);
3099 }
3100 
3101 void cfs_bandwidth_usage_dec(void)
3102 {
3103 	static_key_slow_dec(&__cfs_bandwidth_used);
3104 }
3105 #else /* HAVE_JUMP_LABEL */
3106 static bool cfs_bandwidth_used(void)
3107 {
3108 	return true;
3109 }
3110 
3111 void cfs_bandwidth_usage_inc(void) {}
3112 void cfs_bandwidth_usage_dec(void) {}
3113 #endif /* HAVE_JUMP_LABEL */
3114 
3115 /*
3116  * default period for cfs group bandwidth.
3117  * default: 0.1s, units: nanoseconds
3118  */
3119 static inline u64 default_cfs_period(void)
3120 {
3121 	return 100000000ULL;
3122 }
3123 
3124 static inline u64 sched_cfs_bandwidth_slice(void)
3125 {
3126 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3127 }
3128 
3129 /*
3130  * Replenish runtime according to assigned quota and update expiration time.
3131  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3132  * additional synchronization around rq->lock.
3133  *
3134  * requires cfs_b->lock
3135  */
3136 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3137 {
3138 	u64 now;
3139 
3140 	if (cfs_b->quota == RUNTIME_INF)
3141 		return;
3142 
3143 	now = sched_clock_cpu(smp_processor_id());
3144 	cfs_b->runtime = cfs_b->quota;
3145 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3146 }
3147 
3148 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3149 {
3150 	return &tg->cfs_bandwidth;
3151 }
3152 
3153 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3154 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3155 {
3156 	if (unlikely(cfs_rq->throttle_count))
3157 		return cfs_rq->throttled_clock_task;
3158 
3159 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3160 }
3161 
3162 /* returns 0 on failure to allocate runtime */
3163 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3164 {
3165 	struct task_group *tg = cfs_rq->tg;
3166 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3167 	u64 amount = 0, min_amount, expires;
3168 
3169 	/* note: this is a positive sum as runtime_remaining <= 0 */
3170 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3171 
3172 	raw_spin_lock(&cfs_b->lock);
3173 	if (cfs_b->quota == RUNTIME_INF)
3174 		amount = min_amount;
3175 	else {
3176 		/*
3177 		 * If the bandwidth pool has become inactive, then at least one
3178 		 * period must have elapsed since the last consumption.
3179 		 * Refresh the global state and ensure bandwidth timer becomes
3180 		 * active.
3181 		 */
3182 		if (!cfs_b->timer_active) {
3183 			__refill_cfs_bandwidth_runtime(cfs_b);
3184 			__start_cfs_bandwidth(cfs_b, false);
3185 		}
3186 
3187 		if (cfs_b->runtime > 0) {
3188 			amount = min(cfs_b->runtime, min_amount);
3189 			cfs_b->runtime -= amount;
3190 			cfs_b->idle = 0;
3191 		}
3192 	}
3193 	expires = cfs_b->runtime_expires;
3194 	raw_spin_unlock(&cfs_b->lock);
3195 
3196 	cfs_rq->runtime_remaining += amount;
3197 	/*
3198 	 * we may have advanced our local expiration to account for allowed
3199 	 * spread between our sched_clock and the one on which runtime was
3200 	 * issued.
3201 	 */
3202 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3203 		cfs_rq->runtime_expires = expires;
3204 
3205 	return cfs_rq->runtime_remaining > 0;
3206 }
3207 
3208 /*
3209  * Note: This depends on the synchronization provided by sched_clock and the
3210  * fact that rq->clock snapshots this value.
3211  */
3212 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3213 {
3214 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3215 
3216 	/* if the deadline is ahead of our clock, nothing to do */
3217 	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3218 		return;
3219 
3220 	if (cfs_rq->runtime_remaining < 0)
3221 		return;
3222 
3223 	/*
3224 	 * If the local deadline has passed we have to consider the
3225 	 * possibility that our sched_clock is 'fast' and the global deadline
3226 	 * has not truly expired.
3227 	 *
3228 	 * Fortunately we can check determine whether this the case by checking
3229 	 * whether the global deadline has advanced. It is valid to compare
3230 	 * cfs_b->runtime_expires without any locks since we only care about
3231 	 * exact equality, so a partial write will still work.
3232 	 */
3233 
3234 	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3235 		/* extend local deadline, drift is bounded above by 2 ticks */
3236 		cfs_rq->runtime_expires += TICK_NSEC;
3237 	} else {
3238 		/* global deadline is ahead, expiration has passed */
3239 		cfs_rq->runtime_remaining = 0;
3240 	}
3241 }
3242 
3243 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3244 {
3245 	/* dock delta_exec before expiring quota (as it could span periods) */
3246 	cfs_rq->runtime_remaining -= delta_exec;
3247 	expire_cfs_rq_runtime(cfs_rq);
3248 
3249 	if (likely(cfs_rq->runtime_remaining > 0))
3250 		return;
3251 
3252 	/*
3253 	 * if we're unable to extend our runtime we resched so that the active
3254 	 * hierarchy can be throttled
3255 	 */
3256 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3257 		resched_task(rq_of(cfs_rq)->curr);
3258 }
3259 
3260 static __always_inline
3261 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3262 {
3263 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3264 		return;
3265 
3266 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
3267 }
3268 
3269 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3270 {
3271 	return cfs_bandwidth_used() && cfs_rq->throttled;
3272 }
3273 
3274 /* check whether cfs_rq, or any parent, is throttled */
3275 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3276 {
3277 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
3278 }
3279 
3280 /*
3281  * Ensure that neither of the group entities corresponding to src_cpu or
3282  * dest_cpu are members of a throttled hierarchy when performing group
3283  * load-balance operations.
3284  */
3285 static inline int throttled_lb_pair(struct task_group *tg,
3286 				    int src_cpu, int dest_cpu)
3287 {
3288 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3289 
3290 	src_cfs_rq = tg->cfs_rq[src_cpu];
3291 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
3292 
3293 	return throttled_hierarchy(src_cfs_rq) ||
3294 	       throttled_hierarchy(dest_cfs_rq);
3295 }
3296 
3297 /* updated child weight may affect parent so we have to do this bottom up */
3298 static int tg_unthrottle_up(struct task_group *tg, void *data)
3299 {
3300 	struct rq *rq = data;
3301 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3302 
3303 	cfs_rq->throttle_count--;
3304 #ifdef CONFIG_SMP
3305 	if (!cfs_rq->throttle_count) {
3306 		/* adjust cfs_rq_clock_task() */
3307 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3308 					     cfs_rq->throttled_clock_task;
3309 	}
3310 #endif
3311 
3312 	return 0;
3313 }
3314 
3315 static int tg_throttle_down(struct task_group *tg, void *data)
3316 {
3317 	struct rq *rq = data;
3318 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3319 
3320 	/* group is entering throttled state, stop time */
3321 	if (!cfs_rq->throttle_count)
3322 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
3323 	cfs_rq->throttle_count++;
3324 
3325 	return 0;
3326 }
3327 
3328 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3329 {
3330 	struct rq *rq = rq_of(cfs_rq);
3331 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3332 	struct sched_entity *se;
3333 	long task_delta, dequeue = 1;
3334 
3335 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3336 
3337 	/* freeze hierarchy runnable averages while throttled */
3338 	rcu_read_lock();
3339 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3340 	rcu_read_unlock();
3341 
3342 	task_delta = cfs_rq->h_nr_running;
3343 	for_each_sched_entity(se) {
3344 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3345 		/* throttled entity or throttle-on-deactivate */
3346 		if (!se->on_rq)
3347 			break;
3348 
3349 		if (dequeue)
3350 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3351 		qcfs_rq->h_nr_running -= task_delta;
3352 
3353 		if (qcfs_rq->load.weight)
3354 			dequeue = 0;
3355 	}
3356 
3357 	if (!se)
3358 		sub_nr_running(rq, task_delta);
3359 
3360 	cfs_rq->throttled = 1;
3361 	cfs_rq->throttled_clock = rq_clock(rq);
3362 	raw_spin_lock(&cfs_b->lock);
3363 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3364 	if (!cfs_b->timer_active)
3365 		__start_cfs_bandwidth(cfs_b, false);
3366 	raw_spin_unlock(&cfs_b->lock);
3367 }
3368 
3369 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3370 {
3371 	struct rq *rq = rq_of(cfs_rq);
3372 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3373 	struct sched_entity *se;
3374 	int enqueue = 1;
3375 	long task_delta;
3376 
3377 	se = cfs_rq->tg->se[cpu_of(rq)];
3378 
3379 	cfs_rq->throttled = 0;
3380 
3381 	update_rq_clock(rq);
3382 
3383 	raw_spin_lock(&cfs_b->lock);
3384 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3385 	list_del_rcu(&cfs_rq->throttled_list);
3386 	raw_spin_unlock(&cfs_b->lock);
3387 
3388 	/* update hierarchical throttle state */
3389 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3390 
3391 	if (!cfs_rq->load.weight)
3392 		return;
3393 
3394 	task_delta = cfs_rq->h_nr_running;
3395 	for_each_sched_entity(se) {
3396 		if (se->on_rq)
3397 			enqueue = 0;
3398 
3399 		cfs_rq = cfs_rq_of(se);
3400 		if (enqueue)
3401 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3402 		cfs_rq->h_nr_running += task_delta;
3403 
3404 		if (cfs_rq_throttled(cfs_rq))
3405 			break;
3406 	}
3407 
3408 	if (!se)
3409 		add_nr_running(rq, task_delta);
3410 
3411 	/* determine whether we need to wake up potentially idle cpu */
3412 	if (rq->curr == rq->idle && rq->cfs.nr_running)
3413 		resched_task(rq->curr);
3414 }
3415 
3416 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3417 		u64 remaining, u64 expires)
3418 {
3419 	struct cfs_rq *cfs_rq;
3420 	u64 runtime = remaining;
3421 
3422 	rcu_read_lock();
3423 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3424 				throttled_list) {
3425 		struct rq *rq = rq_of(cfs_rq);
3426 
3427 		raw_spin_lock(&rq->lock);
3428 		if (!cfs_rq_throttled(cfs_rq))
3429 			goto next;
3430 
3431 		runtime = -cfs_rq->runtime_remaining + 1;
3432 		if (runtime > remaining)
3433 			runtime = remaining;
3434 		remaining -= runtime;
3435 
3436 		cfs_rq->runtime_remaining += runtime;
3437 		cfs_rq->runtime_expires = expires;
3438 
3439 		/* we check whether we're throttled above */
3440 		if (cfs_rq->runtime_remaining > 0)
3441 			unthrottle_cfs_rq(cfs_rq);
3442 
3443 next:
3444 		raw_spin_unlock(&rq->lock);
3445 
3446 		if (!remaining)
3447 			break;
3448 	}
3449 	rcu_read_unlock();
3450 
3451 	return remaining;
3452 }
3453 
3454 /*
3455  * Responsible for refilling a task_group's bandwidth and unthrottling its
3456  * cfs_rqs as appropriate. If there has been no activity within the last
3457  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3458  * used to track this state.
3459  */
3460 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3461 {
3462 	u64 runtime, runtime_expires;
3463 	int throttled;
3464 
3465 	/* no need to continue the timer with no bandwidth constraint */
3466 	if (cfs_b->quota == RUNTIME_INF)
3467 		goto out_deactivate;
3468 
3469 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3470 	cfs_b->nr_periods += overrun;
3471 
3472 	/*
3473 	 * idle depends on !throttled (for the case of a large deficit), and if
3474 	 * we're going inactive then everything else can be deferred
3475 	 */
3476 	if (cfs_b->idle && !throttled)
3477 		goto out_deactivate;
3478 
3479 	/*
3480 	 * if we have relooped after returning idle once, we need to update our
3481 	 * status as actually running, so that other cpus doing
3482 	 * __start_cfs_bandwidth will stop trying to cancel us.
3483 	 */
3484 	cfs_b->timer_active = 1;
3485 
3486 	__refill_cfs_bandwidth_runtime(cfs_b);
3487 
3488 	if (!throttled) {
3489 		/* mark as potentially idle for the upcoming period */
3490 		cfs_b->idle = 1;
3491 		return 0;
3492 	}
3493 
3494 	/* account preceding periods in which throttling occurred */
3495 	cfs_b->nr_throttled += overrun;
3496 
3497 	/*
3498 	 * There are throttled entities so we must first use the new bandwidth
3499 	 * to unthrottle them before making it generally available.  This
3500 	 * ensures that all existing debts will be paid before a new cfs_rq is
3501 	 * allowed to run.
3502 	 */
3503 	runtime = cfs_b->runtime;
3504 	runtime_expires = cfs_b->runtime_expires;
3505 	cfs_b->runtime = 0;
3506 
3507 	/*
3508 	 * This check is repeated as we are holding onto the new bandwidth
3509 	 * while we unthrottle.  This can potentially race with an unthrottled
3510 	 * group trying to acquire new bandwidth from the global pool.
3511 	 */
3512 	while (throttled && runtime > 0) {
3513 		raw_spin_unlock(&cfs_b->lock);
3514 		/* we can't nest cfs_b->lock while distributing bandwidth */
3515 		runtime = distribute_cfs_runtime(cfs_b, runtime,
3516 						 runtime_expires);
3517 		raw_spin_lock(&cfs_b->lock);
3518 
3519 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3520 	}
3521 
3522 	/* return (any) remaining runtime */
3523 	cfs_b->runtime = runtime;
3524 	/*
3525 	 * While we are ensured activity in the period following an
3526 	 * unthrottle, this also covers the case in which the new bandwidth is
3527 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
3528 	 * timer to remain active while there are any throttled entities.)
3529 	 */
3530 	cfs_b->idle = 0;
3531 
3532 	return 0;
3533 
3534 out_deactivate:
3535 	cfs_b->timer_active = 0;
3536 	return 1;
3537 }
3538 
3539 /* a cfs_rq won't donate quota below this amount */
3540 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3541 /* minimum remaining period time to redistribute slack quota */
3542 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3543 /* how long we wait to gather additional slack before distributing */
3544 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3545 
3546 /*
3547  * Are we near the end of the current quota period?
3548  *
3549  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3550  * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3551  * migrate_hrtimers, base is never cleared, so we are fine.
3552  */
3553 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3554 {
3555 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
3556 	u64 remaining;
3557 
3558 	/* if the call-back is running a quota refresh is already occurring */
3559 	if (hrtimer_callback_running(refresh_timer))
3560 		return 1;
3561 
3562 	/* is a quota refresh about to occur? */
3563 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3564 	if (remaining < min_expire)
3565 		return 1;
3566 
3567 	return 0;
3568 }
3569 
3570 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3571 {
3572 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3573 
3574 	/* if there's a quota refresh soon don't bother with slack */
3575 	if (runtime_refresh_within(cfs_b, min_left))
3576 		return;
3577 
3578 	start_bandwidth_timer(&cfs_b->slack_timer,
3579 				ns_to_ktime(cfs_bandwidth_slack_period));
3580 }
3581 
3582 /* we know any runtime found here is valid as update_curr() precedes return */
3583 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3584 {
3585 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3586 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3587 
3588 	if (slack_runtime <= 0)
3589 		return;
3590 
3591 	raw_spin_lock(&cfs_b->lock);
3592 	if (cfs_b->quota != RUNTIME_INF &&
3593 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3594 		cfs_b->runtime += slack_runtime;
3595 
3596 		/* we are under rq->lock, defer unthrottling using a timer */
3597 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3598 		    !list_empty(&cfs_b->throttled_cfs_rq))
3599 			start_cfs_slack_bandwidth(cfs_b);
3600 	}
3601 	raw_spin_unlock(&cfs_b->lock);
3602 
3603 	/* even if it's not valid for return we don't want to try again */
3604 	cfs_rq->runtime_remaining -= slack_runtime;
3605 }
3606 
3607 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3608 {
3609 	if (!cfs_bandwidth_used())
3610 		return;
3611 
3612 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3613 		return;
3614 
3615 	__return_cfs_rq_runtime(cfs_rq);
3616 }
3617 
3618 /*
3619  * This is done with a timer (instead of inline with bandwidth return) since
3620  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3621  */
3622 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3623 {
3624 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3625 	u64 expires;
3626 
3627 	/* confirm we're still not at a refresh boundary */
3628 	raw_spin_lock(&cfs_b->lock);
3629 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3630 		raw_spin_unlock(&cfs_b->lock);
3631 		return;
3632 	}
3633 
3634 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
3635 		runtime = cfs_b->runtime;
3636 		cfs_b->runtime = 0;
3637 	}
3638 	expires = cfs_b->runtime_expires;
3639 	raw_spin_unlock(&cfs_b->lock);
3640 
3641 	if (!runtime)
3642 		return;
3643 
3644 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3645 
3646 	raw_spin_lock(&cfs_b->lock);
3647 	if (expires == cfs_b->runtime_expires)
3648 		cfs_b->runtime = runtime;
3649 	raw_spin_unlock(&cfs_b->lock);
3650 }
3651 
3652 /*
3653  * When a group wakes up we want to make sure that its quota is not already
3654  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3655  * runtime as update_curr() throttling can not not trigger until it's on-rq.
3656  */
3657 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3658 {
3659 	if (!cfs_bandwidth_used())
3660 		return;
3661 
3662 	/* an active group must be handled by the update_curr()->put() path */
3663 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3664 		return;
3665 
3666 	/* ensure the group is not already throttled */
3667 	if (cfs_rq_throttled(cfs_rq))
3668 		return;
3669 
3670 	/* update runtime allocation */
3671 	account_cfs_rq_runtime(cfs_rq, 0);
3672 	if (cfs_rq->runtime_remaining <= 0)
3673 		throttle_cfs_rq(cfs_rq);
3674 }
3675 
3676 /* conditionally throttle active cfs_rq's from put_prev_entity() */
3677 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3678 {
3679 	if (!cfs_bandwidth_used())
3680 		return false;
3681 
3682 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3683 		return false;
3684 
3685 	/*
3686 	 * it's possible for a throttled entity to be forced into a running
3687 	 * state (e.g. set_curr_task), in this case we're finished.
3688 	 */
3689 	if (cfs_rq_throttled(cfs_rq))
3690 		return true;
3691 
3692 	throttle_cfs_rq(cfs_rq);
3693 	return true;
3694 }
3695 
3696 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
3697 {
3698 	struct cfs_bandwidth *cfs_b =
3699 		container_of(timer, struct cfs_bandwidth, slack_timer);
3700 	do_sched_cfs_slack_timer(cfs_b);
3701 
3702 	return HRTIMER_NORESTART;
3703 }
3704 
3705 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3706 {
3707 	struct cfs_bandwidth *cfs_b =
3708 		container_of(timer, struct cfs_bandwidth, period_timer);
3709 	ktime_t now;
3710 	int overrun;
3711 	int idle = 0;
3712 
3713 	raw_spin_lock(&cfs_b->lock);
3714 	for (;;) {
3715 		now = hrtimer_cb_get_time(timer);
3716 		overrun = hrtimer_forward(timer, now, cfs_b->period);
3717 
3718 		if (!overrun)
3719 			break;
3720 
3721 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
3722 	}
3723 	raw_spin_unlock(&cfs_b->lock);
3724 
3725 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3726 }
3727 
3728 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3729 {
3730 	raw_spin_lock_init(&cfs_b->lock);
3731 	cfs_b->runtime = 0;
3732 	cfs_b->quota = RUNTIME_INF;
3733 	cfs_b->period = ns_to_ktime(default_cfs_period());
3734 
3735 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
3736 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3737 	cfs_b->period_timer.function = sched_cfs_period_timer;
3738 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3739 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
3740 }
3741 
3742 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3743 {
3744 	cfs_rq->runtime_enabled = 0;
3745 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
3746 }
3747 
3748 /* requires cfs_b->lock, may release to reprogram timer */
3749 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
3750 {
3751 	/*
3752 	 * The timer may be active because we're trying to set a new bandwidth
3753 	 * period or because we're racing with the tear-down path
3754 	 * (timer_active==0 becomes visible before the hrtimer call-back
3755 	 * terminates).  In either case we ensure that it's re-programmed
3756 	 */
3757 	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3758 	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3759 		/* bounce the lock to allow do_sched_cfs_period_timer to run */
3760 		raw_spin_unlock(&cfs_b->lock);
3761 		cpu_relax();
3762 		raw_spin_lock(&cfs_b->lock);
3763 		/* if someone else restarted the timer then we're done */
3764 		if (!force && cfs_b->timer_active)
3765 			return;
3766 	}
3767 
3768 	cfs_b->timer_active = 1;
3769 	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
3770 }
3771 
3772 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3773 {
3774 	hrtimer_cancel(&cfs_b->period_timer);
3775 	hrtimer_cancel(&cfs_b->slack_timer);
3776 }
3777 
3778 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3779 {
3780 	struct cfs_rq *cfs_rq;
3781 
3782 	for_each_leaf_cfs_rq(rq, cfs_rq) {
3783 		if (!cfs_rq->runtime_enabled)
3784 			continue;
3785 
3786 		/*
3787 		 * clock_task is not advancing so we just need to make sure
3788 		 * there's some valid quota amount
3789 		 */
3790 		cfs_rq->runtime_remaining = 1;
3791 		if (cfs_rq_throttled(cfs_rq))
3792 			unthrottle_cfs_rq(cfs_rq);
3793 	}
3794 }
3795 
3796 #else /* CONFIG_CFS_BANDWIDTH */
3797 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3798 {
3799 	return rq_clock_task(rq_of(cfs_rq));
3800 }
3801 
3802 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3803 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
3804 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3805 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3806 
3807 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3808 {
3809 	return 0;
3810 }
3811 
3812 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3813 {
3814 	return 0;
3815 }
3816 
3817 static inline int throttled_lb_pair(struct task_group *tg,
3818 				    int src_cpu, int dest_cpu)
3819 {
3820 	return 0;
3821 }
3822 
3823 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3824 
3825 #ifdef CONFIG_FAIR_GROUP_SCHED
3826 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3827 #endif
3828 
3829 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3830 {
3831 	return NULL;
3832 }
3833 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3834 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
3835 
3836 #endif /* CONFIG_CFS_BANDWIDTH */
3837 
3838 /**************************************************
3839  * CFS operations on tasks:
3840  */
3841 
3842 #ifdef CONFIG_SCHED_HRTICK
3843 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3844 {
3845 	struct sched_entity *se = &p->se;
3846 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3847 
3848 	WARN_ON(task_rq(p) != rq);
3849 
3850 	if (cfs_rq->nr_running > 1) {
3851 		u64 slice = sched_slice(cfs_rq, se);
3852 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
3853 		s64 delta = slice - ran;
3854 
3855 		if (delta < 0) {
3856 			if (rq->curr == p)
3857 				resched_task(p);
3858 			return;
3859 		}
3860 
3861 		/*
3862 		 * Don't schedule slices shorter than 10000ns, that just
3863 		 * doesn't make sense. Rely on vruntime for fairness.
3864 		 */
3865 		if (rq->curr != p)
3866 			delta = max_t(s64, 10000LL, delta);
3867 
3868 		hrtick_start(rq, delta);
3869 	}
3870 }
3871 
3872 /*
3873  * called from enqueue/dequeue and updates the hrtick when the
3874  * current task is from our class and nr_running is low enough
3875  * to matter.
3876  */
3877 static void hrtick_update(struct rq *rq)
3878 {
3879 	struct task_struct *curr = rq->curr;
3880 
3881 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
3882 		return;
3883 
3884 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
3885 		hrtick_start_fair(rq, curr);
3886 }
3887 #else /* !CONFIG_SCHED_HRTICK */
3888 static inline void
3889 hrtick_start_fair(struct rq *rq, struct task_struct *p)
3890 {
3891 }
3892 
3893 static inline void hrtick_update(struct rq *rq)
3894 {
3895 }
3896 #endif
3897 
3898 /*
3899  * The enqueue_task method is called before nr_running is
3900  * increased. Here we update the fair scheduling stats and
3901  * then put the task into the rbtree:
3902  */
3903 static void
3904 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3905 {
3906 	struct cfs_rq *cfs_rq;
3907 	struct sched_entity *se = &p->se;
3908 
3909 	for_each_sched_entity(se) {
3910 		if (se->on_rq)
3911 			break;
3912 		cfs_rq = cfs_rq_of(se);
3913 		enqueue_entity(cfs_rq, se, flags);
3914 
3915 		/*
3916 		 * end evaluation on encountering a throttled cfs_rq
3917 		 *
3918 		 * note: in the case of encountering a throttled cfs_rq we will
3919 		 * post the final h_nr_running increment below.
3920 		*/
3921 		if (cfs_rq_throttled(cfs_rq))
3922 			break;
3923 		cfs_rq->h_nr_running++;
3924 
3925 		flags = ENQUEUE_WAKEUP;
3926 	}
3927 
3928 	for_each_sched_entity(se) {
3929 		cfs_rq = cfs_rq_of(se);
3930 		cfs_rq->h_nr_running++;
3931 
3932 		if (cfs_rq_throttled(cfs_rq))
3933 			break;
3934 
3935 		update_cfs_shares(cfs_rq);
3936 		update_entity_load_avg(se, 1);
3937 	}
3938 
3939 	if (!se) {
3940 		update_rq_runnable_avg(rq, rq->nr_running);
3941 		add_nr_running(rq, 1);
3942 	}
3943 	hrtick_update(rq);
3944 }
3945 
3946 static void set_next_buddy(struct sched_entity *se);
3947 
3948 /*
3949  * The dequeue_task method is called before nr_running is
3950  * decreased. We remove the task from the rbtree and
3951  * update the fair scheduling stats:
3952  */
3953 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3954 {
3955 	struct cfs_rq *cfs_rq;
3956 	struct sched_entity *se = &p->se;
3957 	int task_sleep = flags & DEQUEUE_SLEEP;
3958 
3959 	for_each_sched_entity(se) {
3960 		cfs_rq = cfs_rq_of(se);
3961 		dequeue_entity(cfs_rq, se, flags);
3962 
3963 		/*
3964 		 * end evaluation on encountering a throttled cfs_rq
3965 		 *
3966 		 * note: in the case of encountering a throttled cfs_rq we will
3967 		 * post the final h_nr_running decrement below.
3968 		*/
3969 		if (cfs_rq_throttled(cfs_rq))
3970 			break;
3971 		cfs_rq->h_nr_running--;
3972 
3973 		/* Don't dequeue parent if it has other entities besides us */
3974 		if (cfs_rq->load.weight) {
3975 			/*
3976 			 * Bias pick_next to pick a task from this cfs_rq, as
3977 			 * p is sleeping when it is within its sched_slice.
3978 			 */
3979 			if (task_sleep && parent_entity(se))
3980 				set_next_buddy(parent_entity(se));
3981 
3982 			/* avoid re-evaluating load for this entity */
3983 			se = parent_entity(se);
3984 			break;
3985 		}
3986 		flags |= DEQUEUE_SLEEP;
3987 	}
3988 
3989 	for_each_sched_entity(se) {
3990 		cfs_rq = cfs_rq_of(se);
3991 		cfs_rq->h_nr_running--;
3992 
3993 		if (cfs_rq_throttled(cfs_rq))
3994 			break;
3995 
3996 		update_cfs_shares(cfs_rq);
3997 		update_entity_load_avg(se, 1);
3998 	}
3999 
4000 	if (!se) {
4001 		sub_nr_running(rq, 1);
4002 		update_rq_runnable_avg(rq, 1);
4003 	}
4004 	hrtick_update(rq);
4005 }
4006 
4007 #ifdef CONFIG_SMP
4008 /* Used instead of source_load when we know the type == 0 */
4009 static unsigned long weighted_cpuload(const int cpu)
4010 {
4011 	return cpu_rq(cpu)->cfs.runnable_load_avg;
4012 }
4013 
4014 /*
4015  * Return a low guess at the load of a migration-source cpu weighted
4016  * according to the scheduling class and "nice" value.
4017  *
4018  * We want to under-estimate the load of migration sources, to
4019  * balance conservatively.
4020  */
4021 static unsigned long source_load(int cpu, int type)
4022 {
4023 	struct rq *rq = cpu_rq(cpu);
4024 	unsigned long total = weighted_cpuload(cpu);
4025 
4026 	if (type == 0 || !sched_feat(LB_BIAS))
4027 		return total;
4028 
4029 	return min(rq->cpu_load[type-1], total);
4030 }
4031 
4032 /*
4033  * Return a high guess at the load of a migration-target cpu weighted
4034  * according to the scheduling class and "nice" value.
4035  */
4036 static unsigned long target_load(int cpu, int type)
4037 {
4038 	struct rq *rq = cpu_rq(cpu);
4039 	unsigned long total = weighted_cpuload(cpu);
4040 
4041 	if (type == 0 || !sched_feat(LB_BIAS))
4042 		return total;
4043 
4044 	return max(rq->cpu_load[type-1], total);
4045 }
4046 
4047 static unsigned long capacity_of(int cpu)
4048 {
4049 	return cpu_rq(cpu)->cpu_capacity;
4050 }
4051 
4052 static unsigned long cpu_avg_load_per_task(int cpu)
4053 {
4054 	struct rq *rq = cpu_rq(cpu);
4055 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
4056 	unsigned long load_avg = rq->cfs.runnable_load_avg;
4057 
4058 	if (nr_running)
4059 		return load_avg / nr_running;
4060 
4061 	return 0;
4062 }
4063 
4064 static void record_wakee(struct task_struct *p)
4065 {
4066 	/*
4067 	 * Rough decay (wiping) for cost saving, don't worry
4068 	 * about the boundary, really active task won't care
4069 	 * about the loss.
4070 	 */
4071 	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
4072 		current->wakee_flips >>= 1;
4073 		current->wakee_flip_decay_ts = jiffies;
4074 	}
4075 
4076 	if (current->last_wakee != p) {
4077 		current->last_wakee = p;
4078 		current->wakee_flips++;
4079 	}
4080 }
4081 
4082 static void task_waking_fair(struct task_struct *p)
4083 {
4084 	struct sched_entity *se = &p->se;
4085 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4086 	u64 min_vruntime;
4087 
4088 #ifndef CONFIG_64BIT
4089 	u64 min_vruntime_copy;
4090 
4091 	do {
4092 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
4093 		smp_rmb();
4094 		min_vruntime = cfs_rq->min_vruntime;
4095 	} while (min_vruntime != min_vruntime_copy);
4096 #else
4097 	min_vruntime = cfs_rq->min_vruntime;
4098 #endif
4099 
4100 	se->vruntime -= min_vruntime;
4101 	record_wakee(p);
4102 }
4103 
4104 #ifdef CONFIG_FAIR_GROUP_SCHED
4105 /*
4106  * effective_load() calculates the load change as seen from the root_task_group
4107  *
4108  * Adding load to a group doesn't make a group heavier, but can cause movement
4109  * of group shares between cpus. Assuming the shares were perfectly aligned one
4110  * can calculate the shift in shares.
4111  *
4112  * Calculate the effective load difference if @wl is added (subtracted) to @tg
4113  * on this @cpu and results in a total addition (subtraction) of @wg to the
4114  * total group weight.
4115  *
4116  * Given a runqueue weight distribution (rw_i) we can compute a shares
4117  * distribution (s_i) using:
4118  *
4119  *   s_i = rw_i / \Sum rw_j						(1)
4120  *
4121  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4122  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4123  * shares distribution (s_i):
4124  *
4125  *   rw_i = {   2,   4,   1,   0 }
4126  *   s_i  = { 2/7, 4/7, 1/7,   0 }
4127  *
4128  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4129  * task used to run on and the CPU the waker is running on), we need to
4130  * compute the effect of waking a task on either CPU and, in case of a sync
4131  * wakeup, compute the effect of the current task going to sleep.
4132  *
4133  * So for a change of @wl to the local @cpu with an overall group weight change
4134  * of @wl we can compute the new shares distribution (s'_i) using:
4135  *
4136  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
4137  *
4138  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4139  * differences in waking a task to CPU 0. The additional task changes the
4140  * weight and shares distributions like:
4141  *
4142  *   rw'_i = {   3,   4,   1,   0 }
4143  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
4144  *
4145  * We can then compute the difference in effective weight by using:
4146  *
4147  *   dw_i = S * (s'_i - s_i)						(3)
4148  *
4149  * Where 'S' is the group weight as seen by its parent.
4150  *
4151  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4152  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4153  * 4/7) times the weight of the group.
4154  */
4155 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4156 {
4157 	struct sched_entity *se = tg->se[cpu];
4158 
4159 	if (!tg->parent)	/* the trivial, non-cgroup case */
4160 		return wl;
4161 
4162 	for_each_sched_entity(se) {
4163 		long w, W;
4164 
4165 		tg = se->my_q->tg;
4166 
4167 		/*
4168 		 * W = @wg + \Sum rw_j
4169 		 */
4170 		W = wg + calc_tg_weight(tg, se->my_q);
4171 
4172 		/*
4173 		 * w = rw_i + @wl
4174 		 */
4175 		w = se->my_q->load.weight + wl;
4176 
4177 		/*
4178 		 * wl = S * s'_i; see (2)
4179 		 */
4180 		if (W > 0 && w < W)
4181 			wl = (w * tg->shares) / W;
4182 		else
4183 			wl = tg->shares;
4184 
4185 		/*
4186 		 * Per the above, wl is the new se->load.weight value; since
4187 		 * those are clipped to [MIN_SHARES, ...) do so now. See
4188 		 * calc_cfs_shares().
4189 		 */
4190 		if (wl < MIN_SHARES)
4191 			wl = MIN_SHARES;
4192 
4193 		/*
4194 		 * wl = dw_i = S * (s'_i - s_i); see (3)
4195 		 */
4196 		wl -= se->load.weight;
4197 
4198 		/*
4199 		 * Recursively apply this logic to all parent groups to compute
4200 		 * the final effective load change on the root group. Since
4201 		 * only the @tg group gets extra weight, all parent groups can
4202 		 * only redistribute existing shares. @wl is the shift in shares
4203 		 * resulting from this level per the above.
4204 		 */
4205 		wg = 0;
4206 	}
4207 
4208 	return wl;
4209 }
4210 #else
4211 
4212 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4213 {
4214 	return wl;
4215 }
4216 
4217 #endif
4218 
4219 static int wake_wide(struct task_struct *p)
4220 {
4221 	int factor = this_cpu_read(sd_llc_size);
4222 
4223 	/*
4224 	 * Yeah, it's the switching-frequency, could means many wakee or
4225 	 * rapidly switch, use factor here will just help to automatically
4226 	 * adjust the loose-degree, so bigger node will lead to more pull.
4227 	 */
4228 	if (p->wakee_flips > factor) {
4229 		/*
4230 		 * wakee is somewhat hot, it needs certain amount of cpu
4231 		 * resource, so if waker is far more hot, prefer to leave
4232 		 * it alone.
4233 		 */
4234 		if (current->wakee_flips > (factor * p->wakee_flips))
4235 			return 1;
4236 	}
4237 
4238 	return 0;
4239 }
4240 
4241 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4242 {
4243 	s64 this_load, load;
4244 	int idx, this_cpu, prev_cpu;
4245 	unsigned long tl_per_task;
4246 	struct task_group *tg;
4247 	unsigned long weight;
4248 	int balanced;
4249 
4250 	/*
4251 	 * If we wake multiple tasks be careful to not bounce
4252 	 * ourselves around too much.
4253 	 */
4254 	if (wake_wide(p))
4255 		return 0;
4256 
4257 	idx	  = sd->wake_idx;
4258 	this_cpu  = smp_processor_id();
4259 	prev_cpu  = task_cpu(p);
4260 	load	  = source_load(prev_cpu, idx);
4261 	this_load = target_load(this_cpu, idx);
4262 
4263 	/*
4264 	 * If sync wakeup then subtract the (maximum possible)
4265 	 * effect of the currently running task from the load
4266 	 * of the current CPU:
4267 	 */
4268 	if (sync) {
4269 		tg = task_group(current);
4270 		weight = current->se.load.weight;
4271 
4272 		this_load += effective_load(tg, this_cpu, -weight, -weight);
4273 		load += effective_load(tg, prev_cpu, 0, -weight);
4274 	}
4275 
4276 	tg = task_group(p);
4277 	weight = p->se.load.weight;
4278 
4279 	/*
4280 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
4281 	 * due to the sync cause above having dropped this_load to 0, we'll
4282 	 * always have an imbalance, but there's really nothing you can do
4283 	 * about that, so that's good too.
4284 	 *
4285 	 * Otherwise check if either cpus are near enough in load to allow this
4286 	 * task to be woken on this_cpu.
4287 	 */
4288 	if (this_load > 0) {
4289 		s64 this_eff_load, prev_eff_load;
4290 
4291 		this_eff_load = 100;
4292 		this_eff_load *= capacity_of(prev_cpu);
4293 		this_eff_load *= this_load +
4294 			effective_load(tg, this_cpu, weight, weight);
4295 
4296 		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4297 		prev_eff_load *= capacity_of(this_cpu);
4298 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4299 
4300 		balanced = this_eff_load <= prev_eff_load;
4301 	} else
4302 		balanced = true;
4303 
4304 	/*
4305 	 * If the currently running task will sleep within
4306 	 * a reasonable amount of time then attract this newly
4307 	 * woken task:
4308 	 */
4309 	if (sync && balanced)
4310 		return 1;
4311 
4312 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4313 	tl_per_task = cpu_avg_load_per_task(this_cpu);
4314 
4315 	if (balanced ||
4316 	    (this_load <= load &&
4317 	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
4318 		/*
4319 		 * This domain has SD_WAKE_AFFINE and
4320 		 * p is cache cold in this domain, and
4321 		 * there is no bad imbalance.
4322 		 */
4323 		schedstat_inc(sd, ttwu_move_affine);
4324 		schedstat_inc(p, se.statistics.nr_wakeups_affine);
4325 
4326 		return 1;
4327 	}
4328 	return 0;
4329 }
4330 
4331 /*
4332  * find_idlest_group finds and returns the least busy CPU group within the
4333  * domain.
4334  */
4335 static struct sched_group *
4336 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4337 		  int this_cpu, int sd_flag)
4338 {
4339 	struct sched_group *idlest = NULL, *group = sd->groups;
4340 	unsigned long min_load = ULONG_MAX, this_load = 0;
4341 	int load_idx = sd->forkexec_idx;
4342 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
4343 
4344 	if (sd_flag & SD_BALANCE_WAKE)
4345 		load_idx = sd->wake_idx;
4346 
4347 	do {
4348 		unsigned long load, avg_load;
4349 		int local_group;
4350 		int i;
4351 
4352 		/* Skip over this group if it has no CPUs allowed */
4353 		if (!cpumask_intersects(sched_group_cpus(group),
4354 					tsk_cpus_allowed(p)))
4355 			continue;
4356 
4357 		local_group = cpumask_test_cpu(this_cpu,
4358 					       sched_group_cpus(group));
4359 
4360 		/* Tally up the load of all CPUs in the group */
4361 		avg_load = 0;
4362 
4363 		for_each_cpu(i, sched_group_cpus(group)) {
4364 			/* Bias balancing toward cpus of our domain */
4365 			if (local_group)
4366 				load = source_load(i, load_idx);
4367 			else
4368 				load = target_load(i, load_idx);
4369 
4370 			avg_load += load;
4371 		}
4372 
4373 		/* Adjust by relative CPU capacity of the group */
4374 		avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
4375 
4376 		if (local_group) {
4377 			this_load = avg_load;
4378 		} else if (avg_load < min_load) {
4379 			min_load = avg_load;
4380 			idlest = group;
4381 		}
4382 	} while (group = group->next, group != sd->groups);
4383 
4384 	if (!idlest || 100*this_load < imbalance*min_load)
4385 		return NULL;
4386 	return idlest;
4387 }
4388 
4389 /*
4390  * find_idlest_cpu - find the idlest cpu among the cpus in group.
4391  */
4392 static int
4393 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4394 {
4395 	unsigned long load, min_load = ULONG_MAX;
4396 	int idlest = -1;
4397 	int i;
4398 
4399 	/* Traverse only the allowed CPUs */
4400 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4401 		load = weighted_cpuload(i);
4402 
4403 		if (load < min_load || (load == min_load && i == this_cpu)) {
4404 			min_load = load;
4405 			idlest = i;
4406 		}
4407 	}
4408 
4409 	return idlest;
4410 }
4411 
4412 /*
4413  * Try and locate an idle CPU in the sched_domain.
4414  */
4415 static int select_idle_sibling(struct task_struct *p, int target)
4416 {
4417 	struct sched_domain *sd;
4418 	struct sched_group *sg;
4419 	int i = task_cpu(p);
4420 
4421 	if (idle_cpu(target))
4422 		return target;
4423 
4424 	/*
4425 	 * If the prevous cpu is cache affine and idle, don't be stupid.
4426 	 */
4427 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4428 		return i;
4429 
4430 	/*
4431 	 * Otherwise, iterate the domains and find an elegible idle cpu.
4432 	 */
4433 	sd = rcu_dereference(per_cpu(sd_llc, target));
4434 	for_each_lower_domain(sd) {
4435 		sg = sd->groups;
4436 		do {
4437 			if (!cpumask_intersects(sched_group_cpus(sg),
4438 						tsk_cpus_allowed(p)))
4439 				goto next;
4440 
4441 			for_each_cpu(i, sched_group_cpus(sg)) {
4442 				if (i == target || !idle_cpu(i))
4443 					goto next;
4444 			}
4445 
4446 			target = cpumask_first_and(sched_group_cpus(sg),
4447 					tsk_cpus_allowed(p));
4448 			goto done;
4449 next:
4450 			sg = sg->next;
4451 		} while (sg != sd->groups);
4452 	}
4453 done:
4454 	return target;
4455 }
4456 
4457 /*
4458  * select_task_rq_fair: Select target runqueue for the waking task in domains
4459  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
4460  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
4461  *
4462  * Balances load by selecting the idlest cpu in the idlest group, or under
4463  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
4464  *
4465  * Returns the target cpu number.
4466  *
4467  * preempt must be disabled.
4468  */
4469 static int
4470 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
4471 {
4472 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
4473 	int cpu = smp_processor_id();
4474 	int new_cpu = cpu;
4475 	int want_affine = 0;
4476 	int sync = wake_flags & WF_SYNC;
4477 
4478 	if (p->nr_cpus_allowed == 1)
4479 		return prev_cpu;
4480 
4481 	if (sd_flag & SD_BALANCE_WAKE) {
4482 		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
4483 			want_affine = 1;
4484 		new_cpu = prev_cpu;
4485 	}
4486 
4487 	rcu_read_lock();
4488 	for_each_domain(cpu, tmp) {
4489 		if (!(tmp->flags & SD_LOAD_BALANCE))
4490 			continue;
4491 
4492 		/*
4493 		 * If both cpu and prev_cpu are part of this domain,
4494 		 * cpu is a valid SD_WAKE_AFFINE target.
4495 		 */
4496 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
4497 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
4498 			affine_sd = tmp;
4499 			break;
4500 		}
4501 
4502 		if (tmp->flags & sd_flag)
4503 			sd = tmp;
4504 	}
4505 
4506 	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4507 		prev_cpu = cpu;
4508 
4509 	if (sd_flag & SD_BALANCE_WAKE) {
4510 		new_cpu = select_idle_sibling(p, prev_cpu);
4511 		goto unlock;
4512 	}
4513 
4514 	while (sd) {
4515 		struct sched_group *group;
4516 		int weight;
4517 
4518 		if (!(sd->flags & sd_flag)) {
4519 			sd = sd->child;
4520 			continue;
4521 		}
4522 
4523 		group = find_idlest_group(sd, p, cpu, sd_flag);
4524 		if (!group) {
4525 			sd = sd->child;
4526 			continue;
4527 		}
4528 
4529 		new_cpu = find_idlest_cpu(group, p, cpu);
4530 		if (new_cpu == -1 || new_cpu == cpu) {
4531 			/* Now try balancing at a lower domain level of cpu */
4532 			sd = sd->child;
4533 			continue;
4534 		}
4535 
4536 		/* Now try balancing at a lower domain level of new_cpu */
4537 		cpu = new_cpu;
4538 		weight = sd->span_weight;
4539 		sd = NULL;
4540 		for_each_domain(cpu, tmp) {
4541 			if (weight <= tmp->span_weight)
4542 				break;
4543 			if (tmp->flags & sd_flag)
4544 				sd = tmp;
4545 		}
4546 		/* while loop will break here if sd == NULL */
4547 	}
4548 unlock:
4549 	rcu_read_unlock();
4550 
4551 	return new_cpu;
4552 }
4553 
4554 /*
4555  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
4556  * cfs_rq_of(p) references at time of call are still valid and identify the
4557  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
4558  * other assumptions, including the state of rq->lock, should be made.
4559  */
4560 static void
4561 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4562 {
4563 	struct sched_entity *se = &p->se;
4564 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4565 
4566 	/*
4567 	 * Load tracking: accumulate removed load so that it can be processed
4568 	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
4569 	 * to blocked load iff they have a positive decay-count.  It can never
4570 	 * be negative here since on-rq tasks have decay-count == 0.
4571 	 */
4572 	if (se->avg.decay_count) {
4573 		se->avg.decay_count = -__synchronize_entity_decay(se);
4574 		atomic_long_add(se->avg.load_avg_contrib,
4575 						&cfs_rq->removed_load);
4576 	}
4577 
4578 	/* We have migrated, no longer consider this task hot */
4579 	se->exec_start = 0;
4580 }
4581 #endif /* CONFIG_SMP */
4582 
4583 static unsigned long
4584 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
4585 {
4586 	unsigned long gran = sysctl_sched_wakeup_granularity;
4587 
4588 	/*
4589 	 * Since its curr running now, convert the gran from real-time
4590 	 * to virtual-time in his units.
4591 	 *
4592 	 * By using 'se' instead of 'curr' we penalize light tasks, so
4593 	 * they get preempted easier. That is, if 'se' < 'curr' then
4594 	 * the resulting gran will be larger, therefore penalizing the
4595 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
4596 	 * be smaller, again penalizing the lighter task.
4597 	 *
4598 	 * This is especially important for buddies when the leftmost
4599 	 * task is higher priority than the buddy.
4600 	 */
4601 	return calc_delta_fair(gran, se);
4602 }
4603 
4604 /*
4605  * Should 'se' preempt 'curr'.
4606  *
4607  *             |s1
4608  *        |s2
4609  *   |s3
4610  *         g
4611  *      |<--->|c
4612  *
4613  *  w(c, s1) = -1
4614  *  w(c, s2) =  0
4615  *  w(c, s3) =  1
4616  *
4617  */
4618 static int
4619 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
4620 {
4621 	s64 gran, vdiff = curr->vruntime - se->vruntime;
4622 
4623 	if (vdiff <= 0)
4624 		return -1;
4625 
4626 	gran = wakeup_gran(curr, se);
4627 	if (vdiff > gran)
4628 		return 1;
4629 
4630 	return 0;
4631 }
4632 
4633 static void set_last_buddy(struct sched_entity *se)
4634 {
4635 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4636 		return;
4637 
4638 	for_each_sched_entity(se)
4639 		cfs_rq_of(se)->last = se;
4640 }
4641 
4642 static void set_next_buddy(struct sched_entity *se)
4643 {
4644 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4645 		return;
4646 
4647 	for_each_sched_entity(se)
4648 		cfs_rq_of(se)->next = se;
4649 }
4650 
4651 static void set_skip_buddy(struct sched_entity *se)
4652 {
4653 	for_each_sched_entity(se)
4654 		cfs_rq_of(se)->skip = se;
4655 }
4656 
4657 /*
4658  * Preempt the current task with a newly woken task if needed:
4659  */
4660 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
4661 {
4662 	struct task_struct *curr = rq->curr;
4663 	struct sched_entity *se = &curr->se, *pse = &p->se;
4664 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4665 	int scale = cfs_rq->nr_running >= sched_nr_latency;
4666 	int next_buddy_marked = 0;
4667 
4668 	if (unlikely(se == pse))
4669 		return;
4670 
4671 	/*
4672 	 * This is possible from callers such as move_task(), in which we
4673 	 * unconditionally check_prempt_curr() after an enqueue (which may have
4674 	 * lead to a throttle).  This both saves work and prevents false
4675 	 * next-buddy nomination below.
4676 	 */
4677 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
4678 		return;
4679 
4680 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
4681 		set_next_buddy(pse);
4682 		next_buddy_marked = 1;
4683 	}
4684 
4685 	/*
4686 	 * We can come here with TIF_NEED_RESCHED already set from new task
4687 	 * wake up path.
4688 	 *
4689 	 * Note: this also catches the edge-case of curr being in a throttled
4690 	 * group (e.g. via set_curr_task), since update_curr() (in the
4691 	 * enqueue of curr) will have resulted in resched being set.  This
4692 	 * prevents us from potentially nominating it as a false LAST_BUDDY
4693 	 * below.
4694 	 */
4695 	if (test_tsk_need_resched(curr))
4696 		return;
4697 
4698 	/* Idle tasks are by definition preempted by non-idle tasks. */
4699 	if (unlikely(curr->policy == SCHED_IDLE) &&
4700 	    likely(p->policy != SCHED_IDLE))
4701 		goto preempt;
4702 
4703 	/*
4704 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
4705 	 * is driven by the tick):
4706 	 */
4707 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
4708 		return;
4709 
4710 	find_matching_se(&se, &pse);
4711 	update_curr(cfs_rq_of(se));
4712 	BUG_ON(!pse);
4713 	if (wakeup_preempt_entity(se, pse) == 1) {
4714 		/*
4715 		 * Bias pick_next to pick the sched entity that is
4716 		 * triggering this preemption.
4717 		 */
4718 		if (!next_buddy_marked)
4719 			set_next_buddy(pse);
4720 		goto preempt;
4721 	}
4722 
4723 	return;
4724 
4725 preempt:
4726 	resched_task(curr);
4727 	/*
4728 	 * Only set the backward buddy when the current task is still
4729 	 * on the rq. This can happen when a wakeup gets interleaved
4730 	 * with schedule on the ->pre_schedule() or idle_balance()
4731 	 * point, either of which can * drop the rq lock.
4732 	 *
4733 	 * Also, during early boot the idle thread is in the fair class,
4734 	 * for obvious reasons its a bad idea to schedule back to it.
4735 	 */
4736 	if (unlikely(!se->on_rq || curr == rq->idle))
4737 		return;
4738 
4739 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
4740 		set_last_buddy(se);
4741 }
4742 
4743 static struct task_struct *
4744 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
4745 {
4746 	struct cfs_rq *cfs_rq = &rq->cfs;
4747 	struct sched_entity *se;
4748 	struct task_struct *p;
4749 	int new_tasks;
4750 
4751 again:
4752 #ifdef CONFIG_FAIR_GROUP_SCHED
4753 	if (!cfs_rq->nr_running)
4754 		goto idle;
4755 
4756 	if (prev->sched_class != &fair_sched_class)
4757 		goto simple;
4758 
4759 	/*
4760 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
4761 	 * likely that a next task is from the same cgroup as the current.
4762 	 *
4763 	 * Therefore attempt to avoid putting and setting the entire cgroup
4764 	 * hierarchy, only change the part that actually changes.
4765 	 */
4766 
4767 	do {
4768 		struct sched_entity *curr = cfs_rq->curr;
4769 
4770 		/*
4771 		 * Since we got here without doing put_prev_entity() we also
4772 		 * have to consider cfs_rq->curr. If it is still a runnable
4773 		 * entity, update_curr() will update its vruntime, otherwise
4774 		 * forget we've ever seen it.
4775 		 */
4776 		if (curr && curr->on_rq)
4777 			update_curr(cfs_rq);
4778 		else
4779 			curr = NULL;
4780 
4781 		/*
4782 		 * This call to check_cfs_rq_runtime() will do the throttle and
4783 		 * dequeue its entity in the parent(s). Therefore the 'simple'
4784 		 * nr_running test will indeed be correct.
4785 		 */
4786 		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
4787 			goto simple;
4788 
4789 		se = pick_next_entity(cfs_rq, curr);
4790 		cfs_rq = group_cfs_rq(se);
4791 	} while (cfs_rq);
4792 
4793 	p = task_of(se);
4794 
4795 	/*
4796 	 * Since we haven't yet done put_prev_entity and if the selected task
4797 	 * is a different task than we started out with, try and touch the
4798 	 * least amount of cfs_rqs.
4799 	 */
4800 	if (prev != p) {
4801 		struct sched_entity *pse = &prev->se;
4802 
4803 		while (!(cfs_rq = is_same_group(se, pse))) {
4804 			int se_depth = se->depth;
4805 			int pse_depth = pse->depth;
4806 
4807 			if (se_depth <= pse_depth) {
4808 				put_prev_entity(cfs_rq_of(pse), pse);
4809 				pse = parent_entity(pse);
4810 			}
4811 			if (se_depth >= pse_depth) {
4812 				set_next_entity(cfs_rq_of(se), se);
4813 				se = parent_entity(se);
4814 			}
4815 		}
4816 
4817 		put_prev_entity(cfs_rq, pse);
4818 		set_next_entity(cfs_rq, se);
4819 	}
4820 
4821 	if (hrtick_enabled(rq))
4822 		hrtick_start_fair(rq, p);
4823 
4824 	return p;
4825 simple:
4826 	cfs_rq = &rq->cfs;
4827 #endif
4828 
4829 	if (!cfs_rq->nr_running)
4830 		goto idle;
4831 
4832 	put_prev_task(rq, prev);
4833 
4834 	do {
4835 		se = pick_next_entity(cfs_rq, NULL);
4836 		set_next_entity(cfs_rq, se);
4837 		cfs_rq = group_cfs_rq(se);
4838 	} while (cfs_rq);
4839 
4840 	p = task_of(se);
4841 
4842 	if (hrtick_enabled(rq))
4843 		hrtick_start_fair(rq, p);
4844 
4845 	return p;
4846 
4847 idle:
4848 	new_tasks = idle_balance(rq);
4849 	/*
4850 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
4851 	 * possible for any higher priority task to appear. In that case we
4852 	 * must re-start the pick_next_entity() loop.
4853 	 */
4854 	if (new_tasks < 0)
4855 		return RETRY_TASK;
4856 
4857 	if (new_tasks > 0)
4858 		goto again;
4859 
4860 	return NULL;
4861 }
4862 
4863 /*
4864  * Account for a descheduled task:
4865  */
4866 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
4867 {
4868 	struct sched_entity *se = &prev->se;
4869 	struct cfs_rq *cfs_rq;
4870 
4871 	for_each_sched_entity(se) {
4872 		cfs_rq = cfs_rq_of(se);
4873 		put_prev_entity(cfs_rq, se);
4874 	}
4875 }
4876 
4877 /*
4878  * sched_yield() is very simple
4879  *
4880  * The magic of dealing with the ->skip buddy is in pick_next_entity.
4881  */
4882 static void yield_task_fair(struct rq *rq)
4883 {
4884 	struct task_struct *curr = rq->curr;
4885 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4886 	struct sched_entity *se = &curr->se;
4887 
4888 	/*
4889 	 * Are we the only task in the tree?
4890 	 */
4891 	if (unlikely(rq->nr_running == 1))
4892 		return;
4893 
4894 	clear_buddies(cfs_rq, se);
4895 
4896 	if (curr->policy != SCHED_BATCH) {
4897 		update_rq_clock(rq);
4898 		/*
4899 		 * Update run-time statistics of the 'current'.
4900 		 */
4901 		update_curr(cfs_rq);
4902 		/*
4903 		 * Tell update_rq_clock() that we've just updated,
4904 		 * so we don't do microscopic update in schedule()
4905 		 * and double the fastpath cost.
4906 		 */
4907 		 rq->skip_clock_update = 1;
4908 	}
4909 
4910 	set_skip_buddy(se);
4911 }
4912 
4913 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
4914 {
4915 	struct sched_entity *se = &p->se;
4916 
4917 	/* throttled hierarchies are not runnable */
4918 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
4919 		return false;
4920 
4921 	/* Tell the scheduler that we'd really like pse to run next. */
4922 	set_next_buddy(se);
4923 
4924 	yield_task_fair(rq);
4925 
4926 	return true;
4927 }
4928 
4929 #ifdef CONFIG_SMP
4930 /**************************************************
4931  * Fair scheduling class load-balancing methods.
4932  *
4933  * BASICS
4934  *
4935  * The purpose of load-balancing is to achieve the same basic fairness the
4936  * per-cpu scheduler provides, namely provide a proportional amount of compute
4937  * time to each task. This is expressed in the following equation:
4938  *
4939  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
4940  *
4941  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
4942  * W_i,0 is defined as:
4943  *
4944  *   W_i,0 = \Sum_j w_i,j                                             (2)
4945  *
4946  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
4947  * is derived from the nice value as per prio_to_weight[].
4948  *
4949  * The weight average is an exponential decay average of the instantaneous
4950  * weight:
4951  *
4952  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
4953  *
4954  * C_i is the compute capacity of cpu i, typically it is the
4955  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
4956  * can also include other factors [XXX].
4957  *
4958  * To achieve this balance we define a measure of imbalance which follows
4959  * directly from (1):
4960  *
4961  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
4962  *
4963  * We them move tasks around to minimize the imbalance. In the continuous
4964  * function space it is obvious this converges, in the discrete case we get
4965  * a few fun cases generally called infeasible weight scenarios.
4966  *
4967  * [XXX expand on:
4968  *     - infeasible weights;
4969  *     - local vs global optima in the discrete case. ]
4970  *
4971  *
4972  * SCHED DOMAINS
4973  *
4974  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
4975  * for all i,j solution, we create a tree of cpus that follows the hardware
4976  * topology where each level pairs two lower groups (or better). This results
4977  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
4978  * tree to only the first of the previous level and we decrease the frequency
4979  * of load-balance at each level inv. proportional to the number of cpus in
4980  * the groups.
4981  *
4982  * This yields:
4983  *
4984  *     log_2 n     1     n
4985  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
4986  *     i = 0      2^i   2^i
4987  *                               `- size of each group
4988  *         |         |     `- number of cpus doing load-balance
4989  *         |         `- freq
4990  *         `- sum over all levels
4991  *
4992  * Coupled with a limit on how many tasks we can migrate every balance pass,
4993  * this makes (5) the runtime complexity of the balancer.
4994  *
4995  * An important property here is that each CPU is still (indirectly) connected
4996  * to every other cpu in at most O(log n) steps:
4997  *
4998  * The adjacency matrix of the resulting graph is given by:
4999  *
5000  *             log_2 n
5001  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
5002  *             k = 0
5003  *
5004  * And you'll find that:
5005  *
5006  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
5007  *
5008  * Showing there's indeed a path between every cpu in at most O(log n) steps.
5009  * The task movement gives a factor of O(m), giving a convergence complexity
5010  * of:
5011  *
5012  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
5013  *
5014  *
5015  * WORK CONSERVING
5016  *
5017  * In order to avoid CPUs going idle while there's still work to do, new idle
5018  * balancing is more aggressive and has the newly idle cpu iterate up the domain
5019  * tree itself instead of relying on other CPUs to bring it work.
5020  *
5021  * This adds some complexity to both (5) and (8) but it reduces the total idle
5022  * time.
5023  *
5024  * [XXX more?]
5025  *
5026  *
5027  * CGROUPS
5028  *
5029  * Cgroups make a horror show out of (2), instead of a simple sum we get:
5030  *
5031  *                                s_k,i
5032  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
5033  *                                 S_k
5034  *
5035  * Where
5036  *
5037  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
5038  *
5039  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5040  *
5041  * The big problem is S_k, its a global sum needed to compute a local (W_i)
5042  * property.
5043  *
5044  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5045  *      rewrite all of this once again.]
5046  */
5047 
5048 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5049 
5050 enum fbq_type { regular, remote, all };
5051 
5052 #define LBF_ALL_PINNED	0x01
5053 #define LBF_NEED_BREAK	0x02
5054 #define LBF_DST_PINNED  0x04
5055 #define LBF_SOME_PINNED	0x08
5056 
5057 struct lb_env {
5058 	struct sched_domain	*sd;
5059 
5060 	struct rq		*src_rq;
5061 	int			src_cpu;
5062 
5063 	int			dst_cpu;
5064 	struct rq		*dst_rq;
5065 
5066 	struct cpumask		*dst_grpmask;
5067 	int			new_dst_cpu;
5068 	enum cpu_idle_type	idle;
5069 	long			imbalance;
5070 	/* The set of CPUs under consideration for load-balancing */
5071 	struct cpumask		*cpus;
5072 
5073 	unsigned int		flags;
5074 
5075 	unsigned int		loop;
5076 	unsigned int		loop_break;
5077 	unsigned int		loop_max;
5078 
5079 	enum fbq_type		fbq_type;
5080 };
5081 
5082 /*
5083  * move_task - move a task from one runqueue to another runqueue.
5084  * Both runqueues must be locked.
5085  */
5086 static void move_task(struct task_struct *p, struct lb_env *env)
5087 {
5088 	deactivate_task(env->src_rq, p, 0);
5089 	set_task_cpu(p, env->dst_cpu);
5090 	activate_task(env->dst_rq, p, 0);
5091 	check_preempt_curr(env->dst_rq, p, 0);
5092 }
5093 
5094 /*
5095  * Is this task likely cache-hot:
5096  */
5097 static int
5098 task_hot(struct task_struct *p, u64 now)
5099 {
5100 	s64 delta;
5101 
5102 	if (p->sched_class != &fair_sched_class)
5103 		return 0;
5104 
5105 	if (unlikely(p->policy == SCHED_IDLE))
5106 		return 0;
5107 
5108 	/*
5109 	 * Buddy candidates are cache hot:
5110 	 */
5111 	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
5112 			(&p->se == cfs_rq_of(&p->se)->next ||
5113 			 &p->se == cfs_rq_of(&p->se)->last))
5114 		return 1;
5115 
5116 	if (sysctl_sched_migration_cost == -1)
5117 		return 1;
5118 	if (sysctl_sched_migration_cost == 0)
5119 		return 0;
5120 
5121 	delta = now - p->se.exec_start;
5122 
5123 	return delta < (s64)sysctl_sched_migration_cost;
5124 }
5125 
5126 #ifdef CONFIG_NUMA_BALANCING
5127 /* Returns true if the destination node has incurred more faults */
5128 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5129 {
5130 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
5131 	int src_nid, dst_nid;
5132 
5133 	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
5134 	    !(env->sd->flags & SD_NUMA)) {
5135 		return false;
5136 	}
5137 
5138 	src_nid = cpu_to_node(env->src_cpu);
5139 	dst_nid = cpu_to_node(env->dst_cpu);
5140 
5141 	if (src_nid == dst_nid)
5142 		return false;
5143 
5144 	if (numa_group) {
5145 		/* Task is already in the group's interleave set. */
5146 		if (node_isset(src_nid, numa_group->active_nodes))
5147 			return false;
5148 
5149 		/* Task is moving into the group's interleave set. */
5150 		if (node_isset(dst_nid, numa_group->active_nodes))
5151 			return true;
5152 
5153 		return group_faults(p, dst_nid) > group_faults(p, src_nid);
5154 	}
5155 
5156 	/* Encourage migration to the preferred node. */
5157 	if (dst_nid == p->numa_preferred_nid)
5158 		return true;
5159 
5160 	return task_faults(p, dst_nid) > task_faults(p, src_nid);
5161 }
5162 
5163 
5164 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5165 {
5166 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
5167 	int src_nid, dst_nid;
5168 
5169 	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
5170 		return false;
5171 
5172 	if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
5173 		return false;
5174 
5175 	src_nid = cpu_to_node(env->src_cpu);
5176 	dst_nid = cpu_to_node(env->dst_cpu);
5177 
5178 	if (src_nid == dst_nid)
5179 		return false;
5180 
5181 	if (numa_group) {
5182 		/* Task is moving within/into the group's interleave set. */
5183 		if (node_isset(dst_nid, numa_group->active_nodes))
5184 			return false;
5185 
5186 		/* Task is moving out of the group's interleave set. */
5187 		if (node_isset(src_nid, numa_group->active_nodes))
5188 			return true;
5189 
5190 		return group_faults(p, dst_nid) < group_faults(p, src_nid);
5191 	}
5192 
5193 	/* Migrating away from the preferred node is always bad. */
5194 	if (src_nid == p->numa_preferred_nid)
5195 		return true;
5196 
5197 	return task_faults(p, dst_nid) < task_faults(p, src_nid);
5198 }
5199 
5200 #else
5201 static inline bool migrate_improves_locality(struct task_struct *p,
5202 					     struct lb_env *env)
5203 {
5204 	return false;
5205 }
5206 
5207 static inline bool migrate_degrades_locality(struct task_struct *p,
5208 					     struct lb_env *env)
5209 {
5210 	return false;
5211 }
5212 #endif
5213 
5214 /*
5215  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5216  */
5217 static
5218 int can_migrate_task(struct task_struct *p, struct lb_env *env)
5219 {
5220 	int tsk_cache_hot = 0;
5221 	/*
5222 	 * We do not migrate tasks that are:
5223 	 * 1) throttled_lb_pair, or
5224 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
5225 	 * 3) running (obviously), or
5226 	 * 4) are cache-hot on their current CPU.
5227 	 */
5228 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5229 		return 0;
5230 
5231 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
5232 		int cpu;
5233 
5234 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
5235 
5236 		env->flags |= LBF_SOME_PINNED;
5237 
5238 		/*
5239 		 * Remember if this task can be migrated to any other cpu in
5240 		 * our sched_group. We may want to revisit it if we couldn't
5241 		 * meet load balance goals by pulling other tasks on src_cpu.
5242 		 *
5243 		 * Also avoid computing new_dst_cpu if we have already computed
5244 		 * one in current iteration.
5245 		 */
5246 		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
5247 			return 0;
5248 
5249 		/* Prevent to re-select dst_cpu via env's cpus */
5250 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5251 			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5252 				env->flags |= LBF_DST_PINNED;
5253 				env->new_dst_cpu = cpu;
5254 				break;
5255 			}
5256 		}
5257 
5258 		return 0;
5259 	}
5260 
5261 	/* Record that we found atleast one task that could run on dst_cpu */
5262 	env->flags &= ~LBF_ALL_PINNED;
5263 
5264 	if (task_running(env->src_rq, p)) {
5265 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
5266 		return 0;
5267 	}
5268 
5269 	/*
5270 	 * Aggressive migration if:
5271 	 * 1) destination numa is preferred
5272 	 * 2) task is cache cold, or
5273 	 * 3) too many balance attempts have failed.
5274 	 */
5275 	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
5276 	if (!tsk_cache_hot)
5277 		tsk_cache_hot = migrate_degrades_locality(p, env);
5278 
5279 	if (migrate_improves_locality(p, env)) {
5280 #ifdef CONFIG_SCHEDSTATS
5281 		if (tsk_cache_hot) {
5282 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5283 			schedstat_inc(p, se.statistics.nr_forced_migrations);
5284 		}
5285 #endif
5286 		return 1;
5287 	}
5288 
5289 	if (!tsk_cache_hot ||
5290 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5291 
5292 		if (tsk_cache_hot) {
5293 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5294 			schedstat_inc(p, se.statistics.nr_forced_migrations);
5295 		}
5296 
5297 		return 1;
5298 	}
5299 
5300 	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
5301 	return 0;
5302 }
5303 
5304 /*
5305  * move_one_task tries to move exactly one task from busiest to this_rq, as
5306  * part of active balancing operations within "domain".
5307  * Returns 1 if successful and 0 otherwise.
5308  *
5309  * Called with both runqueues locked.
5310  */
5311 static int move_one_task(struct lb_env *env)
5312 {
5313 	struct task_struct *p, *n;
5314 
5315 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5316 		if (!can_migrate_task(p, env))
5317 			continue;
5318 
5319 		move_task(p, env);
5320 		/*
5321 		 * Right now, this is only the second place move_task()
5322 		 * is called, so we can safely collect move_task()
5323 		 * stats here rather than inside move_task().
5324 		 */
5325 		schedstat_inc(env->sd, lb_gained[env->idle]);
5326 		return 1;
5327 	}
5328 	return 0;
5329 }
5330 
5331 static const unsigned int sched_nr_migrate_break = 32;
5332 
5333 /*
5334  * move_tasks tries to move up to imbalance weighted load from busiest to
5335  * this_rq, as part of a balancing operation within domain "sd".
5336  * Returns 1 if successful and 0 otherwise.
5337  *
5338  * Called with both runqueues locked.
5339  */
5340 static int move_tasks(struct lb_env *env)
5341 {
5342 	struct list_head *tasks = &env->src_rq->cfs_tasks;
5343 	struct task_struct *p;
5344 	unsigned long load;
5345 	int pulled = 0;
5346 
5347 	if (env->imbalance <= 0)
5348 		return 0;
5349 
5350 	while (!list_empty(tasks)) {
5351 		p = list_first_entry(tasks, struct task_struct, se.group_node);
5352 
5353 		env->loop++;
5354 		/* We've more or less seen every task there is, call it quits */
5355 		if (env->loop > env->loop_max)
5356 			break;
5357 
5358 		/* take a breather every nr_migrate tasks */
5359 		if (env->loop > env->loop_break) {
5360 			env->loop_break += sched_nr_migrate_break;
5361 			env->flags |= LBF_NEED_BREAK;
5362 			break;
5363 		}
5364 
5365 		if (!can_migrate_task(p, env))
5366 			goto next;
5367 
5368 		load = task_h_load(p);
5369 
5370 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
5371 			goto next;
5372 
5373 		if ((load / 2) > env->imbalance)
5374 			goto next;
5375 
5376 		move_task(p, env);
5377 		pulled++;
5378 		env->imbalance -= load;
5379 
5380 #ifdef CONFIG_PREEMPT
5381 		/*
5382 		 * NEWIDLE balancing is a source of latency, so preemptible
5383 		 * kernels will stop after the first task is pulled to minimize
5384 		 * the critical section.
5385 		 */
5386 		if (env->idle == CPU_NEWLY_IDLE)
5387 			break;
5388 #endif
5389 
5390 		/*
5391 		 * We only want to steal up to the prescribed amount of
5392 		 * weighted load.
5393 		 */
5394 		if (env->imbalance <= 0)
5395 			break;
5396 
5397 		continue;
5398 next:
5399 		list_move_tail(&p->se.group_node, tasks);
5400 	}
5401 
5402 	/*
5403 	 * Right now, this is one of only two places move_task() is called,
5404 	 * so we can safely collect move_task() stats here rather than
5405 	 * inside move_task().
5406 	 */
5407 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
5408 
5409 	return pulled;
5410 }
5411 
5412 #ifdef CONFIG_FAIR_GROUP_SCHED
5413 /*
5414  * update tg->load_weight by folding this cpu's load_avg
5415  */
5416 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
5417 {
5418 	struct sched_entity *se = tg->se[cpu];
5419 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
5420 
5421 	/* throttled entities do not contribute to load */
5422 	if (throttled_hierarchy(cfs_rq))
5423 		return;
5424 
5425 	update_cfs_rq_blocked_load(cfs_rq, 1);
5426 
5427 	if (se) {
5428 		update_entity_load_avg(se, 1);
5429 		/*
5430 		 * We pivot on our runnable average having decayed to zero for
5431 		 * list removal.  This generally implies that all our children
5432 		 * have also been removed (modulo rounding error or bandwidth
5433 		 * control); however, such cases are rare and we can fix these
5434 		 * at enqueue.
5435 		 *
5436 		 * TODO: fix up out-of-order children on enqueue.
5437 		 */
5438 		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
5439 			list_del_leaf_cfs_rq(cfs_rq);
5440 	} else {
5441 		struct rq *rq = rq_of(cfs_rq);
5442 		update_rq_runnable_avg(rq, rq->nr_running);
5443 	}
5444 }
5445 
5446 static void update_blocked_averages(int cpu)
5447 {
5448 	struct rq *rq = cpu_rq(cpu);
5449 	struct cfs_rq *cfs_rq;
5450 	unsigned long flags;
5451 
5452 	raw_spin_lock_irqsave(&rq->lock, flags);
5453 	update_rq_clock(rq);
5454 	/*
5455 	 * Iterates the task_group tree in a bottom up fashion, see
5456 	 * list_add_leaf_cfs_rq() for details.
5457 	 */
5458 	for_each_leaf_cfs_rq(rq, cfs_rq) {
5459 		/*
5460 		 * Note: We may want to consider periodically releasing
5461 		 * rq->lock about these updates so that creating many task
5462 		 * groups does not result in continually extending hold time.
5463 		 */
5464 		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
5465 	}
5466 
5467 	raw_spin_unlock_irqrestore(&rq->lock, flags);
5468 }
5469 
5470 /*
5471  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
5472  * This needs to be done in a top-down fashion because the load of a child
5473  * group is a fraction of its parents load.
5474  */
5475 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
5476 {
5477 	struct rq *rq = rq_of(cfs_rq);
5478 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
5479 	unsigned long now = jiffies;
5480 	unsigned long load;
5481 
5482 	if (cfs_rq->last_h_load_update == now)
5483 		return;
5484 
5485 	cfs_rq->h_load_next = NULL;
5486 	for_each_sched_entity(se) {
5487 		cfs_rq = cfs_rq_of(se);
5488 		cfs_rq->h_load_next = se;
5489 		if (cfs_rq->last_h_load_update == now)
5490 			break;
5491 	}
5492 
5493 	if (!se) {
5494 		cfs_rq->h_load = cfs_rq->runnable_load_avg;
5495 		cfs_rq->last_h_load_update = now;
5496 	}
5497 
5498 	while ((se = cfs_rq->h_load_next) != NULL) {
5499 		load = cfs_rq->h_load;
5500 		load = div64_ul(load * se->avg.load_avg_contrib,
5501 				cfs_rq->runnable_load_avg + 1);
5502 		cfs_rq = group_cfs_rq(se);
5503 		cfs_rq->h_load = load;
5504 		cfs_rq->last_h_load_update = now;
5505 	}
5506 }
5507 
5508 static unsigned long task_h_load(struct task_struct *p)
5509 {
5510 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
5511 
5512 	update_cfs_rq_h_load(cfs_rq);
5513 	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
5514 			cfs_rq->runnable_load_avg + 1);
5515 }
5516 #else
5517 static inline void update_blocked_averages(int cpu)
5518 {
5519 }
5520 
5521 static unsigned long task_h_load(struct task_struct *p)
5522 {
5523 	return p->se.avg.load_avg_contrib;
5524 }
5525 #endif
5526 
5527 /********** Helpers for find_busiest_group ************************/
5528 /*
5529  * sg_lb_stats - stats of a sched_group required for load_balancing
5530  */
5531 struct sg_lb_stats {
5532 	unsigned long avg_load; /*Avg load across the CPUs of the group */
5533 	unsigned long group_load; /* Total load over the CPUs of the group */
5534 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5535 	unsigned long load_per_task;
5536 	unsigned long group_capacity;
5537 	unsigned int sum_nr_running; /* Nr tasks running in the group */
5538 	unsigned int group_capacity_factor;
5539 	unsigned int idle_cpus;
5540 	unsigned int group_weight;
5541 	int group_imb; /* Is there an imbalance in the group ? */
5542 	int group_has_free_capacity;
5543 #ifdef CONFIG_NUMA_BALANCING
5544 	unsigned int nr_numa_running;
5545 	unsigned int nr_preferred_running;
5546 #endif
5547 };
5548 
5549 /*
5550  * sd_lb_stats - Structure to store the statistics of a sched_domain
5551  *		 during load balancing.
5552  */
5553 struct sd_lb_stats {
5554 	struct sched_group *busiest;	/* Busiest group in this sd */
5555 	struct sched_group *local;	/* Local group in this sd */
5556 	unsigned long total_load;	/* Total load of all groups in sd */
5557 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
5558 	unsigned long avg_load;	/* Average load across all groups in sd */
5559 
5560 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
5561 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
5562 };
5563 
5564 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5565 {
5566 	/*
5567 	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
5568 	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
5569 	 * We must however clear busiest_stat::avg_load because
5570 	 * update_sd_pick_busiest() reads this before assignment.
5571 	 */
5572 	*sds = (struct sd_lb_stats){
5573 		.busiest = NULL,
5574 		.local = NULL,
5575 		.total_load = 0UL,
5576 		.total_capacity = 0UL,
5577 		.busiest_stat = {
5578 			.avg_load = 0UL,
5579 		},
5580 	};
5581 }
5582 
5583 /**
5584  * get_sd_load_idx - Obtain the load index for a given sched domain.
5585  * @sd: The sched_domain whose load_idx is to be obtained.
5586  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
5587  *
5588  * Return: The load index.
5589  */
5590 static inline int get_sd_load_idx(struct sched_domain *sd,
5591 					enum cpu_idle_type idle)
5592 {
5593 	int load_idx;
5594 
5595 	switch (idle) {
5596 	case CPU_NOT_IDLE:
5597 		load_idx = sd->busy_idx;
5598 		break;
5599 
5600 	case CPU_NEWLY_IDLE:
5601 		load_idx = sd->newidle_idx;
5602 		break;
5603 	default:
5604 		load_idx = sd->idle_idx;
5605 		break;
5606 	}
5607 
5608 	return load_idx;
5609 }
5610 
5611 static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
5612 {
5613 	return SCHED_CAPACITY_SCALE;
5614 }
5615 
5616 unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
5617 {
5618 	return default_scale_capacity(sd, cpu);
5619 }
5620 
5621 static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
5622 {
5623 	unsigned long weight = sd->span_weight;
5624 	unsigned long smt_gain = sd->smt_gain;
5625 
5626 	smt_gain /= weight;
5627 
5628 	return smt_gain;
5629 }
5630 
5631 unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
5632 {
5633 	return default_scale_smt_capacity(sd, cpu);
5634 }
5635 
5636 static unsigned long scale_rt_capacity(int cpu)
5637 {
5638 	struct rq *rq = cpu_rq(cpu);
5639 	u64 total, available, age_stamp, avg;
5640 	s64 delta;
5641 
5642 	/*
5643 	 * Since we're reading these variables without serialization make sure
5644 	 * we read them once before doing sanity checks on them.
5645 	 */
5646 	age_stamp = ACCESS_ONCE(rq->age_stamp);
5647 	avg = ACCESS_ONCE(rq->rt_avg);
5648 
5649 	delta = rq_clock(rq) - age_stamp;
5650 	if (unlikely(delta < 0))
5651 		delta = 0;
5652 
5653 	total = sched_avg_period() + delta;
5654 
5655 	if (unlikely(total < avg)) {
5656 		/* Ensures that capacity won't end up being negative */
5657 		available = 0;
5658 	} else {
5659 		available = total - avg;
5660 	}
5661 
5662 	if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
5663 		total = SCHED_CAPACITY_SCALE;
5664 
5665 	total >>= SCHED_CAPACITY_SHIFT;
5666 
5667 	return div_u64(available, total);
5668 }
5669 
5670 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
5671 {
5672 	unsigned long weight = sd->span_weight;
5673 	unsigned long capacity = SCHED_CAPACITY_SCALE;
5674 	struct sched_group *sdg = sd->groups;
5675 
5676 	if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
5677 		if (sched_feat(ARCH_CAPACITY))
5678 			capacity *= arch_scale_smt_capacity(sd, cpu);
5679 		else
5680 			capacity *= default_scale_smt_capacity(sd, cpu);
5681 
5682 		capacity >>= SCHED_CAPACITY_SHIFT;
5683 	}
5684 
5685 	sdg->sgc->capacity_orig = capacity;
5686 
5687 	if (sched_feat(ARCH_CAPACITY))
5688 		capacity *= arch_scale_freq_capacity(sd, cpu);
5689 	else
5690 		capacity *= default_scale_capacity(sd, cpu);
5691 
5692 	capacity >>= SCHED_CAPACITY_SHIFT;
5693 
5694 	capacity *= scale_rt_capacity(cpu);
5695 	capacity >>= SCHED_CAPACITY_SHIFT;
5696 
5697 	if (!capacity)
5698 		capacity = 1;
5699 
5700 	cpu_rq(cpu)->cpu_capacity = capacity;
5701 	sdg->sgc->capacity = capacity;
5702 }
5703 
5704 void update_group_capacity(struct sched_domain *sd, int cpu)
5705 {
5706 	struct sched_domain *child = sd->child;
5707 	struct sched_group *group, *sdg = sd->groups;
5708 	unsigned long capacity, capacity_orig;
5709 	unsigned long interval;
5710 
5711 	interval = msecs_to_jiffies(sd->balance_interval);
5712 	interval = clamp(interval, 1UL, max_load_balance_interval);
5713 	sdg->sgc->next_update = jiffies + interval;
5714 
5715 	if (!child) {
5716 		update_cpu_capacity(sd, cpu);
5717 		return;
5718 	}
5719 
5720 	capacity_orig = capacity = 0;
5721 
5722 	if (child->flags & SD_OVERLAP) {
5723 		/*
5724 		 * SD_OVERLAP domains cannot assume that child groups
5725 		 * span the current group.
5726 		 */
5727 
5728 		for_each_cpu(cpu, sched_group_cpus(sdg)) {
5729 			struct sched_group_capacity *sgc;
5730 			struct rq *rq = cpu_rq(cpu);
5731 
5732 			/*
5733 			 * build_sched_domains() -> init_sched_groups_capacity()
5734 			 * gets here before we've attached the domains to the
5735 			 * runqueues.
5736 			 *
5737 			 * Use capacity_of(), which is set irrespective of domains
5738 			 * in update_cpu_capacity().
5739 			 *
5740 			 * This avoids capacity/capacity_orig from being 0 and
5741 			 * causing divide-by-zero issues on boot.
5742 			 *
5743 			 * Runtime updates will correct capacity_orig.
5744 			 */
5745 			if (unlikely(!rq->sd)) {
5746 				capacity_orig += capacity_of(cpu);
5747 				capacity += capacity_of(cpu);
5748 				continue;
5749 			}
5750 
5751 			sgc = rq->sd->groups->sgc;
5752 			capacity_orig += sgc->capacity_orig;
5753 			capacity += sgc->capacity;
5754 		}
5755 	} else  {
5756 		/*
5757 		 * !SD_OVERLAP domains can assume that child groups
5758 		 * span the current group.
5759 		 */
5760 
5761 		group = child->groups;
5762 		do {
5763 			capacity_orig += group->sgc->capacity_orig;
5764 			capacity += group->sgc->capacity;
5765 			group = group->next;
5766 		} while (group != child->groups);
5767 	}
5768 
5769 	sdg->sgc->capacity_orig = capacity_orig;
5770 	sdg->sgc->capacity = capacity;
5771 }
5772 
5773 /*
5774  * Try and fix up capacity for tiny siblings, this is needed when
5775  * things like SD_ASYM_PACKING need f_b_g to select another sibling
5776  * which on its own isn't powerful enough.
5777  *
5778  * See update_sd_pick_busiest() and check_asym_packing().
5779  */
5780 static inline int
5781 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
5782 {
5783 	/*
5784 	 * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
5785 	 */
5786 	if (!(sd->flags & SD_SHARE_CPUCAPACITY))
5787 		return 0;
5788 
5789 	/*
5790 	 * If ~90% of the cpu_capacity is still there, we're good.
5791 	 */
5792 	if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
5793 		return 1;
5794 
5795 	return 0;
5796 }
5797 
5798 /*
5799  * Group imbalance indicates (and tries to solve) the problem where balancing
5800  * groups is inadequate due to tsk_cpus_allowed() constraints.
5801  *
5802  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
5803  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
5804  * Something like:
5805  *
5806  * 	{ 0 1 2 3 } { 4 5 6 7 }
5807  * 	        *     * * *
5808  *
5809  * If we were to balance group-wise we'd place two tasks in the first group and
5810  * two tasks in the second group. Clearly this is undesired as it will overload
5811  * cpu 3 and leave one of the cpus in the second group unused.
5812  *
5813  * The current solution to this issue is detecting the skew in the first group
5814  * by noticing the lower domain failed to reach balance and had difficulty
5815  * moving tasks due to affinity constraints.
5816  *
5817  * When this is so detected; this group becomes a candidate for busiest; see
5818  * update_sd_pick_busiest(). And calculate_imbalance() and
5819  * find_busiest_group() avoid some of the usual balance conditions to allow it
5820  * to create an effective group imbalance.
5821  *
5822  * This is a somewhat tricky proposition since the next run might not find the
5823  * group imbalance and decide the groups need to be balanced again. A most
5824  * subtle and fragile situation.
5825  */
5826 
5827 static inline int sg_imbalanced(struct sched_group *group)
5828 {
5829 	return group->sgc->imbalance;
5830 }
5831 
5832 /*
5833  * Compute the group capacity factor.
5834  *
5835  * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
5836  * first dividing out the smt factor and computing the actual number of cores
5837  * and limit unit capacity with that.
5838  */
5839 static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
5840 {
5841 	unsigned int capacity_factor, smt, cpus;
5842 	unsigned int capacity, capacity_orig;
5843 
5844 	capacity = group->sgc->capacity;
5845 	capacity_orig = group->sgc->capacity_orig;
5846 	cpus = group->group_weight;
5847 
5848 	/* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
5849 	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
5850 	capacity_factor = cpus / smt; /* cores */
5851 
5852 	capacity_factor = min_t(unsigned,
5853 		capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
5854 	if (!capacity_factor)
5855 		capacity_factor = fix_small_capacity(env->sd, group);
5856 
5857 	return capacity_factor;
5858 }
5859 
5860 /**
5861  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
5862  * @env: The load balancing environment.
5863  * @group: sched_group whose statistics are to be updated.
5864  * @load_idx: Load index of sched_domain of this_cpu for load calc.
5865  * @local_group: Does group contain this_cpu.
5866  * @sgs: variable to hold the statistics for this group.
5867  */
5868 static inline void update_sg_lb_stats(struct lb_env *env,
5869 			struct sched_group *group, int load_idx,
5870 			int local_group, struct sg_lb_stats *sgs)
5871 {
5872 	unsigned long load;
5873 	int i;
5874 
5875 	memset(sgs, 0, sizeof(*sgs));
5876 
5877 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5878 		struct rq *rq = cpu_rq(i);
5879 
5880 		/* Bias balancing toward cpus of our domain */
5881 		if (local_group)
5882 			load = target_load(i, load_idx);
5883 		else
5884 			load = source_load(i, load_idx);
5885 
5886 		sgs->group_load += load;
5887 		sgs->sum_nr_running += rq->nr_running;
5888 #ifdef CONFIG_NUMA_BALANCING
5889 		sgs->nr_numa_running += rq->nr_numa_running;
5890 		sgs->nr_preferred_running += rq->nr_preferred_running;
5891 #endif
5892 		sgs->sum_weighted_load += weighted_cpuload(i);
5893 		if (idle_cpu(i))
5894 			sgs->idle_cpus++;
5895 	}
5896 
5897 	/* Adjust by relative CPU capacity of the group */
5898 	sgs->group_capacity = group->sgc->capacity;
5899 	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
5900 
5901 	if (sgs->sum_nr_running)
5902 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
5903 
5904 	sgs->group_weight = group->group_weight;
5905 
5906 	sgs->group_imb = sg_imbalanced(group);
5907 	sgs->group_capacity_factor = sg_capacity_factor(env, group);
5908 
5909 	if (sgs->group_capacity_factor > sgs->sum_nr_running)
5910 		sgs->group_has_free_capacity = 1;
5911 }
5912 
5913 /**
5914  * update_sd_pick_busiest - return 1 on busiest group
5915  * @env: The load balancing environment.
5916  * @sds: sched_domain statistics
5917  * @sg: sched_group candidate to be checked for being the busiest
5918  * @sgs: sched_group statistics
5919  *
5920  * Determine if @sg is a busier group than the previously selected
5921  * busiest group.
5922  *
5923  * Return: %true if @sg is a busier group than the previously selected
5924  * busiest group. %false otherwise.
5925  */
5926 static bool update_sd_pick_busiest(struct lb_env *env,
5927 				   struct sd_lb_stats *sds,
5928 				   struct sched_group *sg,
5929 				   struct sg_lb_stats *sgs)
5930 {
5931 	if (sgs->avg_load <= sds->busiest_stat.avg_load)
5932 		return false;
5933 
5934 	if (sgs->sum_nr_running > sgs->group_capacity_factor)
5935 		return true;
5936 
5937 	if (sgs->group_imb)
5938 		return true;
5939 
5940 	/*
5941 	 * ASYM_PACKING needs to move all the work to the lowest
5942 	 * numbered CPUs in the group, therefore mark all groups
5943 	 * higher than ourself as busy.
5944 	 */
5945 	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
5946 	    env->dst_cpu < group_first_cpu(sg)) {
5947 		if (!sds->busiest)
5948 			return true;
5949 
5950 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
5951 			return true;
5952 	}
5953 
5954 	return false;
5955 }
5956 
5957 #ifdef CONFIG_NUMA_BALANCING
5958 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5959 {
5960 	if (sgs->sum_nr_running > sgs->nr_numa_running)
5961 		return regular;
5962 	if (sgs->sum_nr_running > sgs->nr_preferred_running)
5963 		return remote;
5964 	return all;
5965 }
5966 
5967 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5968 {
5969 	if (rq->nr_running > rq->nr_numa_running)
5970 		return regular;
5971 	if (rq->nr_running > rq->nr_preferred_running)
5972 		return remote;
5973 	return all;
5974 }
5975 #else
5976 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5977 {
5978 	return all;
5979 }
5980 
5981 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5982 {
5983 	return regular;
5984 }
5985 #endif /* CONFIG_NUMA_BALANCING */
5986 
5987 /**
5988  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
5989  * @env: The load balancing environment.
5990  * @sds: variable to hold the statistics for this sched_domain.
5991  */
5992 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
5993 {
5994 	struct sched_domain *child = env->sd->child;
5995 	struct sched_group *sg = env->sd->groups;
5996 	struct sg_lb_stats tmp_sgs;
5997 	int load_idx, prefer_sibling = 0;
5998 
5999 	if (child && child->flags & SD_PREFER_SIBLING)
6000 		prefer_sibling = 1;
6001 
6002 	load_idx = get_sd_load_idx(env->sd, env->idle);
6003 
6004 	do {
6005 		struct sg_lb_stats *sgs = &tmp_sgs;
6006 		int local_group;
6007 
6008 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
6009 		if (local_group) {
6010 			sds->local = sg;
6011 			sgs = &sds->local_stat;
6012 
6013 			if (env->idle != CPU_NEWLY_IDLE ||
6014 			    time_after_eq(jiffies, sg->sgc->next_update))
6015 				update_group_capacity(env->sd, env->dst_cpu);
6016 		}
6017 
6018 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
6019 
6020 		if (local_group)
6021 			goto next_group;
6022 
6023 		/*
6024 		 * In case the child domain prefers tasks go to siblings
6025 		 * first, lower the sg capacity factor to one so that we'll try
6026 		 * and move all the excess tasks away. We lower the capacity
6027 		 * of a group only if the local group has the capacity to fit
6028 		 * these excess tasks, i.e. nr_running < group_capacity_factor. The
6029 		 * extra check prevents the case where you always pull from the
6030 		 * heaviest group when it is already under-utilized (possible
6031 		 * with a large weight task outweighs the tasks on the system).
6032 		 */
6033 		if (prefer_sibling && sds->local &&
6034 		    sds->local_stat.group_has_free_capacity)
6035 			sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
6036 
6037 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6038 			sds->busiest = sg;
6039 			sds->busiest_stat = *sgs;
6040 		}
6041 
6042 next_group:
6043 		/* Now, start updating sd_lb_stats */
6044 		sds->total_load += sgs->group_load;
6045 		sds->total_capacity += sgs->group_capacity;
6046 
6047 		sg = sg->next;
6048 	} while (sg != env->sd->groups);
6049 
6050 	if (env->sd->flags & SD_NUMA)
6051 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6052 }
6053 
6054 /**
6055  * check_asym_packing - Check to see if the group is packed into the
6056  *			sched doman.
6057  *
6058  * This is primarily intended to used at the sibling level.  Some
6059  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
6060  * case of POWER7, it can move to lower SMT modes only when higher
6061  * threads are idle.  When in lower SMT modes, the threads will
6062  * perform better since they share less core resources.  Hence when we
6063  * have idle threads, we want them to be the higher ones.
6064  *
6065  * This packing function is run on idle threads.  It checks to see if
6066  * the busiest CPU in this domain (core in the P7 case) has a higher
6067  * CPU number than the packing function is being run on.  Here we are
6068  * assuming lower CPU number will be equivalent to lower a SMT thread
6069  * number.
6070  *
6071  * Return: 1 when packing is required and a task should be moved to
6072  * this CPU.  The amount of the imbalance is returned in *imbalance.
6073  *
6074  * @env: The load balancing environment.
6075  * @sds: Statistics of the sched_domain which is to be packed
6076  */
6077 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6078 {
6079 	int busiest_cpu;
6080 
6081 	if (!(env->sd->flags & SD_ASYM_PACKING))
6082 		return 0;
6083 
6084 	if (!sds->busiest)
6085 		return 0;
6086 
6087 	busiest_cpu = group_first_cpu(sds->busiest);
6088 	if (env->dst_cpu > busiest_cpu)
6089 		return 0;
6090 
6091 	env->imbalance = DIV_ROUND_CLOSEST(
6092 		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
6093 		SCHED_CAPACITY_SCALE);
6094 
6095 	return 1;
6096 }
6097 
6098 /**
6099  * fix_small_imbalance - Calculate the minor imbalance that exists
6100  *			amongst the groups of a sched_domain, during
6101  *			load balancing.
6102  * @env: The load balancing environment.
6103  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
6104  */
6105 static inline
6106 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6107 {
6108 	unsigned long tmp, capa_now = 0, capa_move = 0;
6109 	unsigned int imbn = 2;
6110 	unsigned long scaled_busy_load_per_task;
6111 	struct sg_lb_stats *local, *busiest;
6112 
6113 	local = &sds->local_stat;
6114 	busiest = &sds->busiest_stat;
6115 
6116 	if (!local->sum_nr_running)
6117 		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
6118 	else if (busiest->load_per_task > local->load_per_task)
6119 		imbn = 1;
6120 
6121 	scaled_busy_load_per_task =
6122 		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6123 		busiest->group_capacity;
6124 
6125 	if (busiest->avg_load + scaled_busy_load_per_task >=
6126 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
6127 		env->imbalance = busiest->load_per_task;
6128 		return;
6129 	}
6130 
6131 	/*
6132 	 * OK, we don't have enough imbalance to justify moving tasks,
6133 	 * however we may be able to increase total CPU capacity used by
6134 	 * moving them.
6135 	 */
6136 
6137 	capa_now += busiest->group_capacity *
6138 			min(busiest->load_per_task, busiest->avg_load);
6139 	capa_now += local->group_capacity *
6140 			min(local->load_per_task, local->avg_load);
6141 	capa_now /= SCHED_CAPACITY_SCALE;
6142 
6143 	/* Amount of load we'd subtract */
6144 	if (busiest->avg_load > scaled_busy_load_per_task) {
6145 		capa_move += busiest->group_capacity *
6146 			    min(busiest->load_per_task,
6147 				busiest->avg_load - scaled_busy_load_per_task);
6148 	}
6149 
6150 	/* Amount of load we'd add */
6151 	if (busiest->avg_load * busiest->group_capacity <
6152 	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
6153 		tmp = (busiest->avg_load * busiest->group_capacity) /
6154 		      local->group_capacity;
6155 	} else {
6156 		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6157 		      local->group_capacity;
6158 	}
6159 	capa_move += local->group_capacity *
6160 		    min(local->load_per_task, local->avg_load + tmp);
6161 	capa_move /= SCHED_CAPACITY_SCALE;
6162 
6163 	/* Move if we gain throughput */
6164 	if (capa_move > capa_now)
6165 		env->imbalance = busiest->load_per_task;
6166 }
6167 
6168 /**
6169  * calculate_imbalance - Calculate the amount of imbalance present within the
6170  *			 groups of a given sched_domain during load balance.
6171  * @env: load balance environment
6172  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
6173  */
6174 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6175 {
6176 	unsigned long max_pull, load_above_capacity = ~0UL;
6177 	struct sg_lb_stats *local, *busiest;
6178 
6179 	local = &sds->local_stat;
6180 	busiest = &sds->busiest_stat;
6181 
6182 	if (busiest->group_imb) {
6183 		/*
6184 		 * In the group_imb case we cannot rely on group-wide averages
6185 		 * to ensure cpu-load equilibrium, look at wider averages. XXX
6186 		 */
6187 		busiest->load_per_task =
6188 			min(busiest->load_per_task, sds->avg_load);
6189 	}
6190 
6191 	/*
6192 	 * In the presence of smp nice balancing, certain scenarios can have
6193 	 * max load less than avg load(as we skip the groups at or below
6194 	 * its cpu_capacity, while calculating max_load..)
6195 	 */
6196 	if (busiest->avg_load <= sds->avg_load ||
6197 	    local->avg_load >= sds->avg_load) {
6198 		env->imbalance = 0;
6199 		return fix_small_imbalance(env, sds);
6200 	}
6201 
6202 	if (!busiest->group_imb) {
6203 		/*
6204 		 * Don't want to pull so many tasks that a group would go idle.
6205 		 * Except of course for the group_imb case, since then we might
6206 		 * have to drop below capacity to reach cpu-load equilibrium.
6207 		 */
6208 		load_above_capacity =
6209 			(busiest->sum_nr_running - busiest->group_capacity_factor);
6210 
6211 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
6212 		load_above_capacity /= busiest->group_capacity;
6213 	}
6214 
6215 	/*
6216 	 * We're trying to get all the cpus to the average_load, so we don't
6217 	 * want to push ourselves above the average load, nor do we wish to
6218 	 * reduce the max loaded cpu below the average load. At the same time,
6219 	 * we also don't want to reduce the group load below the group capacity
6220 	 * (so that we can implement power-savings policies etc). Thus we look
6221 	 * for the minimum possible imbalance.
6222 	 */
6223 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
6224 
6225 	/* How much load to actually move to equalise the imbalance */
6226 	env->imbalance = min(
6227 		max_pull * busiest->group_capacity,
6228 		(sds->avg_load - local->avg_load) * local->group_capacity
6229 	) / SCHED_CAPACITY_SCALE;
6230 
6231 	/*
6232 	 * if *imbalance is less than the average load per runnable task
6233 	 * there is no guarantee that any tasks will be moved so we'll have
6234 	 * a think about bumping its value to force at least one task to be
6235 	 * moved
6236 	 */
6237 	if (env->imbalance < busiest->load_per_task)
6238 		return fix_small_imbalance(env, sds);
6239 }
6240 
6241 /******* find_busiest_group() helpers end here *********************/
6242 
6243 /**
6244  * find_busiest_group - Returns the busiest group within the sched_domain
6245  * if there is an imbalance. If there isn't an imbalance, and
6246  * the user has opted for power-savings, it returns a group whose
6247  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
6248  * such a group exists.
6249  *
6250  * Also calculates the amount of weighted load which should be moved
6251  * to restore balance.
6252  *
6253  * @env: The load balancing environment.
6254  *
6255  * Return:	- The busiest group if imbalance exists.
6256  *		- If no imbalance and user has opted for power-savings balance,
6257  *		   return the least loaded group whose CPUs can be
6258  *		   put to idle by rebalancing its tasks onto our group.
6259  */
6260 static struct sched_group *find_busiest_group(struct lb_env *env)
6261 {
6262 	struct sg_lb_stats *local, *busiest;
6263 	struct sd_lb_stats sds;
6264 
6265 	init_sd_lb_stats(&sds);
6266 
6267 	/*
6268 	 * Compute the various statistics relavent for load balancing at
6269 	 * this level.
6270 	 */
6271 	update_sd_lb_stats(env, &sds);
6272 	local = &sds.local_stat;
6273 	busiest = &sds.busiest_stat;
6274 
6275 	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
6276 	    check_asym_packing(env, &sds))
6277 		return sds.busiest;
6278 
6279 	/* There is no busy sibling group to pull tasks from */
6280 	if (!sds.busiest || busiest->sum_nr_running == 0)
6281 		goto out_balanced;
6282 
6283 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
6284 						/ sds.total_capacity;
6285 
6286 	/*
6287 	 * If the busiest group is imbalanced the below checks don't
6288 	 * work because they assume all things are equal, which typically
6289 	 * isn't true due to cpus_allowed constraints and the like.
6290 	 */
6291 	if (busiest->group_imb)
6292 		goto force_balance;
6293 
6294 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
6295 	if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
6296 	    !busiest->group_has_free_capacity)
6297 		goto force_balance;
6298 
6299 	/*
6300 	 * If the local group is more busy than the selected busiest group
6301 	 * don't try and pull any tasks.
6302 	 */
6303 	if (local->avg_load >= busiest->avg_load)
6304 		goto out_balanced;
6305 
6306 	/*
6307 	 * Don't pull any tasks if this group is already above the domain
6308 	 * average load.
6309 	 */
6310 	if (local->avg_load >= sds.avg_load)
6311 		goto out_balanced;
6312 
6313 	if (env->idle == CPU_IDLE) {
6314 		/*
6315 		 * This cpu is idle. If the busiest group load doesn't
6316 		 * have more tasks than the number of available cpu's and
6317 		 * there is no imbalance between this and busiest group
6318 		 * wrt to idle cpu's, it is balanced.
6319 		 */
6320 		if ((local->idle_cpus < busiest->idle_cpus) &&
6321 		    busiest->sum_nr_running <= busiest->group_weight)
6322 			goto out_balanced;
6323 	} else {
6324 		/*
6325 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
6326 		 * imbalance_pct to be conservative.
6327 		 */
6328 		if (100 * busiest->avg_load <=
6329 				env->sd->imbalance_pct * local->avg_load)
6330 			goto out_balanced;
6331 	}
6332 
6333 force_balance:
6334 	/* Looks like there is an imbalance. Compute it */
6335 	calculate_imbalance(env, &sds);
6336 	return sds.busiest;
6337 
6338 out_balanced:
6339 	env->imbalance = 0;
6340 	return NULL;
6341 }
6342 
6343 /*
6344  * find_busiest_queue - find the busiest runqueue among the cpus in group.
6345  */
6346 static struct rq *find_busiest_queue(struct lb_env *env,
6347 				     struct sched_group *group)
6348 {
6349 	struct rq *busiest = NULL, *rq;
6350 	unsigned long busiest_load = 0, busiest_capacity = 1;
6351 	int i;
6352 
6353 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6354 		unsigned long capacity, capacity_factor, wl;
6355 		enum fbq_type rt;
6356 
6357 		rq = cpu_rq(i);
6358 		rt = fbq_classify_rq(rq);
6359 
6360 		/*
6361 		 * We classify groups/runqueues into three groups:
6362 		 *  - regular: there are !numa tasks
6363 		 *  - remote:  there are numa tasks that run on the 'wrong' node
6364 		 *  - all:     there is no distinction
6365 		 *
6366 		 * In order to avoid migrating ideally placed numa tasks,
6367 		 * ignore those when there's better options.
6368 		 *
6369 		 * If we ignore the actual busiest queue to migrate another
6370 		 * task, the next balance pass can still reduce the busiest
6371 		 * queue by moving tasks around inside the node.
6372 		 *
6373 		 * If we cannot move enough load due to this classification
6374 		 * the next pass will adjust the group classification and
6375 		 * allow migration of more tasks.
6376 		 *
6377 		 * Both cases only affect the total convergence complexity.
6378 		 */
6379 		if (rt > env->fbq_type)
6380 			continue;
6381 
6382 		capacity = capacity_of(i);
6383 		capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
6384 		if (!capacity_factor)
6385 			capacity_factor = fix_small_capacity(env->sd, group);
6386 
6387 		wl = weighted_cpuload(i);
6388 
6389 		/*
6390 		 * When comparing with imbalance, use weighted_cpuload()
6391 		 * which is not scaled with the cpu capacity.
6392 		 */
6393 		if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
6394 			continue;
6395 
6396 		/*
6397 		 * For the load comparisons with the other cpu's, consider
6398 		 * the weighted_cpuload() scaled with the cpu capacity, so
6399 		 * that the load can be moved away from the cpu that is
6400 		 * potentially running at a lower capacity.
6401 		 *
6402 		 * Thus we're looking for max(wl_i / capacity_i), crosswise
6403 		 * multiplication to rid ourselves of the division works out
6404 		 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
6405 		 * our previous maximum.
6406 		 */
6407 		if (wl * busiest_capacity > busiest_load * capacity) {
6408 			busiest_load = wl;
6409 			busiest_capacity = capacity;
6410 			busiest = rq;
6411 		}
6412 	}
6413 
6414 	return busiest;
6415 }
6416 
6417 /*
6418  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
6419  * so long as it is large enough.
6420  */
6421 #define MAX_PINNED_INTERVAL	512
6422 
6423 /* Working cpumask for load_balance and load_balance_newidle. */
6424 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
6425 
6426 static int need_active_balance(struct lb_env *env)
6427 {
6428 	struct sched_domain *sd = env->sd;
6429 
6430 	if (env->idle == CPU_NEWLY_IDLE) {
6431 
6432 		/*
6433 		 * ASYM_PACKING needs to force migrate tasks from busy but
6434 		 * higher numbered CPUs in order to pack all tasks in the
6435 		 * lowest numbered CPUs.
6436 		 */
6437 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
6438 			return 1;
6439 	}
6440 
6441 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
6442 }
6443 
6444 static int active_load_balance_cpu_stop(void *data);
6445 
6446 static int should_we_balance(struct lb_env *env)
6447 {
6448 	struct sched_group *sg = env->sd->groups;
6449 	struct cpumask *sg_cpus, *sg_mask;
6450 	int cpu, balance_cpu = -1;
6451 
6452 	/*
6453 	 * In the newly idle case, we will allow all the cpu's
6454 	 * to do the newly idle load balance.
6455 	 */
6456 	if (env->idle == CPU_NEWLY_IDLE)
6457 		return 1;
6458 
6459 	sg_cpus = sched_group_cpus(sg);
6460 	sg_mask = sched_group_mask(sg);
6461 	/* Try to find first idle cpu */
6462 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
6463 		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
6464 			continue;
6465 
6466 		balance_cpu = cpu;
6467 		break;
6468 	}
6469 
6470 	if (balance_cpu == -1)
6471 		balance_cpu = group_balance_cpu(sg);
6472 
6473 	/*
6474 	 * First idle cpu or the first cpu(busiest) in this sched group
6475 	 * is eligible for doing load balancing at this and above domains.
6476 	 */
6477 	return balance_cpu == env->dst_cpu;
6478 }
6479 
6480 /*
6481  * Check this_cpu to ensure it is balanced within domain. Attempt to move
6482  * tasks if there is an imbalance.
6483  */
6484 static int load_balance(int this_cpu, struct rq *this_rq,
6485 			struct sched_domain *sd, enum cpu_idle_type idle,
6486 			int *continue_balancing)
6487 {
6488 	int ld_moved, cur_ld_moved, active_balance = 0;
6489 	struct sched_domain *sd_parent = sd->parent;
6490 	struct sched_group *group;
6491 	struct rq *busiest;
6492 	unsigned long flags;
6493 	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
6494 
6495 	struct lb_env env = {
6496 		.sd		= sd,
6497 		.dst_cpu	= this_cpu,
6498 		.dst_rq		= this_rq,
6499 		.dst_grpmask    = sched_group_cpus(sd->groups),
6500 		.idle		= idle,
6501 		.loop_break	= sched_nr_migrate_break,
6502 		.cpus		= cpus,
6503 		.fbq_type	= all,
6504 	};
6505 
6506 	/*
6507 	 * For NEWLY_IDLE load_balancing, we don't need to consider
6508 	 * other cpus in our group
6509 	 */
6510 	if (idle == CPU_NEWLY_IDLE)
6511 		env.dst_grpmask = NULL;
6512 
6513 	cpumask_copy(cpus, cpu_active_mask);
6514 
6515 	schedstat_inc(sd, lb_count[idle]);
6516 
6517 redo:
6518 	if (!should_we_balance(&env)) {
6519 		*continue_balancing = 0;
6520 		goto out_balanced;
6521 	}
6522 
6523 	group = find_busiest_group(&env);
6524 	if (!group) {
6525 		schedstat_inc(sd, lb_nobusyg[idle]);
6526 		goto out_balanced;
6527 	}
6528 
6529 	busiest = find_busiest_queue(&env, group);
6530 	if (!busiest) {
6531 		schedstat_inc(sd, lb_nobusyq[idle]);
6532 		goto out_balanced;
6533 	}
6534 
6535 	BUG_ON(busiest == env.dst_rq);
6536 
6537 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
6538 
6539 	ld_moved = 0;
6540 	if (busiest->nr_running > 1) {
6541 		/*
6542 		 * Attempt to move tasks. If find_busiest_group has found
6543 		 * an imbalance but busiest->nr_running <= 1, the group is
6544 		 * still unbalanced. ld_moved simply stays zero, so it is
6545 		 * correctly treated as an imbalance.
6546 		 */
6547 		env.flags |= LBF_ALL_PINNED;
6548 		env.src_cpu   = busiest->cpu;
6549 		env.src_rq    = busiest;
6550 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
6551 
6552 more_balance:
6553 		local_irq_save(flags);
6554 		double_rq_lock(env.dst_rq, busiest);
6555 
6556 		/*
6557 		 * cur_ld_moved - load moved in current iteration
6558 		 * ld_moved     - cumulative load moved across iterations
6559 		 */
6560 		cur_ld_moved = move_tasks(&env);
6561 		ld_moved += cur_ld_moved;
6562 		double_rq_unlock(env.dst_rq, busiest);
6563 		local_irq_restore(flags);
6564 
6565 		/*
6566 		 * some other cpu did the load balance for us.
6567 		 */
6568 		if (cur_ld_moved && env.dst_cpu != smp_processor_id())
6569 			resched_cpu(env.dst_cpu);
6570 
6571 		if (env.flags & LBF_NEED_BREAK) {
6572 			env.flags &= ~LBF_NEED_BREAK;
6573 			goto more_balance;
6574 		}
6575 
6576 		/*
6577 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
6578 		 * us and move them to an alternate dst_cpu in our sched_group
6579 		 * where they can run. The upper limit on how many times we
6580 		 * iterate on same src_cpu is dependent on number of cpus in our
6581 		 * sched_group.
6582 		 *
6583 		 * This changes load balance semantics a bit on who can move
6584 		 * load to a given_cpu. In addition to the given_cpu itself
6585 		 * (or a ilb_cpu acting on its behalf where given_cpu is
6586 		 * nohz-idle), we now have balance_cpu in a position to move
6587 		 * load to given_cpu. In rare situations, this may cause
6588 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
6589 		 * _independently_ and at _same_ time to move some load to
6590 		 * given_cpu) causing exceess load to be moved to given_cpu.
6591 		 * This however should not happen so much in practice and
6592 		 * moreover subsequent load balance cycles should correct the
6593 		 * excess load moved.
6594 		 */
6595 		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6596 
6597 			/* Prevent to re-select dst_cpu via env's cpus */
6598 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
6599 
6600 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
6601 			env.dst_cpu	 = env.new_dst_cpu;
6602 			env.flags	&= ~LBF_DST_PINNED;
6603 			env.loop	 = 0;
6604 			env.loop_break	 = sched_nr_migrate_break;
6605 
6606 			/*
6607 			 * Go back to "more_balance" rather than "redo" since we
6608 			 * need to continue with same src_cpu.
6609 			 */
6610 			goto more_balance;
6611 		}
6612 
6613 		/*
6614 		 * We failed to reach balance because of affinity.
6615 		 */
6616 		if (sd_parent) {
6617 			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6618 
6619 			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6620 				*group_imbalance = 1;
6621 			} else if (*group_imbalance)
6622 				*group_imbalance = 0;
6623 		}
6624 
6625 		/* All tasks on this runqueue were pinned by CPU affinity */
6626 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
6627 			cpumask_clear_cpu(cpu_of(busiest), cpus);
6628 			if (!cpumask_empty(cpus)) {
6629 				env.loop = 0;
6630 				env.loop_break = sched_nr_migrate_break;
6631 				goto redo;
6632 			}
6633 			goto out_balanced;
6634 		}
6635 	}
6636 
6637 	if (!ld_moved) {
6638 		schedstat_inc(sd, lb_failed[idle]);
6639 		/*
6640 		 * Increment the failure counter only on periodic balance.
6641 		 * We do not want newidle balance, which can be very
6642 		 * frequent, pollute the failure counter causing
6643 		 * excessive cache_hot migrations and active balances.
6644 		 */
6645 		if (idle != CPU_NEWLY_IDLE)
6646 			sd->nr_balance_failed++;
6647 
6648 		if (need_active_balance(&env)) {
6649 			raw_spin_lock_irqsave(&busiest->lock, flags);
6650 
6651 			/* don't kick the active_load_balance_cpu_stop,
6652 			 * if the curr task on busiest cpu can't be
6653 			 * moved to this_cpu
6654 			 */
6655 			if (!cpumask_test_cpu(this_cpu,
6656 					tsk_cpus_allowed(busiest->curr))) {
6657 				raw_spin_unlock_irqrestore(&busiest->lock,
6658 							    flags);
6659 				env.flags |= LBF_ALL_PINNED;
6660 				goto out_one_pinned;
6661 			}
6662 
6663 			/*
6664 			 * ->active_balance synchronizes accesses to
6665 			 * ->active_balance_work.  Once set, it's cleared
6666 			 * only after active load balance is finished.
6667 			 */
6668 			if (!busiest->active_balance) {
6669 				busiest->active_balance = 1;
6670 				busiest->push_cpu = this_cpu;
6671 				active_balance = 1;
6672 			}
6673 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
6674 
6675 			if (active_balance) {
6676 				stop_one_cpu_nowait(cpu_of(busiest),
6677 					active_load_balance_cpu_stop, busiest,
6678 					&busiest->active_balance_work);
6679 			}
6680 
6681 			/*
6682 			 * We've kicked active balancing, reset the failure
6683 			 * counter.
6684 			 */
6685 			sd->nr_balance_failed = sd->cache_nice_tries+1;
6686 		}
6687 	} else
6688 		sd->nr_balance_failed = 0;
6689 
6690 	if (likely(!active_balance)) {
6691 		/* We were unbalanced, so reset the balancing interval */
6692 		sd->balance_interval = sd->min_interval;
6693 	} else {
6694 		/*
6695 		 * If we've begun active balancing, start to back off. This
6696 		 * case may not be covered by the all_pinned logic if there
6697 		 * is only 1 task on the busy runqueue (because we don't call
6698 		 * move_tasks).
6699 		 */
6700 		if (sd->balance_interval < sd->max_interval)
6701 			sd->balance_interval *= 2;
6702 	}
6703 
6704 	goto out;
6705 
6706 out_balanced:
6707 	schedstat_inc(sd, lb_balanced[idle]);
6708 
6709 	sd->nr_balance_failed = 0;
6710 
6711 out_one_pinned:
6712 	/* tune up the balancing interval */
6713 	if (((env.flags & LBF_ALL_PINNED) &&
6714 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
6715 			(sd->balance_interval < sd->max_interval))
6716 		sd->balance_interval *= 2;
6717 
6718 	ld_moved = 0;
6719 out:
6720 	return ld_moved;
6721 }
6722 
6723 static inline unsigned long
6724 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
6725 {
6726 	unsigned long interval = sd->balance_interval;
6727 
6728 	if (cpu_busy)
6729 		interval *= sd->busy_factor;
6730 
6731 	/* scale ms to jiffies */
6732 	interval = msecs_to_jiffies(interval);
6733 	interval = clamp(interval, 1UL, max_load_balance_interval);
6734 
6735 	return interval;
6736 }
6737 
6738 static inline void
6739 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
6740 {
6741 	unsigned long interval, next;
6742 
6743 	interval = get_sd_balance_interval(sd, cpu_busy);
6744 	next = sd->last_balance + interval;
6745 
6746 	if (time_after(*next_balance, next))
6747 		*next_balance = next;
6748 }
6749 
6750 /*
6751  * idle_balance is called by schedule() if this_cpu is about to become
6752  * idle. Attempts to pull tasks from other CPUs.
6753  */
6754 static int idle_balance(struct rq *this_rq)
6755 {
6756 	unsigned long next_balance = jiffies + HZ;
6757 	int this_cpu = this_rq->cpu;
6758 	struct sched_domain *sd;
6759 	int pulled_task = 0;
6760 	u64 curr_cost = 0;
6761 
6762 	idle_enter_fair(this_rq);
6763 
6764 	/*
6765 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
6766 	 * measure the duration of idle_balance() as idle time.
6767 	 */
6768 	this_rq->idle_stamp = rq_clock(this_rq);
6769 
6770 	if (this_rq->avg_idle < sysctl_sched_migration_cost) {
6771 		rcu_read_lock();
6772 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
6773 		if (sd)
6774 			update_next_balance(sd, 0, &next_balance);
6775 		rcu_read_unlock();
6776 
6777 		goto out;
6778 	}
6779 
6780 	/*
6781 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
6782 	 */
6783 	raw_spin_unlock(&this_rq->lock);
6784 
6785 	update_blocked_averages(this_cpu);
6786 	rcu_read_lock();
6787 	for_each_domain(this_cpu, sd) {
6788 		int continue_balancing = 1;
6789 		u64 t0, domain_cost;
6790 
6791 		if (!(sd->flags & SD_LOAD_BALANCE))
6792 			continue;
6793 
6794 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
6795 			update_next_balance(sd, 0, &next_balance);
6796 			break;
6797 		}
6798 
6799 		if (sd->flags & SD_BALANCE_NEWIDLE) {
6800 			t0 = sched_clock_cpu(this_cpu);
6801 
6802 			pulled_task = load_balance(this_cpu, this_rq,
6803 						   sd, CPU_NEWLY_IDLE,
6804 						   &continue_balancing);
6805 
6806 			domain_cost = sched_clock_cpu(this_cpu) - t0;
6807 			if (domain_cost > sd->max_newidle_lb_cost)
6808 				sd->max_newidle_lb_cost = domain_cost;
6809 
6810 			curr_cost += domain_cost;
6811 		}
6812 
6813 		update_next_balance(sd, 0, &next_balance);
6814 
6815 		/*
6816 		 * Stop searching for tasks to pull if there are
6817 		 * now runnable tasks on this rq.
6818 		 */
6819 		if (pulled_task || this_rq->nr_running > 0)
6820 			break;
6821 	}
6822 	rcu_read_unlock();
6823 
6824 	raw_spin_lock(&this_rq->lock);
6825 
6826 	if (curr_cost > this_rq->max_idle_balance_cost)
6827 		this_rq->max_idle_balance_cost = curr_cost;
6828 
6829 	/*
6830 	 * While browsing the domains, we released the rq lock, a task could
6831 	 * have been enqueued in the meantime. Since we're not going idle,
6832 	 * pretend we pulled a task.
6833 	 */
6834 	if (this_rq->cfs.h_nr_running && !pulled_task)
6835 		pulled_task = 1;
6836 
6837 out:
6838 	/* Move the next balance forward */
6839 	if (time_after(this_rq->next_balance, next_balance))
6840 		this_rq->next_balance = next_balance;
6841 
6842 	/* Is there a task of a high priority class? */
6843 	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
6844 		pulled_task = -1;
6845 
6846 	if (pulled_task) {
6847 		idle_exit_fair(this_rq);
6848 		this_rq->idle_stamp = 0;
6849 	}
6850 
6851 	return pulled_task;
6852 }
6853 
6854 /*
6855  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
6856  * running tasks off the busiest CPU onto idle CPUs. It requires at
6857  * least 1 task to be running on each physical CPU where possible, and
6858  * avoids physical / logical imbalances.
6859  */
6860 static int active_load_balance_cpu_stop(void *data)
6861 {
6862 	struct rq *busiest_rq = data;
6863 	int busiest_cpu = cpu_of(busiest_rq);
6864 	int target_cpu = busiest_rq->push_cpu;
6865 	struct rq *target_rq = cpu_rq(target_cpu);
6866 	struct sched_domain *sd;
6867 
6868 	raw_spin_lock_irq(&busiest_rq->lock);
6869 
6870 	/* make sure the requested cpu hasn't gone down in the meantime */
6871 	if (unlikely(busiest_cpu != smp_processor_id() ||
6872 		     !busiest_rq->active_balance))
6873 		goto out_unlock;
6874 
6875 	/* Is there any task to move? */
6876 	if (busiest_rq->nr_running <= 1)
6877 		goto out_unlock;
6878 
6879 	/*
6880 	 * This condition is "impossible", if it occurs
6881 	 * we need to fix it. Originally reported by
6882 	 * Bjorn Helgaas on a 128-cpu setup.
6883 	 */
6884 	BUG_ON(busiest_rq == target_rq);
6885 
6886 	/* move a task from busiest_rq to target_rq */
6887 	double_lock_balance(busiest_rq, target_rq);
6888 
6889 	/* Search for an sd spanning us and the target CPU. */
6890 	rcu_read_lock();
6891 	for_each_domain(target_cpu, sd) {
6892 		if ((sd->flags & SD_LOAD_BALANCE) &&
6893 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6894 				break;
6895 	}
6896 
6897 	if (likely(sd)) {
6898 		struct lb_env env = {
6899 			.sd		= sd,
6900 			.dst_cpu	= target_cpu,
6901 			.dst_rq		= target_rq,
6902 			.src_cpu	= busiest_rq->cpu,
6903 			.src_rq		= busiest_rq,
6904 			.idle		= CPU_IDLE,
6905 		};
6906 
6907 		schedstat_inc(sd, alb_count);
6908 
6909 		if (move_one_task(&env))
6910 			schedstat_inc(sd, alb_pushed);
6911 		else
6912 			schedstat_inc(sd, alb_failed);
6913 	}
6914 	rcu_read_unlock();
6915 	double_unlock_balance(busiest_rq, target_rq);
6916 out_unlock:
6917 	busiest_rq->active_balance = 0;
6918 	raw_spin_unlock_irq(&busiest_rq->lock);
6919 	return 0;
6920 }
6921 
6922 static inline int on_null_domain(struct rq *rq)
6923 {
6924 	return unlikely(!rcu_dereference_sched(rq->sd));
6925 }
6926 
6927 #ifdef CONFIG_NO_HZ_COMMON
6928 /*
6929  * idle load balancing details
6930  * - When one of the busy CPUs notice that there may be an idle rebalancing
6931  *   needed, they will kick the idle load balancer, which then does idle
6932  *   load balancing for all the idle CPUs.
6933  */
6934 static struct {
6935 	cpumask_var_t idle_cpus_mask;
6936 	atomic_t nr_cpus;
6937 	unsigned long next_balance;     /* in jiffy units */
6938 } nohz ____cacheline_aligned;
6939 
6940 static inline int find_new_ilb(void)
6941 {
6942 	int ilb = cpumask_first(nohz.idle_cpus_mask);
6943 
6944 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
6945 		return ilb;
6946 
6947 	return nr_cpu_ids;
6948 }
6949 
6950 /*
6951  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
6952  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
6953  * CPU (if there is one).
6954  */
6955 static void nohz_balancer_kick(void)
6956 {
6957 	int ilb_cpu;
6958 
6959 	nohz.next_balance++;
6960 
6961 	ilb_cpu = find_new_ilb();
6962 
6963 	if (ilb_cpu >= nr_cpu_ids)
6964 		return;
6965 
6966 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
6967 		return;
6968 	/*
6969 	 * Use smp_send_reschedule() instead of resched_cpu().
6970 	 * This way we generate a sched IPI on the target cpu which
6971 	 * is idle. And the softirq performing nohz idle load balance
6972 	 * will be run before returning from the IPI.
6973 	 */
6974 	smp_send_reschedule(ilb_cpu);
6975 	return;
6976 }
6977 
6978 static inline void nohz_balance_exit_idle(int cpu)
6979 {
6980 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
6981 		/*
6982 		 * Completely isolated CPUs don't ever set, so we must test.
6983 		 */
6984 		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
6985 			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
6986 			atomic_dec(&nohz.nr_cpus);
6987 		}
6988 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6989 	}
6990 }
6991 
6992 static inline void set_cpu_sd_state_busy(void)
6993 {
6994 	struct sched_domain *sd;
6995 	int cpu = smp_processor_id();
6996 
6997 	rcu_read_lock();
6998 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
6999 
7000 	if (!sd || !sd->nohz_idle)
7001 		goto unlock;
7002 	sd->nohz_idle = 0;
7003 
7004 	atomic_inc(&sd->groups->sgc->nr_busy_cpus);
7005 unlock:
7006 	rcu_read_unlock();
7007 }
7008 
7009 void set_cpu_sd_state_idle(void)
7010 {
7011 	struct sched_domain *sd;
7012 	int cpu = smp_processor_id();
7013 
7014 	rcu_read_lock();
7015 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7016 
7017 	if (!sd || sd->nohz_idle)
7018 		goto unlock;
7019 	sd->nohz_idle = 1;
7020 
7021 	atomic_dec(&sd->groups->sgc->nr_busy_cpus);
7022 unlock:
7023 	rcu_read_unlock();
7024 }
7025 
7026 /*
7027  * This routine will record that the cpu is going idle with tick stopped.
7028  * This info will be used in performing idle load balancing in the future.
7029  */
7030 void nohz_balance_enter_idle(int cpu)
7031 {
7032 	/*
7033 	 * If this cpu is going down, then nothing needs to be done.
7034 	 */
7035 	if (!cpu_active(cpu))
7036 		return;
7037 
7038 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
7039 		return;
7040 
7041 	/*
7042 	 * If we're a completely isolated CPU, we don't play.
7043 	 */
7044 	if (on_null_domain(cpu_rq(cpu)))
7045 		return;
7046 
7047 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
7048 	atomic_inc(&nohz.nr_cpus);
7049 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7050 }
7051 
7052 static int sched_ilb_notifier(struct notifier_block *nfb,
7053 					unsigned long action, void *hcpu)
7054 {
7055 	switch (action & ~CPU_TASKS_FROZEN) {
7056 	case CPU_DYING:
7057 		nohz_balance_exit_idle(smp_processor_id());
7058 		return NOTIFY_OK;
7059 	default:
7060 		return NOTIFY_DONE;
7061 	}
7062 }
7063 #endif
7064 
7065 static DEFINE_SPINLOCK(balancing);
7066 
7067 /*
7068  * Scale the max load_balance interval with the number of CPUs in the system.
7069  * This trades load-balance latency on larger machines for less cross talk.
7070  */
7071 void update_max_interval(void)
7072 {
7073 	max_load_balance_interval = HZ*num_online_cpus()/10;
7074 }
7075 
7076 /*
7077  * It checks each scheduling domain to see if it is due to be balanced,
7078  * and initiates a balancing operation if so.
7079  *
7080  * Balancing parameters are set up in init_sched_domains.
7081  */
7082 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7083 {
7084 	int continue_balancing = 1;
7085 	int cpu = rq->cpu;
7086 	unsigned long interval;
7087 	struct sched_domain *sd;
7088 	/* Earliest time when we have to do rebalance again */
7089 	unsigned long next_balance = jiffies + 60*HZ;
7090 	int update_next_balance = 0;
7091 	int need_serialize, need_decay = 0;
7092 	u64 max_cost = 0;
7093 
7094 	update_blocked_averages(cpu);
7095 
7096 	rcu_read_lock();
7097 	for_each_domain(cpu, sd) {
7098 		/*
7099 		 * Decay the newidle max times here because this is a regular
7100 		 * visit to all the domains. Decay ~1% per second.
7101 		 */
7102 		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
7103 			sd->max_newidle_lb_cost =
7104 				(sd->max_newidle_lb_cost * 253) / 256;
7105 			sd->next_decay_max_lb_cost = jiffies + HZ;
7106 			need_decay = 1;
7107 		}
7108 		max_cost += sd->max_newidle_lb_cost;
7109 
7110 		if (!(sd->flags & SD_LOAD_BALANCE))
7111 			continue;
7112 
7113 		/*
7114 		 * Stop the load balance at this level. There is another
7115 		 * CPU in our sched group which is doing load balancing more
7116 		 * actively.
7117 		 */
7118 		if (!continue_balancing) {
7119 			if (need_decay)
7120 				continue;
7121 			break;
7122 		}
7123 
7124 		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7125 
7126 		need_serialize = sd->flags & SD_SERIALIZE;
7127 		if (need_serialize) {
7128 			if (!spin_trylock(&balancing))
7129 				goto out;
7130 		}
7131 
7132 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
7133 			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
7134 				/*
7135 				 * The LBF_DST_PINNED logic could have changed
7136 				 * env->dst_cpu, so we can't know our idle
7137 				 * state even if we migrated tasks. Update it.
7138 				 */
7139 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7140 			}
7141 			sd->last_balance = jiffies;
7142 			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7143 		}
7144 		if (need_serialize)
7145 			spin_unlock(&balancing);
7146 out:
7147 		if (time_after(next_balance, sd->last_balance + interval)) {
7148 			next_balance = sd->last_balance + interval;
7149 			update_next_balance = 1;
7150 		}
7151 	}
7152 	if (need_decay) {
7153 		/*
7154 		 * Ensure the rq-wide value also decays but keep it at a
7155 		 * reasonable floor to avoid funnies with rq->avg_idle.
7156 		 */
7157 		rq->max_idle_balance_cost =
7158 			max((u64)sysctl_sched_migration_cost, max_cost);
7159 	}
7160 	rcu_read_unlock();
7161 
7162 	/*
7163 	 * next_balance will be updated only when there is a need.
7164 	 * When the cpu is attached to null domain for ex, it will not be
7165 	 * updated.
7166 	 */
7167 	if (likely(update_next_balance))
7168 		rq->next_balance = next_balance;
7169 }
7170 
7171 #ifdef CONFIG_NO_HZ_COMMON
7172 /*
7173  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
7174  * rebalancing for all the cpus for whom scheduler ticks are stopped.
7175  */
7176 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7177 {
7178 	int this_cpu = this_rq->cpu;
7179 	struct rq *rq;
7180 	int balance_cpu;
7181 
7182 	if (idle != CPU_IDLE ||
7183 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
7184 		goto end;
7185 
7186 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
7187 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
7188 			continue;
7189 
7190 		/*
7191 		 * If this cpu gets work to do, stop the load balancing
7192 		 * work being done for other cpus. Next load
7193 		 * balancing owner will pick it up.
7194 		 */
7195 		if (need_resched())
7196 			break;
7197 
7198 		rq = cpu_rq(balance_cpu);
7199 
7200 		/*
7201 		 * If time for next balance is due,
7202 		 * do the balance.
7203 		 */
7204 		if (time_after_eq(jiffies, rq->next_balance)) {
7205 			raw_spin_lock_irq(&rq->lock);
7206 			update_rq_clock(rq);
7207 			update_idle_cpu_load(rq);
7208 			raw_spin_unlock_irq(&rq->lock);
7209 			rebalance_domains(rq, CPU_IDLE);
7210 		}
7211 
7212 		if (time_after(this_rq->next_balance, rq->next_balance))
7213 			this_rq->next_balance = rq->next_balance;
7214 	}
7215 	nohz.next_balance = this_rq->next_balance;
7216 end:
7217 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
7218 }
7219 
7220 /*
7221  * Current heuristic for kicking the idle load balancer in the presence
7222  * of an idle cpu is the system.
7223  *   - This rq has more than one task.
7224  *   - At any scheduler domain level, this cpu's scheduler group has multiple
7225  *     busy cpu's exceeding the group's capacity.
7226  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
7227  *     domain span are idle.
7228  */
7229 static inline int nohz_kick_needed(struct rq *rq)
7230 {
7231 	unsigned long now = jiffies;
7232 	struct sched_domain *sd;
7233 	struct sched_group_capacity *sgc;
7234 	int nr_busy, cpu = rq->cpu;
7235 
7236 	if (unlikely(rq->idle_balance))
7237 		return 0;
7238 
7239        /*
7240 	* We may be recently in ticked or tickless idle mode. At the first
7241 	* busy tick after returning from idle, we will update the busy stats.
7242 	*/
7243 	set_cpu_sd_state_busy();
7244 	nohz_balance_exit_idle(cpu);
7245 
7246 	/*
7247 	 * None are in tickless mode and hence no need for NOHZ idle load
7248 	 * balancing.
7249 	 */
7250 	if (likely(!atomic_read(&nohz.nr_cpus)))
7251 		return 0;
7252 
7253 	if (time_before(now, nohz.next_balance))
7254 		return 0;
7255 
7256 	if (rq->nr_running >= 2)
7257 		goto need_kick;
7258 
7259 	rcu_read_lock();
7260 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7261 
7262 	if (sd) {
7263 		sgc = sd->groups->sgc;
7264 		nr_busy = atomic_read(&sgc->nr_busy_cpus);
7265 
7266 		if (nr_busy > 1)
7267 			goto need_kick_unlock;
7268 	}
7269 
7270 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
7271 
7272 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
7273 				  sched_domain_span(sd)) < cpu))
7274 		goto need_kick_unlock;
7275 
7276 	rcu_read_unlock();
7277 	return 0;
7278 
7279 need_kick_unlock:
7280 	rcu_read_unlock();
7281 need_kick:
7282 	return 1;
7283 }
7284 #else
7285 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
7286 #endif
7287 
7288 /*
7289  * run_rebalance_domains is triggered when needed from the scheduler tick.
7290  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
7291  */
7292 static void run_rebalance_domains(struct softirq_action *h)
7293 {
7294 	struct rq *this_rq = this_rq();
7295 	enum cpu_idle_type idle = this_rq->idle_balance ?
7296 						CPU_IDLE : CPU_NOT_IDLE;
7297 
7298 	rebalance_domains(this_rq, idle);
7299 
7300 	/*
7301 	 * If this cpu has a pending nohz_balance_kick, then do the
7302 	 * balancing on behalf of the other idle cpus whose ticks are
7303 	 * stopped.
7304 	 */
7305 	nohz_idle_balance(this_rq, idle);
7306 }
7307 
7308 /*
7309  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
7310  */
7311 void trigger_load_balance(struct rq *rq)
7312 {
7313 	/* Don't need to rebalance while attached to NULL domain */
7314 	if (unlikely(on_null_domain(rq)))
7315 		return;
7316 
7317 	if (time_after_eq(jiffies, rq->next_balance))
7318 		raise_softirq(SCHED_SOFTIRQ);
7319 #ifdef CONFIG_NO_HZ_COMMON
7320 	if (nohz_kick_needed(rq))
7321 		nohz_balancer_kick();
7322 #endif
7323 }
7324 
7325 static void rq_online_fair(struct rq *rq)
7326 {
7327 	update_sysctl();
7328 }
7329 
7330 static void rq_offline_fair(struct rq *rq)
7331 {
7332 	update_sysctl();
7333 
7334 	/* Ensure any throttled groups are reachable by pick_next_task */
7335 	unthrottle_offline_cfs_rqs(rq);
7336 }
7337 
7338 #endif /* CONFIG_SMP */
7339 
7340 /*
7341  * scheduler tick hitting a task of our scheduling class:
7342  */
7343 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
7344 {
7345 	struct cfs_rq *cfs_rq;
7346 	struct sched_entity *se = &curr->se;
7347 
7348 	for_each_sched_entity(se) {
7349 		cfs_rq = cfs_rq_of(se);
7350 		entity_tick(cfs_rq, se, queued);
7351 	}
7352 
7353 	if (numabalancing_enabled)
7354 		task_tick_numa(rq, curr);
7355 
7356 	update_rq_runnable_avg(rq, 1);
7357 }
7358 
7359 /*
7360  * called on fork with the child task as argument from the parent's context
7361  *  - child not yet on the tasklist
7362  *  - preemption disabled
7363  */
7364 static void task_fork_fair(struct task_struct *p)
7365 {
7366 	struct cfs_rq *cfs_rq;
7367 	struct sched_entity *se = &p->se, *curr;
7368 	int this_cpu = smp_processor_id();
7369 	struct rq *rq = this_rq();
7370 	unsigned long flags;
7371 
7372 	raw_spin_lock_irqsave(&rq->lock, flags);
7373 
7374 	update_rq_clock(rq);
7375 
7376 	cfs_rq = task_cfs_rq(current);
7377 	curr = cfs_rq->curr;
7378 
7379 	/*
7380 	 * Not only the cpu but also the task_group of the parent might have
7381 	 * been changed after parent->se.parent,cfs_rq were copied to
7382 	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
7383 	 * of child point to valid ones.
7384 	 */
7385 	rcu_read_lock();
7386 	__set_task_cpu(p, this_cpu);
7387 	rcu_read_unlock();
7388 
7389 	update_curr(cfs_rq);
7390 
7391 	if (curr)
7392 		se->vruntime = curr->vruntime;
7393 	place_entity(cfs_rq, se, 1);
7394 
7395 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
7396 		/*
7397 		 * Upon rescheduling, sched_class::put_prev_task() will place
7398 		 * 'current' within the tree based on its new key value.
7399 		 */
7400 		swap(curr->vruntime, se->vruntime);
7401 		resched_task(rq->curr);
7402 	}
7403 
7404 	se->vruntime -= cfs_rq->min_vruntime;
7405 
7406 	raw_spin_unlock_irqrestore(&rq->lock, flags);
7407 }
7408 
7409 /*
7410  * Priority of the task has changed. Check to see if we preempt
7411  * the current task.
7412  */
7413 static void
7414 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7415 {
7416 	if (!p->se.on_rq)
7417 		return;
7418 
7419 	/*
7420 	 * Reschedule if we are currently running on this runqueue and
7421 	 * our priority decreased, or if we are not currently running on
7422 	 * this runqueue and our priority is higher than the current's
7423 	 */
7424 	if (rq->curr == p) {
7425 		if (p->prio > oldprio)
7426 			resched_task(rq->curr);
7427 	} else
7428 		check_preempt_curr(rq, p, 0);
7429 }
7430 
7431 static void switched_from_fair(struct rq *rq, struct task_struct *p)
7432 {
7433 	struct sched_entity *se = &p->se;
7434 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
7435 
7436 	/*
7437 	 * Ensure the task's vruntime is normalized, so that when it's
7438 	 * switched back to the fair class the enqueue_entity(.flags=0) will
7439 	 * do the right thing.
7440 	 *
7441 	 * If it's on_rq, then the dequeue_entity(.flags=0) will already
7442 	 * have normalized the vruntime, if it's !on_rq, then only when
7443 	 * the task is sleeping will it still have non-normalized vruntime.
7444 	 */
7445 	if (!p->on_rq && p->state != TASK_RUNNING) {
7446 		/*
7447 		 * Fix up our vruntime so that the current sleep doesn't
7448 		 * cause 'unlimited' sleep bonus.
7449 		 */
7450 		place_entity(cfs_rq, se, 0);
7451 		se->vruntime -= cfs_rq->min_vruntime;
7452 	}
7453 
7454 #ifdef CONFIG_SMP
7455 	/*
7456 	* Remove our load from contribution when we leave sched_fair
7457 	* and ensure we don't carry in an old decay_count if we
7458 	* switch back.
7459 	*/
7460 	if (se->avg.decay_count) {
7461 		__synchronize_entity_decay(se);
7462 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
7463 	}
7464 #endif
7465 }
7466 
7467 /*
7468  * We switched to the sched_fair class.
7469  */
7470 static void switched_to_fair(struct rq *rq, struct task_struct *p)
7471 {
7472 	struct sched_entity *se = &p->se;
7473 #ifdef CONFIG_FAIR_GROUP_SCHED
7474 	/*
7475 	 * Since the real-depth could have been changed (only FAIR
7476 	 * class maintain depth value), reset depth properly.
7477 	 */
7478 	se->depth = se->parent ? se->parent->depth + 1 : 0;
7479 #endif
7480 	if (!se->on_rq)
7481 		return;
7482 
7483 	/*
7484 	 * We were most likely switched from sched_rt, so
7485 	 * kick off the schedule if running, otherwise just see
7486 	 * if we can still preempt the current task.
7487 	 */
7488 	if (rq->curr == p)
7489 		resched_task(rq->curr);
7490 	else
7491 		check_preempt_curr(rq, p, 0);
7492 }
7493 
7494 /* Account for a task changing its policy or group.
7495  *
7496  * This routine is mostly called to set cfs_rq->curr field when a task
7497  * migrates between groups/classes.
7498  */
7499 static void set_curr_task_fair(struct rq *rq)
7500 {
7501 	struct sched_entity *se = &rq->curr->se;
7502 
7503 	for_each_sched_entity(se) {
7504 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
7505 
7506 		set_next_entity(cfs_rq, se);
7507 		/* ensure bandwidth has been allocated on our new cfs_rq */
7508 		account_cfs_rq_runtime(cfs_rq, 0);
7509 	}
7510 }
7511 
7512 void init_cfs_rq(struct cfs_rq *cfs_rq)
7513 {
7514 	cfs_rq->tasks_timeline = RB_ROOT;
7515 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7516 #ifndef CONFIG_64BIT
7517 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7518 #endif
7519 #ifdef CONFIG_SMP
7520 	atomic64_set(&cfs_rq->decay_counter, 1);
7521 	atomic_long_set(&cfs_rq->removed_load, 0);
7522 #endif
7523 }
7524 
7525 #ifdef CONFIG_FAIR_GROUP_SCHED
7526 static void task_move_group_fair(struct task_struct *p, int on_rq)
7527 {
7528 	struct sched_entity *se = &p->se;
7529 	struct cfs_rq *cfs_rq;
7530 
7531 	/*
7532 	 * If the task was not on the rq at the time of this cgroup movement
7533 	 * it must have been asleep, sleeping tasks keep their ->vruntime
7534 	 * absolute on their old rq until wakeup (needed for the fair sleeper
7535 	 * bonus in place_entity()).
7536 	 *
7537 	 * If it was on the rq, we've just 'preempted' it, which does convert
7538 	 * ->vruntime to a relative base.
7539 	 *
7540 	 * Make sure both cases convert their relative position when migrating
7541 	 * to another cgroup's rq. This does somewhat interfere with the
7542 	 * fair sleeper stuff for the first placement, but who cares.
7543 	 */
7544 	/*
7545 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
7546 	 * But there are some cases where it has already been normalized:
7547 	 *
7548 	 * - Moving a forked child which is waiting for being woken up by
7549 	 *   wake_up_new_task().
7550 	 * - Moving a task which has been woken up by try_to_wake_up() and
7551 	 *   waiting for actually being woken up by sched_ttwu_pending().
7552 	 *
7553 	 * To prevent boost or penalty in the new cfs_rq caused by delta
7554 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7555 	 */
7556 	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7557 		on_rq = 1;
7558 
7559 	if (!on_rq)
7560 		se->vruntime -= cfs_rq_of(se)->min_vruntime;
7561 	set_task_rq(p, task_cpu(p));
7562 	se->depth = se->parent ? se->parent->depth + 1 : 0;
7563 	if (!on_rq) {
7564 		cfs_rq = cfs_rq_of(se);
7565 		se->vruntime += cfs_rq->min_vruntime;
7566 #ifdef CONFIG_SMP
7567 		/*
7568 		 * migrate_task_rq_fair() will have removed our previous
7569 		 * contribution, but we must synchronize for ongoing future
7570 		 * decay.
7571 		 */
7572 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7573 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
7574 #endif
7575 	}
7576 }
7577 
7578 void free_fair_sched_group(struct task_group *tg)
7579 {
7580 	int i;
7581 
7582 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
7583 
7584 	for_each_possible_cpu(i) {
7585 		if (tg->cfs_rq)
7586 			kfree(tg->cfs_rq[i]);
7587 		if (tg->se)
7588 			kfree(tg->se[i]);
7589 	}
7590 
7591 	kfree(tg->cfs_rq);
7592 	kfree(tg->se);
7593 }
7594 
7595 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7596 {
7597 	struct cfs_rq *cfs_rq;
7598 	struct sched_entity *se;
7599 	int i;
7600 
7601 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7602 	if (!tg->cfs_rq)
7603 		goto err;
7604 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7605 	if (!tg->se)
7606 		goto err;
7607 
7608 	tg->shares = NICE_0_LOAD;
7609 
7610 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
7611 
7612 	for_each_possible_cpu(i) {
7613 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
7614 				      GFP_KERNEL, cpu_to_node(i));
7615 		if (!cfs_rq)
7616 			goto err;
7617 
7618 		se = kzalloc_node(sizeof(struct sched_entity),
7619 				  GFP_KERNEL, cpu_to_node(i));
7620 		if (!se)
7621 			goto err_free_rq;
7622 
7623 		init_cfs_rq(cfs_rq);
7624 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
7625 	}
7626 
7627 	return 1;
7628 
7629 err_free_rq:
7630 	kfree(cfs_rq);
7631 err:
7632 	return 0;
7633 }
7634 
7635 void unregister_fair_sched_group(struct task_group *tg, int cpu)
7636 {
7637 	struct rq *rq = cpu_rq(cpu);
7638 	unsigned long flags;
7639 
7640 	/*
7641 	* Only empty task groups can be destroyed; so we can speculatively
7642 	* check on_list without danger of it being re-added.
7643 	*/
7644 	if (!tg->cfs_rq[cpu]->on_list)
7645 		return;
7646 
7647 	raw_spin_lock_irqsave(&rq->lock, flags);
7648 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
7649 	raw_spin_unlock_irqrestore(&rq->lock, flags);
7650 }
7651 
7652 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7653 			struct sched_entity *se, int cpu,
7654 			struct sched_entity *parent)
7655 {
7656 	struct rq *rq = cpu_rq(cpu);
7657 
7658 	cfs_rq->tg = tg;
7659 	cfs_rq->rq = rq;
7660 	init_cfs_rq_runtime(cfs_rq);
7661 
7662 	tg->cfs_rq[cpu] = cfs_rq;
7663 	tg->se[cpu] = se;
7664 
7665 	/* se could be NULL for root_task_group */
7666 	if (!se)
7667 		return;
7668 
7669 	if (!parent) {
7670 		se->cfs_rq = &rq->cfs;
7671 		se->depth = 0;
7672 	} else {
7673 		se->cfs_rq = parent->my_q;
7674 		se->depth = parent->depth + 1;
7675 	}
7676 
7677 	se->my_q = cfs_rq;
7678 	/* guarantee group entities always have weight */
7679 	update_load_set(&se->load, NICE_0_LOAD);
7680 	se->parent = parent;
7681 }
7682 
7683 static DEFINE_MUTEX(shares_mutex);
7684 
7685 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7686 {
7687 	int i;
7688 	unsigned long flags;
7689 
7690 	/*
7691 	 * We can't change the weight of the root cgroup.
7692 	 */
7693 	if (!tg->se[0])
7694 		return -EINVAL;
7695 
7696 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
7697 
7698 	mutex_lock(&shares_mutex);
7699 	if (tg->shares == shares)
7700 		goto done;
7701 
7702 	tg->shares = shares;
7703 	for_each_possible_cpu(i) {
7704 		struct rq *rq = cpu_rq(i);
7705 		struct sched_entity *se;
7706 
7707 		se = tg->se[i];
7708 		/* Propagate contribution to hierarchy */
7709 		raw_spin_lock_irqsave(&rq->lock, flags);
7710 
7711 		/* Possible calls to update_curr() need rq clock */
7712 		update_rq_clock(rq);
7713 		for_each_sched_entity(se)
7714 			update_cfs_shares(group_cfs_rq(se));
7715 		raw_spin_unlock_irqrestore(&rq->lock, flags);
7716 	}
7717 
7718 done:
7719 	mutex_unlock(&shares_mutex);
7720 	return 0;
7721 }
7722 #else /* CONFIG_FAIR_GROUP_SCHED */
7723 
7724 void free_fair_sched_group(struct task_group *tg) { }
7725 
7726 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7727 {
7728 	return 1;
7729 }
7730 
7731 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
7732 
7733 #endif /* CONFIG_FAIR_GROUP_SCHED */
7734 
7735 
7736 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
7737 {
7738 	struct sched_entity *se = &task->se;
7739 	unsigned int rr_interval = 0;
7740 
7741 	/*
7742 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
7743 	 * idle runqueue:
7744 	 */
7745 	if (rq->cfs.load.weight)
7746 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
7747 
7748 	return rr_interval;
7749 }
7750 
7751 /*
7752  * All the scheduling class methods:
7753  */
7754 const struct sched_class fair_sched_class = {
7755 	.next			= &idle_sched_class,
7756 	.enqueue_task		= enqueue_task_fair,
7757 	.dequeue_task		= dequeue_task_fair,
7758 	.yield_task		= yield_task_fair,
7759 	.yield_to_task		= yield_to_task_fair,
7760 
7761 	.check_preempt_curr	= check_preempt_wakeup,
7762 
7763 	.pick_next_task		= pick_next_task_fair,
7764 	.put_prev_task		= put_prev_task_fair,
7765 
7766 #ifdef CONFIG_SMP
7767 	.select_task_rq		= select_task_rq_fair,
7768 	.migrate_task_rq	= migrate_task_rq_fair,
7769 
7770 	.rq_online		= rq_online_fair,
7771 	.rq_offline		= rq_offline_fair,
7772 
7773 	.task_waking		= task_waking_fair,
7774 #endif
7775 
7776 	.set_curr_task          = set_curr_task_fair,
7777 	.task_tick		= task_tick_fair,
7778 	.task_fork		= task_fork_fair,
7779 
7780 	.prio_changed		= prio_changed_fair,
7781 	.switched_from		= switched_from_fair,
7782 	.switched_to		= switched_to_fair,
7783 
7784 	.get_rr_interval	= get_rr_interval_fair,
7785 
7786 #ifdef CONFIG_FAIR_GROUP_SCHED
7787 	.task_move_group	= task_move_group_fair,
7788 #endif
7789 };
7790 
7791 #ifdef CONFIG_SCHED_DEBUG
7792 void print_cfs_stats(struct seq_file *m, int cpu)
7793 {
7794 	struct cfs_rq *cfs_rq;
7795 
7796 	rcu_read_lock();
7797 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
7798 		print_cfs_rq(m, cpu, cfs_rq);
7799 	rcu_read_unlock();
7800 }
7801 #endif
7802 
7803 __init void init_sched_fair_class(void)
7804 {
7805 #ifdef CONFIG_SMP
7806 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7807 
7808 #ifdef CONFIG_NO_HZ_COMMON
7809 	nohz.next_balance = jiffies;
7810 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7811 	cpu_notifier(sched_ilb_notifier, 0);
7812 #endif
7813 #endif /* SMP */
7814 
7815 }
7816