xref: /openbmc/linux/kernel/sched/fair.c (revision 179dd8c0)
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21  */
22 
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/cpuidle.h>
27 #include <linux/slab.h>
28 #include <linux/profile.h>
29 #include <linux/interrupt.h>
30 #include <linux/mempolicy.h>
31 #include <linux/migrate.h>
32 #include <linux/task_work.h>
33 
34 #include <trace/events/sched.h>
35 
36 #include "sched.h"
37 
38 /*
39  * Targeted preemption latency for CPU-bound tasks:
40  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
41  *
42  * NOTE: this latency value is not the same as the concept of
43  * 'timeslice length' - timeslices in CFS are of variable length
44  * and have no persistent notion like in traditional, time-slice
45  * based scheduling concepts.
46  *
47  * (to see the precise effective timeslice length of your workload,
48  *  run vmstat and monitor the context-switches (cs) field)
49  */
50 unsigned int sysctl_sched_latency = 6000000ULL;
51 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
52 
53 /*
54  * The initial- and re-scaling of tunables is configurable
55  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
56  *
57  * Options are:
58  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
59  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
60  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
61  */
62 enum sched_tunable_scaling sysctl_sched_tunable_scaling
63 	= SCHED_TUNABLESCALING_LOG;
64 
65 /*
66  * Minimal preemption granularity for CPU-bound tasks:
67  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
68  */
69 unsigned int sysctl_sched_min_granularity = 750000ULL;
70 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
71 
72 /*
73  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
74  */
75 static unsigned int sched_nr_latency = 8;
76 
77 /*
78  * After fork, child runs first. If set to 0 (default) then
79  * parent will (try to) run first.
80  */
81 unsigned int sysctl_sched_child_runs_first __read_mostly;
82 
83 /*
84  * SCHED_OTHER wake-up granularity.
85  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
86  *
87  * This option delays the preemption effects of decoupled workloads
88  * and reduces their over-scheduling. Synchronous workloads will still
89  * have immediate wakeup/sleep latencies.
90  */
91 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
92 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
93 
94 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
95 
96 /*
97  * The exponential sliding  window over which load is averaged for shares
98  * distribution.
99  * (default: 10msec)
100  */
101 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
102 
103 #ifdef CONFIG_CFS_BANDWIDTH
104 /*
105  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
106  * each time a cfs_rq requests quota.
107  *
108  * Note: in the case that the slice exceeds the runtime remaining (either due
109  * to consumption or the quota being specified to be smaller than the slice)
110  * we will always only issue the remaining available time.
111  *
112  * default: 5 msec, units: microseconds
113   */
114 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115 #endif
116 
117 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
118 {
119 	lw->weight += inc;
120 	lw->inv_weight = 0;
121 }
122 
123 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
124 {
125 	lw->weight -= dec;
126 	lw->inv_weight = 0;
127 }
128 
129 static inline void update_load_set(struct load_weight *lw, unsigned long w)
130 {
131 	lw->weight = w;
132 	lw->inv_weight = 0;
133 }
134 
135 /*
136  * Increase the granularity value when there are more CPUs,
137  * because with more CPUs the 'effective latency' as visible
138  * to users decreases. But the relationship is not linear,
139  * so pick a second-best guess by going with the log2 of the
140  * number of CPUs.
141  *
142  * This idea comes from the SD scheduler of Con Kolivas:
143  */
144 static unsigned int get_update_sysctl_factor(void)
145 {
146 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
147 	unsigned int factor;
148 
149 	switch (sysctl_sched_tunable_scaling) {
150 	case SCHED_TUNABLESCALING_NONE:
151 		factor = 1;
152 		break;
153 	case SCHED_TUNABLESCALING_LINEAR:
154 		factor = cpus;
155 		break;
156 	case SCHED_TUNABLESCALING_LOG:
157 	default:
158 		factor = 1 + ilog2(cpus);
159 		break;
160 	}
161 
162 	return factor;
163 }
164 
165 static void update_sysctl(void)
166 {
167 	unsigned int factor = get_update_sysctl_factor();
168 
169 #define SET_SYSCTL(name) \
170 	(sysctl_##name = (factor) * normalized_sysctl_##name)
171 	SET_SYSCTL(sched_min_granularity);
172 	SET_SYSCTL(sched_latency);
173 	SET_SYSCTL(sched_wakeup_granularity);
174 #undef SET_SYSCTL
175 }
176 
177 void sched_init_granularity(void)
178 {
179 	update_sysctl();
180 }
181 
182 #define WMULT_CONST	(~0U)
183 #define WMULT_SHIFT	32
184 
185 static void __update_inv_weight(struct load_weight *lw)
186 {
187 	unsigned long w;
188 
189 	if (likely(lw->inv_weight))
190 		return;
191 
192 	w = scale_load_down(lw->weight);
193 
194 	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
195 		lw->inv_weight = 1;
196 	else if (unlikely(!w))
197 		lw->inv_weight = WMULT_CONST;
198 	else
199 		lw->inv_weight = WMULT_CONST / w;
200 }
201 
202 /*
203  * delta_exec * weight / lw.weight
204  *   OR
205  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
206  *
207  * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
208  * we're guaranteed shift stays positive because inv_weight is guaranteed to
209  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
210  *
211  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
212  * weight/lw.weight <= 1, and therefore our shift will also be positive.
213  */
214 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
215 {
216 	u64 fact = scale_load_down(weight);
217 	int shift = WMULT_SHIFT;
218 
219 	__update_inv_weight(lw);
220 
221 	if (unlikely(fact >> 32)) {
222 		while (fact >> 32) {
223 			fact >>= 1;
224 			shift--;
225 		}
226 	}
227 
228 	/* hint to use a 32x32->64 mul */
229 	fact = (u64)(u32)fact * lw->inv_weight;
230 
231 	while (fact >> 32) {
232 		fact >>= 1;
233 		shift--;
234 	}
235 
236 	return mul_u64_u32_shr(delta_exec, fact, shift);
237 }
238 
239 
240 const struct sched_class fair_sched_class;
241 
242 /**************************************************************
243  * CFS operations on generic schedulable entities:
244  */
245 
246 #ifdef CONFIG_FAIR_GROUP_SCHED
247 
248 /* cpu runqueue to which this cfs_rq is attached */
249 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
250 {
251 	return cfs_rq->rq;
252 }
253 
254 /* An entity is a task if it doesn't "own" a runqueue */
255 #define entity_is_task(se)	(!se->my_q)
256 
257 static inline struct task_struct *task_of(struct sched_entity *se)
258 {
259 #ifdef CONFIG_SCHED_DEBUG
260 	WARN_ON_ONCE(!entity_is_task(se));
261 #endif
262 	return container_of(se, struct task_struct, se);
263 }
264 
265 /* Walk up scheduling entities hierarchy */
266 #define for_each_sched_entity(se) \
267 		for (; se; se = se->parent)
268 
269 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
270 {
271 	return p->se.cfs_rq;
272 }
273 
274 /* runqueue on which this entity is (to be) queued */
275 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
276 {
277 	return se->cfs_rq;
278 }
279 
280 /* runqueue "owned" by this group */
281 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
282 {
283 	return grp->my_q;
284 }
285 
286 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
287 				       int force_update);
288 
289 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
290 {
291 	if (!cfs_rq->on_list) {
292 		/*
293 		 * Ensure we either appear before our parent (if already
294 		 * enqueued) or force our parent to appear after us when it is
295 		 * enqueued.  The fact that we always enqueue bottom-up
296 		 * reduces this to two cases.
297 		 */
298 		if (cfs_rq->tg->parent &&
299 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
300 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
301 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
302 		} else {
303 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
304 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
305 		}
306 
307 		cfs_rq->on_list = 1;
308 		/* We should have no load, but we need to update last_decay. */
309 		update_cfs_rq_blocked_load(cfs_rq, 0);
310 	}
311 }
312 
313 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
314 {
315 	if (cfs_rq->on_list) {
316 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
317 		cfs_rq->on_list = 0;
318 	}
319 }
320 
321 /* Iterate thr' all leaf cfs_rq's on a runqueue */
322 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
323 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
324 
325 /* Do the two (enqueued) entities belong to the same group ? */
326 static inline struct cfs_rq *
327 is_same_group(struct sched_entity *se, struct sched_entity *pse)
328 {
329 	if (se->cfs_rq == pse->cfs_rq)
330 		return se->cfs_rq;
331 
332 	return NULL;
333 }
334 
335 static inline struct sched_entity *parent_entity(struct sched_entity *se)
336 {
337 	return se->parent;
338 }
339 
340 static void
341 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
342 {
343 	int se_depth, pse_depth;
344 
345 	/*
346 	 * preemption test can be made between sibling entities who are in the
347 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
348 	 * both tasks until we find their ancestors who are siblings of common
349 	 * parent.
350 	 */
351 
352 	/* First walk up until both entities are at same depth */
353 	se_depth = (*se)->depth;
354 	pse_depth = (*pse)->depth;
355 
356 	while (se_depth > pse_depth) {
357 		se_depth--;
358 		*se = parent_entity(*se);
359 	}
360 
361 	while (pse_depth > se_depth) {
362 		pse_depth--;
363 		*pse = parent_entity(*pse);
364 	}
365 
366 	while (!is_same_group(*se, *pse)) {
367 		*se = parent_entity(*se);
368 		*pse = parent_entity(*pse);
369 	}
370 }
371 
372 #else	/* !CONFIG_FAIR_GROUP_SCHED */
373 
374 static inline struct task_struct *task_of(struct sched_entity *se)
375 {
376 	return container_of(se, struct task_struct, se);
377 }
378 
379 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
380 {
381 	return container_of(cfs_rq, struct rq, cfs);
382 }
383 
384 #define entity_is_task(se)	1
385 
386 #define for_each_sched_entity(se) \
387 		for (; se; se = NULL)
388 
389 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
390 {
391 	return &task_rq(p)->cfs;
392 }
393 
394 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
395 {
396 	struct task_struct *p = task_of(se);
397 	struct rq *rq = task_rq(p);
398 
399 	return &rq->cfs;
400 }
401 
402 /* runqueue "owned" by this group */
403 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
404 {
405 	return NULL;
406 }
407 
408 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
409 {
410 }
411 
412 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
413 {
414 }
415 
416 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
417 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
418 
419 static inline struct sched_entity *parent_entity(struct sched_entity *se)
420 {
421 	return NULL;
422 }
423 
424 static inline void
425 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
426 {
427 }
428 
429 #endif	/* CONFIG_FAIR_GROUP_SCHED */
430 
431 static __always_inline
432 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
433 
434 /**************************************************************
435  * Scheduling class tree data structure manipulation methods:
436  */
437 
438 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
439 {
440 	s64 delta = (s64)(vruntime - max_vruntime);
441 	if (delta > 0)
442 		max_vruntime = vruntime;
443 
444 	return max_vruntime;
445 }
446 
447 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
448 {
449 	s64 delta = (s64)(vruntime - min_vruntime);
450 	if (delta < 0)
451 		min_vruntime = vruntime;
452 
453 	return min_vruntime;
454 }
455 
456 static inline int entity_before(struct sched_entity *a,
457 				struct sched_entity *b)
458 {
459 	return (s64)(a->vruntime - b->vruntime) < 0;
460 }
461 
462 static void update_min_vruntime(struct cfs_rq *cfs_rq)
463 {
464 	u64 vruntime = cfs_rq->min_vruntime;
465 
466 	if (cfs_rq->curr)
467 		vruntime = cfs_rq->curr->vruntime;
468 
469 	if (cfs_rq->rb_leftmost) {
470 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
471 						   struct sched_entity,
472 						   run_node);
473 
474 		if (!cfs_rq->curr)
475 			vruntime = se->vruntime;
476 		else
477 			vruntime = min_vruntime(vruntime, se->vruntime);
478 	}
479 
480 	/* ensure we never gain time by being placed backwards. */
481 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
482 #ifndef CONFIG_64BIT
483 	smp_wmb();
484 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
485 #endif
486 }
487 
488 /*
489  * Enqueue an entity into the rb-tree:
490  */
491 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
492 {
493 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
494 	struct rb_node *parent = NULL;
495 	struct sched_entity *entry;
496 	int leftmost = 1;
497 
498 	/*
499 	 * Find the right place in the rbtree:
500 	 */
501 	while (*link) {
502 		parent = *link;
503 		entry = rb_entry(parent, struct sched_entity, run_node);
504 		/*
505 		 * We dont care about collisions. Nodes with
506 		 * the same key stay together.
507 		 */
508 		if (entity_before(se, entry)) {
509 			link = &parent->rb_left;
510 		} else {
511 			link = &parent->rb_right;
512 			leftmost = 0;
513 		}
514 	}
515 
516 	/*
517 	 * Maintain a cache of leftmost tree entries (it is frequently
518 	 * used):
519 	 */
520 	if (leftmost)
521 		cfs_rq->rb_leftmost = &se->run_node;
522 
523 	rb_link_node(&se->run_node, parent, link);
524 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
525 }
526 
527 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
528 {
529 	if (cfs_rq->rb_leftmost == &se->run_node) {
530 		struct rb_node *next_node;
531 
532 		next_node = rb_next(&se->run_node);
533 		cfs_rq->rb_leftmost = next_node;
534 	}
535 
536 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
537 }
538 
539 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
540 {
541 	struct rb_node *left = cfs_rq->rb_leftmost;
542 
543 	if (!left)
544 		return NULL;
545 
546 	return rb_entry(left, struct sched_entity, run_node);
547 }
548 
549 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
550 {
551 	struct rb_node *next = rb_next(&se->run_node);
552 
553 	if (!next)
554 		return NULL;
555 
556 	return rb_entry(next, struct sched_entity, run_node);
557 }
558 
559 #ifdef CONFIG_SCHED_DEBUG
560 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
561 {
562 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
563 
564 	if (!last)
565 		return NULL;
566 
567 	return rb_entry(last, struct sched_entity, run_node);
568 }
569 
570 /**************************************************************
571  * Scheduling class statistics methods:
572  */
573 
574 int sched_proc_update_handler(struct ctl_table *table, int write,
575 		void __user *buffer, size_t *lenp,
576 		loff_t *ppos)
577 {
578 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
579 	unsigned int factor = get_update_sysctl_factor();
580 
581 	if (ret || !write)
582 		return ret;
583 
584 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
585 					sysctl_sched_min_granularity);
586 
587 #define WRT_SYSCTL(name) \
588 	(normalized_sysctl_##name = sysctl_##name / (factor))
589 	WRT_SYSCTL(sched_min_granularity);
590 	WRT_SYSCTL(sched_latency);
591 	WRT_SYSCTL(sched_wakeup_granularity);
592 #undef WRT_SYSCTL
593 
594 	return 0;
595 }
596 #endif
597 
598 /*
599  * delta /= w
600  */
601 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
602 {
603 	if (unlikely(se->load.weight != NICE_0_LOAD))
604 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
605 
606 	return delta;
607 }
608 
609 /*
610  * The idea is to set a period in which each task runs once.
611  *
612  * When there are too many tasks (sched_nr_latency) we have to stretch
613  * this period because otherwise the slices get too small.
614  *
615  * p = (nr <= nl) ? l : l*nr/nl
616  */
617 static u64 __sched_period(unsigned long nr_running)
618 {
619 	u64 period = sysctl_sched_latency;
620 	unsigned long nr_latency = sched_nr_latency;
621 
622 	if (unlikely(nr_running > nr_latency)) {
623 		period = sysctl_sched_min_granularity;
624 		period *= nr_running;
625 	}
626 
627 	return period;
628 }
629 
630 /*
631  * We calculate the wall-time slice from the period by taking a part
632  * proportional to the weight.
633  *
634  * s = p*P[w/rw]
635  */
636 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
637 {
638 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
639 
640 	for_each_sched_entity(se) {
641 		struct load_weight *load;
642 		struct load_weight lw;
643 
644 		cfs_rq = cfs_rq_of(se);
645 		load = &cfs_rq->load;
646 
647 		if (unlikely(!se->on_rq)) {
648 			lw = cfs_rq->load;
649 
650 			update_load_add(&lw, se->load.weight);
651 			load = &lw;
652 		}
653 		slice = __calc_delta(slice, se->load.weight, load);
654 	}
655 	return slice;
656 }
657 
658 /*
659  * We calculate the vruntime slice of a to-be-inserted task.
660  *
661  * vs = s/w
662  */
663 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
664 {
665 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
666 }
667 
668 #ifdef CONFIG_SMP
669 static int select_idle_sibling(struct task_struct *p, int cpu);
670 static unsigned long task_h_load(struct task_struct *p);
671 
672 static inline void __update_task_entity_contrib(struct sched_entity *se);
673 static inline void __update_task_entity_utilization(struct sched_entity *se);
674 
675 /* Give new task start runnable values to heavy its load in infant time */
676 void init_task_runnable_average(struct task_struct *p)
677 {
678 	u32 slice;
679 
680 	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
681 	p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
682 	p->se.avg.avg_period = slice;
683 	__update_task_entity_contrib(&p->se);
684 	__update_task_entity_utilization(&p->se);
685 }
686 #else
687 void init_task_runnable_average(struct task_struct *p)
688 {
689 }
690 #endif
691 
692 /*
693  * Update the current task's runtime statistics.
694  */
695 static void update_curr(struct cfs_rq *cfs_rq)
696 {
697 	struct sched_entity *curr = cfs_rq->curr;
698 	u64 now = rq_clock_task(rq_of(cfs_rq));
699 	u64 delta_exec;
700 
701 	if (unlikely(!curr))
702 		return;
703 
704 	delta_exec = now - curr->exec_start;
705 	if (unlikely((s64)delta_exec <= 0))
706 		return;
707 
708 	curr->exec_start = now;
709 
710 	schedstat_set(curr->statistics.exec_max,
711 		      max(delta_exec, curr->statistics.exec_max));
712 
713 	curr->sum_exec_runtime += delta_exec;
714 	schedstat_add(cfs_rq, exec_clock, delta_exec);
715 
716 	curr->vruntime += calc_delta_fair(delta_exec, curr);
717 	update_min_vruntime(cfs_rq);
718 
719 	if (entity_is_task(curr)) {
720 		struct task_struct *curtask = task_of(curr);
721 
722 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
723 		cpuacct_charge(curtask, delta_exec);
724 		account_group_exec_runtime(curtask, delta_exec);
725 	}
726 
727 	account_cfs_rq_runtime(cfs_rq, delta_exec);
728 }
729 
730 static void update_curr_fair(struct rq *rq)
731 {
732 	update_curr(cfs_rq_of(&rq->curr->se));
733 }
734 
735 static inline void
736 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
737 {
738 	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
739 }
740 
741 /*
742  * Task is being enqueued - update stats:
743  */
744 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
745 {
746 	/*
747 	 * Are we enqueueing a waiting task? (for current tasks
748 	 * a dequeue/enqueue event is a NOP)
749 	 */
750 	if (se != cfs_rq->curr)
751 		update_stats_wait_start(cfs_rq, se);
752 }
753 
754 static void
755 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
756 {
757 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
758 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
759 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
760 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
761 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
762 #ifdef CONFIG_SCHEDSTATS
763 	if (entity_is_task(se)) {
764 		trace_sched_stat_wait(task_of(se),
765 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
766 	}
767 #endif
768 	schedstat_set(se->statistics.wait_start, 0);
769 }
770 
771 static inline void
772 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
773 {
774 	/*
775 	 * Mark the end of the wait period if dequeueing a
776 	 * waiting task:
777 	 */
778 	if (se != cfs_rq->curr)
779 		update_stats_wait_end(cfs_rq, se);
780 }
781 
782 /*
783  * We are picking a new current task - update its stats:
784  */
785 static inline void
786 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
787 {
788 	/*
789 	 * We are starting a new run period:
790 	 */
791 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
792 }
793 
794 /**************************************************
795  * Scheduling class queueing methods:
796  */
797 
798 #ifdef CONFIG_NUMA_BALANCING
799 /*
800  * Approximate time to scan a full NUMA task in ms. The task scan period is
801  * calculated based on the tasks virtual memory size and
802  * numa_balancing_scan_size.
803  */
804 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
805 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
806 
807 /* Portion of address space to scan in MB */
808 unsigned int sysctl_numa_balancing_scan_size = 256;
809 
810 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
811 unsigned int sysctl_numa_balancing_scan_delay = 1000;
812 
813 static unsigned int task_nr_scan_windows(struct task_struct *p)
814 {
815 	unsigned long rss = 0;
816 	unsigned long nr_scan_pages;
817 
818 	/*
819 	 * Calculations based on RSS as non-present and empty pages are skipped
820 	 * by the PTE scanner and NUMA hinting faults should be trapped based
821 	 * on resident pages
822 	 */
823 	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
824 	rss = get_mm_rss(p->mm);
825 	if (!rss)
826 		rss = nr_scan_pages;
827 
828 	rss = round_up(rss, nr_scan_pages);
829 	return rss / nr_scan_pages;
830 }
831 
832 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
833 #define MAX_SCAN_WINDOW 2560
834 
835 static unsigned int task_scan_min(struct task_struct *p)
836 {
837 	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
838 	unsigned int scan, floor;
839 	unsigned int windows = 1;
840 
841 	if (scan_size < MAX_SCAN_WINDOW)
842 		windows = MAX_SCAN_WINDOW / scan_size;
843 	floor = 1000 / windows;
844 
845 	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
846 	return max_t(unsigned int, floor, scan);
847 }
848 
849 static unsigned int task_scan_max(struct task_struct *p)
850 {
851 	unsigned int smin = task_scan_min(p);
852 	unsigned int smax;
853 
854 	/* Watch for min being lower than max due to floor calculations */
855 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
856 	return max(smin, smax);
857 }
858 
859 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
860 {
861 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
862 	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
863 }
864 
865 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
866 {
867 	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
868 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
869 }
870 
871 struct numa_group {
872 	atomic_t refcount;
873 
874 	spinlock_t lock; /* nr_tasks, tasks */
875 	int nr_tasks;
876 	pid_t gid;
877 
878 	struct rcu_head rcu;
879 	nodemask_t active_nodes;
880 	unsigned long total_faults;
881 	/*
882 	 * Faults_cpu is used to decide whether memory should move
883 	 * towards the CPU. As a consequence, these stats are weighted
884 	 * more by CPU use than by memory faults.
885 	 */
886 	unsigned long *faults_cpu;
887 	unsigned long faults[0];
888 };
889 
890 /* Shared or private faults. */
891 #define NR_NUMA_HINT_FAULT_TYPES 2
892 
893 /* Memory and CPU locality */
894 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
895 
896 /* Averaged statistics, and temporary buffers. */
897 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
898 
899 pid_t task_numa_group_id(struct task_struct *p)
900 {
901 	return p->numa_group ? p->numa_group->gid : 0;
902 }
903 
904 /*
905  * The averaged statistics, shared & private, memory & cpu,
906  * occupy the first half of the array. The second half of the
907  * array is for current counters, which are averaged into the
908  * first set by task_numa_placement.
909  */
910 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
911 {
912 	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
913 }
914 
915 static inline unsigned long task_faults(struct task_struct *p, int nid)
916 {
917 	if (!p->numa_faults)
918 		return 0;
919 
920 	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
921 		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
922 }
923 
924 static inline unsigned long group_faults(struct task_struct *p, int nid)
925 {
926 	if (!p->numa_group)
927 		return 0;
928 
929 	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
930 		p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
931 }
932 
933 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
934 {
935 	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
936 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
937 }
938 
939 /* Handle placement on systems where not all nodes are directly connected. */
940 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
941 					int maxdist, bool task)
942 {
943 	unsigned long score = 0;
944 	int node;
945 
946 	/*
947 	 * All nodes are directly connected, and the same distance
948 	 * from each other. No need for fancy placement algorithms.
949 	 */
950 	if (sched_numa_topology_type == NUMA_DIRECT)
951 		return 0;
952 
953 	/*
954 	 * This code is called for each node, introducing N^2 complexity,
955 	 * which should be ok given the number of nodes rarely exceeds 8.
956 	 */
957 	for_each_online_node(node) {
958 		unsigned long faults;
959 		int dist = node_distance(nid, node);
960 
961 		/*
962 		 * The furthest away nodes in the system are not interesting
963 		 * for placement; nid was already counted.
964 		 */
965 		if (dist == sched_max_numa_distance || node == nid)
966 			continue;
967 
968 		/*
969 		 * On systems with a backplane NUMA topology, compare groups
970 		 * of nodes, and move tasks towards the group with the most
971 		 * memory accesses. When comparing two nodes at distance
972 		 * "hoplimit", only nodes closer by than "hoplimit" are part
973 		 * of each group. Skip other nodes.
974 		 */
975 		if (sched_numa_topology_type == NUMA_BACKPLANE &&
976 					dist > maxdist)
977 			continue;
978 
979 		/* Add up the faults from nearby nodes. */
980 		if (task)
981 			faults = task_faults(p, node);
982 		else
983 			faults = group_faults(p, node);
984 
985 		/*
986 		 * On systems with a glueless mesh NUMA topology, there are
987 		 * no fixed "groups of nodes". Instead, nodes that are not
988 		 * directly connected bounce traffic through intermediate
989 		 * nodes; a numa_group can occupy any set of nodes.
990 		 * The further away a node is, the less the faults count.
991 		 * This seems to result in good task placement.
992 		 */
993 		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
994 			faults *= (sched_max_numa_distance - dist);
995 			faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
996 		}
997 
998 		score += faults;
999 	}
1000 
1001 	return score;
1002 }
1003 
1004 /*
1005  * These return the fraction of accesses done by a particular task, or
1006  * task group, on a particular numa node.  The group weight is given a
1007  * larger multiplier, in order to group tasks together that are almost
1008  * evenly spread out between numa nodes.
1009  */
1010 static inline unsigned long task_weight(struct task_struct *p, int nid,
1011 					int dist)
1012 {
1013 	unsigned long faults, total_faults;
1014 
1015 	if (!p->numa_faults)
1016 		return 0;
1017 
1018 	total_faults = p->total_numa_faults;
1019 
1020 	if (!total_faults)
1021 		return 0;
1022 
1023 	faults = task_faults(p, nid);
1024 	faults += score_nearby_nodes(p, nid, dist, true);
1025 
1026 	return 1000 * faults / total_faults;
1027 }
1028 
1029 static inline unsigned long group_weight(struct task_struct *p, int nid,
1030 					 int dist)
1031 {
1032 	unsigned long faults, total_faults;
1033 
1034 	if (!p->numa_group)
1035 		return 0;
1036 
1037 	total_faults = p->numa_group->total_faults;
1038 
1039 	if (!total_faults)
1040 		return 0;
1041 
1042 	faults = group_faults(p, nid);
1043 	faults += score_nearby_nodes(p, nid, dist, false);
1044 
1045 	return 1000 * faults / total_faults;
1046 }
1047 
1048 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1049 				int src_nid, int dst_cpu)
1050 {
1051 	struct numa_group *ng = p->numa_group;
1052 	int dst_nid = cpu_to_node(dst_cpu);
1053 	int last_cpupid, this_cpupid;
1054 
1055 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1056 
1057 	/*
1058 	 * Multi-stage node selection is used in conjunction with a periodic
1059 	 * migration fault to build a temporal task<->page relation. By using
1060 	 * a two-stage filter we remove short/unlikely relations.
1061 	 *
1062 	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1063 	 * a task's usage of a particular page (n_p) per total usage of this
1064 	 * page (n_t) (in a given time-span) to a probability.
1065 	 *
1066 	 * Our periodic faults will sample this probability and getting the
1067 	 * same result twice in a row, given these samples are fully
1068 	 * independent, is then given by P(n)^2, provided our sample period
1069 	 * is sufficiently short compared to the usage pattern.
1070 	 *
1071 	 * This quadric squishes small probabilities, making it less likely we
1072 	 * act on an unlikely task<->page relation.
1073 	 */
1074 	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1075 	if (!cpupid_pid_unset(last_cpupid) &&
1076 				cpupid_to_nid(last_cpupid) != dst_nid)
1077 		return false;
1078 
1079 	/* Always allow migrate on private faults */
1080 	if (cpupid_match_pid(p, last_cpupid))
1081 		return true;
1082 
1083 	/* A shared fault, but p->numa_group has not been set up yet. */
1084 	if (!ng)
1085 		return true;
1086 
1087 	/*
1088 	 * Do not migrate if the destination is not a node that
1089 	 * is actively used by this numa group.
1090 	 */
1091 	if (!node_isset(dst_nid, ng->active_nodes))
1092 		return false;
1093 
1094 	/*
1095 	 * Source is a node that is not actively used by this
1096 	 * numa group, while the destination is. Migrate.
1097 	 */
1098 	if (!node_isset(src_nid, ng->active_nodes))
1099 		return true;
1100 
1101 	/*
1102 	 * Both source and destination are nodes in active
1103 	 * use by this numa group. Maximize memory bandwidth
1104 	 * by migrating from more heavily used groups, to less
1105 	 * heavily used ones, spreading the load around.
1106 	 * Use a 1/4 hysteresis to avoid spurious page movement.
1107 	 */
1108 	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1109 }
1110 
1111 static unsigned long weighted_cpuload(const int cpu);
1112 static unsigned long source_load(int cpu, int type);
1113 static unsigned long target_load(int cpu, int type);
1114 static unsigned long capacity_of(int cpu);
1115 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1116 
1117 /* Cached statistics for all CPUs within a node */
1118 struct numa_stats {
1119 	unsigned long nr_running;
1120 	unsigned long load;
1121 
1122 	/* Total compute capacity of CPUs on a node */
1123 	unsigned long compute_capacity;
1124 
1125 	/* Approximate capacity in terms of runnable tasks on a node */
1126 	unsigned long task_capacity;
1127 	int has_free_capacity;
1128 };
1129 
1130 /*
1131  * XXX borrowed from update_sg_lb_stats
1132  */
1133 static void update_numa_stats(struct numa_stats *ns, int nid)
1134 {
1135 	int smt, cpu, cpus = 0;
1136 	unsigned long capacity;
1137 
1138 	memset(ns, 0, sizeof(*ns));
1139 	for_each_cpu(cpu, cpumask_of_node(nid)) {
1140 		struct rq *rq = cpu_rq(cpu);
1141 
1142 		ns->nr_running += rq->nr_running;
1143 		ns->load += weighted_cpuload(cpu);
1144 		ns->compute_capacity += capacity_of(cpu);
1145 
1146 		cpus++;
1147 	}
1148 
1149 	/*
1150 	 * If we raced with hotplug and there are no CPUs left in our mask
1151 	 * the @ns structure is NULL'ed and task_numa_compare() will
1152 	 * not find this node attractive.
1153 	 *
1154 	 * We'll either bail at !has_free_capacity, or we'll detect a huge
1155 	 * imbalance and bail there.
1156 	 */
1157 	if (!cpus)
1158 		return;
1159 
1160 	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1161 	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1162 	capacity = cpus / smt; /* cores */
1163 
1164 	ns->task_capacity = min_t(unsigned, capacity,
1165 		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1166 	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1167 }
1168 
1169 struct task_numa_env {
1170 	struct task_struct *p;
1171 
1172 	int src_cpu, src_nid;
1173 	int dst_cpu, dst_nid;
1174 
1175 	struct numa_stats src_stats, dst_stats;
1176 
1177 	int imbalance_pct;
1178 	int dist;
1179 
1180 	struct task_struct *best_task;
1181 	long best_imp;
1182 	int best_cpu;
1183 };
1184 
1185 static void task_numa_assign(struct task_numa_env *env,
1186 			     struct task_struct *p, long imp)
1187 {
1188 	if (env->best_task)
1189 		put_task_struct(env->best_task);
1190 	if (p)
1191 		get_task_struct(p);
1192 
1193 	env->best_task = p;
1194 	env->best_imp = imp;
1195 	env->best_cpu = env->dst_cpu;
1196 }
1197 
1198 static bool load_too_imbalanced(long src_load, long dst_load,
1199 				struct task_numa_env *env)
1200 {
1201 	long imb, old_imb;
1202 	long orig_src_load, orig_dst_load;
1203 	long src_capacity, dst_capacity;
1204 
1205 	/*
1206 	 * The load is corrected for the CPU capacity available on each node.
1207 	 *
1208 	 * src_load        dst_load
1209 	 * ------------ vs ---------
1210 	 * src_capacity    dst_capacity
1211 	 */
1212 	src_capacity = env->src_stats.compute_capacity;
1213 	dst_capacity = env->dst_stats.compute_capacity;
1214 
1215 	/* We care about the slope of the imbalance, not the direction. */
1216 	if (dst_load < src_load)
1217 		swap(dst_load, src_load);
1218 
1219 	/* Is the difference below the threshold? */
1220 	imb = dst_load * src_capacity * 100 -
1221 	      src_load * dst_capacity * env->imbalance_pct;
1222 	if (imb <= 0)
1223 		return false;
1224 
1225 	/*
1226 	 * The imbalance is above the allowed threshold.
1227 	 * Compare it with the old imbalance.
1228 	 */
1229 	orig_src_load = env->src_stats.load;
1230 	orig_dst_load = env->dst_stats.load;
1231 
1232 	if (orig_dst_load < orig_src_load)
1233 		swap(orig_dst_load, orig_src_load);
1234 
1235 	old_imb = orig_dst_load * src_capacity * 100 -
1236 		  orig_src_load * dst_capacity * env->imbalance_pct;
1237 
1238 	/* Would this change make things worse? */
1239 	return (imb > old_imb);
1240 }
1241 
1242 /*
1243  * This checks if the overall compute and NUMA accesses of the system would
1244  * be improved if the source tasks was migrated to the target dst_cpu taking
1245  * into account that it might be best if task running on the dst_cpu should
1246  * be exchanged with the source task
1247  */
1248 static void task_numa_compare(struct task_numa_env *env,
1249 			      long taskimp, long groupimp)
1250 {
1251 	struct rq *src_rq = cpu_rq(env->src_cpu);
1252 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
1253 	struct task_struct *cur;
1254 	long src_load, dst_load;
1255 	long load;
1256 	long imp = env->p->numa_group ? groupimp : taskimp;
1257 	long moveimp = imp;
1258 	int dist = env->dist;
1259 
1260 	rcu_read_lock();
1261 
1262 	raw_spin_lock_irq(&dst_rq->lock);
1263 	cur = dst_rq->curr;
1264 	/*
1265 	 * No need to move the exiting task, and this ensures that ->curr
1266 	 * wasn't reaped and thus get_task_struct() in task_numa_assign()
1267 	 * is safe under RCU read lock.
1268 	 * Note that rcu_read_lock() itself can't protect from the final
1269 	 * put_task_struct() after the last schedule().
1270 	 */
1271 	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1272 		cur = NULL;
1273 	raw_spin_unlock_irq(&dst_rq->lock);
1274 
1275 	/*
1276 	 * Because we have preemption enabled we can get migrated around and
1277 	 * end try selecting ourselves (current == env->p) as a swap candidate.
1278 	 */
1279 	if (cur == env->p)
1280 		goto unlock;
1281 
1282 	/*
1283 	 * "imp" is the fault differential for the source task between the
1284 	 * source and destination node. Calculate the total differential for
1285 	 * the source task and potential destination task. The more negative
1286 	 * the value is, the more rmeote accesses that would be expected to
1287 	 * be incurred if the tasks were swapped.
1288 	 */
1289 	if (cur) {
1290 		/* Skip this swap candidate if cannot move to the source cpu */
1291 		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1292 			goto unlock;
1293 
1294 		/*
1295 		 * If dst and source tasks are in the same NUMA group, or not
1296 		 * in any group then look only at task weights.
1297 		 */
1298 		if (cur->numa_group == env->p->numa_group) {
1299 			imp = taskimp + task_weight(cur, env->src_nid, dist) -
1300 			      task_weight(cur, env->dst_nid, dist);
1301 			/*
1302 			 * Add some hysteresis to prevent swapping the
1303 			 * tasks within a group over tiny differences.
1304 			 */
1305 			if (cur->numa_group)
1306 				imp -= imp/16;
1307 		} else {
1308 			/*
1309 			 * Compare the group weights. If a task is all by
1310 			 * itself (not part of a group), use the task weight
1311 			 * instead.
1312 			 */
1313 			if (cur->numa_group)
1314 				imp += group_weight(cur, env->src_nid, dist) -
1315 				       group_weight(cur, env->dst_nid, dist);
1316 			else
1317 				imp += task_weight(cur, env->src_nid, dist) -
1318 				       task_weight(cur, env->dst_nid, dist);
1319 		}
1320 	}
1321 
1322 	if (imp <= env->best_imp && moveimp <= env->best_imp)
1323 		goto unlock;
1324 
1325 	if (!cur) {
1326 		/* Is there capacity at our destination? */
1327 		if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1328 		    !env->dst_stats.has_free_capacity)
1329 			goto unlock;
1330 
1331 		goto balance;
1332 	}
1333 
1334 	/* Balance doesn't matter much if we're running a task per cpu */
1335 	if (imp > env->best_imp && src_rq->nr_running == 1 &&
1336 			dst_rq->nr_running == 1)
1337 		goto assign;
1338 
1339 	/*
1340 	 * In the overloaded case, try and keep the load balanced.
1341 	 */
1342 balance:
1343 	load = task_h_load(env->p);
1344 	dst_load = env->dst_stats.load + load;
1345 	src_load = env->src_stats.load - load;
1346 
1347 	if (moveimp > imp && moveimp > env->best_imp) {
1348 		/*
1349 		 * If the improvement from just moving env->p direction is
1350 		 * better than swapping tasks around, check if a move is
1351 		 * possible. Store a slightly smaller score than moveimp,
1352 		 * so an actually idle CPU will win.
1353 		 */
1354 		if (!load_too_imbalanced(src_load, dst_load, env)) {
1355 			imp = moveimp - 1;
1356 			cur = NULL;
1357 			goto assign;
1358 		}
1359 	}
1360 
1361 	if (imp <= env->best_imp)
1362 		goto unlock;
1363 
1364 	if (cur) {
1365 		load = task_h_load(cur);
1366 		dst_load -= load;
1367 		src_load += load;
1368 	}
1369 
1370 	if (load_too_imbalanced(src_load, dst_load, env))
1371 		goto unlock;
1372 
1373 	/*
1374 	 * One idle CPU per node is evaluated for a task numa move.
1375 	 * Call select_idle_sibling to maybe find a better one.
1376 	 */
1377 	if (!cur)
1378 		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1379 
1380 assign:
1381 	task_numa_assign(env, cur, imp);
1382 unlock:
1383 	rcu_read_unlock();
1384 }
1385 
1386 static void task_numa_find_cpu(struct task_numa_env *env,
1387 				long taskimp, long groupimp)
1388 {
1389 	int cpu;
1390 
1391 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1392 		/* Skip this CPU if the source task cannot migrate */
1393 		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1394 			continue;
1395 
1396 		env->dst_cpu = cpu;
1397 		task_numa_compare(env, taskimp, groupimp);
1398 	}
1399 }
1400 
1401 /* Only move tasks to a NUMA node less busy than the current node. */
1402 static bool numa_has_capacity(struct task_numa_env *env)
1403 {
1404 	struct numa_stats *src = &env->src_stats;
1405 	struct numa_stats *dst = &env->dst_stats;
1406 
1407 	if (src->has_free_capacity && !dst->has_free_capacity)
1408 		return false;
1409 
1410 	/*
1411 	 * Only consider a task move if the source has a higher load
1412 	 * than the destination, corrected for CPU capacity on each node.
1413 	 *
1414 	 *      src->load                dst->load
1415 	 * --------------------- vs ---------------------
1416 	 * src->compute_capacity    dst->compute_capacity
1417 	 */
1418 	if (src->load * dst->compute_capacity >
1419 	    dst->load * src->compute_capacity)
1420 		return true;
1421 
1422 	return false;
1423 }
1424 
1425 static int task_numa_migrate(struct task_struct *p)
1426 {
1427 	struct task_numa_env env = {
1428 		.p = p,
1429 
1430 		.src_cpu = task_cpu(p),
1431 		.src_nid = task_node(p),
1432 
1433 		.imbalance_pct = 112,
1434 
1435 		.best_task = NULL,
1436 		.best_imp = 0,
1437 		.best_cpu = -1
1438 	};
1439 	struct sched_domain *sd;
1440 	unsigned long taskweight, groupweight;
1441 	int nid, ret, dist;
1442 	long taskimp, groupimp;
1443 
1444 	/*
1445 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
1446 	 * imbalance and would be the first to start moving tasks about.
1447 	 *
1448 	 * And we want to avoid any moving of tasks about, as that would create
1449 	 * random movement of tasks -- counter the numa conditions we're trying
1450 	 * to satisfy here.
1451 	 */
1452 	rcu_read_lock();
1453 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1454 	if (sd)
1455 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1456 	rcu_read_unlock();
1457 
1458 	/*
1459 	 * Cpusets can break the scheduler domain tree into smaller
1460 	 * balance domains, some of which do not cross NUMA boundaries.
1461 	 * Tasks that are "trapped" in such domains cannot be migrated
1462 	 * elsewhere, so there is no point in (re)trying.
1463 	 */
1464 	if (unlikely(!sd)) {
1465 		p->numa_preferred_nid = task_node(p);
1466 		return -EINVAL;
1467 	}
1468 
1469 	env.dst_nid = p->numa_preferred_nid;
1470 	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1471 	taskweight = task_weight(p, env.src_nid, dist);
1472 	groupweight = group_weight(p, env.src_nid, dist);
1473 	update_numa_stats(&env.src_stats, env.src_nid);
1474 	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1475 	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1476 	update_numa_stats(&env.dst_stats, env.dst_nid);
1477 
1478 	/* Try to find a spot on the preferred nid. */
1479 	if (numa_has_capacity(&env))
1480 		task_numa_find_cpu(&env, taskimp, groupimp);
1481 
1482 	/*
1483 	 * Look at other nodes in these cases:
1484 	 * - there is no space available on the preferred_nid
1485 	 * - the task is part of a numa_group that is interleaved across
1486 	 *   multiple NUMA nodes; in order to better consolidate the group,
1487 	 *   we need to check other locations.
1488 	 */
1489 	if (env.best_cpu == -1 || (p->numa_group &&
1490 			nodes_weight(p->numa_group->active_nodes) > 1)) {
1491 		for_each_online_node(nid) {
1492 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
1493 				continue;
1494 
1495 			dist = node_distance(env.src_nid, env.dst_nid);
1496 			if (sched_numa_topology_type == NUMA_BACKPLANE &&
1497 						dist != env.dist) {
1498 				taskweight = task_weight(p, env.src_nid, dist);
1499 				groupweight = group_weight(p, env.src_nid, dist);
1500 			}
1501 
1502 			/* Only consider nodes where both task and groups benefit */
1503 			taskimp = task_weight(p, nid, dist) - taskweight;
1504 			groupimp = group_weight(p, nid, dist) - groupweight;
1505 			if (taskimp < 0 && groupimp < 0)
1506 				continue;
1507 
1508 			env.dist = dist;
1509 			env.dst_nid = nid;
1510 			update_numa_stats(&env.dst_stats, env.dst_nid);
1511 			if (numa_has_capacity(&env))
1512 				task_numa_find_cpu(&env, taskimp, groupimp);
1513 		}
1514 	}
1515 
1516 	/*
1517 	 * If the task is part of a workload that spans multiple NUMA nodes,
1518 	 * and is migrating into one of the workload's active nodes, remember
1519 	 * this node as the task's preferred numa node, so the workload can
1520 	 * settle down.
1521 	 * A task that migrated to a second choice node will be better off
1522 	 * trying for a better one later. Do not set the preferred node here.
1523 	 */
1524 	if (p->numa_group) {
1525 		if (env.best_cpu == -1)
1526 			nid = env.src_nid;
1527 		else
1528 			nid = env.dst_nid;
1529 
1530 		if (node_isset(nid, p->numa_group->active_nodes))
1531 			sched_setnuma(p, env.dst_nid);
1532 	}
1533 
1534 	/* No better CPU than the current one was found. */
1535 	if (env.best_cpu == -1)
1536 		return -EAGAIN;
1537 
1538 	/*
1539 	 * Reset the scan period if the task is being rescheduled on an
1540 	 * alternative node to recheck if the tasks is now properly placed.
1541 	 */
1542 	p->numa_scan_period = task_scan_min(p);
1543 
1544 	if (env.best_task == NULL) {
1545 		ret = migrate_task_to(p, env.best_cpu);
1546 		if (ret != 0)
1547 			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1548 		return ret;
1549 	}
1550 
1551 	ret = migrate_swap(p, env.best_task);
1552 	if (ret != 0)
1553 		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1554 	put_task_struct(env.best_task);
1555 	return ret;
1556 }
1557 
1558 /* Attempt to migrate a task to a CPU on the preferred node. */
1559 static void numa_migrate_preferred(struct task_struct *p)
1560 {
1561 	unsigned long interval = HZ;
1562 
1563 	/* This task has no NUMA fault statistics yet */
1564 	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1565 		return;
1566 
1567 	/* Periodically retry migrating the task to the preferred node */
1568 	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1569 	p->numa_migrate_retry = jiffies + interval;
1570 
1571 	/* Success if task is already running on preferred CPU */
1572 	if (task_node(p) == p->numa_preferred_nid)
1573 		return;
1574 
1575 	/* Otherwise, try migrate to a CPU on the preferred node */
1576 	task_numa_migrate(p);
1577 }
1578 
1579 /*
1580  * Find the nodes on which the workload is actively running. We do this by
1581  * tracking the nodes from which NUMA hinting faults are triggered. This can
1582  * be different from the set of nodes where the workload's memory is currently
1583  * located.
1584  *
1585  * The bitmask is used to make smarter decisions on when to do NUMA page
1586  * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1587  * are added when they cause over 6/16 of the maximum number of faults, but
1588  * only removed when they drop below 3/16.
1589  */
1590 static void update_numa_active_node_mask(struct numa_group *numa_group)
1591 {
1592 	unsigned long faults, max_faults = 0;
1593 	int nid;
1594 
1595 	for_each_online_node(nid) {
1596 		faults = group_faults_cpu(numa_group, nid);
1597 		if (faults > max_faults)
1598 			max_faults = faults;
1599 	}
1600 
1601 	for_each_online_node(nid) {
1602 		faults = group_faults_cpu(numa_group, nid);
1603 		if (!node_isset(nid, numa_group->active_nodes)) {
1604 			if (faults > max_faults * 6 / 16)
1605 				node_set(nid, numa_group->active_nodes);
1606 		} else if (faults < max_faults * 3 / 16)
1607 			node_clear(nid, numa_group->active_nodes);
1608 	}
1609 }
1610 
1611 /*
1612  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1613  * increments. The more local the fault statistics are, the higher the scan
1614  * period will be for the next scan window. If local/(local+remote) ratio is
1615  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1616  * the scan period will decrease. Aim for 70% local accesses.
1617  */
1618 #define NUMA_PERIOD_SLOTS 10
1619 #define NUMA_PERIOD_THRESHOLD 7
1620 
1621 /*
1622  * Increase the scan period (slow down scanning) if the majority of
1623  * our memory is already on our local node, or if the majority of
1624  * the page accesses are shared with other processes.
1625  * Otherwise, decrease the scan period.
1626  */
1627 static void update_task_scan_period(struct task_struct *p,
1628 			unsigned long shared, unsigned long private)
1629 {
1630 	unsigned int period_slot;
1631 	int ratio;
1632 	int diff;
1633 
1634 	unsigned long remote = p->numa_faults_locality[0];
1635 	unsigned long local = p->numa_faults_locality[1];
1636 
1637 	/*
1638 	 * If there were no record hinting faults then either the task is
1639 	 * completely idle or all activity is areas that are not of interest
1640 	 * to automatic numa balancing. Related to that, if there were failed
1641 	 * migration then it implies we are migrating too quickly or the local
1642 	 * node is overloaded. In either case, scan slower
1643 	 */
1644 	if (local + shared == 0 || p->numa_faults_locality[2]) {
1645 		p->numa_scan_period = min(p->numa_scan_period_max,
1646 			p->numa_scan_period << 1);
1647 
1648 		p->mm->numa_next_scan = jiffies +
1649 			msecs_to_jiffies(p->numa_scan_period);
1650 
1651 		return;
1652 	}
1653 
1654 	/*
1655 	 * Prepare to scale scan period relative to the current period.
1656 	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
1657 	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1658 	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1659 	 */
1660 	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1661 	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1662 	if (ratio >= NUMA_PERIOD_THRESHOLD) {
1663 		int slot = ratio - NUMA_PERIOD_THRESHOLD;
1664 		if (!slot)
1665 			slot = 1;
1666 		diff = slot * period_slot;
1667 	} else {
1668 		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1669 
1670 		/*
1671 		 * Scale scan rate increases based on sharing. There is an
1672 		 * inverse relationship between the degree of sharing and
1673 		 * the adjustment made to the scanning period. Broadly
1674 		 * speaking the intent is that there is little point
1675 		 * scanning faster if shared accesses dominate as it may
1676 		 * simply bounce migrations uselessly
1677 		 */
1678 		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1679 		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1680 	}
1681 
1682 	p->numa_scan_period = clamp(p->numa_scan_period + diff,
1683 			task_scan_min(p), task_scan_max(p));
1684 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1685 }
1686 
1687 /*
1688  * Get the fraction of time the task has been running since the last
1689  * NUMA placement cycle. The scheduler keeps similar statistics, but
1690  * decays those on a 32ms period, which is orders of magnitude off
1691  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1692  * stats only if the task is so new there are no NUMA statistics yet.
1693  */
1694 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1695 {
1696 	u64 runtime, delta, now;
1697 	/* Use the start of this time slice to avoid calculations. */
1698 	now = p->se.exec_start;
1699 	runtime = p->se.sum_exec_runtime;
1700 
1701 	if (p->last_task_numa_placement) {
1702 		delta = runtime - p->last_sum_exec_runtime;
1703 		*period = now - p->last_task_numa_placement;
1704 	} else {
1705 		delta = p->se.avg.runnable_avg_sum;
1706 		*period = p->se.avg.avg_period;
1707 	}
1708 
1709 	p->last_sum_exec_runtime = runtime;
1710 	p->last_task_numa_placement = now;
1711 
1712 	return delta;
1713 }
1714 
1715 /*
1716  * Determine the preferred nid for a task in a numa_group. This needs to
1717  * be done in a way that produces consistent results with group_weight,
1718  * otherwise workloads might not converge.
1719  */
1720 static int preferred_group_nid(struct task_struct *p, int nid)
1721 {
1722 	nodemask_t nodes;
1723 	int dist;
1724 
1725 	/* Direct connections between all NUMA nodes. */
1726 	if (sched_numa_topology_type == NUMA_DIRECT)
1727 		return nid;
1728 
1729 	/*
1730 	 * On a system with glueless mesh NUMA topology, group_weight
1731 	 * scores nodes according to the number of NUMA hinting faults on
1732 	 * both the node itself, and on nearby nodes.
1733 	 */
1734 	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1735 		unsigned long score, max_score = 0;
1736 		int node, max_node = nid;
1737 
1738 		dist = sched_max_numa_distance;
1739 
1740 		for_each_online_node(node) {
1741 			score = group_weight(p, node, dist);
1742 			if (score > max_score) {
1743 				max_score = score;
1744 				max_node = node;
1745 			}
1746 		}
1747 		return max_node;
1748 	}
1749 
1750 	/*
1751 	 * Finding the preferred nid in a system with NUMA backplane
1752 	 * interconnect topology is more involved. The goal is to locate
1753 	 * tasks from numa_groups near each other in the system, and
1754 	 * untangle workloads from different sides of the system. This requires
1755 	 * searching down the hierarchy of node groups, recursively searching
1756 	 * inside the highest scoring group of nodes. The nodemask tricks
1757 	 * keep the complexity of the search down.
1758 	 */
1759 	nodes = node_online_map;
1760 	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1761 		unsigned long max_faults = 0;
1762 		nodemask_t max_group = NODE_MASK_NONE;
1763 		int a, b;
1764 
1765 		/* Are there nodes at this distance from each other? */
1766 		if (!find_numa_distance(dist))
1767 			continue;
1768 
1769 		for_each_node_mask(a, nodes) {
1770 			unsigned long faults = 0;
1771 			nodemask_t this_group;
1772 			nodes_clear(this_group);
1773 
1774 			/* Sum group's NUMA faults; includes a==b case. */
1775 			for_each_node_mask(b, nodes) {
1776 				if (node_distance(a, b) < dist) {
1777 					faults += group_faults(p, b);
1778 					node_set(b, this_group);
1779 					node_clear(b, nodes);
1780 				}
1781 			}
1782 
1783 			/* Remember the top group. */
1784 			if (faults > max_faults) {
1785 				max_faults = faults;
1786 				max_group = this_group;
1787 				/*
1788 				 * subtle: at the smallest distance there is
1789 				 * just one node left in each "group", the
1790 				 * winner is the preferred nid.
1791 				 */
1792 				nid = a;
1793 			}
1794 		}
1795 		/* Next round, evaluate the nodes within max_group. */
1796 		if (!max_faults)
1797 			break;
1798 		nodes = max_group;
1799 	}
1800 	return nid;
1801 }
1802 
1803 static void task_numa_placement(struct task_struct *p)
1804 {
1805 	int seq, nid, max_nid = -1, max_group_nid = -1;
1806 	unsigned long max_faults = 0, max_group_faults = 0;
1807 	unsigned long fault_types[2] = { 0, 0 };
1808 	unsigned long total_faults;
1809 	u64 runtime, period;
1810 	spinlock_t *group_lock = NULL;
1811 
1812 	/*
1813 	 * The p->mm->numa_scan_seq field gets updated without
1814 	 * exclusive access. Use READ_ONCE() here to ensure
1815 	 * that the field is read in a single access:
1816 	 */
1817 	seq = READ_ONCE(p->mm->numa_scan_seq);
1818 	if (p->numa_scan_seq == seq)
1819 		return;
1820 	p->numa_scan_seq = seq;
1821 	p->numa_scan_period_max = task_scan_max(p);
1822 
1823 	total_faults = p->numa_faults_locality[0] +
1824 		       p->numa_faults_locality[1];
1825 	runtime = numa_get_avg_runtime(p, &period);
1826 
1827 	/* If the task is part of a group prevent parallel updates to group stats */
1828 	if (p->numa_group) {
1829 		group_lock = &p->numa_group->lock;
1830 		spin_lock_irq(group_lock);
1831 	}
1832 
1833 	/* Find the node with the highest number of faults */
1834 	for_each_online_node(nid) {
1835 		/* Keep track of the offsets in numa_faults array */
1836 		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1837 		unsigned long faults = 0, group_faults = 0;
1838 		int priv;
1839 
1840 		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1841 			long diff, f_diff, f_weight;
1842 
1843 			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1844 			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1845 			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1846 			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1847 
1848 			/* Decay existing window, copy faults since last scan */
1849 			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1850 			fault_types[priv] += p->numa_faults[membuf_idx];
1851 			p->numa_faults[membuf_idx] = 0;
1852 
1853 			/*
1854 			 * Normalize the faults_from, so all tasks in a group
1855 			 * count according to CPU use, instead of by the raw
1856 			 * number of faults. Tasks with little runtime have
1857 			 * little over-all impact on throughput, and thus their
1858 			 * faults are less important.
1859 			 */
1860 			f_weight = div64_u64(runtime << 16, period + 1);
1861 			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1862 				   (total_faults + 1);
1863 			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1864 			p->numa_faults[cpubuf_idx] = 0;
1865 
1866 			p->numa_faults[mem_idx] += diff;
1867 			p->numa_faults[cpu_idx] += f_diff;
1868 			faults += p->numa_faults[mem_idx];
1869 			p->total_numa_faults += diff;
1870 			if (p->numa_group) {
1871 				/*
1872 				 * safe because we can only change our own group
1873 				 *
1874 				 * mem_idx represents the offset for a given
1875 				 * nid and priv in a specific region because it
1876 				 * is at the beginning of the numa_faults array.
1877 				 */
1878 				p->numa_group->faults[mem_idx] += diff;
1879 				p->numa_group->faults_cpu[mem_idx] += f_diff;
1880 				p->numa_group->total_faults += diff;
1881 				group_faults += p->numa_group->faults[mem_idx];
1882 			}
1883 		}
1884 
1885 		if (faults > max_faults) {
1886 			max_faults = faults;
1887 			max_nid = nid;
1888 		}
1889 
1890 		if (group_faults > max_group_faults) {
1891 			max_group_faults = group_faults;
1892 			max_group_nid = nid;
1893 		}
1894 	}
1895 
1896 	update_task_scan_period(p, fault_types[0], fault_types[1]);
1897 
1898 	if (p->numa_group) {
1899 		update_numa_active_node_mask(p->numa_group);
1900 		spin_unlock_irq(group_lock);
1901 		max_nid = preferred_group_nid(p, max_group_nid);
1902 	}
1903 
1904 	if (max_faults) {
1905 		/* Set the new preferred node */
1906 		if (max_nid != p->numa_preferred_nid)
1907 			sched_setnuma(p, max_nid);
1908 
1909 		if (task_node(p) != p->numa_preferred_nid)
1910 			numa_migrate_preferred(p);
1911 	}
1912 }
1913 
1914 static inline int get_numa_group(struct numa_group *grp)
1915 {
1916 	return atomic_inc_not_zero(&grp->refcount);
1917 }
1918 
1919 static inline void put_numa_group(struct numa_group *grp)
1920 {
1921 	if (atomic_dec_and_test(&grp->refcount))
1922 		kfree_rcu(grp, rcu);
1923 }
1924 
1925 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1926 			int *priv)
1927 {
1928 	struct numa_group *grp, *my_grp;
1929 	struct task_struct *tsk;
1930 	bool join = false;
1931 	int cpu = cpupid_to_cpu(cpupid);
1932 	int i;
1933 
1934 	if (unlikely(!p->numa_group)) {
1935 		unsigned int size = sizeof(struct numa_group) +
1936 				    4*nr_node_ids*sizeof(unsigned long);
1937 
1938 		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1939 		if (!grp)
1940 			return;
1941 
1942 		atomic_set(&grp->refcount, 1);
1943 		spin_lock_init(&grp->lock);
1944 		grp->gid = p->pid;
1945 		/* Second half of the array tracks nids where faults happen */
1946 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1947 						nr_node_ids;
1948 
1949 		node_set(task_node(current), grp->active_nodes);
1950 
1951 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1952 			grp->faults[i] = p->numa_faults[i];
1953 
1954 		grp->total_faults = p->total_numa_faults;
1955 
1956 		grp->nr_tasks++;
1957 		rcu_assign_pointer(p->numa_group, grp);
1958 	}
1959 
1960 	rcu_read_lock();
1961 	tsk = READ_ONCE(cpu_rq(cpu)->curr);
1962 
1963 	if (!cpupid_match_pid(tsk, cpupid))
1964 		goto no_join;
1965 
1966 	grp = rcu_dereference(tsk->numa_group);
1967 	if (!grp)
1968 		goto no_join;
1969 
1970 	my_grp = p->numa_group;
1971 	if (grp == my_grp)
1972 		goto no_join;
1973 
1974 	/*
1975 	 * Only join the other group if its bigger; if we're the bigger group,
1976 	 * the other task will join us.
1977 	 */
1978 	if (my_grp->nr_tasks > grp->nr_tasks)
1979 		goto no_join;
1980 
1981 	/*
1982 	 * Tie-break on the grp address.
1983 	 */
1984 	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1985 		goto no_join;
1986 
1987 	/* Always join threads in the same process. */
1988 	if (tsk->mm == current->mm)
1989 		join = true;
1990 
1991 	/* Simple filter to avoid false positives due to PID collisions */
1992 	if (flags & TNF_SHARED)
1993 		join = true;
1994 
1995 	/* Update priv based on whether false sharing was detected */
1996 	*priv = !join;
1997 
1998 	if (join && !get_numa_group(grp))
1999 		goto no_join;
2000 
2001 	rcu_read_unlock();
2002 
2003 	if (!join)
2004 		return;
2005 
2006 	BUG_ON(irqs_disabled());
2007 	double_lock_irq(&my_grp->lock, &grp->lock);
2008 
2009 	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2010 		my_grp->faults[i] -= p->numa_faults[i];
2011 		grp->faults[i] += p->numa_faults[i];
2012 	}
2013 	my_grp->total_faults -= p->total_numa_faults;
2014 	grp->total_faults += p->total_numa_faults;
2015 
2016 	my_grp->nr_tasks--;
2017 	grp->nr_tasks++;
2018 
2019 	spin_unlock(&my_grp->lock);
2020 	spin_unlock_irq(&grp->lock);
2021 
2022 	rcu_assign_pointer(p->numa_group, grp);
2023 
2024 	put_numa_group(my_grp);
2025 	return;
2026 
2027 no_join:
2028 	rcu_read_unlock();
2029 	return;
2030 }
2031 
2032 void task_numa_free(struct task_struct *p)
2033 {
2034 	struct numa_group *grp = p->numa_group;
2035 	void *numa_faults = p->numa_faults;
2036 	unsigned long flags;
2037 	int i;
2038 
2039 	if (grp) {
2040 		spin_lock_irqsave(&grp->lock, flags);
2041 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2042 			grp->faults[i] -= p->numa_faults[i];
2043 		grp->total_faults -= p->total_numa_faults;
2044 
2045 		grp->nr_tasks--;
2046 		spin_unlock_irqrestore(&grp->lock, flags);
2047 		RCU_INIT_POINTER(p->numa_group, NULL);
2048 		put_numa_group(grp);
2049 	}
2050 
2051 	p->numa_faults = NULL;
2052 	kfree(numa_faults);
2053 }
2054 
2055 /*
2056  * Got a PROT_NONE fault for a page on @node.
2057  */
2058 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2059 {
2060 	struct task_struct *p = current;
2061 	bool migrated = flags & TNF_MIGRATED;
2062 	int cpu_node = task_node(current);
2063 	int local = !!(flags & TNF_FAULT_LOCAL);
2064 	int priv;
2065 
2066 	if (!numabalancing_enabled)
2067 		return;
2068 
2069 	/* for example, ksmd faulting in a user's mm */
2070 	if (!p->mm)
2071 		return;
2072 
2073 	/* Allocate buffer to track faults on a per-node basis */
2074 	if (unlikely(!p->numa_faults)) {
2075 		int size = sizeof(*p->numa_faults) *
2076 			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2077 
2078 		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2079 		if (!p->numa_faults)
2080 			return;
2081 
2082 		p->total_numa_faults = 0;
2083 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2084 	}
2085 
2086 	/*
2087 	 * First accesses are treated as private, otherwise consider accesses
2088 	 * to be private if the accessing pid has not changed
2089 	 */
2090 	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2091 		priv = 1;
2092 	} else {
2093 		priv = cpupid_match_pid(p, last_cpupid);
2094 		if (!priv && !(flags & TNF_NO_GROUP))
2095 			task_numa_group(p, last_cpupid, flags, &priv);
2096 	}
2097 
2098 	/*
2099 	 * If a workload spans multiple NUMA nodes, a shared fault that
2100 	 * occurs wholly within the set of nodes that the workload is
2101 	 * actively using should be counted as local. This allows the
2102 	 * scan rate to slow down when a workload has settled down.
2103 	 */
2104 	if (!priv && !local && p->numa_group &&
2105 			node_isset(cpu_node, p->numa_group->active_nodes) &&
2106 			node_isset(mem_node, p->numa_group->active_nodes))
2107 		local = 1;
2108 
2109 	task_numa_placement(p);
2110 
2111 	/*
2112 	 * Retry task to preferred node migration periodically, in case it
2113 	 * case it previously failed, or the scheduler moved us.
2114 	 */
2115 	if (time_after(jiffies, p->numa_migrate_retry))
2116 		numa_migrate_preferred(p);
2117 
2118 	if (migrated)
2119 		p->numa_pages_migrated += pages;
2120 	if (flags & TNF_MIGRATE_FAIL)
2121 		p->numa_faults_locality[2] += pages;
2122 
2123 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2124 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2125 	p->numa_faults_locality[local] += pages;
2126 }
2127 
2128 static void reset_ptenuma_scan(struct task_struct *p)
2129 {
2130 	/*
2131 	 * We only did a read acquisition of the mmap sem, so
2132 	 * p->mm->numa_scan_seq is written to without exclusive access
2133 	 * and the update is not guaranteed to be atomic. That's not
2134 	 * much of an issue though, since this is just used for
2135 	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2136 	 * expensive, to avoid any form of compiler optimizations:
2137 	 */
2138 	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2139 	p->mm->numa_scan_offset = 0;
2140 }
2141 
2142 /*
2143  * The expensive part of numa migration is done from task_work context.
2144  * Triggered from task_tick_numa().
2145  */
2146 void task_numa_work(struct callback_head *work)
2147 {
2148 	unsigned long migrate, next_scan, now = jiffies;
2149 	struct task_struct *p = current;
2150 	struct mm_struct *mm = p->mm;
2151 	struct vm_area_struct *vma;
2152 	unsigned long start, end;
2153 	unsigned long nr_pte_updates = 0;
2154 	long pages;
2155 
2156 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2157 
2158 	work->next = work; /* protect against double add */
2159 	/*
2160 	 * Who cares about NUMA placement when they're dying.
2161 	 *
2162 	 * NOTE: make sure not to dereference p->mm before this check,
2163 	 * exit_task_work() happens _after_ exit_mm() so we could be called
2164 	 * without p->mm even though we still had it when we enqueued this
2165 	 * work.
2166 	 */
2167 	if (p->flags & PF_EXITING)
2168 		return;
2169 
2170 	if (!mm->numa_next_scan) {
2171 		mm->numa_next_scan = now +
2172 			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2173 	}
2174 
2175 	/*
2176 	 * Enforce maximal scan/migration frequency..
2177 	 */
2178 	migrate = mm->numa_next_scan;
2179 	if (time_before(now, migrate))
2180 		return;
2181 
2182 	if (p->numa_scan_period == 0) {
2183 		p->numa_scan_period_max = task_scan_max(p);
2184 		p->numa_scan_period = task_scan_min(p);
2185 	}
2186 
2187 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2188 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2189 		return;
2190 
2191 	/*
2192 	 * Delay this task enough that another task of this mm will likely win
2193 	 * the next time around.
2194 	 */
2195 	p->node_stamp += 2 * TICK_NSEC;
2196 
2197 	start = mm->numa_scan_offset;
2198 	pages = sysctl_numa_balancing_scan_size;
2199 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2200 	if (!pages)
2201 		return;
2202 
2203 	down_read(&mm->mmap_sem);
2204 	vma = find_vma(mm, start);
2205 	if (!vma) {
2206 		reset_ptenuma_scan(p);
2207 		start = 0;
2208 		vma = mm->mmap;
2209 	}
2210 	for (; vma; vma = vma->vm_next) {
2211 		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2212 			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2213 			continue;
2214 		}
2215 
2216 		/*
2217 		 * Shared library pages mapped by multiple processes are not
2218 		 * migrated as it is expected they are cache replicated. Avoid
2219 		 * hinting faults in read-only file-backed mappings or the vdso
2220 		 * as migrating the pages will be of marginal benefit.
2221 		 */
2222 		if (!vma->vm_mm ||
2223 		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2224 			continue;
2225 
2226 		/*
2227 		 * Skip inaccessible VMAs to avoid any confusion between
2228 		 * PROT_NONE and NUMA hinting ptes
2229 		 */
2230 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2231 			continue;
2232 
2233 		do {
2234 			start = max(start, vma->vm_start);
2235 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2236 			end = min(end, vma->vm_end);
2237 			nr_pte_updates += change_prot_numa(vma, start, end);
2238 
2239 			/*
2240 			 * Scan sysctl_numa_balancing_scan_size but ensure that
2241 			 * at least one PTE is updated so that unused virtual
2242 			 * address space is quickly skipped.
2243 			 */
2244 			if (nr_pte_updates)
2245 				pages -= (end - start) >> PAGE_SHIFT;
2246 
2247 			start = end;
2248 			if (pages <= 0)
2249 				goto out;
2250 
2251 			cond_resched();
2252 		} while (end != vma->vm_end);
2253 	}
2254 
2255 out:
2256 	/*
2257 	 * It is possible to reach the end of the VMA list but the last few
2258 	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2259 	 * would find the !migratable VMA on the next scan but not reset the
2260 	 * scanner to the start so check it now.
2261 	 */
2262 	if (vma)
2263 		mm->numa_scan_offset = start;
2264 	else
2265 		reset_ptenuma_scan(p);
2266 	up_read(&mm->mmap_sem);
2267 }
2268 
2269 /*
2270  * Drive the periodic memory faults..
2271  */
2272 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2273 {
2274 	struct callback_head *work = &curr->numa_work;
2275 	u64 period, now;
2276 
2277 	/*
2278 	 * We don't care about NUMA placement if we don't have memory.
2279 	 */
2280 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2281 		return;
2282 
2283 	/*
2284 	 * Using runtime rather than walltime has the dual advantage that
2285 	 * we (mostly) drive the selection from busy threads and that the
2286 	 * task needs to have done some actual work before we bother with
2287 	 * NUMA placement.
2288 	 */
2289 	now = curr->se.sum_exec_runtime;
2290 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2291 
2292 	if (now - curr->node_stamp > period) {
2293 		if (!curr->node_stamp)
2294 			curr->numa_scan_period = task_scan_min(curr);
2295 		curr->node_stamp += period;
2296 
2297 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2298 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2299 			task_work_add(curr, work, true);
2300 		}
2301 	}
2302 }
2303 #else
2304 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2305 {
2306 }
2307 
2308 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2309 {
2310 }
2311 
2312 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2313 {
2314 }
2315 #endif /* CONFIG_NUMA_BALANCING */
2316 
2317 static void
2318 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2319 {
2320 	update_load_add(&cfs_rq->load, se->load.weight);
2321 	if (!parent_entity(se))
2322 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2323 #ifdef CONFIG_SMP
2324 	if (entity_is_task(se)) {
2325 		struct rq *rq = rq_of(cfs_rq);
2326 
2327 		account_numa_enqueue(rq, task_of(se));
2328 		list_add(&se->group_node, &rq->cfs_tasks);
2329 	}
2330 #endif
2331 	cfs_rq->nr_running++;
2332 }
2333 
2334 static void
2335 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2336 {
2337 	update_load_sub(&cfs_rq->load, se->load.weight);
2338 	if (!parent_entity(se))
2339 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2340 	if (entity_is_task(se)) {
2341 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2342 		list_del_init(&se->group_node);
2343 	}
2344 	cfs_rq->nr_running--;
2345 }
2346 
2347 #ifdef CONFIG_FAIR_GROUP_SCHED
2348 # ifdef CONFIG_SMP
2349 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2350 {
2351 	long tg_weight;
2352 
2353 	/*
2354 	 * Use this CPU's actual weight instead of the last load_contribution
2355 	 * to gain a more accurate current total weight. See
2356 	 * update_cfs_rq_load_contribution().
2357 	 */
2358 	tg_weight = atomic_long_read(&tg->load_avg);
2359 	tg_weight -= cfs_rq->tg_load_contrib;
2360 	tg_weight += cfs_rq->load.weight;
2361 
2362 	return tg_weight;
2363 }
2364 
2365 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2366 {
2367 	long tg_weight, load, shares;
2368 
2369 	tg_weight = calc_tg_weight(tg, cfs_rq);
2370 	load = cfs_rq->load.weight;
2371 
2372 	shares = (tg->shares * load);
2373 	if (tg_weight)
2374 		shares /= tg_weight;
2375 
2376 	if (shares < MIN_SHARES)
2377 		shares = MIN_SHARES;
2378 	if (shares > tg->shares)
2379 		shares = tg->shares;
2380 
2381 	return shares;
2382 }
2383 # else /* CONFIG_SMP */
2384 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2385 {
2386 	return tg->shares;
2387 }
2388 # endif /* CONFIG_SMP */
2389 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2390 			    unsigned long weight)
2391 {
2392 	if (se->on_rq) {
2393 		/* commit outstanding execution time */
2394 		if (cfs_rq->curr == se)
2395 			update_curr(cfs_rq);
2396 		account_entity_dequeue(cfs_rq, se);
2397 	}
2398 
2399 	update_load_set(&se->load, weight);
2400 
2401 	if (se->on_rq)
2402 		account_entity_enqueue(cfs_rq, se);
2403 }
2404 
2405 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2406 
2407 static void update_cfs_shares(struct cfs_rq *cfs_rq)
2408 {
2409 	struct task_group *tg;
2410 	struct sched_entity *se;
2411 	long shares;
2412 
2413 	tg = cfs_rq->tg;
2414 	se = tg->se[cpu_of(rq_of(cfs_rq))];
2415 	if (!se || throttled_hierarchy(cfs_rq))
2416 		return;
2417 #ifndef CONFIG_SMP
2418 	if (likely(se->load.weight == tg->shares))
2419 		return;
2420 #endif
2421 	shares = calc_cfs_shares(cfs_rq, tg);
2422 
2423 	reweight_entity(cfs_rq_of(se), se, shares);
2424 }
2425 #else /* CONFIG_FAIR_GROUP_SCHED */
2426 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2427 {
2428 }
2429 #endif /* CONFIG_FAIR_GROUP_SCHED */
2430 
2431 #ifdef CONFIG_SMP
2432 /*
2433  * We choose a half-life close to 1 scheduling period.
2434  * Note: The tables below are dependent on this value.
2435  */
2436 #define LOAD_AVG_PERIOD 32
2437 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
2438 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
2439 
2440 /* Precomputed fixed inverse multiplies for multiplication by y^n */
2441 static const u32 runnable_avg_yN_inv[] = {
2442 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2443 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2444 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2445 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2446 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2447 	0x85aac367, 0x82cd8698,
2448 };
2449 
2450 /*
2451  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2452  * over-estimates when re-combining.
2453  */
2454 static const u32 runnable_avg_yN_sum[] = {
2455 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2456 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2457 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2458 };
2459 
2460 /*
2461  * Approximate:
2462  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2463  */
2464 static __always_inline u64 decay_load(u64 val, u64 n)
2465 {
2466 	unsigned int local_n;
2467 
2468 	if (!n)
2469 		return val;
2470 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2471 		return 0;
2472 
2473 	/* after bounds checking we can collapse to 32-bit */
2474 	local_n = n;
2475 
2476 	/*
2477 	 * As y^PERIOD = 1/2, we can combine
2478 	 *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2479 	 * With a look-up table which covers y^n (n<PERIOD)
2480 	 *
2481 	 * To achieve constant time decay_load.
2482 	 */
2483 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2484 		val >>= local_n / LOAD_AVG_PERIOD;
2485 		local_n %= LOAD_AVG_PERIOD;
2486 	}
2487 
2488 	val *= runnable_avg_yN_inv[local_n];
2489 	/* We don't use SRR here since we always want to round down. */
2490 	return val >> 32;
2491 }
2492 
2493 /*
2494  * For updates fully spanning n periods, the contribution to runnable
2495  * average will be: \Sum 1024*y^n
2496  *
2497  * We can compute this reasonably efficiently by combining:
2498  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2499  */
2500 static u32 __compute_runnable_contrib(u64 n)
2501 {
2502 	u32 contrib = 0;
2503 
2504 	if (likely(n <= LOAD_AVG_PERIOD))
2505 		return runnable_avg_yN_sum[n];
2506 	else if (unlikely(n >= LOAD_AVG_MAX_N))
2507 		return LOAD_AVG_MAX;
2508 
2509 	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2510 	do {
2511 		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2512 		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2513 
2514 		n -= LOAD_AVG_PERIOD;
2515 	} while (n > LOAD_AVG_PERIOD);
2516 
2517 	contrib = decay_load(contrib, n);
2518 	return contrib + runnable_avg_yN_sum[n];
2519 }
2520 
2521 /*
2522  * We can represent the historical contribution to runnable average as the
2523  * coefficients of a geometric series.  To do this we sub-divide our runnable
2524  * history into segments of approximately 1ms (1024us); label the segment that
2525  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2526  *
2527  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2528  *      p0            p1           p2
2529  *     (now)       (~1ms ago)  (~2ms ago)
2530  *
2531  * Let u_i denote the fraction of p_i that the entity was runnable.
2532  *
2533  * We then designate the fractions u_i as our co-efficients, yielding the
2534  * following representation of historical load:
2535  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2536  *
2537  * We choose y based on the with of a reasonably scheduling period, fixing:
2538  *   y^32 = 0.5
2539  *
2540  * This means that the contribution to load ~32ms ago (u_32) will be weighted
2541  * approximately half as much as the contribution to load within the last ms
2542  * (u_0).
2543  *
2544  * When a period "rolls over" and we have new u_0`, multiplying the previous
2545  * sum again by y is sufficient to update:
2546  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2547  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2548  */
2549 static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
2550 							struct sched_avg *sa,
2551 							int runnable,
2552 							int running)
2553 {
2554 	u64 delta, periods;
2555 	u32 runnable_contrib;
2556 	int delta_w, decayed = 0;
2557 	unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
2558 
2559 	delta = now - sa->last_runnable_update;
2560 	/*
2561 	 * This should only happen when time goes backwards, which it
2562 	 * unfortunately does during sched clock init when we swap over to TSC.
2563 	 */
2564 	if ((s64)delta < 0) {
2565 		sa->last_runnable_update = now;
2566 		return 0;
2567 	}
2568 
2569 	/*
2570 	 * Use 1024ns as the unit of measurement since it's a reasonable
2571 	 * approximation of 1us and fast to compute.
2572 	 */
2573 	delta >>= 10;
2574 	if (!delta)
2575 		return 0;
2576 	sa->last_runnable_update = now;
2577 
2578 	/* delta_w is the amount already accumulated against our next period */
2579 	delta_w = sa->avg_period % 1024;
2580 	if (delta + delta_w >= 1024) {
2581 		/* period roll-over */
2582 		decayed = 1;
2583 
2584 		/*
2585 		 * Now that we know we're crossing a period boundary, figure
2586 		 * out how much from delta we need to complete the current
2587 		 * period and accrue it.
2588 		 */
2589 		delta_w = 1024 - delta_w;
2590 		if (runnable)
2591 			sa->runnable_avg_sum += delta_w;
2592 		if (running)
2593 			sa->running_avg_sum += delta_w * scale_freq
2594 				>> SCHED_CAPACITY_SHIFT;
2595 		sa->avg_period += delta_w;
2596 
2597 		delta -= delta_w;
2598 
2599 		/* Figure out how many additional periods this update spans */
2600 		periods = delta / 1024;
2601 		delta %= 1024;
2602 
2603 		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
2604 						  periods + 1);
2605 		sa->running_avg_sum = decay_load(sa->running_avg_sum,
2606 						  periods + 1);
2607 		sa->avg_period = decay_load(sa->avg_period,
2608 						     periods + 1);
2609 
2610 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
2611 		runnable_contrib = __compute_runnable_contrib(periods);
2612 		if (runnable)
2613 			sa->runnable_avg_sum += runnable_contrib;
2614 		if (running)
2615 			sa->running_avg_sum += runnable_contrib * scale_freq
2616 				>> SCHED_CAPACITY_SHIFT;
2617 		sa->avg_period += runnable_contrib;
2618 	}
2619 
2620 	/* Remainder of delta accrued against u_0` */
2621 	if (runnable)
2622 		sa->runnable_avg_sum += delta;
2623 	if (running)
2624 		sa->running_avg_sum += delta * scale_freq
2625 			>> SCHED_CAPACITY_SHIFT;
2626 	sa->avg_period += delta;
2627 
2628 	return decayed;
2629 }
2630 
2631 /* Synchronize an entity's decay with its parenting cfs_rq.*/
2632 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2633 {
2634 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2635 	u64 decays = atomic64_read(&cfs_rq->decay_counter);
2636 
2637 	decays -= se->avg.decay_count;
2638 	se->avg.decay_count = 0;
2639 	if (!decays)
2640 		return 0;
2641 
2642 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2643 	se->avg.utilization_avg_contrib =
2644 		decay_load(se->avg.utilization_avg_contrib, decays);
2645 
2646 	return decays;
2647 }
2648 
2649 #ifdef CONFIG_FAIR_GROUP_SCHED
2650 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2651 						 int force_update)
2652 {
2653 	struct task_group *tg = cfs_rq->tg;
2654 	long tg_contrib;
2655 
2656 	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
2657 	tg_contrib -= cfs_rq->tg_load_contrib;
2658 
2659 	if (!tg_contrib)
2660 		return;
2661 
2662 	if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
2663 		atomic_long_add(tg_contrib, &tg->load_avg);
2664 		cfs_rq->tg_load_contrib += tg_contrib;
2665 	}
2666 }
2667 
2668 /*
2669  * Aggregate cfs_rq runnable averages into an equivalent task_group
2670  * representation for computing load contributions.
2671  */
2672 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2673 						  struct cfs_rq *cfs_rq)
2674 {
2675 	struct task_group *tg = cfs_rq->tg;
2676 	long contrib;
2677 
2678 	/* The fraction of a cpu used by this cfs_rq */
2679 	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
2680 			  sa->avg_period + 1);
2681 	contrib -= cfs_rq->tg_runnable_contrib;
2682 
2683 	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
2684 		atomic_add(contrib, &tg->runnable_avg);
2685 		cfs_rq->tg_runnable_contrib += contrib;
2686 	}
2687 }
2688 
2689 static inline void __update_group_entity_contrib(struct sched_entity *se)
2690 {
2691 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
2692 	struct task_group *tg = cfs_rq->tg;
2693 	int runnable_avg;
2694 
2695 	u64 contrib;
2696 
2697 	contrib = cfs_rq->tg_load_contrib * tg->shares;
2698 	se->avg.load_avg_contrib = div_u64(contrib,
2699 				     atomic_long_read(&tg->load_avg) + 1);
2700 
2701 	/*
2702 	 * For group entities we need to compute a correction term in the case
2703 	 * that they are consuming <1 cpu so that we would contribute the same
2704 	 * load as a task of equal weight.
2705 	 *
2706 	 * Explicitly co-ordinating this measurement would be expensive, but
2707 	 * fortunately the sum of each cpus contribution forms a usable
2708 	 * lower-bound on the true value.
2709 	 *
2710 	 * Consider the aggregate of 2 contributions.  Either they are disjoint
2711 	 * (and the sum represents true value) or they are disjoint and we are
2712 	 * understating by the aggregate of their overlap.
2713 	 *
2714 	 * Extending this to N cpus, for a given overlap, the maximum amount we
2715 	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
2716 	 * cpus that overlap for this interval and w_i is the interval width.
2717 	 *
2718 	 * On a small machine; the first term is well-bounded which bounds the
2719 	 * total error since w_i is a subset of the period.  Whereas on a
2720 	 * larger machine, while this first term can be larger, if w_i is the
2721 	 * of consequential size guaranteed to see n_i*w_i quickly converge to
2722 	 * our upper bound of 1-cpu.
2723 	 */
2724 	runnable_avg = atomic_read(&tg->runnable_avg);
2725 	if (runnable_avg < NICE_0_LOAD) {
2726 		se->avg.load_avg_contrib *= runnable_avg;
2727 		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2728 	}
2729 }
2730 
2731 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2732 {
2733 	__update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
2734 			runnable, runnable);
2735 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
2736 }
2737 #else /* CONFIG_FAIR_GROUP_SCHED */
2738 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2739 						 int force_update) {}
2740 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2741 						  struct cfs_rq *cfs_rq) {}
2742 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2743 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2744 #endif /* CONFIG_FAIR_GROUP_SCHED */
2745 
2746 static inline void __update_task_entity_contrib(struct sched_entity *se)
2747 {
2748 	u32 contrib;
2749 
2750 	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2751 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2752 	contrib /= (se->avg.avg_period + 1);
2753 	se->avg.load_avg_contrib = scale_load(contrib);
2754 }
2755 
2756 /* Compute the current contribution to load_avg by se, return any delta */
2757 static long __update_entity_load_avg_contrib(struct sched_entity *se)
2758 {
2759 	long old_contrib = se->avg.load_avg_contrib;
2760 
2761 	if (entity_is_task(se)) {
2762 		__update_task_entity_contrib(se);
2763 	} else {
2764 		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
2765 		__update_group_entity_contrib(se);
2766 	}
2767 
2768 	return se->avg.load_avg_contrib - old_contrib;
2769 }
2770 
2771 
2772 static inline void __update_task_entity_utilization(struct sched_entity *se)
2773 {
2774 	u32 contrib;
2775 
2776 	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2777 	contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
2778 	contrib /= (se->avg.avg_period + 1);
2779 	se->avg.utilization_avg_contrib = scale_load(contrib);
2780 }
2781 
2782 static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
2783 {
2784 	long old_contrib = se->avg.utilization_avg_contrib;
2785 
2786 	if (entity_is_task(se))
2787 		__update_task_entity_utilization(se);
2788 	else
2789 		se->avg.utilization_avg_contrib =
2790 					group_cfs_rq(se)->utilization_load_avg;
2791 
2792 	return se->avg.utilization_avg_contrib - old_contrib;
2793 }
2794 
2795 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
2796 						 long load_contrib)
2797 {
2798 	if (likely(load_contrib < cfs_rq->blocked_load_avg))
2799 		cfs_rq->blocked_load_avg -= load_contrib;
2800 	else
2801 		cfs_rq->blocked_load_avg = 0;
2802 }
2803 
2804 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2805 
2806 /* Update a sched_entity's runnable average */
2807 static inline void update_entity_load_avg(struct sched_entity *se,
2808 					  int update_cfs_rq)
2809 {
2810 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
2811 	long contrib_delta, utilization_delta;
2812 	int cpu = cpu_of(rq_of(cfs_rq));
2813 	u64 now;
2814 
2815 	/*
2816 	 * For a group entity we need to use their owned cfs_rq_clock_task() in
2817 	 * case they are the parent of a throttled hierarchy.
2818 	 */
2819 	if (entity_is_task(se))
2820 		now = cfs_rq_clock_task(cfs_rq);
2821 	else
2822 		now = cfs_rq_clock_task(group_cfs_rq(se));
2823 
2824 	if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
2825 					cfs_rq->curr == se))
2826 		return;
2827 
2828 	contrib_delta = __update_entity_load_avg_contrib(se);
2829 	utilization_delta = __update_entity_utilization_avg_contrib(se);
2830 
2831 	if (!update_cfs_rq)
2832 		return;
2833 
2834 	if (se->on_rq) {
2835 		cfs_rq->runnable_load_avg += contrib_delta;
2836 		cfs_rq->utilization_load_avg += utilization_delta;
2837 	} else {
2838 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2839 	}
2840 }
2841 
2842 /*
2843  * Decay the load contributed by all blocked children and account this so that
2844  * their contribution may appropriately discounted when they wake up.
2845  */
2846 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2847 {
2848 	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
2849 	u64 decays;
2850 
2851 	decays = now - cfs_rq->last_decay;
2852 	if (!decays && !force_update)
2853 		return;
2854 
2855 	if (atomic_long_read(&cfs_rq->removed_load)) {
2856 		unsigned long removed_load;
2857 		removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
2858 		subtract_blocked_load_contrib(cfs_rq, removed_load);
2859 	}
2860 
2861 	if (decays) {
2862 		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
2863 						      decays);
2864 		atomic64_add(decays, &cfs_rq->decay_counter);
2865 		cfs_rq->last_decay = now;
2866 	}
2867 
2868 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2869 }
2870 
2871 /* Add the load generated by se into cfs_rq's child load-average */
2872 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2873 						  struct sched_entity *se,
2874 						  int wakeup)
2875 {
2876 	/*
2877 	 * We track migrations using entity decay_count <= 0, on a wake-up
2878 	 * migration we use a negative decay count to track the remote decays
2879 	 * accumulated while sleeping.
2880 	 *
2881 	 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
2882 	 * are seen by enqueue_entity_load_avg() as a migration with an already
2883 	 * constructed load_avg_contrib.
2884 	 */
2885 	if (unlikely(se->avg.decay_count <= 0)) {
2886 		se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
2887 		if (se->avg.decay_count) {
2888 			/*
2889 			 * In a wake-up migration we have to approximate the
2890 			 * time sleeping.  This is because we can't synchronize
2891 			 * clock_task between the two cpus, and it is not
2892 			 * guaranteed to be read-safe.  Instead, we can
2893 			 * approximate this using our carried decays, which are
2894 			 * explicitly atomically readable.
2895 			 */
2896 			se->avg.last_runnable_update -= (-se->avg.decay_count)
2897 							<< 20;
2898 			update_entity_load_avg(se, 0);
2899 			/* Indicate that we're now synchronized and on-rq */
2900 			se->avg.decay_count = 0;
2901 		}
2902 		wakeup = 0;
2903 	} else {
2904 		__synchronize_entity_decay(se);
2905 	}
2906 
2907 	/* migrated tasks did not contribute to our blocked load */
2908 	if (wakeup) {
2909 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
2910 		update_entity_load_avg(se, 0);
2911 	}
2912 
2913 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
2914 	cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
2915 	/* we force update consideration on load-balancer moves */
2916 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2917 }
2918 
2919 /*
2920  * Remove se's load from this cfs_rq child load-average, if the entity is
2921  * transitioning to a blocked state we track its projected decay using
2922  * blocked_load_avg.
2923  */
2924 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2925 						  struct sched_entity *se,
2926 						  int sleep)
2927 {
2928 	update_entity_load_avg(se, 1);
2929 	/* we force update consideration on load-balancer moves */
2930 	update_cfs_rq_blocked_load(cfs_rq, !sleep);
2931 
2932 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
2933 	cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
2934 	if (sleep) {
2935 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
2936 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
2937 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
2938 }
2939 
2940 /*
2941  * Update the rq's load with the elapsed running time before entering
2942  * idle. if the last scheduled task is not a CFS task, idle_enter will
2943  * be the only way to update the runnable statistic.
2944  */
2945 void idle_enter_fair(struct rq *this_rq)
2946 {
2947 	update_rq_runnable_avg(this_rq, 1);
2948 }
2949 
2950 /*
2951  * Update the rq's load with the elapsed idle time before a task is
2952  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
2953  * be the only way to update the runnable statistic.
2954  */
2955 void idle_exit_fair(struct rq *this_rq)
2956 {
2957 	update_rq_runnable_avg(this_rq, 0);
2958 }
2959 
2960 static int idle_balance(struct rq *this_rq);
2961 
2962 #else /* CONFIG_SMP */
2963 
2964 static inline void update_entity_load_avg(struct sched_entity *se,
2965 					  int update_cfs_rq) {}
2966 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2967 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2968 					   struct sched_entity *se,
2969 					   int wakeup) {}
2970 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2971 					   struct sched_entity *se,
2972 					   int sleep) {}
2973 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2974 					      int force_update) {}
2975 
2976 static inline int idle_balance(struct rq *rq)
2977 {
2978 	return 0;
2979 }
2980 
2981 #endif /* CONFIG_SMP */
2982 
2983 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2984 {
2985 #ifdef CONFIG_SCHEDSTATS
2986 	struct task_struct *tsk = NULL;
2987 
2988 	if (entity_is_task(se))
2989 		tsk = task_of(se);
2990 
2991 	if (se->statistics.sleep_start) {
2992 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
2993 
2994 		if ((s64)delta < 0)
2995 			delta = 0;
2996 
2997 		if (unlikely(delta > se->statistics.sleep_max))
2998 			se->statistics.sleep_max = delta;
2999 
3000 		se->statistics.sleep_start = 0;
3001 		se->statistics.sum_sleep_runtime += delta;
3002 
3003 		if (tsk) {
3004 			account_scheduler_latency(tsk, delta >> 10, 1);
3005 			trace_sched_stat_sleep(tsk, delta);
3006 		}
3007 	}
3008 	if (se->statistics.block_start) {
3009 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
3010 
3011 		if ((s64)delta < 0)
3012 			delta = 0;
3013 
3014 		if (unlikely(delta > se->statistics.block_max))
3015 			se->statistics.block_max = delta;
3016 
3017 		se->statistics.block_start = 0;
3018 		se->statistics.sum_sleep_runtime += delta;
3019 
3020 		if (tsk) {
3021 			if (tsk->in_iowait) {
3022 				se->statistics.iowait_sum += delta;
3023 				se->statistics.iowait_count++;
3024 				trace_sched_stat_iowait(tsk, delta);
3025 			}
3026 
3027 			trace_sched_stat_blocked(tsk, delta);
3028 
3029 			/*
3030 			 * Blocking time is in units of nanosecs, so shift by
3031 			 * 20 to get a milliseconds-range estimation of the
3032 			 * amount of time that the task spent sleeping:
3033 			 */
3034 			if (unlikely(prof_on == SLEEP_PROFILING)) {
3035 				profile_hits(SLEEP_PROFILING,
3036 						(void *)get_wchan(tsk),
3037 						delta >> 20);
3038 			}
3039 			account_scheduler_latency(tsk, delta >> 10, 0);
3040 		}
3041 	}
3042 #endif
3043 }
3044 
3045 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3046 {
3047 #ifdef CONFIG_SCHED_DEBUG
3048 	s64 d = se->vruntime - cfs_rq->min_vruntime;
3049 
3050 	if (d < 0)
3051 		d = -d;
3052 
3053 	if (d > 3*sysctl_sched_latency)
3054 		schedstat_inc(cfs_rq, nr_spread_over);
3055 #endif
3056 }
3057 
3058 static void
3059 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3060 {
3061 	u64 vruntime = cfs_rq->min_vruntime;
3062 
3063 	/*
3064 	 * The 'current' period is already promised to the current tasks,
3065 	 * however the extra weight of the new task will slow them down a
3066 	 * little, place the new task so that it fits in the slot that
3067 	 * stays open at the end.
3068 	 */
3069 	if (initial && sched_feat(START_DEBIT))
3070 		vruntime += sched_vslice(cfs_rq, se);
3071 
3072 	/* sleeps up to a single latency don't count. */
3073 	if (!initial) {
3074 		unsigned long thresh = sysctl_sched_latency;
3075 
3076 		/*
3077 		 * Halve their sleep time's effect, to allow
3078 		 * for a gentler effect of sleepers:
3079 		 */
3080 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
3081 			thresh >>= 1;
3082 
3083 		vruntime -= thresh;
3084 	}
3085 
3086 	/* ensure we never gain time by being placed backwards. */
3087 	se->vruntime = max_vruntime(se->vruntime, vruntime);
3088 }
3089 
3090 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3091 
3092 static void
3093 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3094 {
3095 	/*
3096 	 * Update the normalized vruntime before updating min_vruntime
3097 	 * through calling update_curr().
3098 	 */
3099 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
3100 		se->vruntime += cfs_rq->min_vruntime;
3101 
3102 	/*
3103 	 * Update run-time statistics of the 'current'.
3104 	 */
3105 	update_curr(cfs_rq);
3106 	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
3107 	account_entity_enqueue(cfs_rq, se);
3108 	update_cfs_shares(cfs_rq);
3109 
3110 	if (flags & ENQUEUE_WAKEUP) {
3111 		place_entity(cfs_rq, se, 0);
3112 		enqueue_sleeper(cfs_rq, se);
3113 	}
3114 
3115 	update_stats_enqueue(cfs_rq, se);
3116 	check_spread(cfs_rq, se);
3117 	if (se != cfs_rq->curr)
3118 		__enqueue_entity(cfs_rq, se);
3119 	se->on_rq = 1;
3120 
3121 	if (cfs_rq->nr_running == 1) {
3122 		list_add_leaf_cfs_rq(cfs_rq);
3123 		check_enqueue_throttle(cfs_rq);
3124 	}
3125 }
3126 
3127 static void __clear_buddies_last(struct sched_entity *se)
3128 {
3129 	for_each_sched_entity(se) {
3130 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3131 		if (cfs_rq->last != se)
3132 			break;
3133 
3134 		cfs_rq->last = NULL;
3135 	}
3136 }
3137 
3138 static void __clear_buddies_next(struct sched_entity *se)
3139 {
3140 	for_each_sched_entity(se) {
3141 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3142 		if (cfs_rq->next != se)
3143 			break;
3144 
3145 		cfs_rq->next = NULL;
3146 	}
3147 }
3148 
3149 static void __clear_buddies_skip(struct sched_entity *se)
3150 {
3151 	for_each_sched_entity(se) {
3152 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3153 		if (cfs_rq->skip != se)
3154 			break;
3155 
3156 		cfs_rq->skip = NULL;
3157 	}
3158 }
3159 
3160 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3161 {
3162 	if (cfs_rq->last == se)
3163 		__clear_buddies_last(se);
3164 
3165 	if (cfs_rq->next == se)
3166 		__clear_buddies_next(se);
3167 
3168 	if (cfs_rq->skip == se)
3169 		__clear_buddies_skip(se);
3170 }
3171 
3172 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3173 
3174 static void
3175 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3176 {
3177 	/*
3178 	 * Update run-time statistics of the 'current'.
3179 	 */
3180 	update_curr(cfs_rq);
3181 	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
3182 
3183 	update_stats_dequeue(cfs_rq, se);
3184 	if (flags & DEQUEUE_SLEEP) {
3185 #ifdef CONFIG_SCHEDSTATS
3186 		if (entity_is_task(se)) {
3187 			struct task_struct *tsk = task_of(se);
3188 
3189 			if (tsk->state & TASK_INTERRUPTIBLE)
3190 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
3191 			if (tsk->state & TASK_UNINTERRUPTIBLE)
3192 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
3193 		}
3194 #endif
3195 	}
3196 
3197 	clear_buddies(cfs_rq, se);
3198 
3199 	if (se != cfs_rq->curr)
3200 		__dequeue_entity(cfs_rq, se);
3201 	se->on_rq = 0;
3202 	account_entity_dequeue(cfs_rq, se);
3203 
3204 	/*
3205 	 * Normalize the entity after updating the min_vruntime because the
3206 	 * update can refer to the ->curr item and we need to reflect this
3207 	 * movement in our normalized position.
3208 	 */
3209 	if (!(flags & DEQUEUE_SLEEP))
3210 		se->vruntime -= cfs_rq->min_vruntime;
3211 
3212 	/* return excess runtime on last dequeue */
3213 	return_cfs_rq_runtime(cfs_rq);
3214 
3215 	update_min_vruntime(cfs_rq);
3216 	update_cfs_shares(cfs_rq);
3217 }
3218 
3219 /*
3220  * Preempt the current task with a newly woken task if needed:
3221  */
3222 static void
3223 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3224 {
3225 	unsigned long ideal_runtime, delta_exec;
3226 	struct sched_entity *se;
3227 	s64 delta;
3228 
3229 	ideal_runtime = sched_slice(cfs_rq, curr);
3230 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3231 	if (delta_exec > ideal_runtime) {
3232 		resched_curr(rq_of(cfs_rq));
3233 		/*
3234 		 * The current task ran long enough, ensure it doesn't get
3235 		 * re-elected due to buddy favours.
3236 		 */
3237 		clear_buddies(cfs_rq, curr);
3238 		return;
3239 	}
3240 
3241 	/*
3242 	 * Ensure that a task that missed wakeup preemption by a
3243 	 * narrow margin doesn't have to wait for a full slice.
3244 	 * This also mitigates buddy induced latencies under load.
3245 	 */
3246 	if (delta_exec < sysctl_sched_min_granularity)
3247 		return;
3248 
3249 	se = __pick_first_entity(cfs_rq);
3250 	delta = curr->vruntime - se->vruntime;
3251 
3252 	if (delta < 0)
3253 		return;
3254 
3255 	if (delta > ideal_runtime)
3256 		resched_curr(rq_of(cfs_rq));
3257 }
3258 
3259 static void
3260 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3261 {
3262 	/* 'current' is not kept within the tree. */
3263 	if (se->on_rq) {
3264 		/*
3265 		 * Any task has to be enqueued before it get to execute on
3266 		 * a CPU. So account for the time it spent waiting on the
3267 		 * runqueue.
3268 		 */
3269 		update_stats_wait_end(cfs_rq, se);
3270 		__dequeue_entity(cfs_rq, se);
3271 		update_entity_load_avg(se, 1);
3272 	}
3273 
3274 	update_stats_curr_start(cfs_rq, se);
3275 	cfs_rq->curr = se;
3276 #ifdef CONFIG_SCHEDSTATS
3277 	/*
3278 	 * Track our maximum slice length, if the CPU's load is at
3279 	 * least twice that of our own weight (i.e. dont track it
3280 	 * when there are only lesser-weight tasks around):
3281 	 */
3282 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3283 		se->statistics.slice_max = max(se->statistics.slice_max,
3284 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
3285 	}
3286 #endif
3287 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
3288 }
3289 
3290 static int
3291 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3292 
3293 /*
3294  * Pick the next process, keeping these things in mind, in this order:
3295  * 1) keep things fair between processes/task groups
3296  * 2) pick the "next" process, since someone really wants that to run
3297  * 3) pick the "last" process, for cache locality
3298  * 4) do not run the "skip" process, if something else is available
3299  */
3300 static struct sched_entity *
3301 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3302 {
3303 	struct sched_entity *left = __pick_first_entity(cfs_rq);
3304 	struct sched_entity *se;
3305 
3306 	/*
3307 	 * If curr is set we have to see if its left of the leftmost entity
3308 	 * still in the tree, provided there was anything in the tree at all.
3309 	 */
3310 	if (!left || (curr && entity_before(curr, left)))
3311 		left = curr;
3312 
3313 	se = left; /* ideally we run the leftmost entity */
3314 
3315 	/*
3316 	 * Avoid running the skip buddy, if running something else can
3317 	 * be done without getting too unfair.
3318 	 */
3319 	if (cfs_rq->skip == se) {
3320 		struct sched_entity *second;
3321 
3322 		if (se == curr) {
3323 			second = __pick_first_entity(cfs_rq);
3324 		} else {
3325 			second = __pick_next_entity(se);
3326 			if (!second || (curr && entity_before(curr, second)))
3327 				second = curr;
3328 		}
3329 
3330 		if (second && wakeup_preempt_entity(second, left) < 1)
3331 			se = second;
3332 	}
3333 
3334 	/*
3335 	 * Prefer last buddy, try to return the CPU to a preempted task.
3336 	 */
3337 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3338 		se = cfs_rq->last;
3339 
3340 	/*
3341 	 * Someone really wants this to run. If it's not unfair, run it.
3342 	 */
3343 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3344 		se = cfs_rq->next;
3345 
3346 	clear_buddies(cfs_rq, se);
3347 
3348 	return se;
3349 }
3350 
3351 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3352 
3353 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3354 {
3355 	/*
3356 	 * If still on the runqueue then deactivate_task()
3357 	 * was not called and update_curr() has to be done:
3358 	 */
3359 	if (prev->on_rq)
3360 		update_curr(cfs_rq);
3361 
3362 	/* throttle cfs_rqs exceeding runtime */
3363 	check_cfs_rq_runtime(cfs_rq);
3364 
3365 	check_spread(cfs_rq, prev);
3366 	if (prev->on_rq) {
3367 		update_stats_wait_start(cfs_rq, prev);
3368 		/* Put 'current' back into the tree. */
3369 		__enqueue_entity(cfs_rq, prev);
3370 		/* in !on_rq case, update occurred at dequeue */
3371 		update_entity_load_avg(prev, 1);
3372 	}
3373 	cfs_rq->curr = NULL;
3374 }
3375 
3376 static void
3377 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3378 {
3379 	/*
3380 	 * Update run-time statistics of the 'current'.
3381 	 */
3382 	update_curr(cfs_rq);
3383 
3384 	/*
3385 	 * Ensure that runnable average is periodically updated.
3386 	 */
3387 	update_entity_load_avg(curr, 1);
3388 	update_cfs_rq_blocked_load(cfs_rq, 1);
3389 	update_cfs_shares(cfs_rq);
3390 
3391 #ifdef CONFIG_SCHED_HRTICK
3392 	/*
3393 	 * queued ticks are scheduled to match the slice, so don't bother
3394 	 * validating it and just reschedule.
3395 	 */
3396 	if (queued) {
3397 		resched_curr(rq_of(cfs_rq));
3398 		return;
3399 	}
3400 	/*
3401 	 * don't let the period tick interfere with the hrtick preemption
3402 	 */
3403 	if (!sched_feat(DOUBLE_TICK) &&
3404 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3405 		return;
3406 #endif
3407 
3408 	if (cfs_rq->nr_running > 1)
3409 		check_preempt_tick(cfs_rq, curr);
3410 }
3411 
3412 
3413 /**************************************************
3414  * CFS bandwidth control machinery
3415  */
3416 
3417 #ifdef CONFIG_CFS_BANDWIDTH
3418 
3419 #ifdef HAVE_JUMP_LABEL
3420 static struct static_key __cfs_bandwidth_used;
3421 
3422 static inline bool cfs_bandwidth_used(void)
3423 {
3424 	return static_key_false(&__cfs_bandwidth_used);
3425 }
3426 
3427 void cfs_bandwidth_usage_inc(void)
3428 {
3429 	static_key_slow_inc(&__cfs_bandwidth_used);
3430 }
3431 
3432 void cfs_bandwidth_usage_dec(void)
3433 {
3434 	static_key_slow_dec(&__cfs_bandwidth_used);
3435 }
3436 #else /* HAVE_JUMP_LABEL */
3437 static bool cfs_bandwidth_used(void)
3438 {
3439 	return true;
3440 }
3441 
3442 void cfs_bandwidth_usage_inc(void) {}
3443 void cfs_bandwidth_usage_dec(void) {}
3444 #endif /* HAVE_JUMP_LABEL */
3445 
3446 /*
3447  * default period for cfs group bandwidth.
3448  * default: 0.1s, units: nanoseconds
3449  */
3450 static inline u64 default_cfs_period(void)
3451 {
3452 	return 100000000ULL;
3453 }
3454 
3455 static inline u64 sched_cfs_bandwidth_slice(void)
3456 {
3457 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3458 }
3459 
3460 /*
3461  * Replenish runtime according to assigned quota and update expiration time.
3462  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3463  * additional synchronization around rq->lock.
3464  *
3465  * requires cfs_b->lock
3466  */
3467 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3468 {
3469 	u64 now;
3470 
3471 	if (cfs_b->quota == RUNTIME_INF)
3472 		return;
3473 
3474 	now = sched_clock_cpu(smp_processor_id());
3475 	cfs_b->runtime = cfs_b->quota;
3476 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3477 }
3478 
3479 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3480 {
3481 	return &tg->cfs_bandwidth;
3482 }
3483 
3484 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3485 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3486 {
3487 	if (unlikely(cfs_rq->throttle_count))
3488 		return cfs_rq->throttled_clock_task;
3489 
3490 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3491 }
3492 
3493 /* returns 0 on failure to allocate runtime */
3494 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3495 {
3496 	struct task_group *tg = cfs_rq->tg;
3497 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3498 	u64 amount = 0, min_amount, expires;
3499 
3500 	/* note: this is a positive sum as runtime_remaining <= 0 */
3501 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3502 
3503 	raw_spin_lock(&cfs_b->lock);
3504 	if (cfs_b->quota == RUNTIME_INF)
3505 		amount = min_amount;
3506 	else {
3507 		start_cfs_bandwidth(cfs_b);
3508 
3509 		if (cfs_b->runtime > 0) {
3510 			amount = min(cfs_b->runtime, min_amount);
3511 			cfs_b->runtime -= amount;
3512 			cfs_b->idle = 0;
3513 		}
3514 	}
3515 	expires = cfs_b->runtime_expires;
3516 	raw_spin_unlock(&cfs_b->lock);
3517 
3518 	cfs_rq->runtime_remaining += amount;
3519 	/*
3520 	 * we may have advanced our local expiration to account for allowed
3521 	 * spread between our sched_clock and the one on which runtime was
3522 	 * issued.
3523 	 */
3524 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3525 		cfs_rq->runtime_expires = expires;
3526 
3527 	return cfs_rq->runtime_remaining > 0;
3528 }
3529 
3530 /*
3531  * Note: This depends on the synchronization provided by sched_clock and the
3532  * fact that rq->clock snapshots this value.
3533  */
3534 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3535 {
3536 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3537 
3538 	/* if the deadline is ahead of our clock, nothing to do */
3539 	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3540 		return;
3541 
3542 	if (cfs_rq->runtime_remaining < 0)
3543 		return;
3544 
3545 	/*
3546 	 * If the local deadline has passed we have to consider the
3547 	 * possibility that our sched_clock is 'fast' and the global deadline
3548 	 * has not truly expired.
3549 	 *
3550 	 * Fortunately we can check determine whether this the case by checking
3551 	 * whether the global deadline has advanced. It is valid to compare
3552 	 * cfs_b->runtime_expires without any locks since we only care about
3553 	 * exact equality, so a partial write will still work.
3554 	 */
3555 
3556 	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3557 		/* extend local deadline, drift is bounded above by 2 ticks */
3558 		cfs_rq->runtime_expires += TICK_NSEC;
3559 	} else {
3560 		/* global deadline is ahead, expiration has passed */
3561 		cfs_rq->runtime_remaining = 0;
3562 	}
3563 }
3564 
3565 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3566 {
3567 	/* dock delta_exec before expiring quota (as it could span periods) */
3568 	cfs_rq->runtime_remaining -= delta_exec;
3569 	expire_cfs_rq_runtime(cfs_rq);
3570 
3571 	if (likely(cfs_rq->runtime_remaining > 0))
3572 		return;
3573 
3574 	/*
3575 	 * if we're unable to extend our runtime we resched so that the active
3576 	 * hierarchy can be throttled
3577 	 */
3578 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3579 		resched_curr(rq_of(cfs_rq));
3580 }
3581 
3582 static __always_inline
3583 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3584 {
3585 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3586 		return;
3587 
3588 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
3589 }
3590 
3591 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3592 {
3593 	return cfs_bandwidth_used() && cfs_rq->throttled;
3594 }
3595 
3596 /* check whether cfs_rq, or any parent, is throttled */
3597 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3598 {
3599 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
3600 }
3601 
3602 /*
3603  * Ensure that neither of the group entities corresponding to src_cpu or
3604  * dest_cpu are members of a throttled hierarchy when performing group
3605  * load-balance operations.
3606  */
3607 static inline int throttled_lb_pair(struct task_group *tg,
3608 				    int src_cpu, int dest_cpu)
3609 {
3610 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3611 
3612 	src_cfs_rq = tg->cfs_rq[src_cpu];
3613 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
3614 
3615 	return throttled_hierarchy(src_cfs_rq) ||
3616 	       throttled_hierarchy(dest_cfs_rq);
3617 }
3618 
3619 /* updated child weight may affect parent so we have to do this bottom up */
3620 static int tg_unthrottle_up(struct task_group *tg, void *data)
3621 {
3622 	struct rq *rq = data;
3623 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3624 
3625 	cfs_rq->throttle_count--;
3626 #ifdef CONFIG_SMP
3627 	if (!cfs_rq->throttle_count) {
3628 		/* adjust cfs_rq_clock_task() */
3629 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3630 					     cfs_rq->throttled_clock_task;
3631 	}
3632 #endif
3633 
3634 	return 0;
3635 }
3636 
3637 static int tg_throttle_down(struct task_group *tg, void *data)
3638 {
3639 	struct rq *rq = data;
3640 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3641 
3642 	/* group is entering throttled state, stop time */
3643 	if (!cfs_rq->throttle_count)
3644 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
3645 	cfs_rq->throttle_count++;
3646 
3647 	return 0;
3648 }
3649 
3650 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3651 {
3652 	struct rq *rq = rq_of(cfs_rq);
3653 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3654 	struct sched_entity *se;
3655 	long task_delta, dequeue = 1;
3656 	bool empty;
3657 
3658 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3659 
3660 	/* freeze hierarchy runnable averages while throttled */
3661 	rcu_read_lock();
3662 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3663 	rcu_read_unlock();
3664 
3665 	task_delta = cfs_rq->h_nr_running;
3666 	for_each_sched_entity(se) {
3667 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3668 		/* throttled entity or throttle-on-deactivate */
3669 		if (!se->on_rq)
3670 			break;
3671 
3672 		if (dequeue)
3673 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3674 		qcfs_rq->h_nr_running -= task_delta;
3675 
3676 		if (qcfs_rq->load.weight)
3677 			dequeue = 0;
3678 	}
3679 
3680 	if (!se)
3681 		sub_nr_running(rq, task_delta);
3682 
3683 	cfs_rq->throttled = 1;
3684 	cfs_rq->throttled_clock = rq_clock(rq);
3685 	raw_spin_lock(&cfs_b->lock);
3686 	empty = list_empty(&cfs_rq->throttled_list);
3687 
3688 	/*
3689 	 * Add to the _head_ of the list, so that an already-started
3690 	 * distribute_cfs_runtime will not see us
3691 	 */
3692 	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3693 
3694 	/*
3695 	 * If we're the first throttled task, make sure the bandwidth
3696 	 * timer is running.
3697 	 */
3698 	if (empty)
3699 		start_cfs_bandwidth(cfs_b);
3700 
3701 	raw_spin_unlock(&cfs_b->lock);
3702 }
3703 
3704 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3705 {
3706 	struct rq *rq = rq_of(cfs_rq);
3707 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3708 	struct sched_entity *se;
3709 	int enqueue = 1;
3710 	long task_delta;
3711 
3712 	se = cfs_rq->tg->se[cpu_of(rq)];
3713 
3714 	cfs_rq->throttled = 0;
3715 
3716 	update_rq_clock(rq);
3717 
3718 	raw_spin_lock(&cfs_b->lock);
3719 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3720 	list_del_rcu(&cfs_rq->throttled_list);
3721 	raw_spin_unlock(&cfs_b->lock);
3722 
3723 	/* update hierarchical throttle state */
3724 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3725 
3726 	if (!cfs_rq->load.weight)
3727 		return;
3728 
3729 	task_delta = cfs_rq->h_nr_running;
3730 	for_each_sched_entity(se) {
3731 		if (se->on_rq)
3732 			enqueue = 0;
3733 
3734 		cfs_rq = cfs_rq_of(se);
3735 		if (enqueue)
3736 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3737 		cfs_rq->h_nr_running += task_delta;
3738 
3739 		if (cfs_rq_throttled(cfs_rq))
3740 			break;
3741 	}
3742 
3743 	if (!se)
3744 		add_nr_running(rq, task_delta);
3745 
3746 	/* determine whether we need to wake up potentially idle cpu */
3747 	if (rq->curr == rq->idle && rq->cfs.nr_running)
3748 		resched_curr(rq);
3749 }
3750 
3751 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3752 		u64 remaining, u64 expires)
3753 {
3754 	struct cfs_rq *cfs_rq;
3755 	u64 runtime;
3756 	u64 starting_runtime = remaining;
3757 
3758 	rcu_read_lock();
3759 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3760 				throttled_list) {
3761 		struct rq *rq = rq_of(cfs_rq);
3762 
3763 		raw_spin_lock(&rq->lock);
3764 		if (!cfs_rq_throttled(cfs_rq))
3765 			goto next;
3766 
3767 		runtime = -cfs_rq->runtime_remaining + 1;
3768 		if (runtime > remaining)
3769 			runtime = remaining;
3770 		remaining -= runtime;
3771 
3772 		cfs_rq->runtime_remaining += runtime;
3773 		cfs_rq->runtime_expires = expires;
3774 
3775 		/* we check whether we're throttled above */
3776 		if (cfs_rq->runtime_remaining > 0)
3777 			unthrottle_cfs_rq(cfs_rq);
3778 
3779 next:
3780 		raw_spin_unlock(&rq->lock);
3781 
3782 		if (!remaining)
3783 			break;
3784 	}
3785 	rcu_read_unlock();
3786 
3787 	return starting_runtime - remaining;
3788 }
3789 
3790 /*
3791  * Responsible for refilling a task_group's bandwidth and unthrottling its
3792  * cfs_rqs as appropriate. If there has been no activity within the last
3793  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3794  * used to track this state.
3795  */
3796 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3797 {
3798 	u64 runtime, runtime_expires;
3799 	int throttled;
3800 
3801 	/* no need to continue the timer with no bandwidth constraint */
3802 	if (cfs_b->quota == RUNTIME_INF)
3803 		goto out_deactivate;
3804 
3805 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3806 	cfs_b->nr_periods += overrun;
3807 
3808 	/*
3809 	 * idle depends on !throttled (for the case of a large deficit), and if
3810 	 * we're going inactive then everything else can be deferred
3811 	 */
3812 	if (cfs_b->idle && !throttled)
3813 		goto out_deactivate;
3814 
3815 	__refill_cfs_bandwidth_runtime(cfs_b);
3816 
3817 	if (!throttled) {
3818 		/* mark as potentially idle for the upcoming period */
3819 		cfs_b->idle = 1;
3820 		return 0;
3821 	}
3822 
3823 	/* account preceding periods in which throttling occurred */
3824 	cfs_b->nr_throttled += overrun;
3825 
3826 	runtime_expires = cfs_b->runtime_expires;
3827 
3828 	/*
3829 	 * This check is repeated as we are holding onto the new bandwidth while
3830 	 * we unthrottle. This can potentially race with an unthrottled group
3831 	 * trying to acquire new bandwidth from the global pool. This can result
3832 	 * in us over-using our runtime if it is all used during this loop, but
3833 	 * only by limited amounts in that extreme case.
3834 	 */
3835 	while (throttled && cfs_b->runtime > 0) {
3836 		runtime = cfs_b->runtime;
3837 		raw_spin_unlock(&cfs_b->lock);
3838 		/* we can't nest cfs_b->lock while distributing bandwidth */
3839 		runtime = distribute_cfs_runtime(cfs_b, runtime,
3840 						 runtime_expires);
3841 		raw_spin_lock(&cfs_b->lock);
3842 
3843 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3844 
3845 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
3846 	}
3847 
3848 	/*
3849 	 * While we are ensured activity in the period following an
3850 	 * unthrottle, this also covers the case in which the new bandwidth is
3851 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
3852 	 * timer to remain active while there are any throttled entities.)
3853 	 */
3854 	cfs_b->idle = 0;
3855 
3856 	return 0;
3857 
3858 out_deactivate:
3859 	return 1;
3860 }
3861 
3862 /* a cfs_rq won't donate quota below this amount */
3863 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3864 /* minimum remaining period time to redistribute slack quota */
3865 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3866 /* how long we wait to gather additional slack before distributing */
3867 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3868 
3869 /*
3870  * Are we near the end of the current quota period?
3871  *
3872  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3873  * hrtimer base being cleared by hrtimer_start. In the case of
3874  * migrate_hrtimers, base is never cleared, so we are fine.
3875  */
3876 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3877 {
3878 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
3879 	u64 remaining;
3880 
3881 	/* if the call-back is running a quota refresh is already occurring */
3882 	if (hrtimer_callback_running(refresh_timer))
3883 		return 1;
3884 
3885 	/* is a quota refresh about to occur? */
3886 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3887 	if (remaining < min_expire)
3888 		return 1;
3889 
3890 	return 0;
3891 }
3892 
3893 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3894 {
3895 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3896 
3897 	/* if there's a quota refresh soon don't bother with slack */
3898 	if (runtime_refresh_within(cfs_b, min_left))
3899 		return;
3900 
3901 	hrtimer_start(&cfs_b->slack_timer,
3902 			ns_to_ktime(cfs_bandwidth_slack_period),
3903 			HRTIMER_MODE_REL);
3904 }
3905 
3906 /* we know any runtime found here is valid as update_curr() precedes return */
3907 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3908 {
3909 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3910 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3911 
3912 	if (slack_runtime <= 0)
3913 		return;
3914 
3915 	raw_spin_lock(&cfs_b->lock);
3916 	if (cfs_b->quota != RUNTIME_INF &&
3917 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3918 		cfs_b->runtime += slack_runtime;
3919 
3920 		/* we are under rq->lock, defer unthrottling using a timer */
3921 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3922 		    !list_empty(&cfs_b->throttled_cfs_rq))
3923 			start_cfs_slack_bandwidth(cfs_b);
3924 	}
3925 	raw_spin_unlock(&cfs_b->lock);
3926 
3927 	/* even if it's not valid for return we don't want to try again */
3928 	cfs_rq->runtime_remaining -= slack_runtime;
3929 }
3930 
3931 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3932 {
3933 	if (!cfs_bandwidth_used())
3934 		return;
3935 
3936 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3937 		return;
3938 
3939 	__return_cfs_rq_runtime(cfs_rq);
3940 }
3941 
3942 /*
3943  * This is done with a timer (instead of inline with bandwidth return) since
3944  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3945  */
3946 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3947 {
3948 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3949 	u64 expires;
3950 
3951 	/* confirm we're still not at a refresh boundary */
3952 	raw_spin_lock(&cfs_b->lock);
3953 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3954 		raw_spin_unlock(&cfs_b->lock);
3955 		return;
3956 	}
3957 
3958 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3959 		runtime = cfs_b->runtime;
3960 
3961 	expires = cfs_b->runtime_expires;
3962 	raw_spin_unlock(&cfs_b->lock);
3963 
3964 	if (!runtime)
3965 		return;
3966 
3967 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3968 
3969 	raw_spin_lock(&cfs_b->lock);
3970 	if (expires == cfs_b->runtime_expires)
3971 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
3972 	raw_spin_unlock(&cfs_b->lock);
3973 }
3974 
3975 /*
3976  * When a group wakes up we want to make sure that its quota is not already
3977  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3978  * runtime as update_curr() throttling can not not trigger until it's on-rq.
3979  */
3980 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3981 {
3982 	if (!cfs_bandwidth_used())
3983 		return;
3984 
3985 	/* an active group must be handled by the update_curr()->put() path */
3986 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3987 		return;
3988 
3989 	/* ensure the group is not already throttled */
3990 	if (cfs_rq_throttled(cfs_rq))
3991 		return;
3992 
3993 	/* update runtime allocation */
3994 	account_cfs_rq_runtime(cfs_rq, 0);
3995 	if (cfs_rq->runtime_remaining <= 0)
3996 		throttle_cfs_rq(cfs_rq);
3997 }
3998 
3999 /* conditionally throttle active cfs_rq's from put_prev_entity() */
4000 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4001 {
4002 	if (!cfs_bandwidth_used())
4003 		return false;
4004 
4005 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4006 		return false;
4007 
4008 	/*
4009 	 * it's possible for a throttled entity to be forced into a running
4010 	 * state (e.g. set_curr_task), in this case we're finished.
4011 	 */
4012 	if (cfs_rq_throttled(cfs_rq))
4013 		return true;
4014 
4015 	throttle_cfs_rq(cfs_rq);
4016 	return true;
4017 }
4018 
4019 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4020 {
4021 	struct cfs_bandwidth *cfs_b =
4022 		container_of(timer, struct cfs_bandwidth, slack_timer);
4023 
4024 	do_sched_cfs_slack_timer(cfs_b);
4025 
4026 	return HRTIMER_NORESTART;
4027 }
4028 
4029 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4030 {
4031 	struct cfs_bandwidth *cfs_b =
4032 		container_of(timer, struct cfs_bandwidth, period_timer);
4033 	int overrun;
4034 	int idle = 0;
4035 
4036 	raw_spin_lock(&cfs_b->lock);
4037 	for (;;) {
4038 		overrun = hrtimer_forward_now(timer, cfs_b->period);
4039 		if (!overrun)
4040 			break;
4041 
4042 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
4043 	}
4044 	if (idle)
4045 		cfs_b->period_active = 0;
4046 	raw_spin_unlock(&cfs_b->lock);
4047 
4048 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4049 }
4050 
4051 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4052 {
4053 	raw_spin_lock_init(&cfs_b->lock);
4054 	cfs_b->runtime = 0;
4055 	cfs_b->quota = RUNTIME_INF;
4056 	cfs_b->period = ns_to_ktime(default_cfs_period());
4057 
4058 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4059 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4060 	cfs_b->period_timer.function = sched_cfs_period_timer;
4061 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4062 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
4063 }
4064 
4065 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4066 {
4067 	cfs_rq->runtime_enabled = 0;
4068 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
4069 }
4070 
4071 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4072 {
4073 	lockdep_assert_held(&cfs_b->lock);
4074 
4075 	if (!cfs_b->period_active) {
4076 		cfs_b->period_active = 1;
4077 		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4078 		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4079 	}
4080 }
4081 
4082 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4083 {
4084 	/* init_cfs_bandwidth() was not called */
4085 	if (!cfs_b->throttled_cfs_rq.next)
4086 		return;
4087 
4088 	hrtimer_cancel(&cfs_b->period_timer);
4089 	hrtimer_cancel(&cfs_b->slack_timer);
4090 }
4091 
4092 static void __maybe_unused update_runtime_enabled(struct rq *rq)
4093 {
4094 	struct cfs_rq *cfs_rq;
4095 
4096 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4097 		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4098 
4099 		raw_spin_lock(&cfs_b->lock);
4100 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4101 		raw_spin_unlock(&cfs_b->lock);
4102 	}
4103 }
4104 
4105 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4106 {
4107 	struct cfs_rq *cfs_rq;
4108 
4109 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4110 		if (!cfs_rq->runtime_enabled)
4111 			continue;
4112 
4113 		/*
4114 		 * clock_task is not advancing so we just need to make sure
4115 		 * there's some valid quota amount
4116 		 */
4117 		cfs_rq->runtime_remaining = 1;
4118 		/*
4119 		 * Offline rq is schedulable till cpu is completely disabled
4120 		 * in take_cpu_down(), so we prevent new cfs throttling here.
4121 		 */
4122 		cfs_rq->runtime_enabled = 0;
4123 
4124 		if (cfs_rq_throttled(cfs_rq))
4125 			unthrottle_cfs_rq(cfs_rq);
4126 	}
4127 }
4128 
4129 #else /* CONFIG_CFS_BANDWIDTH */
4130 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4131 {
4132 	return rq_clock_task(rq_of(cfs_rq));
4133 }
4134 
4135 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4136 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4137 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4138 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4139 
4140 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4141 {
4142 	return 0;
4143 }
4144 
4145 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4146 {
4147 	return 0;
4148 }
4149 
4150 static inline int throttled_lb_pair(struct task_group *tg,
4151 				    int src_cpu, int dest_cpu)
4152 {
4153 	return 0;
4154 }
4155 
4156 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4157 
4158 #ifdef CONFIG_FAIR_GROUP_SCHED
4159 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4160 #endif
4161 
4162 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4163 {
4164 	return NULL;
4165 }
4166 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4167 static inline void update_runtime_enabled(struct rq *rq) {}
4168 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4169 
4170 #endif /* CONFIG_CFS_BANDWIDTH */
4171 
4172 /**************************************************
4173  * CFS operations on tasks:
4174  */
4175 
4176 #ifdef CONFIG_SCHED_HRTICK
4177 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4178 {
4179 	struct sched_entity *se = &p->se;
4180 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4181 
4182 	WARN_ON(task_rq(p) != rq);
4183 
4184 	if (cfs_rq->nr_running > 1) {
4185 		u64 slice = sched_slice(cfs_rq, se);
4186 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4187 		s64 delta = slice - ran;
4188 
4189 		if (delta < 0) {
4190 			if (rq->curr == p)
4191 				resched_curr(rq);
4192 			return;
4193 		}
4194 		hrtick_start(rq, delta);
4195 	}
4196 }
4197 
4198 /*
4199  * called from enqueue/dequeue and updates the hrtick when the
4200  * current task is from our class and nr_running is low enough
4201  * to matter.
4202  */
4203 static void hrtick_update(struct rq *rq)
4204 {
4205 	struct task_struct *curr = rq->curr;
4206 
4207 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4208 		return;
4209 
4210 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4211 		hrtick_start_fair(rq, curr);
4212 }
4213 #else /* !CONFIG_SCHED_HRTICK */
4214 static inline void
4215 hrtick_start_fair(struct rq *rq, struct task_struct *p)
4216 {
4217 }
4218 
4219 static inline void hrtick_update(struct rq *rq)
4220 {
4221 }
4222 #endif
4223 
4224 /*
4225  * The enqueue_task method is called before nr_running is
4226  * increased. Here we update the fair scheduling stats and
4227  * then put the task into the rbtree:
4228  */
4229 static void
4230 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4231 {
4232 	struct cfs_rq *cfs_rq;
4233 	struct sched_entity *se = &p->se;
4234 
4235 	for_each_sched_entity(se) {
4236 		if (se->on_rq)
4237 			break;
4238 		cfs_rq = cfs_rq_of(se);
4239 		enqueue_entity(cfs_rq, se, flags);
4240 
4241 		/*
4242 		 * end evaluation on encountering a throttled cfs_rq
4243 		 *
4244 		 * note: in the case of encountering a throttled cfs_rq we will
4245 		 * post the final h_nr_running increment below.
4246 		*/
4247 		if (cfs_rq_throttled(cfs_rq))
4248 			break;
4249 		cfs_rq->h_nr_running++;
4250 
4251 		flags = ENQUEUE_WAKEUP;
4252 	}
4253 
4254 	for_each_sched_entity(se) {
4255 		cfs_rq = cfs_rq_of(se);
4256 		cfs_rq->h_nr_running++;
4257 
4258 		if (cfs_rq_throttled(cfs_rq))
4259 			break;
4260 
4261 		update_cfs_shares(cfs_rq);
4262 		update_entity_load_avg(se, 1);
4263 	}
4264 
4265 	if (!se) {
4266 		update_rq_runnable_avg(rq, rq->nr_running);
4267 		add_nr_running(rq, 1);
4268 	}
4269 	hrtick_update(rq);
4270 }
4271 
4272 static void set_next_buddy(struct sched_entity *se);
4273 
4274 /*
4275  * The dequeue_task method is called before nr_running is
4276  * decreased. We remove the task from the rbtree and
4277  * update the fair scheduling stats:
4278  */
4279 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4280 {
4281 	struct cfs_rq *cfs_rq;
4282 	struct sched_entity *se = &p->se;
4283 	int task_sleep = flags & DEQUEUE_SLEEP;
4284 
4285 	for_each_sched_entity(se) {
4286 		cfs_rq = cfs_rq_of(se);
4287 		dequeue_entity(cfs_rq, se, flags);
4288 
4289 		/*
4290 		 * end evaluation on encountering a throttled cfs_rq
4291 		 *
4292 		 * note: in the case of encountering a throttled cfs_rq we will
4293 		 * post the final h_nr_running decrement below.
4294 		*/
4295 		if (cfs_rq_throttled(cfs_rq))
4296 			break;
4297 		cfs_rq->h_nr_running--;
4298 
4299 		/* Don't dequeue parent if it has other entities besides us */
4300 		if (cfs_rq->load.weight) {
4301 			/*
4302 			 * Bias pick_next to pick a task from this cfs_rq, as
4303 			 * p is sleeping when it is within its sched_slice.
4304 			 */
4305 			if (task_sleep && parent_entity(se))
4306 				set_next_buddy(parent_entity(se));
4307 
4308 			/* avoid re-evaluating load for this entity */
4309 			se = parent_entity(se);
4310 			break;
4311 		}
4312 		flags |= DEQUEUE_SLEEP;
4313 	}
4314 
4315 	for_each_sched_entity(se) {
4316 		cfs_rq = cfs_rq_of(se);
4317 		cfs_rq->h_nr_running--;
4318 
4319 		if (cfs_rq_throttled(cfs_rq))
4320 			break;
4321 
4322 		update_cfs_shares(cfs_rq);
4323 		update_entity_load_avg(se, 1);
4324 	}
4325 
4326 	if (!se) {
4327 		sub_nr_running(rq, 1);
4328 		update_rq_runnable_avg(rq, 1);
4329 	}
4330 	hrtick_update(rq);
4331 }
4332 
4333 #ifdef CONFIG_SMP
4334 
4335 /*
4336  * per rq 'load' arrray crap; XXX kill this.
4337  */
4338 
4339 /*
4340  * The exact cpuload at various idx values, calculated at every tick would be
4341  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
4342  *
4343  * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
4344  * on nth tick when cpu may be busy, then we have:
4345  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4346  * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
4347  *
4348  * decay_load_missed() below does efficient calculation of
4349  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4350  * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
4351  *
4352  * The calculation is approximated on a 128 point scale.
4353  * degrade_zero_ticks is the number of ticks after which load at any
4354  * particular idx is approximated to be zero.
4355  * degrade_factor is a precomputed table, a row for each load idx.
4356  * Each column corresponds to degradation factor for a power of two ticks,
4357  * based on 128 point scale.
4358  * Example:
4359  * row 2, col 3 (=12) says that the degradation at load idx 2 after
4360  * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
4361  *
4362  * With this power of 2 load factors, we can degrade the load n times
4363  * by looking at 1 bits in n and doing as many mult/shift instead of
4364  * n mult/shifts needed by the exact degradation.
4365  */
4366 #define DEGRADE_SHIFT		7
4367 static const unsigned char
4368 		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4369 static const unsigned char
4370 		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4371 					{0, 0, 0, 0, 0, 0, 0, 0},
4372 					{64, 32, 8, 0, 0, 0, 0, 0},
4373 					{96, 72, 40, 12, 1, 0, 0},
4374 					{112, 98, 75, 43, 15, 1, 0},
4375 					{120, 112, 98, 76, 45, 16, 2} };
4376 
4377 /*
4378  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4379  * would be when CPU is idle and so we just decay the old load without
4380  * adding any new load.
4381  */
4382 static unsigned long
4383 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4384 {
4385 	int j = 0;
4386 
4387 	if (!missed_updates)
4388 		return load;
4389 
4390 	if (missed_updates >= degrade_zero_ticks[idx])
4391 		return 0;
4392 
4393 	if (idx == 1)
4394 		return load >> missed_updates;
4395 
4396 	while (missed_updates) {
4397 		if (missed_updates % 2)
4398 			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4399 
4400 		missed_updates >>= 1;
4401 		j++;
4402 	}
4403 	return load;
4404 }
4405 
4406 /*
4407  * Update rq->cpu_load[] statistics. This function is usually called every
4408  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
4409  * every tick. We fix it up based on jiffies.
4410  */
4411 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4412 			      unsigned long pending_updates)
4413 {
4414 	int i, scale;
4415 
4416 	this_rq->nr_load_updates++;
4417 
4418 	/* Update our load: */
4419 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4420 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4421 		unsigned long old_load, new_load;
4422 
4423 		/* scale is effectively 1 << i now, and >> i divides by scale */
4424 
4425 		old_load = this_rq->cpu_load[i];
4426 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
4427 		new_load = this_load;
4428 		/*
4429 		 * Round up the averaging division if load is increasing. This
4430 		 * prevents us from getting stuck on 9 if the load is 10, for
4431 		 * example.
4432 		 */
4433 		if (new_load > old_load)
4434 			new_load += scale - 1;
4435 
4436 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4437 	}
4438 
4439 	sched_avg_update(this_rq);
4440 }
4441 
4442 #ifdef CONFIG_NO_HZ_COMMON
4443 /*
4444  * There is no sane way to deal with nohz on smp when using jiffies because the
4445  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4446  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4447  *
4448  * Therefore we cannot use the delta approach from the regular tick since that
4449  * would seriously skew the load calculation. However we'll make do for those
4450  * updates happening while idle (nohz_idle_balance) or coming out of idle
4451  * (tick_nohz_idle_exit).
4452  *
4453  * This means we might still be one tick off for nohz periods.
4454  */
4455 
4456 /*
4457  * Called from nohz_idle_balance() to update the load ratings before doing the
4458  * idle balance.
4459  */
4460 static void update_idle_cpu_load(struct rq *this_rq)
4461 {
4462 	unsigned long curr_jiffies = READ_ONCE(jiffies);
4463 	unsigned long load = this_rq->cfs.runnable_load_avg;
4464 	unsigned long pending_updates;
4465 
4466 	/*
4467 	 * bail if there's load or we're actually up-to-date.
4468 	 */
4469 	if (load || curr_jiffies == this_rq->last_load_update_tick)
4470 		return;
4471 
4472 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4473 	this_rq->last_load_update_tick = curr_jiffies;
4474 
4475 	__update_cpu_load(this_rq, load, pending_updates);
4476 }
4477 
4478 /*
4479  * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
4480  */
4481 void update_cpu_load_nohz(void)
4482 {
4483 	struct rq *this_rq = this_rq();
4484 	unsigned long curr_jiffies = READ_ONCE(jiffies);
4485 	unsigned long pending_updates;
4486 
4487 	if (curr_jiffies == this_rq->last_load_update_tick)
4488 		return;
4489 
4490 	raw_spin_lock(&this_rq->lock);
4491 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4492 	if (pending_updates) {
4493 		this_rq->last_load_update_tick = curr_jiffies;
4494 		/*
4495 		 * We were idle, this means load 0, the current load might be
4496 		 * !0 due to remote wakeups and the sort.
4497 		 */
4498 		__update_cpu_load(this_rq, 0, pending_updates);
4499 	}
4500 	raw_spin_unlock(&this_rq->lock);
4501 }
4502 #endif /* CONFIG_NO_HZ */
4503 
4504 /*
4505  * Called from scheduler_tick()
4506  */
4507 void update_cpu_load_active(struct rq *this_rq)
4508 {
4509 	unsigned long load = this_rq->cfs.runnable_load_avg;
4510 	/*
4511 	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
4512 	 */
4513 	this_rq->last_load_update_tick = jiffies;
4514 	__update_cpu_load(this_rq, load, 1);
4515 }
4516 
4517 /* Used instead of source_load when we know the type == 0 */
4518 static unsigned long weighted_cpuload(const int cpu)
4519 {
4520 	return cpu_rq(cpu)->cfs.runnable_load_avg;
4521 }
4522 
4523 /*
4524  * Return a low guess at the load of a migration-source cpu weighted
4525  * according to the scheduling class and "nice" value.
4526  *
4527  * We want to under-estimate the load of migration sources, to
4528  * balance conservatively.
4529  */
4530 static unsigned long source_load(int cpu, int type)
4531 {
4532 	struct rq *rq = cpu_rq(cpu);
4533 	unsigned long total = weighted_cpuload(cpu);
4534 
4535 	if (type == 0 || !sched_feat(LB_BIAS))
4536 		return total;
4537 
4538 	return min(rq->cpu_load[type-1], total);
4539 }
4540 
4541 /*
4542  * Return a high guess at the load of a migration-target cpu weighted
4543  * according to the scheduling class and "nice" value.
4544  */
4545 static unsigned long target_load(int cpu, int type)
4546 {
4547 	struct rq *rq = cpu_rq(cpu);
4548 	unsigned long total = weighted_cpuload(cpu);
4549 
4550 	if (type == 0 || !sched_feat(LB_BIAS))
4551 		return total;
4552 
4553 	return max(rq->cpu_load[type-1], total);
4554 }
4555 
4556 static unsigned long capacity_of(int cpu)
4557 {
4558 	return cpu_rq(cpu)->cpu_capacity;
4559 }
4560 
4561 static unsigned long capacity_orig_of(int cpu)
4562 {
4563 	return cpu_rq(cpu)->cpu_capacity_orig;
4564 }
4565 
4566 static unsigned long cpu_avg_load_per_task(int cpu)
4567 {
4568 	struct rq *rq = cpu_rq(cpu);
4569 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4570 	unsigned long load_avg = rq->cfs.runnable_load_avg;
4571 
4572 	if (nr_running)
4573 		return load_avg / nr_running;
4574 
4575 	return 0;
4576 }
4577 
4578 static void record_wakee(struct task_struct *p)
4579 {
4580 	/*
4581 	 * Rough decay (wiping) for cost saving, don't worry
4582 	 * about the boundary, really active task won't care
4583 	 * about the loss.
4584 	 */
4585 	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
4586 		current->wakee_flips >>= 1;
4587 		current->wakee_flip_decay_ts = jiffies;
4588 	}
4589 
4590 	if (current->last_wakee != p) {
4591 		current->last_wakee = p;
4592 		current->wakee_flips++;
4593 	}
4594 }
4595 
4596 static void task_waking_fair(struct task_struct *p)
4597 {
4598 	struct sched_entity *se = &p->se;
4599 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4600 	u64 min_vruntime;
4601 
4602 #ifndef CONFIG_64BIT
4603 	u64 min_vruntime_copy;
4604 
4605 	do {
4606 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
4607 		smp_rmb();
4608 		min_vruntime = cfs_rq->min_vruntime;
4609 	} while (min_vruntime != min_vruntime_copy);
4610 #else
4611 	min_vruntime = cfs_rq->min_vruntime;
4612 #endif
4613 
4614 	se->vruntime -= min_vruntime;
4615 	record_wakee(p);
4616 }
4617 
4618 #ifdef CONFIG_FAIR_GROUP_SCHED
4619 /*
4620  * effective_load() calculates the load change as seen from the root_task_group
4621  *
4622  * Adding load to a group doesn't make a group heavier, but can cause movement
4623  * of group shares between cpus. Assuming the shares were perfectly aligned one
4624  * can calculate the shift in shares.
4625  *
4626  * Calculate the effective load difference if @wl is added (subtracted) to @tg
4627  * on this @cpu and results in a total addition (subtraction) of @wg to the
4628  * total group weight.
4629  *
4630  * Given a runqueue weight distribution (rw_i) we can compute a shares
4631  * distribution (s_i) using:
4632  *
4633  *   s_i = rw_i / \Sum rw_j						(1)
4634  *
4635  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4636  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4637  * shares distribution (s_i):
4638  *
4639  *   rw_i = {   2,   4,   1,   0 }
4640  *   s_i  = { 2/7, 4/7, 1/7,   0 }
4641  *
4642  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4643  * task used to run on and the CPU the waker is running on), we need to
4644  * compute the effect of waking a task on either CPU and, in case of a sync
4645  * wakeup, compute the effect of the current task going to sleep.
4646  *
4647  * So for a change of @wl to the local @cpu with an overall group weight change
4648  * of @wl we can compute the new shares distribution (s'_i) using:
4649  *
4650  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
4651  *
4652  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4653  * differences in waking a task to CPU 0. The additional task changes the
4654  * weight and shares distributions like:
4655  *
4656  *   rw'_i = {   3,   4,   1,   0 }
4657  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
4658  *
4659  * We can then compute the difference in effective weight by using:
4660  *
4661  *   dw_i = S * (s'_i - s_i)						(3)
4662  *
4663  * Where 'S' is the group weight as seen by its parent.
4664  *
4665  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4666  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4667  * 4/7) times the weight of the group.
4668  */
4669 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4670 {
4671 	struct sched_entity *se = tg->se[cpu];
4672 
4673 	if (!tg->parent)	/* the trivial, non-cgroup case */
4674 		return wl;
4675 
4676 	for_each_sched_entity(se) {
4677 		long w, W;
4678 
4679 		tg = se->my_q->tg;
4680 
4681 		/*
4682 		 * W = @wg + \Sum rw_j
4683 		 */
4684 		W = wg + calc_tg_weight(tg, se->my_q);
4685 
4686 		/*
4687 		 * w = rw_i + @wl
4688 		 */
4689 		w = se->my_q->load.weight + wl;
4690 
4691 		/*
4692 		 * wl = S * s'_i; see (2)
4693 		 */
4694 		if (W > 0 && w < W)
4695 			wl = (w * (long)tg->shares) / W;
4696 		else
4697 			wl = tg->shares;
4698 
4699 		/*
4700 		 * Per the above, wl is the new se->load.weight value; since
4701 		 * those are clipped to [MIN_SHARES, ...) do so now. See
4702 		 * calc_cfs_shares().
4703 		 */
4704 		if (wl < MIN_SHARES)
4705 			wl = MIN_SHARES;
4706 
4707 		/*
4708 		 * wl = dw_i = S * (s'_i - s_i); see (3)
4709 		 */
4710 		wl -= se->load.weight;
4711 
4712 		/*
4713 		 * Recursively apply this logic to all parent groups to compute
4714 		 * the final effective load change on the root group. Since
4715 		 * only the @tg group gets extra weight, all parent groups can
4716 		 * only redistribute existing shares. @wl is the shift in shares
4717 		 * resulting from this level per the above.
4718 		 */
4719 		wg = 0;
4720 	}
4721 
4722 	return wl;
4723 }
4724 #else
4725 
4726 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4727 {
4728 	return wl;
4729 }
4730 
4731 #endif
4732 
4733 static int wake_wide(struct task_struct *p)
4734 {
4735 	int factor = this_cpu_read(sd_llc_size);
4736 
4737 	/*
4738 	 * Yeah, it's the switching-frequency, could means many wakee or
4739 	 * rapidly switch, use factor here will just help to automatically
4740 	 * adjust the loose-degree, so bigger node will lead to more pull.
4741 	 */
4742 	if (p->wakee_flips > factor) {
4743 		/*
4744 		 * wakee is somewhat hot, it needs certain amount of cpu
4745 		 * resource, so if waker is far more hot, prefer to leave
4746 		 * it alone.
4747 		 */
4748 		if (current->wakee_flips > (factor * p->wakee_flips))
4749 			return 1;
4750 	}
4751 
4752 	return 0;
4753 }
4754 
4755 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4756 {
4757 	s64 this_load, load;
4758 	s64 this_eff_load, prev_eff_load;
4759 	int idx, this_cpu, prev_cpu;
4760 	struct task_group *tg;
4761 	unsigned long weight;
4762 	int balanced;
4763 
4764 	/*
4765 	 * If we wake multiple tasks be careful to not bounce
4766 	 * ourselves around too much.
4767 	 */
4768 	if (wake_wide(p))
4769 		return 0;
4770 
4771 	idx	  = sd->wake_idx;
4772 	this_cpu  = smp_processor_id();
4773 	prev_cpu  = task_cpu(p);
4774 	load	  = source_load(prev_cpu, idx);
4775 	this_load = target_load(this_cpu, idx);
4776 
4777 	/*
4778 	 * If sync wakeup then subtract the (maximum possible)
4779 	 * effect of the currently running task from the load
4780 	 * of the current CPU:
4781 	 */
4782 	if (sync) {
4783 		tg = task_group(current);
4784 		weight = current->se.load.weight;
4785 
4786 		this_load += effective_load(tg, this_cpu, -weight, -weight);
4787 		load += effective_load(tg, prev_cpu, 0, -weight);
4788 	}
4789 
4790 	tg = task_group(p);
4791 	weight = p->se.load.weight;
4792 
4793 	/*
4794 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
4795 	 * due to the sync cause above having dropped this_load to 0, we'll
4796 	 * always have an imbalance, but there's really nothing you can do
4797 	 * about that, so that's good too.
4798 	 *
4799 	 * Otherwise check if either cpus are near enough in load to allow this
4800 	 * task to be woken on this_cpu.
4801 	 */
4802 	this_eff_load = 100;
4803 	this_eff_load *= capacity_of(prev_cpu);
4804 
4805 	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4806 	prev_eff_load *= capacity_of(this_cpu);
4807 
4808 	if (this_load > 0) {
4809 		this_eff_load *= this_load +
4810 			effective_load(tg, this_cpu, weight, weight);
4811 
4812 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4813 	}
4814 
4815 	balanced = this_eff_load <= prev_eff_load;
4816 
4817 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4818 
4819 	if (!balanced)
4820 		return 0;
4821 
4822 	schedstat_inc(sd, ttwu_move_affine);
4823 	schedstat_inc(p, se.statistics.nr_wakeups_affine);
4824 
4825 	return 1;
4826 }
4827 
4828 /*
4829  * find_idlest_group finds and returns the least busy CPU group within the
4830  * domain.
4831  */
4832 static struct sched_group *
4833 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4834 		  int this_cpu, int sd_flag)
4835 {
4836 	struct sched_group *idlest = NULL, *group = sd->groups;
4837 	unsigned long min_load = ULONG_MAX, this_load = 0;
4838 	int load_idx = sd->forkexec_idx;
4839 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
4840 
4841 	if (sd_flag & SD_BALANCE_WAKE)
4842 		load_idx = sd->wake_idx;
4843 
4844 	do {
4845 		unsigned long load, avg_load;
4846 		int local_group;
4847 		int i;
4848 
4849 		/* Skip over this group if it has no CPUs allowed */
4850 		if (!cpumask_intersects(sched_group_cpus(group),
4851 					tsk_cpus_allowed(p)))
4852 			continue;
4853 
4854 		local_group = cpumask_test_cpu(this_cpu,
4855 					       sched_group_cpus(group));
4856 
4857 		/* Tally up the load of all CPUs in the group */
4858 		avg_load = 0;
4859 
4860 		for_each_cpu(i, sched_group_cpus(group)) {
4861 			/* Bias balancing toward cpus of our domain */
4862 			if (local_group)
4863 				load = source_load(i, load_idx);
4864 			else
4865 				load = target_load(i, load_idx);
4866 
4867 			avg_load += load;
4868 		}
4869 
4870 		/* Adjust by relative CPU capacity of the group */
4871 		avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
4872 
4873 		if (local_group) {
4874 			this_load = avg_load;
4875 		} else if (avg_load < min_load) {
4876 			min_load = avg_load;
4877 			idlest = group;
4878 		}
4879 	} while (group = group->next, group != sd->groups);
4880 
4881 	if (!idlest || 100*this_load < imbalance*min_load)
4882 		return NULL;
4883 	return idlest;
4884 }
4885 
4886 /*
4887  * find_idlest_cpu - find the idlest cpu among the cpus in group.
4888  */
4889 static int
4890 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4891 {
4892 	unsigned long load, min_load = ULONG_MAX;
4893 	unsigned int min_exit_latency = UINT_MAX;
4894 	u64 latest_idle_timestamp = 0;
4895 	int least_loaded_cpu = this_cpu;
4896 	int shallowest_idle_cpu = -1;
4897 	int i;
4898 
4899 	/* Traverse only the allowed CPUs */
4900 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4901 		if (idle_cpu(i)) {
4902 			struct rq *rq = cpu_rq(i);
4903 			struct cpuidle_state *idle = idle_get_state(rq);
4904 			if (idle && idle->exit_latency < min_exit_latency) {
4905 				/*
4906 				 * We give priority to a CPU whose idle state
4907 				 * has the smallest exit latency irrespective
4908 				 * of any idle timestamp.
4909 				 */
4910 				min_exit_latency = idle->exit_latency;
4911 				latest_idle_timestamp = rq->idle_stamp;
4912 				shallowest_idle_cpu = i;
4913 			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
4914 				   rq->idle_stamp > latest_idle_timestamp) {
4915 				/*
4916 				 * If equal or no active idle state, then
4917 				 * the most recently idled CPU might have
4918 				 * a warmer cache.
4919 				 */
4920 				latest_idle_timestamp = rq->idle_stamp;
4921 				shallowest_idle_cpu = i;
4922 			}
4923 		} else if (shallowest_idle_cpu == -1) {
4924 			load = weighted_cpuload(i);
4925 			if (load < min_load || (load == min_load && i == this_cpu)) {
4926 				min_load = load;
4927 				least_loaded_cpu = i;
4928 			}
4929 		}
4930 	}
4931 
4932 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
4933 }
4934 
4935 /*
4936  * Try and locate an idle CPU in the sched_domain.
4937  */
4938 static int select_idle_sibling(struct task_struct *p, int target)
4939 {
4940 	struct sched_domain *sd;
4941 	struct sched_group *sg;
4942 	int i = task_cpu(p);
4943 
4944 	if (idle_cpu(target))
4945 		return target;
4946 
4947 	/*
4948 	 * If the prevous cpu is cache affine and idle, don't be stupid.
4949 	 */
4950 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4951 		return i;
4952 
4953 	/*
4954 	 * Otherwise, iterate the domains and find an elegible idle cpu.
4955 	 */
4956 	sd = rcu_dereference(per_cpu(sd_llc, target));
4957 	for_each_lower_domain(sd) {
4958 		sg = sd->groups;
4959 		do {
4960 			if (!cpumask_intersects(sched_group_cpus(sg),
4961 						tsk_cpus_allowed(p)))
4962 				goto next;
4963 
4964 			for_each_cpu(i, sched_group_cpus(sg)) {
4965 				if (i == target || !idle_cpu(i))
4966 					goto next;
4967 			}
4968 
4969 			target = cpumask_first_and(sched_group_cpus(sg),
4970 					tsk_cpus_allowed(p));
4971 			goto done;
4972 next:
4973 			sg = sg->next;
4974 		} while (sg != sd->groups);
4975 	}
4976 done:
4977 	return target;
4978 }
4979 /*
4980  * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
4981  * tasks. The unit of the return value must be the one of capacity so we can
4982  * compare the usage with the capacity of the CPU that is available for CFS
4983  * task (ie cpu_capacity).
4984  * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
4985  * CPU. It represents the amount of utilization of a CPU in the range
4986  * [0..SCHED_LOAD_SCALE].  The usage of a CPU can't be higher than the full
4987  * capacity of the CPU because it's about the running time on this CPU.
4988  * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
4989  * because of unfortunate rounding in avg_period and running_load_avg or just
4990  * after migrating tasks until the average stabilizes with the new running
4991  * time. So we need to check that the usage stays into the range
4992  * [0..cpu_capacity_orig] and cap if necessary.
4993  * Without capping the usage, a group could be seen as overloaded (CPU0 usage
4994  * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
4995  */
4996 static int get_cpu_usage(int cpu)
4997 {
4998 	unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
4999 	unsigned long capacity = capacity_orig_of(cpu);
5000 
5001 	if (usage >= SCHED_LOAD_SCALE)
5002 		return capacity;
5003 
5004 	return (usage * capacity) >> SCHED_LOAD_SHIFT;
5005 }
5006 
5007 /*
5008  * select_task_rq_fair: Select target runqueue for the waking task in domains
5009  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
5010  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
5011  *
5012  * Balances load by selecting the idlest cpu in the idlest group, or under
5013  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
5014  *
5015  * Returns the target cpu number.
5016  *
5017  * preempt must be disabled.
5018  */
5019 static int
5020 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
5021 {
5022 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5023 	int cpu = smp_processor_id();
5024 	int new_cpu = cpu;
5025 	int want_affine = 0;
5026 	int sync = wake_flags & WF_SYNC;
5027 
5028 	if (sd_flag & SD_BALANCE_WAKE)
5029 		want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
5030 
5031 	rcu_read_lock();
5032 	for_each_domain(cpu, tmp) {
5033 		if (!(tmp->flags & SD_LOAD_BALANCE))
5034 			continue;
5035 
5036 		/*
5037 		 * If both cpu and prev_cpu are part of this domain,
5038 		 * cpu is a valid SD_WAKE_AFFINE target.
5039 		 */
5040 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
5041 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
5042 			affine_sd = tmp;
5043 			break;
5044 		}
5045 
5046 		if (tmp->flags & sd_flag)
5047 			sd = tmp;
5048 	}
5049 
5050 	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
5051 		prev_cpu = cpu;
5052 
5053 	if (sd_flag & SD_BALANCE_WAKE) {
5054 		new_cpu = select_idle_sibling(p, prev_cpu);
5055 		goto unlock;
5056 	}
5057 
5058 	while (sd) {
5059 		struct sched_group *group;
5060 		int weight;
5061 
5062 		if (!(sd->flags & sd_flag)) {
5063 			sd = sd->child;
5064 			continue;
5065 		}
5066 
5067 		group = find_idlest_group(sd, p, cpu, sd_flag);
5068 		if (!group) {
5069 			sd = sd->child;
5070 			continue;
5071 		}
5072 
5073 		new_cpu = find_idlest_cpu(group, p, cpu);
5074 		if (new_cpu == -1 || new_cpu == cpu) {
5075 			/* Now try balancing at a lower domain level of cpu */
5076 			sd = sd->child;
5077 			continue;
5078 		}
5079 
5080 		/* Now try balancing at a lower domain level of new_cpu */
5081 		cpu = new_cpu;
5082 		weight = sd->span_weight;
5083 		sd = NULL;
5084 		for_each_domain(cpu, tmp) {
5085 			if (weight <= tmp->span_weight)
5086 				break;
5087 			if (tmp->flags & sd_flag)
5088 				sd = tmp;
5089 		}
5090 		/* while loop will break here if sd == NULL */
5091 	}
5092 unlock:
5093 	rcu_read_unlock();
5094 
5095 	return new_cpu;
5096 }
5097 
5098 /*
5099  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5100  * cfs_rq_of(p) references at time of call are still valid and identify the
5101  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
5102  * other assumptions, including the state of rq->lock, should be made.
5103  */
5104 static void
5105 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
5106 {
5107 	struct sched_entity *se = &p->se;
5108 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5109 
5110 	/*
5111 	 * Load tracking: accumulate removed load so that it can be processed
5112 	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
5113 	 * to blocked load iff they have a positive decay-count.  It can never
5114 	 * be negative here since on-rq tasks have decay-count == 0.
5115 	 */
5116 	if (se->avg.decay_count) {
5117 		se->avg.decay_count = -__synchronize_entity_decay(se);
5118 		atomic_long_add(se->avg.load_avg_contrib,
5119 						&cfs_rq->removed_load);
5120 	}
5121 
5122 	/* We have migrated, no longer consider this task hot */
5123 	se->exec_start = 0;
5124 }
5125 #endif /* CONFIG_SMP */
5126 
5127 static unsigned long
5128 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5129 {
5130 	unsigned long gran = sysctl_sched_wakeup_granularity;
5131 
5132 	/*
5133 	 * Since its curr running now, convert the gran from real-time
5134 	 * to virtual-time in his units.
5135 	 *
5136 	 * By using 'se' instead of 'curr' we penalize light tasks, so
5137 	 * they get preempted easier. That is, if 'se' < 'curr' then
5138 	 * the resulting gran will be larger, therefore penalizing the
5139 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
5140 	 * be smaller, again penalizing the lighter task.
5141 	 *
5142 	 * This is especially important for buddies when the leftmost
5143 	 * task is higher priority than the buddy.
5144 	 */
5145 	return calc_delta_fair(gran, se);
5146 }
5147 
5148 /*
5149  * Should 'se' preempt 'curr'.
5150  *
5151  *             |s1
5152  *        |s2
5153  *   |s3
5154  *         g
5155  *      |<--->|c
5156  *
5157  *  w(c, s1) = -1
5158  *  w(c, s2) =  0
5159  *  w(c, s3) =  1
5160  *
5161  */
5162 static int
5163 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5164 {
5165 	s64 gran, vdiff = curr->vruntime - se->vruntime;
5166 
5167 	if (vdiff <= 0)
5168 		return -1;
5169 
5170 	gran = wakeup_gran(curr, se);
5171 	if (vdiff > gran)
5172 		return 1;
5173 
5174 	return 0;
5175 }
5176 
5177 static void set_last_buddy(struct sched_entity *se)
5178 {
5179 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5180 		return;
5181 
5182 	for_each_sched_entity(se)
5183 		cfs_rq_of(se)->last = se;
5184 }
5185 
5186 static void set_next_buddy(struct sched_entity *se)
5187 {
5188 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5189 		return;
5190 
5191 	for_each_sched_entity(se)
5192 		cfs_rq_of(se)->next = se;
5193 }
5194 
5195 static void set_skip_buddy(struct sched_entity *se)
5196 {
5197 	for_each_sched_entity(se)
5198 		cfs_rq_of(se)->skip = se;
5199 }
5200 
5201 /*
5202  * Preempt the current task with a newly woken task if needed:
5203  */
5204 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5205 {
5206 	struct task_struct *curr = rq->curr;
5207 	struct sched_entity *se = &curr->se, *pse = &p->se;
5208 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5209 	int scale = cfs_rq->nr_running >= sched_nr_latency;
5210 	int next_buddy_marked = 0;
5211 
5212 	if (unlikely(se == pse))
5213 		return;
5214 
5215 	/*
5216 	 * This is possible from callers such as attach_tasks(), in which we
5217 	 * unconditionally check_prempt_curr() after an enqueue (which may have
5218 	 * lead to a throttle).  This both saves work and prevents false
5219 	 * next-buddy nomination below.
5220 	 */
5221 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5222 		return;
5223 
5224 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5225 		set_next_buddy(pse);
5226 		next_buddy_marked = 1;
5227 	}
5228 
5229 	/*
5230 	 * We can come here with TIF_NEED_RESCHED already set from new task
5231 	 * wake up path.
5232 	 *
5233 	 * Note: this also catches the edge-case of curr being in a throttled
5234 	 * group (e.g. via set_curr_task), since update_curr() (in the
5235 	 * enqueue of curr) will have resulted in resched being set.  This
5236 	 * prevents us from potentially nominating it as a false LAST_BUDDY
5237 	 * below.
5238 	 */
5239 	if (test_tsk_need_resched(curr))
5240 		return;
5241 
5242 	/* Idle tasks are by definition preempted by non-idle tasks. */
5243 	if (unlikely(curr->policy == SCHED_IDLE) &&
5244 	    likely(p->policy != SCHED_IDLE))
5245 		goto preempt;
5246 
5247 	/*
5248 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
5249 	 * is driven by the tick):
5250 	 */
5251 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5252 		return;
5253 
5254 	find_matching_se(&se, &pse);
5255 	update_curr(cfs_rq_of(se));
5256 	BUG_ON(!pse);
5257 	if (wakeup_preempt_entity(se, pse) == 1) {
5258 		/*
5259 		 * Bias pick_next to pick the sched entity that is
5260 		 * triggering this preemption.
5261 		 */
5262 		if (!next_buddy_marked)
5263 			set_next_buddy(pse);
5264 		goto preempt;
5265 	}
5266 
5267 	return;
5268 
5269 preempt:
5270 	resched_curr(rq);
5271 	/*
5272 	 * Only set the backward buddy when the current task is still
5273 	 * on the rq. This can happen when a wakeup gets interleaved
5274 	 * with schedule on the ->pre_schedule() or idle_balance()
5275 	 * point, either of which can * drop the rq lock.
5276 	 *
5277 	 * Also, during early boot the idle thread is in the fair class,
5278 	 * for obvious reasons its a bad idea to schedule back to it.
5279 	 */
5280 	if (unlikely(!se->on_rq || curr == rq->idle))
5281 		return;
5282 
5283 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5284 		set_last_buddy(se);
5285 }
5286 
5287 static struct task_struct *
5288 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
5289 {
5290 	struct cfs_rq *cfs_rq = &rq->cfs;
5291 	struct sched_entity *se;
5292 	struct task_struct *p;
5293 	int new_tasks;
5294 
5295 again:
5296 #ifdef CONFIG_FAIR_GROUP_SCHED
5297 	if (!cfs_rq->nr_running)
5298 		goto idle;
5299 
5300 	if (prev->sched_class != &fair_sched_class)
5301 		goto simple;
5302 
5303 	/*
5304 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
5305 	 * likely that a next task is from the same cgroup as the current.
5306 	 *
5307 	 * Therefore attempt to avoid putting and setting the entire cgroup
5308 	 * hierarchy, only change the part that actually changes.
5309 	 */
5310 
5311 	do {
5312 		struct sched_entity *curr = cfs_rq->curr;
5313 
5314 		/*
5315 		 * Since we got here without doing put_prev_entity() we also
5316 		 * have to consider cfs_rq->curr. If it is still a runnable
5317 		 * entity, update_curr() will update its vruntime, otherwise
5318 		 * forget we've ever seen it.
5319 		 */
5320 		if (curr) {
5321 			if (curr->on_rq)
5322 				update_curr(cfs_rq);
5323 			else
5324 				curr = NULL;
5325 
5326 			/*
5327 			 * This call to check_cfs_rq_runtime() will do the
5328 			 * throttle and dequeue its entity in the parent(s).
5329 			 * Therefore the 'simple' nr_running test will indeed
5330 			 * be correct.
5331 			 */
5332 			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5333 				goto simple;
5334 		}
5335 
5336 		se = pick_next_entity(cfs_rq, curr);
5337 		cfs_rq = group_cfs_rq(se);
5338 	} while (cfs_rq);
5339 
5340 	p = task_of(se);
5341 
5342 	/*
5343 	 * Since we haven't yet done put_prev_entity and if the selected task
5344 	 * is a different task than we started out with, try and touch the
5345 	 * least amount of cfs_rqs.
5346 	 */
5347 	if (prev != p) {
5348 		struct sched_entity *pse = &prev->se;
5349 
5350 		while (!(cfs_rq = is_same_group(se, pse))) {
5351 			int se_depth = se->depth;
5352 			int pse_depth = pse->depth;
5353 
5354 			if (se_depth <= pse_depth) {
5355 				put_prev_entity(cfs_rq_of(pse), pse);
5356 				pse = parent_entity(pse);
5357 			}
5358 			if (se_depth >= pse_depth) {
5359 				set_next_entity(cfs_rq_of(se), se);
5360 				se = parent_entity(se);
5361 			}
5362 		}
5363 
5364 		put_prev_entity(cfs_rq, pse);
5365 		set_next_entity(cfs_rq, se);
5366 	}
5367 
5368 	if (hrtick_enabled(rq))
5369 		hrtick_start_fair(rq, p);
5370 
5371 	return p;
5372 simple:
5373 	cfs_rq = &rq->cfs;
5374 #endif
5375 
5376 	if (!cfs_rq->nr_running)
5377 		goto idle;
5378 
5379 	put_prev_task(rq, prev);
5380 
5381 	do {
5382 		se = pick_next_entity(cfs_rq, NULL);
5383 		set_next_entity(cfs_rq, se);
5384 		cfs_rq = group_cfs_rq(se);
5385 	} while (cfs_rq);
5386 
5387 	p = task_of(se);
5388 
5389 	if (hrtick_enabled(rq))
5390 		hrtick_start_fair(rq, p);
5391 
5392 	return p;
5393 
5394 idle:
5395 	/*
5396 	 * This is OK, because current is on_cpu, which avoids it being picked
5397 	 * for load-balance and preemption/IRQs are still disabled avoiding
5398 	 * further scheduler activity on it and we're being very careful to
5399 	 * re-start the picking loop.
5400 	 */
5401 	lockdep_unpin_lock(&rq->lock);
5402 	new_tasks = idle_balance(rq);
5403 	lockdep_pin_lock(&rq->lock);
5404 	/*
5405 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
5406 	 * possible for any higher priority task to appear. In that case we
5407 	 * must re-start the pick_next_entity() loop.
5408 	 */
5409 	if (new_tasks < 0)
5410 		return RETRY_TASK;
5411 
5412 	if (new_tasks > 0)
5413 		goto again;
5414 
5415 	return NULL;
5416 }
5417 
5418 /*
5419  * Account for a descheduled task:
5420  */
5421 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5422 {
5423 	struct sched_entity *se = &prev->se;
5424 	struct cfs_rq *cfs_rq;
5425 
5426 	for_each_sched_entity(se) {
5427 		cfs_rq = cfs_rq_of(se);
5428 		put_prev_entity(cfs_rq, se);
5429 	}
5430 }
5431 
5432 /*
5433  * sched_yield() is very simple
5434  *
5435  * The magic of dealing with the ->skip buddy is in pick_next_entity.
5436  */
5437 static void yield_task_fair(struct rq *rq)
5438 {
5439 	struct task_struct *curr = rq->curr;
5440 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5441 	struct sched_entity *se = &curr->se;
5442 
5443 	/*
5444 	 * Are we the only task in the tree?
5445 	 */
5446 	if (unlikely(rq->nr_running == 1))
5447 		return;
5448 
5449 	clear_buddies(cfs_rq, se);
5450 
5451 	if (curr->policy != SCHED_BATCH) {
5452 		update_rq_clock(rq);
5453 		/*
5454 		 * Update run-time statistics of the 'current'.
5455 		 */
5456 		update_curr(cfs_rq);
5457 		/*
5458 		 * Tell update_rq_clock() that we've just updated,
5459 		 * so we don't do microscopic update in schedule()
5460 		 * and double the fastpath cost.
5461 		 */
5462 		rq_clock_skip_update(rq, true);
5463 	}
5464 
5465 	set_skip_buddy(se);
5466 }
5467 
5468 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5469 {
5470 	struct sched_entity *se = &p->se;
5471 
5472 	/* throttled hierarchies are not runnable */
5473 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5474 		return false;
5475 
5476 	/* Tell the scheduler that we'd really like pse to run next. */
5477 	set_next_buddy(se);
5478 
5479 	yield_task_fair(rq);
5480 
5481 	return true;
5482 }
5483 
5484 #ifdef CONFIG_SMP
5485 /**************************************************
5486  * Fair scheduling class load-balancing methods.
5487  *
5488  * BASICS
5489  *
5490  * The purpose of load-balancing is to achieve the same basic fairness the
5491  * per-cpu scheduler provides, namely provide a proportional amount of compute
5492  * time to each task. This is expressed in the following equation:
5493  *
5494  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
5495  *
5496  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5497  * W_i,0 is defined as:
5498  *
5499  *   W_i,0 = \Sum_j w_i,j                                             (2)
5500  *
5501  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5502  * is derived from the nice value as per prio_to_weight[].
5503  *
5504  * The weight average is an exponential decay average of the instantaneous
5505  * weight:
5506  *
5507  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
5508  *
5509  * C_i is the compute capacity of cpu i, typically it is the
5510  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5511  * can also include other factors [XXX].
5512  *
5513  * To achieve this balance we define a measure of imbalance which follows
5514  * directly from (1):
5515  *
5516  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
5517  *
5518  * We them move tasks around to minimize the imbalance. In the continuous
5519  * function space it is obvious this converges, in the discrete case we get
5520  * a few fun cases generally called infeasible weight scenarios.
5521  *
5522  * [XXX expand on:
5523  *     - infeasible weights;
5524  *     - local vs global optima in the discrete case. ]
5525  *
5526  *
5527  * SCHED DOMAINS
5528  *
5529  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5530  * for all i,j solution, we create a tree of cpus that follows the hardware
5531  * topology where each level pairs two lower groups (or better). This results
5532  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5533  * tree to only the first of the previous level and we decrease the frequency
5534  * of load-balance at each level inv. proportional to the number of cpus in
5535  * the groups.
5536  *
5537  * This yields:
5538  *
5539  *     log_2 n     1     n
5540  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
5541  *     i = 0      2^i   2^i
5542  *                               `- size of each group
5543  *         |         |     `- number of cpus doing load-balance
5544  *         |         `- freq
5545  *         `- sum over all levels
5546  *
5547  * Coupled with a limit on how many tasks we can migrate every balance pass,
5548  * this makes (5) the runtime complexity of the balancer.
5549  *
5550  * An important property here is that each CPU is still (indirectly) connected
5551  * to every other cpu in at most O(log n) steps:
5552  *
5553  * The adjacency matrix of the resulting graph is given by:
5554  *
5555  *             log_2 n
5556  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
5557  *             k = 0
5558  *
5559  * And you'll find that:
5560  *
5561  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
5562  *
5563  * Showing there's indeed a path between every cpu in at most O(log n) steps.
5564  * The task movement gives a factor of O(m), giving a convergence complexity
5565  * of:
5566  *
5567  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
5568  *
5569  *
5570  * WORK CONSERVING
5571  *
5572  * In order to avoid CPUs going idle while there's still work to do, new idle
5573  * balancing is more aggressive and has the newly idle cpu iterate up the domain
5574  * tree itself instead of relying on other CPUs to bring it work.
5575  *
5576  * This adds some complexity to both (5) and (8) but it reduces the total idle
5577  * time.
5578  *
5579  * [XXX more?]
5580  *
5581  *
5582  * CGROUPS
5583  *
5584  * Cgroups make a horror show out of (2), instead of a simple sum we get:
5585  *
5586  *                                s_k,i
5587  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
5588  *                                 S_k
5589  *
5590  * Where
5591  *
5592  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
5593  *
5594  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5595  *
5596  * The big problem is S_k, its a global sum needed to compute a local (W_i)
5597  * property.
5598  *
5599  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5600  *      rewrite all of this once again.]
5601  */
5602 
5603 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5604 
5605 enum fbq_type { regular, remote, all };
5606 
5607 #define LBF_ALL_PINNED	0x01
5608 #define LBF_NEED_BREAK	0x02
5609 #define LBF_DST_PINNED  0x04
5610 #define LBF_SOME_PINNED	0x08
5611 
5612 struct lb_env {
5613 	struct sched_domain	*sd;
5614 
5615 	struct rq		*src_rq;
5616 	int			src_cpu;
5617 
5618 	int			dst_cpu;
5619 	struct rq		*dst_rq;
5620 
5621 	struct cpumask		*dst_grpmask;
5622 	int			new_dst_cpu;
5623 	enum cpu_idle_type	idle;
5624 	long			imbalance;
5625 	/* The set of CPUs under consideration for load-balancing */
5626 	struct cpumask		*cpus;
5627 
5628 	unsigned int		flags;
5629 
5630 	unsigned int		loop;
5631 	unsigned int		loop_break;
5632 	unsigned int		loop_max;
5633 
5634 	enum fbq_type		fbq_type;
5635 	struct list_head	tasks;
5636 };
5637 
5638 /*
5639  * Is this task likely cache-hot:
5640  */
5641 static int task_hot(struct task_struct *p, struct lb_env *env)
5642 {
5643 	s64 delta;
5644 
5645 	lockdep_assert_held(&env->src_rq->lock);
5646 
5647 	if (p->sched_class != &fair_sched_class)
5648 		return 0;
5649 
5650 	if (unlikely(p->policy == SCHED_IDLE))
5651 		return 0;
5652 
5653 	/*
5654 	 * Buddy candidates are cache hot:
5655 	 */
5656 	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5657 			(&p->se == cfs_rq_of(&p->se)->next ||
5658 			 &p->se == cfs_rq_of(&p->se)->last))
5659 		return 1;
5660 
5661 	if (sysctl_sched_migration_cost == -1)
5662 		return 1;
5663 	if (sysctl_sched_migration_cost == 0)
5664 		return 0;
5665 
5666 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5667 
5668 	return delta < (s64)sysctl_sched_migration_cost;
5669 }
5670 
5671 #ifdef CONFIG_NUMA_BALANCING
5672 /*
5673  * Returns true if the destination node is the preferred node.
5674  * Needs to match fbq_classify_rq(): if there is a runnable task
5675  * that is not on its preferred node, we should identify it.
5676  */
5677 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
5678 {
5679 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
5680 	unsigned long src_faults, dst_faults;
5681 	int src_nid, dst_nid;
5682 
5683 	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
5684 	    !(env->sd->flags & SD_NUMA)) {
5685 		return false;
5686 	}
5687 
5688 	src_nid = cpu_to_node(env->src_cpu);
5689 	dst_nid = cpu_to_node(env->dst_cpu);
5690 
5691 	if (src_nid == dst_nid)
5692 		return false;
5693 
5694 	/* Encourage migration to the preferred node. */
5695 	if (dst_nid == p->numa_preferred_nid)
5696 		return true;
5697 
5698 	/* Migrating away from the preferred node is bad. */
5699 	if (src_nid == p->numa_preferred_nid)
5700 		return false;
5701 
5702 	if (numa_group) {
5703 		src_faults = group_faults(p, src_nid);
5704 		dst_faults = group_faults(p, dst_nid);
5705 	} else {
5706 		src_faults = task_faults(p, src_nid);
5707 		dst_faults = task_faults(p, dst_nid);
5708 	}
5709 
5710 	return dst_faults > src_faults;
5711 }
5712 
5713 
5714 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5715 {
5716 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
5717 	unsigned long src_faults, dst_faults;
5718 	int src_nid, dst_nid;
5719 
5720 	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
5721 		return false;
5722 
5723 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5724 		return false;
5725 
5726 	src_nid = cpu_to_node(env->src_cpu);
5727 	dst_nid = cpu_to_node(env->dst_cpu);
5728 
5729 	if (src_nid == dst_nid)
5730 		return false;
5731 
5732 	/* Migrating away from the preferred node is bad. */
5733 	if (src_nid == p->numa_preferred_nid)
5734 		return true;
5735 
5736 	/* Encourage migration to the preferred node. */
5737 	if (dst_nid == p->numa_preferred_nid)
5738 		return false;
5739 
5740 	if (numa_group) {
5741 		src_faults = group_faults(p, src_nid);
5742 		dst_faults = group_faults(p, dst_nid);
5743 	} else {
5744 		src_faults = task_faults(p, src_nid);
5745 		dst_faults = task_faults(p, dst_nid);
5746 	}
5747 
5748 	return dst_faults < src_faults;
5749 }
5750 
5751 #else
5752 static inline bool migrate_improves_locality(struct task_struct *p,
5753 					     struct lb_env *env)
5754 {
5755 	return false;
5756 }
5757 
5758 static inline bool migrate_degrades_locality(struct task_struct *p,
5759 					     struct lb_env *env)
5760 {
5761 	return false;
5762 }
5763 #endif
5764 
5765 /*
5766  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5767  */
5768 static
5769 int can_migrate_task(struct task_struct *p, struct lb_env *env)
5770 {
5771 	int tsk_cache_hot = 0;
5772 
5773 	lockdep_assert_held(&env->src_rq->lock);
5774 
5775 	/*
5776 	 * We do not migrate tasks that are:
5777 	 * 1) throttled_lb_pair, or
5778 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
5779 	 * 3) running (obviously), or
5780 	 * 4) are cache-hot on their current CPU.
5781 	 */
5782 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5783 		return 0;
5784 
5785 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
5786 		int cpu;
5787 
5788 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
5789 
5790 		env->flags |= LBF_SOME_PINNED;
5791 
5792 		/*
5793 		 * Remember if this task can be migrated to any other cpu in
5794 		 * our sched_group. We may want to revisit it if we couldn't
5795 		 * meet load balance goals by pulling other tasks on src_cpu.
5796 		 *
5797 		 * Also avoid computing new_dst_cpu if we have already computed
5798 		 * one in current iteration.
5799 		 */
5800 		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
5801 			return 0;
5802 
5803 		/* Prevent to re-select dst_cpu via env's cpus */
5804 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5805 			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5806 				env->flags |= LBF_DST_PINNED;
5807 				env->new_dst_cpu = cpu;
5808 				break;
5809 			}
5810 		}
5811 
5812 		return 0;
5813 	}
5814 
5815 	/* Record that we found atleast one task that could run on dst_cpu */
5816 	env->flags &= ~LBF_ALL_PINNED;
5817 
5818 	if (task_running(env->src_rq, p)) {
5819 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
5820 		return 0;
5821 	}
5822 
5823 	/*
5824 	 * Aggressive migration if:
5825 	 * 1) destination numa is preferred
5826 	 * 2) task is cache cold, or
5827 	 * 3) too many balance attempts have failed.
5828 	 */
5829 	tsk_cache_hot = task_hot(p, env);
5830 	if (!tsk_cache_hot)
5831 		tsk_cache_hot = migrate_degrades_locality(p, env);
5832 
5833 	if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
5834 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5835 		if (tsk_cache_hot) {
5836 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5837 			schedstat_inc(p, se.statistics.nr_forced_migrations);
5838 		}
5839 		return 1;
5840 	}
5841 
5842 	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
5843 	return 0;
5844 }
5845 
5846 /*
5847  * detach_task() -- detach the task for the migration specified in env
5848  */
5849 static void detach_task(struct task_struct *p, struct lb_env *env)
5850 {
5851 	lockdep_assert_held(&env->src_rq->lock);
5852 
5853 	deactivate_task(env->src_rq, p, 0);
5854 	p->on_rq = TASK_ON_RQ_MIGRATING;
5855 	set_task_cpu(p, env->dst_cpu);
5856 }
5857 
5858 /*
5859  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5860  * part of active balancing operations within "domain".
5861  *
5862  * Returns a task if successful and NULL otherwise.
5863  */
5864 static struct task_struct *detach_one_task(struct lb_env *env)
5865 {
5866 	struct task_struct *p, *n;
5867 
5868 	lockdep_assert_held(&env->src_rq->lock);
5869 
5870 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5871 		if (!can_migrate_task(p, env))
5872 			continue;
5873 
5874 		detach_task(p, env);
5875 
5876 		/*
5877 		 * Right now, this is only the second place where
5878 		 * lb_gained[env->idle] is updated (other is detach_tasks)
5879 		 * so we can safely collect stats here rather than
5880 		 * inside detach_tasks().
5881 		 */
5882 		schedstat_inc(env->sd, lb_gained[env->idle]);
5883 		return p;
5884 	}
5885 	return NULL;
5886 }
5887 
5888 static const unsigned int sched_nr_migrate_break = 32;
5889 
5890 /*
5891  * detach_tasks() -- tries to detach up to imbalance weighted load from
5892  * busiest_rq, as part of a balancing operation within domain "sd".
5893  *
5894  * Returns number of detached tasks if successful and 0 otherwise.
5895  */
5896 static int detach_tasks(struct lb_env *env)
5897 {
5898 	struct list_head *tasks = &env->src_rq->cfs_tasks;
5899 	struct task_struct *p;
5900 	unsigned long load;
5901 	int detached = 0;
5902 
5903 	lockdep_assert_held(&env->src_rq->lock);
5904 
5905 	if (env->imbalance <= 0)
5906 		return 0;
5907 
5908 	while (!list_empty(tasks)) {
5909 		p = list_first_entry(tasks, struct task_struct, se.group_node);
5910 
5911 		env->loop++;
5912 		/* We've more or less seen every task there is, call it quits */
5913 		if (env->loop > env->loop_max)
5914 			break;
5915 
5916 		/* take a breather every nr_migrate tasks */
5917 		if (env->loop > env->loop_break) {
5918 			env->loop_break += sched_nr_migrate_break;
5919 			env->flags |= LBF_NEED_BREAK;
5920 			break;
5921 		}
5922 
5923 		if (!can_migrate_task(p, env))
5924 			goto next;
5925 
5926 		load = task_h_load(p);
5927 
5928 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
5929 			goto next;
5930 
5931 		if ((load / 2) > env->imbalance)
5932 			goto next;
5933 
5934 		detach_task(p, env);
5935 		list_add(&p->se.group_node, &env->tasks);
5936 
5937 		detached++;
5938 		env->imbalance -= load;
5939 
5940 #ifdef CONFIG_PREEMPT
5941 		/*
5942 		 * NEWIDLE balancing is a source of latency, so preemptible
5943 		 * kernels will stop after the first task is detached to minimize
5944 		 * the critical section.
5945 		 */
5946 		if (env->idle == CPU_NEWLY_IDLE)
5947 			break;
5948 #endif
5949 
5950 		/*
5951 		 * We only want to steal up to the prescribed amount of
5952 		 * weighted load.
5953 		 */
5954 		if (env->imbalance <= 0)
5955 			break;
5956 
5957 		continue;
5958 next:
5959 		list_move_tail(&p->se.group_node, tasks);
5960 	}
5961 
5962 	/*
5963 	 * Right now, this is one of only two places we collect this stat
5964 	 * so we can safely collect detach_one_task() stats here rather
5965 	 * than inside detach_one_task().
5966 	 */
5967 	schedstat_add(env->sd, lb_gained[env->idle], detached);
5968 
5969 	return detached;
5970 }
5971 
5972 /*
5973  * attach_task() -- attach the task detached by detach_task() to its new rq.
5974  */
5975 static void attach_task(struct rq *rq, struct task_struct *p)
5976 {
5977 	lockdep_assert_held(&rq->lock);
5978 
5979 	BUG_ON(task_rq(p) != rq);
5980 	p->on_rq = TASK_ON_RQ_QUEUED;
5981 	activate_task(rq, p, 0);
5982 	check_preempt_curr(rq, p, 0);
5983 }
5984 
5985 /*
5986  * attach_one_task() -- attaches the task returned from detach_one_task() to
5987  * its new rq.
5988  */
5989 static void attach_one_task(struct rq *rq, struct task_struct *p)
5990 {
5991 	raw_spin_lock(&rq->lock);
5992 	attach_task(rq, p);
5993 	raw_spin_unlock(&rq->lock);
5994 }
5995 
5996 /*
5997  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
5998  * new rq.
5999  */
6000 static void attach_tasks(struct lb_env *env)
6001 {
6002 	struct list_head *tasks = &env->tasks;
6003 	struct task_struct *p;
6004 
6005 	raw_spin_lock(&env->dst_rq->lock);
6006 
6007 	while (!list_empty(tasks)) {
6008 		p = list_first_entry(tasks, struct task_struct, se.group_node);
6009 		list_del_init(&p->se.group_node);
6010 
6011 		attach_task(env->dst_rq, p);
6012 	}
6013 
6014 	raw_spin_unlock(&env->dst_rq->lock);
6015 }
6016 
6017 #ifdef CONFIG_FAIR_GROUP_SCHED
6018 /*
6019  * update tg->load_weight by folding this cpu's load_avg
6020  */
6021 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
6022 {
6023 	struct sched_entity *se = tg->se[cpu];
6024 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
6025 
6026 	/* throttled entities do not contribute to load */
6027 	if (throttled_hierarchy(cfs_rq))
6028 		return;
6029 
6030 	update_cfs_rq_blocked_load(cfs_rq, 1);
6031 
6032 	if (se) {
6033 		update_entity_load_avg(se, 1);
6034 		/*
6035 		 * We pivot on our runnable average having decayed to zero for
6036 		 * list removal.  This generally implies that all our children
6037 		 * have also been removed (modulo rounding error or bandwidth
6038 		 * control); however, such cases are rare and we can fix these
6039 		 * at enqueue.
6040 		 *
6041 		 * TODO: fix up out-of-order children on enqueue.
6042 		 */
6043 		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
6044 			list_del_leaf_cfs_rq(cfs_rq);
6045 	} else {
6046 		struct rq *rq = rq_of(cfs_rq);
6047 		update_rq_runnable_avg(rq, rq->nr_running);
6048 	}
6049 }
6050 
6051 static void update_blocked_averages(int cpu)
6052 {
6053 	struct rq *rq = cpu_rq(cpu);
6054 	struct cfs_rq *cfs_rq;
6055 	unsigned long flags;
6056 
6057 	raw_spin_lock_irqsave(&rq->lock, flags);
6058 	update_rq_clock(rq);
6059 	/*
6060 	 * Iterates the task_group tree in a bottom up fashion, see
6061 	 * list_add_leaf_cfs_rq() for details.
6062 	 */
6063 	for_each_leaf_cfs_rq(rq, cfs_rq) {
6064 		/*
6065 		 * Note: We may want to consider periodically releasing
6066 		 * rq->lock about these updates so that creating many task
6067 		 * groups does not result in continually extending hold time.
6068 		 */
6069 		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
6070 	}
6071 
6072 	raw_spin_unlock_irqrestore(&rq->lock, flags);
6073 }
6074 
6075 /*
6076  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
6077  * This needs to be done in a top-down fashion because the load of a child
6078  * group is a fraction of its parents load.
6079  */
6080 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
6081 {
6082 	struct rq *rq = rq_of(cfs_rq);
6083 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
6084 	unsigned long now = jiffies;
6085 	unsigned long load;
6086 
6087 	if (cfs_rq->last_h_load_update == now)
6088 		return;
6089 
6090 	cfs_rq->h_load_next = NULL;
6091 	for_each_sched_entity(se) {
6092 		cfs_rq = cfs_rq_of(se);
6093 		cfs_rq->h_load_next = se;
6094 		if (cfs_rq->last_h_load_update == now)
6095 			break;
6096 	}
6097 
6098 	if (!se) {
6099 		cfs_rq->h_load = cfs_rq->runnable_load_avg;
6100 		cfs_rq->last_h_load_update = now;
6101 	}
6102 
6103 	while ((se = cfs_rq->h_load_next) != NULL) {
6104 		load = cfs_rq->h_load;
6105 		load = div64_ul(load * se->avg.load_avg_contrib,
6106 				cfs_rq->runnable_load_avg + 1);
6107 		cfs_rq = group_cfs_rq(se);
6108 		cfs_rq->h_load = load;
6109 		cfs_rq->last_h_load_update = now;
6110 	}
6111 }
6112 
6113 static unsigned long task_h_load(struct task_struct *p)
6114 {
6115 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
6116 
6117 	update_cfs_rq_h_load(cfs_rq);
6118 	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
6119 			cfs_rq->runnable_load_avg + 1);
6120 }
6121 #else
6122 static inline void update_blocked_averages(int cpu)
6123 {
6124 }
6125 
6126 static unsigned long task_h_load(struct task_struct *p)
6127 {
6128 	return p->se.avg.load_avg_contrib;
6129 }
6130 #endif
6131 
6132 /********** Helpers for find_busiest_group ************************/
6133 
6134 enum group_type {
6135 	group_other = 0,
6136 	group_imbalanced,
6137 	group_overloaded,
6138 };
6139 
6140 /*
6141  * sg_lb_stats - stats of a sched_group required for load_balancing
6142  */
6143 struct sg_lb_stats {
6144 	unsigned long avg_load; /*Avg load across the CPUs of the group */
6145 	unsigned long group_load; /* Total load over the CPUs of the group */
6146 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
6147 	unsigned long load_per_task;
6148 	unsigned long group_capacity;
6149 	unsigned long group_usage; /* Total usage of the group */
6150 	unsigned int sum_nr_running; /* Nr tasks running in the group */
6151 	unsigned int idle_cpus;
6152 	unsigned int group_weight;
6153 	enum group_type group_type;
6154 	int group_no_capacity;
6155 #ifdef CONFIG_NUMA_BALANCING
6156 	unsigned int nr_numa_running;
6157 	unsigned int nr_preferred_running;
6158 #endif
6159 };
6160 
6161 /*
6162  * sd_lb_stats - Structure to store the statistics of a sched_domain
6163  *		 during load balancing.
6164  */
6165 struct sd_lb_stats {
6166 	struct sched_group *busiest;	/* Busiest group in this sd */
6167 	struct sched_group *local;	/* Local group in this sd */
6168 	unsigned long total_load;	/* Total load of all groups in sd */
6169 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
6170 	unsigned long avg_load;	/* Average load across all groups in sd */
6171 
6172 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
6173 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
6174 };
6175 
6176 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
6177 {
6178 	/*
6179 	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
6180 	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
6181 	 * We must however clear busiest_stat::avg_load because
6182 	 * update_sd_pick_busiest() reads this before assignment.
6183 	 */
6184 	*sds = (struct sd_lb_stats){
6185 		.busiest = NULL,
6186 		.local = NULL,
6187 		.total_load = 0UL,
6188 		.total_capacity = 0UL,
6189 		.busiest_stat = {
6190 			.avg_load = 0UL,
6191 			.sum_nr_running = 0,
6192 			.group_type = group_other,
6193 		},
6194 	};
6195 }
6196 
6197 /**
6198  * get_sd_load_idx - Obtain the load index for a given sched domain.
6199  * @sd: The sched_domain whose load_idx is to be obtained.
6200  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
6201  *
6202  * Return: The load index.
6203  */
6204 static inline int get_sd_load_idx(struct sched_domain *sd,
6205 					enum cpu_idle_type idle)
6206 {
6207 	int load_idx;
6208 
6209 	switch (idle) {
6210 	case CPU_NOT_IDLE:
6211 		load_idx = sd->busy_idx;
6212 		break;
6213 
6214 	case CPU_NEWLY_IDLE:
6215 		load_idx = sd->newidle_idx;
6216 		break;
6217 	default:
6218 		load_idx = sd->idle_idx;
6219 		break;
6220 	}
6221 
6222 	return load_idx;
6223 }
6224 
6225 static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
6226 {
6227 	if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
6228 		return sd->smt_gain / sd->span_weight;
6229 
6230 	return SCHED_CAPACITY_SCALE;
6231 }
6232 
6233 unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
6234 {
6235 	return default_scale_cpu_capacity(sd, cpu);
6236 }
6237 
6238 static unsigned long scale_rt_capacity(int cpu)
6239 {
6240 	struct rq *rq = cpu_rq(cpu);
6241 	u64 total, used, age_stamp, avg;
6242 	s64 delta;
6243 
6244 	/*
6245 	 * Since we're reading these variables without serialization make sure
6246 	 * we read them once before doing sanity checks on them.
6247 	 */
6248 	age_stamp = READ_ONCE(rq->age_stamp);
6249 	avg = READ_ONCE(rq->rt_avg);
6250 	delta = __rq_clock_broken(rq) - age_stamp;
6251 
6252 	if (unlikely(delta < 0))
6253 		delta = 0;
6254 
6255 	total = sched_avg_period() + delta;
6256 
6257 	used = div_u64(avg, total);
6258 
6259 	if (likely(used < SCHED_CAPACITY_SCALE))
6260 		return SCHED_CAPACITY_SCALE - used;
6261 
6262 	return 1;
6263 }
6264 
6265 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6266 {
6267 	unsigned long capacity = SCHED_CAPACITY_SCALE;
6268 	struct sched_group *sdg = sd->groups;
6269 
6270 	if (sched_feat(ARCH_CAPACITY))
6271 		capacity *= arch_scale_cpu_capacity(sd, cpu);
6272 	else
6273 		capacity *= default_scale_cpu_capacity(sd, cpu);
6274 
6275 	capacity >>= SCHED_CAPACITY_SHIFT;
6276 
6277 	cpu_rq(cpu)->cpu_capacity_orig = capacity;
6278 
6279 	capacity *= scale_rt_capacity(cpu);
6280 	capacity >>= SCHED_CAPACITY_SHIFT;
6281 
6282 	if (!capacity)
6283 		capacity = 1;
6284 
6285 	cpu_rq(cpu)->cpu_capacity = capacity;
6286 	sdg->sgc->capacity = capacity;
6287 }
6288 
6289 void update_group_capacity(struct sched_domain *sd, int cpu)
6290 {
6291 	struct sched_domain *child = sd->child;
6292 	struct sched_group *group, *sdg = sd->groups;
6293 	unsigned long capacity;
6294 	unsigned long interval;
6295 
6296 	interval = msecs_to_jiffies(sd->balance_interval);
6297 	interval = clamp(interval, 1UL, max_load_balance_interval);
6298 	sdg->sgc->next_update = jiffies + interval;
6299 
6300 	if (!child) {
6301 		update_cpu_capacity(sd, cpu);
6302 		return;
6303 	}
6304 
6305 	capacity = 0;
6306 
6307 	if (child->flags & SD_OVERLAP) {
6308 		/*
6309 		 * SD_OVERLAP domains cannot assume that child groups
6310 		 * span the current group.
6311 		 */
6312 
6313 		for_each_cpu(cpu, sched_group_cpus(sdg)) {
6314 			struct sched_group_capacity *sgc;
6315 			struct rq *rq = cpu_rq(cpu);
6316 
6317 			/*
6318 			 * build_sched_domains() -> init_sched_groups_capacity()
6319 			 * gets here before we've attached the domains to the
6320 			 * runqueues.
6321 			 *
6322 			 * Use capacity_of(), which is set irrespective of domains
6323 			 * in update_cpu_capacity().
6324 			 *
6325 			 * This avoids capacity from being 0 and
6326 			 * causing divide-by-zero issues on boot.
6327 			 */
6328 			if (unlikely(!rq->sd)) {
6329 				capacity += capacity_of(cpu);
6330 				continue;
6331 			}
6332 
6333 			sgc = rq->sd->groups->sgc;
6334 			capacity += sgc->capacity;
6335 		}
6336 	} else  {
6337 		/*
6338 		 * !SD_OVERLAP domains can assume that child groups
6339 		 * span the current group.
6340 		 */
6341 
6342 		group = child->groups;
6343 		do {
6344 			capacity += group->sgc->capacity;
6345 			group = group->next;
6346 		} while (group != child->groups);
6347 	}
6348 
6349 	sdg->sgc->capacity = capacity;
6350 }
6351 
6352 /*
6353  * Check whether the capacity of the rq has been noticeably reduced by side
6354  * activity. The imbalance_pct is used for the threshold.
6355  * Return true is the capacity is reduced
6356  */
6357 static inline int
6358 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6359 {
6360 	return ((rq->cpu_capacity * sd->imbalance_pct) <
6361 				(rq->cpu_capacity_orig * 100));
6362 }
6363 
6364 /*
6365  * Group imbalance indicates (and tries to solve) the problem where balancing
6366  * groups is inadequate due to tsk_cpus_allowed() constraints.
6367  *
6368  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
6369  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
6370  * Something like:
6371  *
6372  * 	{ 0 1 2 3 } { 4 5 6 7 }
6373  * 	        *     * * *
6374  *
6375  * If we were to balance group-wise we'd place two tasks in the first group and
6376  * two tasks in the second group. Clearly this is undesired as it will overload
6377  * cpu 3 and leave one of the cpus in the second group unused.
6378  *
6379  * The current solution to this issue is detecting the skew in the first group
6380  * by noticing the lower domain failed to reach balance and had difficulty
6381  * moving tasks due to affinity constraints.
6382  *
6383  * When this is so detected; this group becomes a candidate for busiest; see
6384  * update_sd_pick_busiest(). And calculate_imbalance() and
6385  * find_busiest_group() avoid some of the usual balance conditions to allow it
6386  * to create an effective group imbalance.
6387  *
6388  * This is a somewhat tricky proposition since the next run might not find the
6389  * group imbalance and decide the groups need to be balanced again. A most
6390  * subtle and fragile situation.
6391  */
6392 
6393 static inline int sg_imbalanced(struct sched_group *group)
6394 {
6395 	return group->sgc->imbalance;
6396 }
6397 
6398 /*
6399  * group_has_capacity returns true if the group has spare capacity that could
6400  * be used by some tasks.
6401  * We consider that a group has spare capacity if the  * number of task is
6402  * smaller than the number of CPUs or if the usage is lower than the available
6403  * capacity for CFS tasks.
6404  * For the latter, we use a threshold to stabilize the state, to take into
6405  * account the variance of the tasks' load and to return true if the available
6406  * capacity in meaningful for the load balancer.
6407  * As an example, an available capacity of 1% can appear but it doesn't make
6408  * any benefit for the load balance.
6409  */
6410 static inline bool
6411 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6412 {
6413 	if (sgs->sum_nr_running < sgs->group_weight)
6414 		return true;
6415 
6416 	if ((sgs->group_capacity * 100) >
6417 			(sgs->group_usage * env->sd->imbalance_pct))
6418 		return true;
6419 
6420 	return false;
6421 }
6422 
6423 /*
6424  *  group_is_overloaded returns true if the group has more tasks than it can
6425  *  handle.
6426  *  group_is_overloaded is not equals to !group_has_capacity because a group
6427  *  with the exact right number of tasks, has no more spare capacity but is not
6428  *  overloaded so both group_has_capacity and group_is_overloaded return
6429  *  false.
6430  */
6431 static inline bool
6432 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6433 {
6434 	if (sgs->sum_nr_running <= sgs->group_weight)
6435 		return false;
6436 
6437 	if ((sgs->group_capacity * 100) <
6438 			(sgs->group_usage * env->sd->imbalance_pct))
6439 		return true;
6440 
6441 	return false;
6442 }
6443 
6444 static enum group_type group_classify(struct lb_env *env,
6445 		struct sched_group *group,
6446 		struct sg_lb_stats *sgs)
6447 {
6448 	if (sgs->group_no_capacity)
6449 		return group_overloaded;
6450 
6451 	if (sg_imbalanced(group))
6452 		return group_imbalanced;
6453 
6454 	return group_other;
6455 }
6456 
6457 /**
6458  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
6459  * @env: The load balancing environment.
6460  * @group: sched_group whose statistics are to be updated.
6461  * @load_idx: Load index of sched_domain of this_cpu for load calc.
6462  * @local_group: Does group contain this_cpu.
6463  * @sgs: variable to hold the statistics for this group.
6464  * @overload: Indicate more than one runnable task for any CPU.
6465  */
6466 static inline void update_sg_lb_stats(struct lb_env *env,
6467 			struct sched_group *group, int load_idx,
6468 			int local_group, struct sg_lb_stats *sgs,
6469 			bool *overload)
6470 {
6471 	unsigned long load;
6472 	int i;
6473 
6474 	memset(sgs, 0, sizeof(*sgs));
6475 
6476 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6477 		struct rq *rq = cpu_rq(i);
6478 
6479 		/* Bias balancing toward cpus of our domain */
6480 		if (local_group)
6481 			load = target_load(i, load_idx);
6482 		else
6483 			load = source_load(i, load_idx);
6484 
6485 		sgs->group_load += load;
6486 		sgs->group_usage += get_cpu_usage(i);
6487 		sgs->sum_nr_running += rq->cfs.h_nr_running;
6488 
6489 		if (rq->nr_running > 1)
6490 			*overload = true;
6491 
6492 #ifdef CONFIG_NUMA_BALANCING
6493 		sgs->nr_numa_running += rq->nr_numa_running;
6494 		sgs->nr_preferred_running += rq->nr_preferred_running;
6495 #endif
6496 		sgs->sum_weighted_load += weighted_cpuload(i);
6497 		if (idle_cpu(i))
6498 			sgs->idle_cpus++;
6499 	}
6500 
6501 	/* Adjust by relative CPU capacity of the group */
6502 	sgs->group_capacity = group->sgc->capacity;
6503 	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
6504 
6505 	if (sgs->sum_nr_running)
6506 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
6507 
6508 	sgs->group_weight = group->group_weight;
6509 
6510 	sgs->group_no_capacity = group_is_overloaded(env, sgs);
6511 	sgs->group_type = group_classify(env, group, sgs);
6512 }
6513 
6514 /**
6515  * update_sd_pick_busiest - return 1 on busiest group
6516  * @env: The load balancing environment.
6517  * @sds: sched_domain statistics
6518  * @sg: sched_group candidate to be checked for being the busiest
6519  * @sgs: sched_group statistics
6520  *
6521  * Determine if @sg is a busier group than the previously selected
6522  * busiest group.
6523  *
6524  * Return: %true if @sg is a busier group than the previously selected
6525  * busiest group. %false otherwise.
6526  */
6527 static bool update_sd_pick_busiest(struct lb_env *env,
6528 				   struct sd_lb_stats *sds,
6529 				   struct sched_group *sg,
6530 				   struct sg_lb_stats *sgs)
6531 {
6532 	struct sg_lb_stats *busiest = &sds->busiest_stat;
6533 
6534 	if (sgs->group_type > busiest->group_type)
6535 		return true;
6536 
6537 	if (sgs->group_type < busiest->group_type)
6538 		return false;
6539 
6540 	if (sgs->avg_load <= busiest->avg_load)
6541 		return false;
6542 
6543 	/* This is the busiest node in its class. */
6544 	if (!(env->sd->flags & SD_ASYM_PACKING))
6545 		return true;
6546 
6547 	/*
6548 	 * ASYM_PACKING needs to move all the work to the lowest
6549 	 * numbered CPUs in the group, therefore mark all groups
6550 	 * higher than ourself as busy.
6551 	 */
6552 	if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
6553 		if (!sds->busiest)
6554 			return true;
6555 
6556 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
6557 			return true;
6558 	}
6559 
6560 	return false;
6561 }
6562 
6563 #ifdef CONFIG_NUMA_BALANCING
6564 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6565 {
6566 	if (sgs->sum_nr_running > sgs->nr_numa_running)
6567 		return regular;
6568 	if (sgs->sum_nr_running > sgs->nr_preferred_running)
6569 		return remote;
6570 	return all;
6571 }
6572 
6573 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6574 {
6575 	if (rq->nr_running > rq->nr_numa_running)
6576 		return regular;
6577 	if (rq->nr_running > rq->nr_preferred_running)
6578 		return remote;
6579 	return all;
6580 }
6581 #else
6582 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6583 {
6584 	return all;
6585 }
6586 
6587 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6588 {
6589 	return regular;
6590 }
6591 #endif /* CONFIG_NUMA_BALANCING */
6592 
6593 /**
6594  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
6595  * @env: The load balancing environment.
6596  * @sds: variable to hold the statistics for this sched_domain.
6597  */
6598 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
6599 {
6600 	struct sched_domain *child = env->sd->child;
6601 	struct sched_group *sg = env->sd->groups;
6602 	struct sg_lb_stats tmp_sgs;
6603 	int load_idx, prefer_sibling = 0;
6604 	bool overload = false;
6605 
6606 	if (child && child->flags & SD_PREFER_SIBLING)
6607 		prefer_sibling = 1;
6608 
6609 	load_idx = get_sd_load_idx(env->sd, env->idle);
6610 
6611 	do {
6612 		struct sg_lb_stats *sgs = &tmp_sgs;
6613 		int local_group;
6614 
6615 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
6616 		if (local_group) {
6617 			sds->local = sg;
6618 			sgs = &sds->local_stat;
6619 
6620 			if (env->idle != CPU_NEWLY_IDLE ||
6621 			    time_after_eq(jiffies, sg->sgc->next_update))
6622 				update_group_capacity(env->sd, env->dst_cpu);
6623 		}
6624 
6625 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6626 						&overload);
6627 
6628 		if (local_group)
6629 			goto next_group;
6630 
6631 		/*
6632 		 * In case the child domain prefers tasks go to siblings
6633 		 * first, lower the sg capacity so that we'll try
6634 		 * and move all the excess tasks away. We lower the capacity
6635 		 * of a group only if the local group has the capacity to fit
6636 		 * these excess tasks. The extra check prevents the case where
6637 		 * you always pull from the heaviest group when it is already
6638 		 * under-utilized (possible with a large weight task outweighs
6639 		 * the tasks on the system).
6640 		 */
6641 		if (prefer_sibling && sds->local &&
6642 		    group_has_capacity(env, &sds->local_stat) &&
6643 		    (sgs->sum_nr_running > 1)) {
6644 			sgs->group_no_capacity = 1;
6645 			sgs->group_type = group_overloaded;
6646 		}
6647 
6648 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6649 			sds->busiest = sg;
6650 			sds->busiest_stat = *sgs;
6651 		}
6652 
6653 next_group:
6654 		/* Now, start updating sd_lb_stats */
6655 		sds->total_load += sgs->group_load;
6656 		sds->total_capacity += sgs->group_capacity;
6657 
6658 		sg = sg->next;
6659 	} while (sg != env->sd->groups);
6660 
6661 	if (env->sd->flags & SD_NUMA)
6662 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6663 
6664 	if (!env->sd->parent) {
6665 		/* update overload indicator if we are at root domain */
6666 		if (env->dst_rq->rd->overload != overload)
6667 			env->dst_rq->rd->overload = overload;
6668 	}
6669 
6670 }
6671 
6672 /**
6673  * check_asym_packing - Check to see if the group is packed into the
6674  *			sched doman.
6675  *
6676  * This is primarily intended to used at the sibling level.  Some
6677  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
6678  * case of POWER7, it can move to lower SMT modes only when higher
6679  * threads are idle.  When in lower SMT modes, the threads will
6680  * perform better since they share less core resources.  Hence when we
6681  * have idle threads, we want them to be the higher ones.
6682  *
6683  * This packing function is run on idle threads.  It checks to see if
6684  * the busiest CPU in this domain (core in the P7 case) has a higher
6685  * CPU number than the packing function is being run on.  Here we are
6686  * assuming lower CPU number will be equivalent to lower a SMT thread
6687  * number.
6688  *
6689  * Return: 1 when packing is required and a task should be moved to
6690  * this CPU.  The amount of the imbalance is returned in *imbalance.
6691  *
6692  * @env: The load balancing environment.
6693  * @sds: Statistics of the sched_domain which is to be packed
6694  */
6695 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6696 {
6697 	int busiest_cpu;
6698 
6699 	if (!(env->sd->flags & SD_ASYM_PACKING))
6700 		return 0;
6701 
6702 	if (!sds->busiest)
6703 		return 0;
6704 
6705 	busiest_cpu = group_first_cpu(sds->busiest);
6706 	if (env->dst_cpu > busiest_cpu)
6707 		return 0;
6708 
6709 	env->imbalance = DIV_ROUND_CLOSEST(
6710 		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
6711 		SCHED_CAPACITY_SCALE);
6712 
6713 	return 1;
6714 }
6715 
6716 /**
6717  * fix_small_imbalance - Calculate the minor imbalance that exists
6718  *			amongst the groups of a sched_domain, during
6719  *			load balancing.
6720  * @env: The load balancing environment.
6721  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
6722  */
6723 static inline
6724 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6725 {
6726 	unsigned long tmp, capa_now = 0, capa_move = 0;
6727 	unsigned int imbn = 2;
6728 	unsigned long scaled_busy_load_per_task;
6729 	struct sg_lb_stats *local, *busiest;
6730 
6731 	local = &sds->local_stat;
6732 	busiest = &sds->busiest_stat;
6733 
6734 	if (!local->sum_nr_running)
6735 		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
6736 	else if (busiest->load_per_task > local->load_per_task)
6737 		imbn = 1;
6738 
6739 	scaled_busy_load_per_task =
6740 		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6741 		busiest->group_capacity;
6742 
6743 	if (busiest->avg_load + scaled_busy_load_per_task >=
6744 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
6745 		env->imbalance = busiest->load_per_task;
6746 		return;
6747 	}
6748 
6749 	/*
6750 	 * OK, we don't have enough imbalance to justify moving tasks,
6751 	 * however we may be able to increase total CPU capacity used by
6752 	 * moving them.
6753 	 */
6754 
6755 	capa_now += busiest->group_capacity *
6756 			min(busiest->load_per_task, busiest->avg_load);
6757 	capa_now += local->group_capacity *
6758 			min(local->load_per_task, local->avg_load);
6759 	capa_now /= SCHED_CAPACITY_SCALE;
6760 
6761 	/* Amount of load we'd subtract */
6762 	if (busiest->avg_load > scaled_busy_load_per_task) {
6763 		capa_move += busiest->group_capacity *
6764 			    min(busiest->load_per_task,
6765 				busiest->avg_load - scaled_busy_load_per_task);
6766 	}
6767 
6768 	/* Amount of load we'd add */
6769 	if (busiest->avg_load * busiest->group_capacity <
6770 	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
6771 		tmp = (busiest->avg_load * busiest->group_capacity) /
6772 		      local->group_capacity;
6773 	} else {
6774 		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6775 		      local->group_capacity;
6776 	}
6777 	capa_move += local->group_capacity *
6778 		    min(local->load_per_task, local->avg_load + tmp);
6779 	capa_move /= SCHED_CAPACITY_SCALE;
6780 
6781 	/* Move if we gain throughput */
6782 	if (capa_move > capa_now)
6783 		env->imbalance = busiest->load_per_task;
6784 }
6785 
6786 /**
6787  * calculate_imbalance - Calculate the amount of imbalance present within the
6788  *			 groups of a given sched_domain during load balance.
6789  * @env: load balance environment
6790  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
6791  */
6792 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6793 {
6794 	unsigned long max_pull, load_above_capacity = ~0UL;
6795 	struct sg_lb_stats *local, *busiest;
6796 
6797 	local = &sds->local_stat;
6798 	busiest = &sds->busiest_stat;
6799 
6800 	if (busiest->group_type == group_imbalanced) {
6801 		/*
6802 		 * In the group_imb case we cannot rely on group-wide averages
6803 		 * to ensure cpu-load equilibrium, look at wider averages. XXX
6804 		 */
6805 		busiest->load_per_task =
6806 			min(busiest->load_per_task, sds->avg_load);
6807 	}
6808 
6809 	/*
6810 	 * In the presence of smp nice balancing, certain scenarios can have
6811 	 * max load less than avg load(as we skip the groups at or below
6812 	 * its cpu_capacity, while calculating max_load..)
6813 	 */
6814 	if (busiest->avg_load <= sds->avg_load ||
6815 	    local->avg_load >= sds->avg_load) {
6816 		env->imbalance = 0;
6817 		return fix_small_imbalance(env, sds);
6818 	}
6819 
6820 	/*
6821 	 * If there aren't any idle cpus, avoid creating some.
6822 	 */
6823 	if (busiest->group_type == group_overloaded &&
6824 	    local->group_type   == group_overloaded) {
6825 		load_above_capacity = busiest->sum_nr_running *
6826 					SCHED_LOAD_SCALE;
6827 		if (load_above_capacity > busiest->group_capacity)
6828 			load_above_capacity -= busiest->group_capacity;
6829 		else
6830 			load_above_capacity = ~0UL;
6831 	}
6832 
6833 	/*
6834 	 * We're trying to get all the cpus to the average_load, so we don't
6835 	 * want to push ourselves above the average load, nor do we wish to
6836 	 * reduce the max loaded cpu below the average load. At the same time,
6837 	 * we also don't want to reduce the group load below the group capacity
6838 	 * (so that we can implement power-savings policies etc). Thus we look
6839 	 * for the minimum possible imbalance.
6840 	 */
6841 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
6842 
6843 	/* How much load to actually move to equalise the imbalance */
6844 	env->imbalance = min(
6845 		max_pull * busiest->group_capacity,
6846 		(sds->avg_load - local->avg_load) * local->group_capacity
6847 	) / SCHED_CAPACITY_SCALE;
6848 
6849 	/*
6850 	 * if *imbalance is less than the average load per runnable task
6851 	 * there is no guarantee that any tasks will be moved so we'll have
6852 	 * a think about bumping its value to force at least one task to be
6853 	 * moved
6854 	 */
6855 	if (env->imbalance < busiest->load_per_task)
6856 		return fix_small_imbalance(env, sds);
6857 }
6858 
6859 /******* find_busiest_group() helpers end here *********************/
6860 
6861 /**
6862  * find_busiest_group - Returns the busiest group within the sched_domain
6863  * if there is an imbalance. If there isn't an imbalance, and
6864  * the user has opted for power-savings, it returns a group whose
6865  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
6866  * such a group exists.
6867  *
6868  * Also calculates the amount of weighted load which should be moved
6869  * to restore balance.
6870  *
6871  * @env: The load balancing environment.
6872  *
6873  * Return:	- The busiest group if imbalance exists.
6874  *		- If no imbalance and user has opted for power-savings balance,
6875  *		   return the least loaded group whose CPUs can be
6876  *		   put to idle by rebalancing its tasks onto our group.
6877  */
6878 static struct sched_group *find_busiest_group(struct lb_env *env)
6879 {
6880 	struct sg_lb_stats *local, *busiest;
6881 	struct sd_lb_stats sds;
6882 
6883 	init_sd_lb_stats(&sds);
6884 
6885 	/*
6886 	 * Compute the various statistics relavent for load balancing at
6887 	 * this level.
6888 	 */
6889 	update_sd_lb_stats(env, &sds);
6890 	local = &sds.local_stat;
6891 	busiest = &sds.busiest_stat;
6892 
6893 	/* ASYM feature bypasses nice load balance check */
6894 	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
6895 	    check_asym_packing(env, &sds))
6896 		return sds.busiest;
6897 
6898 	/* There is no busy sibling group to pull tasks from */
6899 	if (!sds.busiest || busiest->sum_nr_running == 0)
6900 		goto out_balanced;
6901 
6902 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
6903 						/ sds.total_capacity;
6904 
6905 	/*
6906 	 * If the busiest group is imbalanced the below checks don't
6907 	 * work because they assume all things are equal, which typically
6908 	 * isn't true due to cpus_allowed constraints and the like.
6909 	 */
6910 	if (busiest->group_type == group_imbalanced)
6911 		goto force_balance;
6912 
6913 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
6914 	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
6915 	    busiest->group_no_capacity)
6916 		goto force_balance;
6917 
6918 	/*
6919 	 * If the local group is busier than the selected busiest group
6920 	 * don't try and pull any tasks.
6921 	 */
6922 	if (local->avg_load >= busiest->avg_load)
6923 		goto out_balanced;
6924 
6925 	/*
6926 	 * Don't pull any tasks if this group is already above the domain
6927 	 * average load.
6928 	 */
6929 	if (local->avg_load >= sds.avg_load)
6930 		goto out_balanced;
6931 
6932 	if (env->idle == CPU_IDLE) {
6933 		/*
6934 		 * This cpu is idle. If the busiest group is not overloaded
6935 		 * and there is no imbalance between this and busiest group
6936 		 * wrt idle cpus, it is balanced. The imbalance becomes
6937 		 * significant if the diff is greater than 1 otherwise we
6938 		 * might end up to just move the imbalance on another group
6939 		 */
6940 		if ((busiest->group_type != group_overloaded) &&
6941 				(local->idle_cpus <= (busiest->idle_cpus + 1)))
6942 			goto out_balanced;
6943 	} else {
6944 		/*
6945 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
6946 		 * imbalance_pct to be conservative.
6947 		 */
6948 		if (100 * busiest->avg_load <=
6949 				env->sd->imbalance_pct * local->avg_load)
6950 			goto out_balanced;
6951 	}
6952 
6953 force_balance:
6954 	/* Looks like there is an imbalance. Compute it */
6955 	calculate_imbalance(env, &sds);
6956 	return sds.busiest;
6957 
6958 out_balanced:
6959 	env->imbalance = 0;
6960 	return NULL;
6961 }
6962 
6963 /*
6964  * find_busiest_queue - find the busiest runqueue among the cpus in group.
6965  */
6966 static struct rq *find_busiest_queue(struct lb_env *env,
6967 				     struct sched_group *group)
6968 {
6969 	struct rq *busiest = NULL, *rq;
6970 	unsigned long busiest_load = 0, busiest_capacity = 1;
6971 	int i;
6972 
6973 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6974 		unsigned long capacity, wl;
6975 		enum fbq_type rt;
6976 
6977 		rq = cpu_rq(i);
6978 		rt = fbq_classify_rq(rq);
6979 
6980 		/*
6981 		 * We classify groups/runqueues into three groups:
6982 		 *  - regular: there are !numa tasks
6983 		 *  - remote:  there are numa tasks that run on the 'wrong' node
6984 		 *  - all:     there is no distinction
6985 		 *
6986 		 * In order to avoid migrating ideally placed numa tasks,
6987 		 * ignore those when there's better options.
6988 		 *
6989 		 * If we ignore the actual busiest queue to migrate another
6990 		 * task, the next balance pass can still reduce the busiest
6991 		 * queue by moving tasks around inside the node.
6992 		 *
6993 		 * If we cannot move enough load due to this classification
6994 		 * the next pass will adjust the group classification and
6995 		 * allow migration of more tasks.
6996 		 *
6997 		 * Both cases only affect the total convergence complexity.
6998 		 */
6999 		if (rt > env->fbq_type)
7000 			continue;
7001 
7002 		capacity = capacity_of(i);
7003 
7004 		wl = weighted_cpuload(i);
7005 
7006 		/*
7007 		 * When comparing with imbalance, use weighted_cpuload()
7008 		 * which is not scaled with the cpu capacity.
7009 		 */
7010 
7011 		if (rq->nr_running == 1 && wl > env->imbalance &&
7012 		    !check_cpu_capacity(rq, env->sd))
7013 			continue;
7014 
7015 		/*
7016 		 * For the load comparisons with the other cpu's, consider
7017 		 * the weighted_cpuload() scaled with the cpu capacity, so
7018 		 * that the load can be moved away from the cpu that is
7019 		 * potentially running at a lower capacity.
7020 		 *
7021 		 * Thus we're looking for max(wl_i / capacity_i), crosswise
7022 		 * multiplication to rid ourselves of the division works out
7023 		 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
7024 		 * our previous maximum.
7025 		 */
7026 		if (wl * busiest_capacity > busiest_load * capacity) {
7027 			busiest_load = wl;
7028 			busiest_capacity = capacity;
7029 			busiest = rq;
7030 		}
7031 	}
7032 
7033 	return busiest;
7034 }
7035 
7036 /*
7037  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
7038  * so long as it is large enough.
7039  */
7040 #define MAX_PINNED_INTERVAL	512
7041 
7042 /* Working cpumask for load_balance and load_balance_newidle. */
7043 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7044 
7045 static int need_active_balance(struct lb_env *env)
7046 {
7047 	struct sched_domain *sd = env->sd;
7048 
7049 	if (env->idle == CPU_NEWLY_IDLE) {
7050 
7051 		/*
7052 		 * ASYM_PACKING needs to force migrate tasks from busy but
7053 		 * higher numbered CPUs in order to pack all tasks in the
7054 		 * lowest numbered CPUs.
7055 		 */
7056 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
7057 			return 1;
7058 	}
7059 
7060 	/*
7061 	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
7062 	 * It's worth migrating the task if the src_cpu's capacity is reduced
7063 	 * because of other sched_class or IRQs if more capacity stays
7064 	 * available on dst_cpu.
7065 	 */
7066 	if ((env->idle != CPU_NOT_IDLE) &&
7067 	    (env->src_rq->cfs.h_nr_running == 1)) {
7068 		if ((check_cpu_capacity(env->src_rq, sd)) &&
7069 		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
7070 			return 1;
7071 	}
7072 
7073 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
7074 }
7075 
7076 static int active_load_balance_cpu_stop(void *data);
7077 
7078 static int should_we_balance(struct lb_env *env)
7079 {
7080 	struct sched_group *sg = env->sd->groups;
7081 	struct cpumask *sg_cpus, *sg_mask;
7082 	int cpu, balance_cpu = -1;
7083 
7084 	/*
7085 	 * In the newly idle case, we will allow all the cpu's
7086 	 * to do the newly idle load balance.
7087 	 */
7088 	if (env->idle == CPU_NEWLY_IDLE)
7089 		return 1;
7090 
7091 	sg_cpus = sched_group_cpus(sg);
7092 	sg_mask = sched_group_mask(sg);
7093 	/* Try to find first idle cpu */
7094 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
7095 		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
7096 			continue;
7097 
7098 		balance_cpu = cpu;
7099 		break;
7100 	}
7101 
7102 	if (balance_cpu == -1)
7103 		balance_cpu = group_balance_cpu(sg);
7104 
7105 	/*
7106 	 * First idle cpu or the first cpu(busiest) in this sched group
7107 	 * is eligible for doing load balancing at this and above domains.
7108 	 */
7109 	return balance_cpu == env->dst_cpu;
7110 }
7111 
7112 /*
7113  * Check this_cpu to ensure it is balanced within domain. Attempt to move
7114  * tasks if there is an imbalance.
7115  */
7116 static int load_balance(int this_cpu, struct rq *this_rq,
7117 			struct sched_domain *sd, enum cpu_idle_type idle,
7118 			int *continue_balancing)
7119 {
7120 	int ld_moved, cur_ld_moved, active_balance = 0;
7121 	struct sched_domain *sd_parent = sd->parent;
7122 	struct sched_group *group;
7123 	struct rq *busiest;
7124 	unsigned long flags;
7125 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
7126 
7127 	struct lb_env env = {
7128 		.sd		= sd,
7129 		.dst_cpu	= this_cpu,
7130 		.dst_rq		= this_rq,
7131 		.dst_grpmask    = sched_group_cpus(sd->groups),
7132 		.idle		= idle,
7133 		.loop_break	= sched_nr_migrate_break,
7134 		.cpus		= cpus,
7135 		.fbq_type	= all,
7136 		.tasks		= LIST_HEAD_INIT(env.tasks),
7137 	};
7138 
7139 	/*
7140 	 * For NEWLY_IDLE load_balancing, we don't need to consider
7141 	 * other cpus in our group
7142 	 */
7143 	if (idle == CPU_NEWLY_IDLE)
7144 		env.dst_grpmask = NULL;
7145 
7146 	cpumask_copy(cpus, cpu_active_mask);
7147 
7148 	schedstat_inc(sd, lb_count[idle]);
7149 
7150 redo:
7151 	if (!should_we_balance(&env)) {
7152 		*continue_balancing = 0;
7153 		goto out_balanced;
7154 	}
7155 
7156 	group = find_busiest_group(&env);
7157 	if (!group) {
7158 		schedstat_inc(sd, lb_nobusyg[idle]);
7159 		goto out_balanced;
7160 	}
7161 
7162 	busiest = find_busiest_queue(&env, group);
7163 	if (!busiest) {
7164 		schedstat_inc(sd, lb_nobusyq[idle]);
7165 		goto out_balanced;
7166 	}
7167 
7168 	BUG_ON(busiest == env.dst_rq);
7169 
7170 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
7171 
7172 	env.src_cpu = busiest->cpu;
7173 	env.src_rq = busiest;
7174 
7175 	ld_moved = 0;
7176 	if (busiest->nr_running > 1) {
7177 		/*
7178 		 * Attempt to move tasks. If find_busiest_group has found
7179 		 * an imbalance but busiest->nr_running <= 1, the group is
7180 		 * still unbalanced. ld_moved simply stays zero, so it is
7181 		 * correctly treated as an imbalance.
7182 		 */
7183 		env.flags |= LBF_ALL_PINNED;
7184 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
7185 
7186 more_balance:
7187 		raw_spin_lock_irqsave(&busiest->lock, flags);
7188 
7189 		/*
7190 		 * cur_ld_moved - load moved in current iteration
7191 		 * ld_moved     - cumulative load moved across iterations
7192 		 */
7193 		cur_ld_moved = detach_tasks(&env);
7194 
7195 		/*
7196 		 * We've detached some tasks from busiest_rq. Every
7197 		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
7198 		 * unlock busiest->lock, and we are able to be sure
7199 		 * that nobody can manipulate the tasks in parallel.
7200 		 * See task_rq_lock() family for the details.
7201 		 */
7202 
7203 		raw_spin_unlock(&busiest->lock);
7204 
7205 		if (cur_ld_moved) {
7206 			attach_tasks(&env);
7207 			ld_moved += cur_ld_moved;
7208 		}
7209 
7210 		local_irq_restore(flags);
7211 
7212 		if (env.flags & LBF_NEED_BREAK) {
7213 			env.flags &= ~LBF_NEED_BREAK;
7214 			goto more_balance;
7215 		}
7216 
7217 		/*
7218 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
7219 		 * us and move them to an alternate dst_cpu in our sched_group
7220 		 * where they can run. The upper limit on how many times we
7221 		 * iterate on same src_cpu is dependent on number of cpus in our
7222 		 * sched_group.
7223 		 *
7224 		 * This changes load balance semantics a bit on who can move
7225 		 * load to a given_cpu. In addition to the given_cpu itself
7226 		 * (or a ilb_cpu acting on its behalf where given_cpu is
7227 		 * nohz-idle), we now have balance_cpu in a position to move
7228 		 * load to given_cpu. In rare situations, this may cause
7229 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7230 		 * _independently_ and at _same_ time to move some load to
7231 		 * given_cpu) causing exceess load to be moved to given_cpu.
7232 		 * This however should not happen so much in practice and
7233 		 * moreover subsequent load balance cycles should correct the
7234 		 * excess load moved.
7235 		 */
7236 		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
7237 
7238 			/* Prevent to re-select dst_cpu via env's cpus */
7239 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
7240 
7241 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
7242 			env.dst_cpu	 = env.new_dst_cpu;
7243 			env.flags	&= ~LBF_DST_PINNED;
7244 			env.loop	 = 0;
7245 			env.loop_break	 = sched_nr_migrate_break;
7246 
7247 			/*
7248 			 * Go back to "more_balance" rather than "redo" since we
7249 			 * need to continue with same src_cpu.
7250 			 */
7251 			goto more_balance;
7252 		}
7253 
7254 		/*
7255 		 * We failed to reach balance because of affinity.
7256 		 */
7257 		if (sd_parent) {
7258 			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7259 
7260 			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
7261 				*group_imbalance = 1;
7262 		}
7263 
7264 		/* All tasks on this runqueue were pinned by CPU affinity */
7265 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
7266 			cpumask_clear_cpu(cpu_of(busiest), cpus);
7267 			if (!cpumask_empty(cpus)) {
7268 				env.loop = 0;
7269 				env.loop_break = sched_nr_migrate_break;
7270 				goto redo;
7271 			}
7272 			goto out_all_pinned;
7273 		}
7274 	}
7275 
7276 	if (!ld_moved) {
7277 		schedstat_inc(sd, lb_failed[idle]);
7278 		/*
7279 		 * Increment the failure counter only on periodic balance.
7280 		 * We do not want newidle balance, which can be very
7281 		 * frequent, pollute the failure counter causing
7282 		 * excessive cache_hot migrations and active balances.
7283 		 */
7284 		if (idle != CPU_NEWLY_IDLE)
7285 			sd->nr_balance_failed++;
7286 
7287 		if (need_active_balance(&env)) {
7288 			raw_spin_lock_irqsave(&busiest->lock, flags);
7289 
7290 			/* don't kick the active_load_balance_cpu_stop,
7291 			 * if the curr task on busiest cpu can't be
7292 			 * moved to this_cpu
7293 			 */
7294 			if (!cpumask_test_cpu(this_cpu,
7295 					tsk_cpus_allowed(busiest->curr))) {
7296 				raw_spin_unlock_irqrestore(&busiest->lock,
7297 							    flags);
7298 				env.flags |= LBF_ALL_PINNED;
7299 				goto out_one_pinned;
7300 			}
7301 
7302 			/*
7303 			 * ->active_balance synchronizes accesses to
7304 			 * ->active_balance_work.  Once set, it's cleared
7305 			 * only after active load balance is finished.
7306 			 */
7307 			if (!busiest->active_balance) {
7308 				busiest->active_balance = 1;
7309 				busiest->push_cpu = this_cpu;
7310 				active_balance = 1;
7311 			}
7312 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
7313 
7314 			if (active_balance) {
7315 				stop_one_cpu_nowait(cpu_of(busiest),
7316 					active_load_balance_cpu_stop, busiest,
7317 					&busiest->active_balance_work);
7318 			}
7319 
7320 			/*
7321 			 * We've kicked active balancing, reset the failure
7322 			 * counter.
7323 			 */
7324 			sd->nr_balance_failed = sd->cache_nice_tries+1;
7325 		}
7326 	} else
7327 		sd->nr_balance_failed = 0;
7328 
7329 	if (likely(!active_balance)) {
7330 		/* We were unbalanced, so reset the balancing interval */
7331 		sd->balance_interval = sd->min_interval;
7332 	} else {
7333 		/*
7334 		 * If we've begun active balancing, start to back off. This
7335 		 * case may not be covered by the all_pinned logic if there
7336 		 * is only 1 task on the busy runqueue (because we don't call
7337 		 * detach_tasks).
7338 		 */
7339 		if (sd->balance_interval < sd->max_interval)
7340 			sd->balance_interval *= 2;
7341 	}
7342 
7343 	goto out;
7344 
7345 out_balanced:
7346 	/*
7347 	 * We reach balance although we may have faced some affinity
7348 	 * constraints. Clear the imbalance flag if it was set.
7349 	 */
7350 	if (sd_parent) {
7351 		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7352 
7353 		if (*group_imbalance)
7354 			*group_imbalance = 0;
7355 	}
7356 
7357 out_all_pinned:
7358 	/*
7359 	 * We reach balance because all tasks are pinned at this level so
7360 	 * we can't migrate them. Let the imbalance flag set so parent level
7361 	 * can try to migrate them.
7362 	 */
7363 	schedstat_inc(sd, lb_balanced[idle]);
7364 
7365 	sd->nr_balance_failed = 0;
7366 
7367 out_one_pinned:
7368 	/* tune up the balancing interval */
7369 	if (((env.flags & LBF_ALL_PINNED) &&
7370 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
7371 			(sd->balance_interval < sd->max_interval))
7372 		sd->balance_interval *= 2;
7373 
7374 	ld_moved = 0;
7375 out:
7376 	return ld_moved;
7377 }
7378 
7379 static inline unsigned long
7380 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7381 {
7382 	unsigned long interval = sd->balance_interval;
7383 
7384 	if (cpu_busy)
7385 		interval *= sd->busy_factor;
7386 
7387 	/* scale ms to jiffies */
7388 	interval = msecs_to_jiffies(interval);
7389 	interval = clamp(interval, 1UL, max_load_balance_interval);
7390 
7391 	return interval;
7392 }
7393 
7394 static inline void
7395 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
7396 {
7397 	unsigned long interval, next;
7398 
7399 	interval = get_sd_balance_interval(sd, cpu_busy);
7400 	next = sd->last_balance + interval;
7401 
7402 	if (time_after(*next_balance, next))
7403 		*next_balance = next;
7404 }
7405 
7406 /*
7407  * idle_balance is called by schedule() if this_cpu is about to become
7408  * idle. Attempts to pull tasks from other CPUs.
7409  */
7410 static int idle_balance(struct rq *this_rq)
7411 {
7412 	unsigned long next_balance = jiffies + HZ;
7413 	int this_cpu = this_rq->cpu;
7414 	struct sched_domain *sd;
7415 	int pulled_task = 0;
7416 	u64 curr_cost = 0;
7417 
7418 	idle_enter_fair(this_rq);
7419 
7420 	/*
7421 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
7422 	 * measure the duration of idle_balance() as idle time.
7423 	 */
7424 	this_rq->idle_stamp = rq_clock(this_rq);
7425 
7426 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
7427 	    !this_rq->rd->overload) {
7428 		rcu_read_lock();
7429 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
7430 		if (sd)
7431 			update_next_balance(sd, 0, &next_balance);
7432 		rcu_read_unlock();
7433 
7434 		goto out;
7435 	}
7436 
7437 	raw_spin_unlock(&this_rq->lock);
7438 
7439 	update_blocked_averages(this_cpu);
7440 	rcu_read_lock();
7441 	for_each_domain(this_cpu, sd) {
7442 		int continue_balancing = 1;
7443 		u64 t0, domain_cost;
7444 
7445 		if (!(sd->flags & SD_LOAD_BALANCE))
7446 			continue;
7447 
7448 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7449 			update_next_balance(sd, 0, &next_balance);
7450 			break;
7451 		}
7452 
7453 		if (sd->flags & SD_BALANCE_NEWIDLE) {
7454 			t0 = sched_clock_cpu(this_cpu);
7455 
7456 			pulled_task = load_balance(this_cpu, this_rq,
7457 						   sd, CPU_NEWLY_IDLE,
7458 						   &continue_balancing);
7459 
7460 			domain_cost = sched_clock_cpu(this_cpu) - t0;
7461 			if (domain_cost > sd->max_newidle_lb_cost)
7462 				sd->max_newidle_lb_cost = domain_cost;
7463 
7464 			curr_cost += domain_cost;
7465 		}
7466 
7467 		update_next_balance(sd, 0, &next_balance);
7468 
7469 		/*
7470 		 * Stop searching for tasks to pull if there are
7471 		 * now runnable tasks on this rq.
7472 		 */
7473 		if (pulled_task || this_rq->nr_running > 0)
7474 			break;
7475 	}
7476 	rcu_read_unlock();
7477 
7478 	raw_spin_lock(&this_rq->lock);
7479 
7480 	if (curr_cost > this_rq->max_idle_balance_cost)
7481 		this_rq->max_idle_balance_cost = curr_cost;
7482 
7483 	/*
7484 	 * While browsing the domains, we released the rq lock, a task could
7485 	 * have been enqueued in the meantime. Since we're not going idle,
7486 	 * pretend we pulled a task.
7487 	 */
7488 	if (this_rq->cfs.h_nr_running && !pulled_task)
7489 		pulled_task = 1;
7490 
7491 out:
7492 	/* Move the next balance forward */
7493 	if (time_after(this_rq->next_balance, next_balance))
7494 		this_rq->next_balance = next_balance;
7495 
7496 	/* Is there a task of a high priority class? */
7497 	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
7498 		pulled_task = -1;
7499 
7500 	if (pulled_task) {
7501 		idle_exit_fair(this_rq);
7502 		this_rq->idle_stamp = 0;
7503 	}
7504 
7505 	return pulled_task;
7506 }
7507 
7508 /*
7509  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7510  * running tasks off the busiest CPU onto idle CPUs. It requires at
7511  * least 1 task to be running on each physical CPU where possible, and
7512  * avoids physical / logical imbalances.
7513  */
7514 static int active_load_balance_cpu_stop(void *data)
7515 {
7516 	struct rq *busiest_rq = data;
7517 	int busiest_cpu = cpu_of(busiest_rq);
7518 	int target_cpu = busiest_rq->push_cpu;
7519 	struct rq *target_rq = cpu_rq(target_cpu);
7520 	struct sched_domain *sd;
7521 	struct task_struct *p = NULL;
7522 
7523 	raw_spin_lock_irq(&busiest_rq->lock);
7524 
7525 	/* make sure the requested cpu hasn't gone down in the meantime */
7526 	if (unlikely(busiest_cpu != smp_processor_id() ||
7527 		     !busiest_rq->active_balance))
7528 		goto out_unlock;
7529 
7530 	/* Is there any task to move? */
7531 	if (busiest_rq->nr_running <= 1)
7532 		goto out_unlock;
7533 
7534 	/*
7535 	 * This condition is "impossible", if it occurs
7536 	 * we need to fix it. Originally reported by
7537 	 * Bjorn Helgaas on a 128-cpu setup.
7538 	 */
7539 	BUG_ON(busiest_rq == target_rq);
7540 
7541 	/* Search for an sd spanning us and the target CPU. */
7542 	rcu_read_lock();
7543 	for_each_domain(target_cpu, sd) {
7544 		if ((sd->flags & SD_LOAD_BALANCE) &&
7545 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7546 				break;
7547 	}
7548 
7549 	if (likely(sd)) {
7550 		struct lb_env env = {
7551 			.sd		= sd,
7552 			.dst_cpu	= target_cpu,
7553 			.dst_rq		= target_rq,
7554 			.src_cpu	= busiest_rq->cpu,
7555 			.src_rq		= busiest_rq,
7556 			.idle		= CPU_IDLE,
7557 		};
7558 
7559 		schedstat_inc(sd, alb_count);
7560 
7561 		p = detach_one_task(&env);
7562 		if (p)
7563 			schedstat_inc(sd, alb_pushed);
7564 		else
7565 			schedstat_inc(sd, alb_failed);
7566 	}
7567 	rcu_read_unlock();
7568 out_unlock:
7569 	busiest_rq->active_balance = 0;
7570 	raw_spin_unlock(&busiest_rq->lock);
7571 
7572 	if (p)
7573 		attach_one_task(target_rq, p);
7574 
7575 	local_irq_enable();
7576 
7577 	return 0;
7578 }
7579 
7580 static inline int on_null_domain(struct rq *rq)
7581 {
7582 	return unlikely(!rcu_dereference_sched(rq->sd));
7583 }
7584 
7585 #ifdef CONFIG_NO_HZ_COMMON
7586 /*
7587  * idle load balancing details
7588  * - When one of the busy CPUs notice that there may be an idle rebalancing
7589  *   needed, they will kick the idle load balancer, which then does idle
7590  *   load balancing for all the idle CPUs.
7591  */
7592 static struct {
7593 	cpumask_var_t idle_cpus_mask;
7594 	atomic_t nr_cpus;
7595 	unsigned long next_balance;     /* in jiffy units */
7596 } nohz ____cacheline_aligned;
7597 
7598 static inline int find_new_ilb(void)
7599 {
7600 	int ilb = cpumask_first(nohz.idle_cpus_mask);
7601 
7602 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
7603 		return ilb;
7604 
7605 	return nr_cpu_ids;
7606 }
7607 
7608 /*
7609  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
7610  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
7611  * CPU (if there is one).
7612  */
7613 static void nohz_balancer_kick(void)
7614 {
7615 	int ilb_cpu;
7616 
7617 	nohz.next_balance++;
7618 
7619 	ilb_cpu = find_new_ilb();
7620 
7621 	if (ilb_cpu >= nr_cpu_ids)
7622 		return;
7623 
7624 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
7625 		return;
7626 	/*
7627 	 * Use smp_send_reschedule() instead of resched_cpu().
7628 	 * This way we generate a sched IPI on the target cpu which
7629 	 * is idle. And the softirq performing nohz idle load balance
7630 	 * will be run before returning from the IPI.
7631 	 */
7632 	smp_send_reschedule(ilb_cpu);
7633 	return;
7634 }
7635 
7636 static inline void nohz_balance_exit_idle(int cpu)
7637 {
7638 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
7639 		/*
7640 		 * Completely isolated CPUs don't ever set, so we must test.
7641 		 */
7642 		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
7643 			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
7644 			atomic_dec(&nohz.nr_cpus);
7645 		}
7646 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7647 	}
7648 }
7649 
7650 static inline void set_cpu_sd_state_busy(void)
7651 {
7652 	struct sched_domain *sd;
7653 	int cpu = smp_processor_id();
7654 
7655 	rcu_read_lock();
7656 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7657 
7658 	if (!sd || !sd->nohz_idle)
7659 		goto unlock;
7660 	sd->nohz_idle = 0;
7661 
7662 	atomic_inc(&sd->groups->sgc->nr_busy_cpus);
7663 unlock:
7664 	rcu_read_unlock();
7665 }
7666 
7667 void set_cpu_sd_state_idle(void)
7668 {
7669 	struct sched_domain *sd;
7670 	int cpu = smp_processor_id();
7671 
7672 	rcu_read_lock();
7673 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7674 
7675 	if (!sd || sd->nohz_idle)
7676 		goto unlock;
7677 	sd->nohz_idle = 1;
7678 
7679 	atomic_dec(&sd->groups->sgc->nr_busy_cpus);
7680 unlock:
7681 	rcu_read_unlock();
7682 }
7683 
7684 /*
7685  * This routine will record that the cpu is going idle with tick stopped.
7686  * This info will be used in performing idle load balancing in the future.
7687  */
7688 void nohz_balance_enter_idle(int cpu)
7689 {
7690 	/*
7691 	 * If this cpu is going down, then nothing needs to be done.
7692 	 */
7693 	if (!cpu_active(cpu))
7694 		return;
7695 
7696 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
7697 		return;
7698 
7699 	/*
7700 	 * If we're a completely isolated CPU, we don't play.
7701 	 */
7702 	if (on_null_domain(cpu_rq(cpu)))
7703 		return;
7704 
7705 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
7706 	atomic_inc(&nohz.nr_cpus);
7707 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7708 }
7709 
7710 static int sched_ilb_notifier(struct notifier_block *nfb,
7711 					unsigned long action, void *hcpu)
7712 {
7713 	switch (action & ~CPU_TASKS_FROZEN) {
7714 	case CPU_DYING:
7715 		nohz_balance_exit_idle(smp_processor_id());
7716 		return NOTIFY_OK;
7717 	default:
7718 		return NOTIFY_DONE;
7719 	}
7720 }
7721 #endif
7722 
7723 static DEFINE_SPINLOCK(balancing);
7724 
7725 /*
7726  * Scale the max load_balance interval with the number of CPUs in the system.
7727  * This trades load-balance latency on larger machines for less cross talk.
7728  */
7729 void update_max_interval(void)
7730 {
7731 	max_load_balance_interval = HZ*num_online_cpus()/10;
7732 }
7733 
7734 /*
7735  * It checks each scheduling domain to see if it is due to be balanced,
7736  * and initiates a balancing operation if so.
7737  *
7738  * Balancing parameters are set up in init_sched_domains.
7739  */
7740 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7741 {
7742 	int continue_balancing = 1;
7743 	int cpu = rq->cpu;
7744 	unsigned long interval;
7745 	struct sched_domain *sd;
7746 	/* Earliest time when we have to do rebalance again */
7747 	unsigned long next_balance = jiffies + 60*HZ;
7748 	int update_next_balance = 0;
7749 	int need_serialize, need_decay = 0;
7750 	u64 max_cost = 0;
7751 
7752 	update_blocked_averages(cpu);
7753 
7754 	rcu_read_lock();
7755 	for_each_domain(cpu, sd) {
7756 		/*
7757 		 * Decay the newidle max times here because this is a regular
7758 		 * visit to all the domains. Decay ~1% per second.
7759 		 */
7760 		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
7761 			sd->max_newidle_lb_cost =
7762 				(sd->max_newidle_lb_cost * 253) / 256;
7763 			sd->next_decay_max_lb_cost = jiffies + HZ;
7764 			need_decay = 1;
7765 		}
7766 		max_cost += sd->max_newidle_lb_cost;
7767 
7768 		if (!(sd->flags & SD_LOAD_BALANCE))
7769 			continue;
7770 
7771 		/*
7772 		 * Stop the load balance at this level. There is another
7773 		 * CPU in our sched group which is doing load balancing more
7774 		 * actively.
7775 		 */
7776 		if (!continue_balancing) {
7777 			if (need_decay)
7778 				continue;
7779 			break;
7780 		}
7781 
7782 		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7783 
7784 		need_serialize = sd->flags & SD_SERIALIZE;
7785 		if (need_serialize) {
7786 			if (!spin_trylock(&balancing))
7787 				goto out;
7788 		}
7789 
7790 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
7791 			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
7792 				/*
7793 				 * The LBF_DST_PINNED logic could have changed
7794 				 * env->dst_cpu, so we can't know our idle
7795 				 * state even if we migrated tasks. Update it.
7796 				 */
7797 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7798 			}
7799 			sd->last_balance = jiffies;
7800 			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7801 		}
7802 		if (need_serialize)
7803 			spin_unlock(&balancing);
7804 out:
7805 		if (time_after(next_balance, sd->last_balance + interval)) {
7806 			next_balance = sd->last_balance + interval;
7807 			update_next_balance = 1;
7808 		}
7809 	}
7810 	if (need_decay) {
7811 		/*
7812 		 * Ensure the rq-wide value also decays but keep it at a
7813 		 * reasonable floor to avoid funnies with rq->avg_idle.
7814 		 */
7815 		rq->max_idle_balance_cost =
7816 			max((u64)sysctl_sched_migration_cost, max_cost);
7817 	}
7818 	rcu_read_unlock();
7819 
7820 	/*
7821 	 * next_balance will be updated only when there is a need.
7822 	 * When the cpu is attached to null domain for ex, it will not be
7823 	 * updated.
7824 	 */
7825 	if (likely(update_next_balance))
7826 		rq->next_balance = next_balance;
7827 }
7828 
7829 #ifdef CONFIG_NO_HZ_COMMON
7830 /*
7831  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
7832  * rebalancing for all the cpus for whom scheduler ticks are stopped.
7833  */
7834 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7835 {
7836 	int this_cpu = this_rq->cpu;
7837 	struct rq *rq;
7838 	int balance_cpu;
7839 
7840 	if (idle != CPU_IDLE ||
7841 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
7842 		goto end;
7843 
7844 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
7845 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
7846 			continue;
7847 
7848 		/*
7849 		 * If this cpu gets work to do, stop the load balancing
7850 		 * work being done for other cpus. Next load
7851 		 * balancing owner will pick it up.
7852 		 */
7853 		if (need_resched())
7854 			break;
7855 
7856 		rq = cpu_rq(balance_cpu);
7857 
7858 		/*
7859 		 * If time for next balance is due,
7860 		 * do the balance.
7861 		 */
7862 		if (time_after_eq(jiffies, rq->next_balance)) {
7863 			raw_spin_lock_irq(&rq->lock);
7864 			update_rq_clock(rq);
7865 			update_idle_cpu_load(rq);
7866 			raw_spin_unlock_irq(&rq->lock);
7867 			rebalance_domains(rq, CPU_IDLE);
7868 		}
7869 
7870 		if (time_after(this_rq->next_balance, rq->next_balance))
7871 			this_rq->next_balance = rq->next_balance;
7872 	}
7873 	nohz.next_balance = this_rq->next_balance;
7874 end:
7875 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
7876 }
7877 
7878 /*
7879  * Current heuristic for kicking the idle load balancer in the presence
7880  * of an idle cpu in the system.
7881  *   - This rq has more than one task.
7882  *   - This rq has at least one CFS task and the capacity of the CPU is
7883  *     significantly reduced because of RT tasks or IRQs.
7884  *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
7885  *     multiple busy cpu.
7886  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
7887  *     domain span are idle.
7888  */
7889 static inline bool nohz_kick_needed(struct rq *rq)
7890 {
7891 	unsigned long now = jiffies;
7892 	struct sched_domain *sd;
7893 	struct sched_group_capacity *sgc;
7894 	int nr_busy, cpu = rq->cpu;
7895 	bool kick = false;
7896 
7897 	if (unlikely(rq->idle_balance))
7898 		return false;
7899 
7900        /*
7901 	* We may be recently in ticked or tickless idle mode. At the first
7902 	* busy tick after returning from idle, we will update the busy stats.
7903 	*/
7904 	set_cpu_sd_state_busy();
7905 	nohz_balance_exit_idle(cpu);
7906 
7907 	/*
7908 	 * None are in tickless mode and hence no need for NOHZ idle load
7909 	 * balancing.
7910 	 */
7911 	if (likely(!atomic_read(&nohz.nr_cpus)))
7912 		return false;
7913 
7914 	if (time_before(now, nohz.next_balance))
7915 		return false;
7916 
7917 	if (rq->nr_running >= 2)
7918 		return true;
7919 
7920 	rcu_read_lock();
7921 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
7922 	if (sd) {
7923 		sgc = sd->groups->sgc;
7924 		nr_busy = atomic_read(&sgc->nr_busy_cpus);
7925 
7926 		if (nr_busy > 1) {
7927 			kick = true;
7928 			goto unlock;
7929 		}
7930 
7931 	}
7932 
7933 	sd = rcu_dereference(rq->sd);
7934 	if (sd) {
7935 		if ((rq->cfs.h_nr_running >= 1) &&
7936 				check_cpu_capacity(rq, sd)) {
7937 			kick = true;
7938 			goto unlock;
7939 		}
7940 	}
7941 
7942 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
7943 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
7944 				  sched_domain_span(sd)) < cpu)) {
7945 		kick = true;
7946 		goto unlock;
7947 	}
7948 
7949 unlock:
7950 	rcu_read_unlock();
7951 	return kick;
7952 }
7953 #else
7954 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
7955 #endif
7956 
7957 /*
7958  * run_rebalance_domains is triggered when needed from the scheduler tick.
7959  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
7960  */
7961 static void run_rebalance_domains(struct softirq_action *h)
7962 {
7963 	struct rq *this_rq = this_rq();
7964 	enum cpu_idle_type idle = this_rq->idle_balance ?
7965 						CPU_IDLE : CPU_NOT_IDLE;
7966 
7967 	/*
7968 	 * If this cpu has a pending nohz_balance_kick, then do the
7969 	 * balancing on behalf of the other idle cpus whose ticks are
7970 	 * stopped. Do nohz_idle_balance *before* rebalance_domains to
7971 	 * give the idle cpus a chance to load balance. Else we may
7972 	 * load balance only within the local sched_domain hierarchy
7973 	 * and abort nohz_idle_balance altogether if we pull some load.
7974 	 */
7975 	nohz_idle_balance(this_rq, idle);
7976 	rebalance_domains(this_rq, idle);
7977 }
7978 
7979 /*
7980  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
7981  */
7982 void trigger_load_balance(struct rq *rq)
7983 {
7984 	/* Don't need to rebalance while attached to NULL domain */
7985 	if (unlikely(on_null_domain(rq)))
7986 		return;
7987 
7988 	if (time_after_eq(jiffies, rq->next_balance))
7989 		raise_softirq(SCHED_SOFTIRQ);
7990 #ifdef CONFIG_NO_HZ_COMMON
7991 	if (nohz_kick_needed(rq))
7992 		nohz_balancer_kick();
7993 #endif
7994 }
7995 
7996 static void rq_online_fair(struct rq *rq)
7997 {
7998 	update_sysctl();
7999 
8000 	update_runtime_enabled(rq);
8001 }
8002 
8003 static void rq_offline_fair(struct rq *rq)
8004 {
8005 	update_sysctl();
8006 
8007 	/* Ensure any throttled groups are reachable by pick_next_task */
8008 	unthrottle_offline_cfs_rqs(rq);
8009 }
8010 
8011 #endif /* CONFIG_SMP */
8012 
8013 /*
8014  * scheduler tick hitting a task of our scheduling class:
8015  */
8016 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
8017 {
8018 	struct cfs_rq *cfs_rq;
8019 	struct sched_entity *se = &curr->se;
8020 
8021 	for_each_sched_entity(se) {
8022 		cfs_rq = cfs_rq_of(se);
8023 		entity_tick(cfs_rq, se, queued);
8024 	}
8025 
8026 	if (numabalancing_enabled)
8027 		task_tick_numa(rq, curr);
8028 
8029 	update_rq_runnable_avg(rq, 1);
8030 }
8031 
8032 /*
8033  * called on fork with the child task as argument from the parent's context
8034  *  - child not yet on the tasklist
8035  *  - preemption disabled
8036  */
8037 static void task_fork_fair(struct task_struct *p)
8038 {
8039 	struct cfs_rq *cfs_rq;
8040 	struct sched_entity *se = &p->se, *curr;
8041 	int this_cpu = smp_processor_id();
8042 	struct rq *rq = this_rq();
8043 	unsigned long flags;
8044 
8045 	raw_spin_lock_irqsave(&rq->lock, flags);
8046 
8047 	update_rq_clock(rq);
8048 
8049 	cfs_rq = task_cfs_rq(current);
8050 	curr = cfs_rq->curr;
8051 
8052 	/*
8053 	 * Not only the cpu but also the task_group of the parent might have
8054 	 * been changed after parent->se.parent,cfs_rq were copied to
8055 	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
8056 	 * of child point to valid ones.
8057 	 */
8058 	rcu_read_lock();
8059 	__set_task_cpu(p, this_cpu);
8060 	rcu_read_unlock();
8061 
8062 	update_curr(cfs_rq);
8063 
8064 	if (curr)
8065 		se->vruntime = curr->vruntime;
8066 	place_entity(cfs_rq, se, 1);
8067 
8068 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
8069 		/*
8070 		 * Upon rescheduling, sched_class::put_prev_task() will place
8071 		 * 'current' within the tree based on its new key value.
8072 		 */
8073 		swap(curr->vruntime, se->vruntime);
8074 		resched_curr(rq);
8075 	}
8076 
8077 	se->vruntime -= cfs_rq->min_vruntime;
8078 
8079 	raw_spin_unlock_irqrestore(&rq->lock, flags);
8080 }
8081 
8082 /*
8083  * Priority of the task has changed. Check to see if we preempt
8084  * the current task.
8085  */
8086 static void
8087 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
8088 {
8089 	if (!task_on_rq_queued(p))
8090 		return;
8091 
8092 	/*
8093 	 * Reschedule if we are currently running on this runqueue and
8094 	 * our priority decreased, or if we are not currently running on
8095 	 * this runqueue and our priority is higher than the current's
8096 	 */
8097 	if (rq->curr == p) {
8098 		if (p->prio > oldprio)
8099 			resched_curr(rq);
8100 	} else
8101 		check_preempt_curr(rq, p, 0);
8102 }
8103 
8104 static void switched_from_fair(struct rq *rq, struct task_struct *p)
8105 {
8106 	struct sched_entity *se = &p->se;
8107 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
8108 
8109 	/*
8110 	 * Ensure the task's vruntime is normalized, so that when it's
8111 	 * switched back to the fair class the enqueue_entity(.flags=0) will
8112 	 * do the right thing.
8113 	 *
8114 	 * If it's queued, then the dequeue_entity(.flags=0) will already
8115 	 * have normalized the vruntime, if it's !queued, then only when
8116 	 * the task is sleeping will it still have non-normalized vruntime.
8117 	 */
8118 	if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
8119 		/*
8120 		 * Fix up our vruntime so that the current sleep doesn't
8121 		 * cause 'unlimited' sleep bonus.
8122 		 */
8123 		place_entity(cfs_rq, se, 0);
8124 		se->vruntime -= cfs_rq->min_vruntime;
8125 	}
8126 
8127 #ifdef CONFIG_SMP
8128 	/*
8129 	* Remove our load from contribution when we leave sched_fair
8130 	* and ensure we don't carry in an old decay_count if we
8131 	* switch back.
8132 	*/
8133 	if (se->avg.decay_count) {
8134 		__synchronize_entity_decay(se);
8135 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
8136 	}
8137 #endif
8138 }
8139 
8140 /*
8141  * We switched to the sched_fair class.
8142  */
8143 static void switched_to_fair(struct rq *rq, struct task_struct *p)
8144 {
8145 #ifdef CONFIG_FAIR_GROUP_SCHED
8146 	struct sched_entity *se = &p->se;
8147 	/*
8148 	 * Since the real-depth could have been changed (only FAIR
8149 	 * class maintain depth value), reset depth properly.
8150 	 */
8151 	se->depth = se->parent ? se->parent->depth + 1 : 0;
8152 #endif
8153 	if (!task_on_rq_queued(p))
8154 		return;
8155 
8156 	/*
8157 	 * We were most likely switched from sched_rt, so
8158 	 * kick off the schedule if running, otherwise just see
8159 	 * if we can still preempt the current task.
8160 	 */
8161 	if (rq->curr == p)
8162 		resched_curr(rq);
8163 	else
8164 		check_preempt_curr(rq, p, 0);
8165 }
8166 
8167 /* Account for a task changing its policy or group.
8168  *
8169  * This routine is mostly called to set cfs_rq->curr field when a task
8170  * migrates between groups/classes.
8171  */
8172 static void set_curr_task_fair(struct rq *rq)
8173 {
8174 	struct sched_entity *se = &rq->curr->se;
8175 
8176 	for_each_sched_entity(se) {
8177 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
8178 
8179 		set_next_entity(cfs_rq, se);
8180 		/* ensure bandwidth has been allocated on our new cfs_rq */
8181 		account_cfs_rq_runtime(cfs_rq, 0);
8182 	}
8183 }
8184 
8185 void init_cfs_rq(struct cfs_rq *cfs_rq)
8186 {
8187 	cfs_rq->tasks_timeline = RB_ROOT;
8188 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8189 #ifndef CONFIG_64BIT
8190 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8191 #endif
8192 #ifdef CONFIG_SMP
8193 	atomic64_set(&cfs_rq->decay_counter, 1);
8194 	atomic_long_set(&cfs_rq->removed_load, 0);
8195 #endif
8196 }
8197 
8198 #ifdef CONFIG_FAIR_GROUP_SCHED
8199 static void task_move_group_fair(struct task_struct *p, int queued)
8200 {
8201 	struct sched_entity *se = &p->se;
8202 	struct cfs_rq *cfs_rq;
8203 
8204 	/*
8205 	 * If the task was not on the rq at the time of this cgroup movement
8206 	 * it must have been asleep, sleeping tasks keep their ->vruntime
8207 	 * absolute on their old rq until wakeup (needed for the fair sleeper
8208 	 * bonus in place_entity()).
8209 	 *
8210 	 * If it was on the rq, we've just 'preempted' it, which does convert
8211 	 * ->vruntime to a relative base.
8212 	 *
8213 	 * Make sure both cases convert their relative position when migrating
8214 	 * to another cgroup's rq. This does somewhat interfere with the
8215 	 * fair sleeper stuff for the first placement, but who cares.
8216 	 */
8217 	/*
8218 	 * When !queued, vruntime of the task has usually NOT been normalized.
8219 	 * But there are some cases where it has already been normalized:
8220 	 *
8221 	 * - Moving a forked child which is waiting for being woken up by
8222 	 *   wake_up_new_task().
8223 	 * - Moving a task which has been woken up by try_to_wake_up() and
8224 	 *   waiting for actually being woken up by sched_ttwu_pending().
8225 	 *
8226 	 * To prevent boost or penalty in the new cfs_rq caused by delta
8227 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
8228 	 */
8229 	if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
8230 		queued = 1;
8231 
8232 	if (!queued)
8233 		se->vruntime -= cfs_rq_of(se)->min_vruntime;
8234 	set_task_rq(p, task_cpu(p));
8235 	se->depth = se->parent ? se->parent->depth + 1 : 0;
8236 	if (!queued) {
8237 		cfs_rq = cfs_rq_of(se);
8238 		se->vruntime += cfs_rq->min_vruntime;
8239 #ifdef CONFIG_SMP
8240 		/*
8241 		 * migrate_task_rq_fair() will have removed our previous
8242 		 * contribution, but we must synchronize for ongoing future
8243 		 * decay.
8244 		 */
8245 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
8246 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
8247 #endif
8248 	}
8249 }
8250 
8251 void free_fair_sched_group(struct task_group *tg)
8252 {
8253 	int i;
8254 
8255 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8256 
8257 	for_each_possible_cpu(i) {
8258 		if (tg->cfs_rq)
8259 			kfree(tg->cfs_rq[i]);
8260 		if (tg->se)
8261 			kfree(tg->se[i]);
8262 	}
8263 
8264 	kfree(tg->cfs_rq);
8265 	kfree(tg->se);
8266 }
8267 
8268 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8269 {
8270 	struct cfs_rq *cfs_rq;
8271 	struct sched_entity *se;
8272 	int i;
8273 
8274 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8275 	if (!tg->cfs_rq)
8276 		goto err;
8277 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8278 	if (!tg->se)
8279 		goto err;
8280 
8281 	tg->shares = NICE_0_LOAD;
8282 
8283 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8284 
8285 	for_each_possible_cpu(i) {
8286 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8287 				      GFP_KERNEL, cpu_to_node(i));
8288 		if (!cfs_rq)
8289 			goto err;
8290 
8291 		se = kzalloc_node(sizeof(struct sched_entity),
8292 				  GFP_KERNEL, cpu_to_node(i));
8293 		if (!se)
8294 			goto err_free_rq;
8295 
8296 		init_cfs_rq(cfs_rq);
8297 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8298 	}
8299 
8300 	return 1;
8301 
8302 err_free_rq:
8303 	kfree(cfs_rq);
8304 err:
8305 	return 0;
8306 }
8307 
8308 void unregister_fair_sched_group(struct task_group *tg, int cpu)
8309 {
8310 	struct rq *rq = cpu_rq(cpu);
8311 	unsigned long flags;
8312 
8313 	/*
8314 	* Only empty task groups can be destroyed; so we can speculatively
8315 	* check on_list without danger of it being re-added.
8316 	*/
8317 	if (!tg->cfs_rq[cpu]->on_list)
8318 		return;
8319 
8320 	raw_spin_lock_irqsave(&rq->lock, flags);
8321 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8322 	raw_spin_unlock_irqrestore(&rq->lock, flags);
8323 }
8324 
8325 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8326 			struct sched_entity *se, int cpu,
8327 			struct sched_entity *parent)
8328 {
8329 	struct rq *rq = cpu_rq(cpu);
8330 
8331 	cfs_rq->tg = tg;
8332 	cfs_rq->rq = rq;
8333 	init_cfs_rq_runtime(cfs_rq);
8334 
8335 	tg->cfs_rq[cpu] = cfs_rq;
8336 	tg->se[cpu] = se;
8337 
8338 	/* se could be NULL for root_task_group */
8339 	if (!se)
8340 		return;
8341 
8342 	if (!parent) {
8343 		se->cfs_rq = &rq->cfs;
8344 		se->depth = 0;
8345 	} else {
8346 		se->cfs_rq = parent->my_q;
8347 		se->depth = parent->depth + 1;
8348 	}
8349 
8350 	se->my_q = cfs_rq;
8351 	/* guarantee group entities always have weight */
8352 	update_load_set(&se->load, NICE_0_LOAD);
8353 	se->parent = parent;
8354 }
8355 
8356 static DEFINE_MUTEX(shares_mutex);
8357 
8358 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8359 {
8360 	int i;
8361 	unsigned long flags;
8362 
8363 	/*
8364 	 * We can't change the weight of the root cgroup.
8365 	 */
8366 	if (!tg->se[0])
8367 		return -EINVAL;
8368 
8369 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8370 
8371 	mutex_lock(&shares_mutex);
8372 	if (tg->shares == shares)
8373 		goto done;
8374 
8375 	tg->shares = shares;
8376 	for_each_possible_cpu(i) {
8377 		struct rq *rq = cpu_rq(i);
8378 		struct sched_entity *se;
8379 
8380 		se = tg->se[i];
8381 		/* Propagate contribution to hierarchy */
8382 		raw_spin_lock_irqsave(&rq->lock, flags);
8383 
8384 		/* Possible calls to update_curr() need rq clock */
8385 		update_rq_clock(rq);
8386 		for_each_sched_entity(se)
8387 			update_cfs_shares(group_cfs_rq(se));
8388 		raw_spin_unlock_irqrestore(&rq->lock, flags);
8389 	}
8390 
8391 done:
8392 	mutex_unlock(&shares_mutex);
8393 	return 0;
8394 }
8395 #else /* CONFIG_FAIR_GROUP_SCHED */
8396 
8397 void free_fair_sched_group(struct task_group *tg) { }
8398 
8399 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8400 {
8401 	return 1;
8402 }
8403 
8404 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
8405 
8406 #endif /* CONFIG_FAIR_GROUP_SCHED */
8407 
8408 
8409 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
8410 {
8411 	struct sched_entity *se = &task->se;
8412 	unsigned int rr_interval = 0;
8413 
8414 	/*
8415 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
8416 	 * idle runqueue:
8417 	 */
8418 	if (rq->cfs.load.weight)
8419 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
8420 
8421 	return rr_interval;
8422 }
8423 
8424 /*
8425  * All the scheduling class methods:
8426  */
8427 const struct sched_class fair_sched_class = {
8428 	.next			= &idle_sched_class,
8429 	.enqueue_task		= enqueue_task_fair,
8430 	.dequeue_task		= dequeue_task_fair,
8431 	.yield_task		= yield_task_fair,
8432 	.yield_to_task		= yield_to_task_fair,
8433 
8434 	.check_preempt_curr	= check_preempt_wakeup,
8435 
8436 	.pick_next_task		= pick_next_task_fair,
8437 	.put_prev_task		= put_prev_task_fair,
8438 
8439 #ifdef CONFIG_SMP
8440 	.select_task_rq		= select_task_rq_fair,
8441 	.migrate_task_rq	= migrate_task_rq_fair,
8442 
8443 	.rq_online		= rq_online_fair,
8444 	.rq_offline		= rq_offline_fair,
8445 
8446 	.task_waking		= task_waking_fair,
8447 #endif
8448 
8449 	.set_curr_task          = set_curr_task_fair,
8450 	.task_tick		= task_tick_fair,
8451 	.task_fork		= task_fork_fair,
8452 
8453 	.prio_changed		= prio_changed_fair,
8454 	.switched_from		= switched_from_fair,
8455 	.switched_to		= switched_to_fair,
8456 
8457 	.get_rr_interval	= get_rr_interval_fair,
8458 
8459 	.update_curr		= update_curr_fair,
8460 
8461 #ifdef CONFIG_FAIR_GROUP_SCHED
8462 	.task_move_group	= task_move_group_fair,
8463 #endif
8464 };
8465 
8466 #ifdef CONFIG_SCHED_DEBUG
8467 void print_cfs_stats(struct seq_file *m, int cpu)
8468 {
8469 	struct cfs_rq *cfs_rq;
8470 
8471 	rcu_read_lock();
8472 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
8473 		print_cfs_rq(m, cpu, cfs_rq);
8474 	rcu_read_unlock();
8475 }
8476 
8477 #ifdef CONFIG_NUMA_BALANCING
8478 void show_numa_stats(struct task_struct *p, struct seq_file *m)
8479 {
8480 	int node;
8481 	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
8482 
8483 	for_each_online_node(node) {
8484 		if (p->numa_faults) {
8485 			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
8486 			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
8487 		}
8488 		if (p->numa_group) {
8489 			gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
8490 			gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
8491 		}
8492 		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
8493 	}
8494 }
8495 #endif /* CONFIG_NUMA_BALANCING */
8496 #endif /* CONFIG_SCHED_DEBUG */
8497 
8498 __init void init_sched_fair_class(void)
8499 {
8500 #ifdef CONFIG_SMP
8501 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8502 
8503 #ifdef CONFIG_NO_HZ_COMMON
8504 	nohz.next_balance = jiffies;
8505 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8506 	cpu_notifier(sched_ilb_notifier, 0);
8507 #endif
8508 #endif /* SMP */
8509 
8510 }
8511