1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Scheduler internal types and methods: 4 */ 5 #include <linux/sched.h> 6 7 #include <linux/sched/autogroup.h> 8 #include <linux/sched/clock.h> 9 #include <linux/sched/coredump.h> 10 #include <linux/sched/cpufreq.h> 11 #include <linux/sched/cputime.h> 12 #include <linux/sched/deadline.h> 13 #include <linux/sched/debug.h> 14 #include <linux/sched/hotplug.h> 15 #include <linux/sched/idle.h> 16 #include <linux/sched/init.h> 17 #include <linux/sched/isolation.h> 18 #include <linux/sched/jobctl.h> 19 #include <linux/sched/loadavg.h> 20 #include <linux/sched/mm.h> 21 #include <linux/sched/nohz.h> 22 #include <linux/sched/numa_balancing.h> 23 #include <linux/sched/prio.h> 24 #include <linux/sched/rt.h> 25 #include <linux/sched/signal.h> 26 #include <linux/sched/smt.h> 27 #include <linux/sched/stat.h> 28 #include <linux/sched/sysctl.h> 29 #include <linux/sched/task.h> 30 #include <linux/sched/task_stack.h> 31 #include <linux/sched/topology.h> 32 #include <linux/sched/user.h> 33 #include <linux/sched/wake_q.h> 34 #include <linux/sched/xacct.h> 35 36 #include <uapi/linux/sched/types.h> 37 38 #include <linux/binfmts.h> 39 #include <linux/blkdev.h> 40 #include <linux/compat.h> 41 #include <linux/context_tracking.h> 42 #include <linux/cpufreq.h> 43 #include <linux/cpuidle.h> 44 #include <linux/cpuset.h> 45 #include <linux/ctype.h> 46 #include <linux/debugfs.h> 47 #include <linux/delayacct.h> 48 #include <linux/init_task.h> 49 #include <linux/kprobes.h> 50 #include <linux/kthread.h> 51 #include <linux/membarrier.h> 52 #include <linux/migrate.h> 53 #include <linux/mmu_context.h> 54 #include <linux/nmi.h> 55 #include <linux/proc_fs.h> 56 #include <linux/prefetch.h> 57 #include <linux/profile.h> 58 #include <linux/psi.h> 59 #include <linux/rcupdate_wait.h> 60 #include <linux/security.h> 61 #include <linux/stop_machine.h> 62 #include <linux/suspend.h> 63 #include <linux/swait.h> 64 #include <linux/syscalls.h> 65 #include <linux/task_work.h> 66 #include <linux/tsacct_kern.h> 67 68 #include <asm/tlb.h> 69 70 #ifdef CONFIG_PARAVIRT 71 # include <asm/paravirt.h> 72 #endif 73 74 #include "cpupri.h" 75 #include "cpudeadline.h" 76 77 #ifdef CONFIG_SCHED_DEBUG 78 # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) 79 #else 80 # define SCHED_WARN_ON(x) ({ (void)(x), 0; }) 81 #endif 82 83 struct rq; 84 struct cpuidle_state; 85 86 /* task_struct::on_rq states: */ 87 #define TASK_ON_RQ_QUEUED 1 88 #define TASK_ON_RQ_MIGRATING 2 89 90 extern __read_mostly int scheduler_running; 91 92 extern unsigned long calc_load_update; 93 extern atomic_long_t calc_load_tasks; 94 95 extern void calc_global_load_tick(struct rq *this_rq); 96 extern long calc_load_fold_active(struct rq *this_rq, long adjust); 97 98 #ifdef CONFIG_SMP 99 extern void cpu_load_update_active(struct rq *this_rq); 100 #else 101 static inline void cpu_load_update_active(struct rq *this_rq) { } 102 #endif 103 104 /* 105 * Helpers for converting nanosecond timing to jiffy resolution 106 */ 107 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 108 109 /* 110 * Increase resolution of nice-level calculations for 64-bit architectures. 111 * The extra resolution improves shares distribution and load balancing of 112 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup 113 * hierarchies, especially on larger systems. This is not a user-visible change 114 * and does not change the user-interface for setting shares/weights. 115 * 116 * We increase resolution only if we have enough bits to allow this increased 117 * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit 118 * are pretty high and the returns do not justify the increased costs. 119 * 120 * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to 121 * increase coverage and consistency always enable it on 64-bit platforms. 122 */ 123 #ifdef CONFIG_64BIT 124 # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) 125 # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) 126 # define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT) 127 #else 128 # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) 129 # define scale_load(w) (w) 130 # define scale_load_down(w) (w) 131 #endif 132 133 /* 134 * Task weight (visible to users) and its load (invisible to users) have 135 * independent resolution, but they should be well calibrated. We use 136 * scale_load() and scale_load_down(w) to convert between them. The 137 * following must be true: 138 * 139 * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD 140 * 141 */ 142 #define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) 143 144 /* 145 * Single value that decides SCHED_DEADLINE internal math precision. 146 * 10 -> just above 1us 147 * 9 -> just above 0.5us 148 */ 149 #define DL_SCALE 10 150 151 /* 152 * Single value that denotes runtime == period, ie unlimited time. 153 */ 154 #define RUNTIME_INF ((u64)~0ULL) 155 156 static inline int idle_policy(int policy) 157 { 158 return policy == SCHED_IDLE; 159 } 160 static inline int fair_policy(int policy) 161 { 162 return policy == SCHED_NORMAL || policy == SCHED_BATCH; 163 } 164 165 static inline int rt_policy(int policy) 166 { 167 return policy == SCHED_FIFO || policy == SCHED_RR; 168 } 169 170 static inline int dl_policy(int policy) 171 { 172 return policy == SCHED_DEADLINE; 173 } 174 static inline bool valid_policy(int policy) 175 { 176 return idle_policy(policy) || fair_policy(policy) || 177 rt_policy(policy) || dl_policy(policy); 178 } 179 180 static inline int task_has_rt_policy(struct task_struct *p) 181 { 182 return rt_policy(p->policy); 183 } 184 185 static inline int task_has_dl_policy(struct task_struct *p) 186 { 187 return dl_policy(p->policy); 188 } 189 190 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) 191 192 /* 193 * !! For sched_setattr_nocheck() (kernel) only !! 194 * 195 * This is actually gross. :( 196 * 197 * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE 198 * tasks, but still be able to sleep. We need this on platforms that cannot 199 * atomically change clock frequency. Remove once fast switching will be 200 * available on such platforms. 201 * 202 * SUGOV stands for SchedUtil GOVernor. 203 */ 204 #define SCHED_FLAG_SUGOV 0x10000000 205 206 static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) 207 { 208 #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 209 return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); 210 #else 211 return false; 212 #endif 213 } 214 215 /* 216 * Tells if entity @a should preempt entity @b. 217 */ 218 static inline bool 219 dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) 220 { 221 return dl_entity_is_special(a) || 222 dl_time_before(a->deadline, b->deadline); 223 } 224 225 /* 226 * This is the priority-queue data structure of the RT scheduling class: 227 */ 228 struct rt_prio_array { 229 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ 230 struct list_head queue[MAX_RT_PRIO]; 231 }; 232 233 struct rt_bandwidth { 234 /* nests inside the rq lock: */ 235 raw_spinlock_t rt_runtime_lock; 236 ktime_t rt_period; 237 u64 rt_runtime; 238 struct hrtimer rt_period_timer; 239 unsigned int rt_period_active; 240 }; 241 242 void __dl_clear_params(struct task_struct *p); 243 244 /* 245 * To keep the bandwidth of -deadline tasks and groups under control 246 * we need some place where: 247 * - store the maximum -deadline bandwidth of the system (the group); 248 * - cache the fraction of that bandwidth that is currently allocated. 249 * 250 * This is all done in the data structure below. It is similar to the 251 * one used for RT-throttling (rt_bandwidth), with the main difference 252 * that, since here we are only interested in admission control, we 253 * do not decrease any runtime while the group "executes", neither we 254 * need a timer to replenish it. 255 * 256 * With respect to SMP, the bandwidth is given on a per-CPU basis, 257 * meaning that: 258 * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; 259 * - dl_total_bw array contains, in the i-eth element, the currently 260 * allocated bandwidth on the i-eth CPU. 261 * Moreover, groups consume bandwidth on each CPU, while tasks only 262 * consume bandwidth on the CPU they're running on. 263 * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw 264 * that will be shown the next time the proc or cgroup controls will 265 * be red. It on its turn can be changed by writing on its own 266 * control. 267 */ 268 struct dl_bandwidth { 269 raw_spinlock_t dl_runtime_lock; 270 u64 dl_runtime; 271 u64 dl_period; 272 }; 273 274 static inline int dl_bandwidth_enabled(void) 275 { 276 return sysctl_sched_rt_runtime >= 0; 277 } 278 279 struct dl_bw { 280 raw_spinlock_t lock; 281 u64 bw; 282 u64 total_bw; 283 }; 284 285 static inline void __dl_update(struct dl_bw *dl_b, s64 bw); 286 287 static inline 288 void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) 289 { 290 dl_b->total_bw -= tsk_bw; 291 __dl_update(dl_b, (s32)tsk_bw / cpus); 292 } 293 294 static inline 295 void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) 296 { 297 dl_b->total_bw += tsk_bw; 298 __dl_update(dl_b, -((s32)tsk_bw / cpus)); 299 } 300 301 static inline 302 bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) 303 { 304 return dl_b->bw != -1 && 305 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 306 } 307 308 extern void dl_change_utilization(struct task_struct *p, u64 new_bw); 309 extern void init_dl_bw(struct dl_bw *dl_b); 310 extern int sched_dl_global_validate(void); 311 extern void sched_dl_do_global(void); 312 extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); 313 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); 314 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); 315 extern bool __checkparam_dl(const struct sched_attr *attr); 316 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 317 extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); 318 extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); 319 extern bool dl_cpu_busy(unsigned int cpu); 320 321 #ifdef CONFIG_CGROUP_SCHED 322 323 #include <linux/cgroup.h> 324 #include <linux/psi.h> 325 326 struct cfs_rq; 327 struct rt_rq; 328 329 extern struct list_head task_groups; 330 331 struct cfs_bandwidth { 332 #ifdef CONFIG_CFS_BANDWIDTH 333 raw_spinlock_t lock; 334 ktime_t period; 335 u64 quota; 336 u64 runtime; 337 s64 hierarchical_quota; 338 u64 runtime_expires; 339 int expires_seq; 340 341 short idle; 342 short period_active; 343 struct hrtimer period_timer; 344 struct hrtimer slack_timer; 345 struct list_head throttled_cfs_rq; 346 347 /* Statistics: */ 348 int nr_periods; 349 int nr_throttled; 350 u64 throttled_time; 351 352 bool distribute_running; 353 #endif 354 }; 355 356 /* Task group related information */ 357 struct task_group { 358 struct cgroup_subsys_state css; 359 360 #ifdef CONFIG_FAIR_GROUP_SCHED 361 /* schedulable entities of this group on each CPU */ 362 struct sched_entity **se; 363 /* runqueue "owned" by this group on each CPU */ 364 struct cfs_rq **cfs_rq; 365 unsigned long shares; 366 367 #ifdef CONFIG_SMP 368 /* 369 * load_avg can be heavily contended at clock tick time, so put 370 * it in its own cacheline separated from the fields above which 371 * will also be accessed at each tick. 372 */ 373 atomic_long_t load_avg ____cacheline_aligned; 374 #endif 375 #endif 376 377 #ifdef CONFIG_RT_GROUP_SCHED 378 struct sched_rt_entity **rt_se; 379 struct rt_rq **rt_rq; 380 381 struct rt_bandwidth rt_bandwidth; 382 #endif 383 384 struct rcu_head rcu; 385 struct list_head list; 386 387 struct task_group *parent; 388 struct list_head siblings; 389 struct list_head children; 390 391 #ifdef CONFIG_SCHED_AUTOGROUP 392 struct autogroup *autogroup; 393 #endif 394 395 struct cfs_bandwidth cfs_bandwidth; 396 }; 397 398 #ifdef CONFIG_FAIR_GROUP_SCHED 399 #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD 400 401 /* 402 * A weight of 0 or 1 can cause arithmetics problems. 403 * A weight of a cfs_rq is the sum of weights of which entities 404 * are queued on this cfs_rq, so a weight of a entity should not be 405 * too large, so as the shares value of a task group. 406 * (The default weight is 1024 - so there's no practical 407 * limitation from this.) 408 */ 409 #define MIN_SHARES (1UL << 1) 410 #define MAX_SHARES (1UL << 18) 411 #endif 412 413 typedef int (*tg_visitor)(struct task_group *, void *); 414 415 extern int walk_tg_tree_from(struct task_group *from, 416 tg_visitor down, tg_visitor up, void *data); 417 418 /* 419 * Iterate the full tree, calling @down when first entering a node and @up when 420 * leaving it for the final time. 421 * 422 * Caller must hold rcu_lock or sufficient equivalent. 423 */ 424 static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 425 { 426 return walk_tg_tree_from(&root_task_group, down, up, data); 427 } 428 429 extern int tg_nop(struct task_group *tg, void *data); 430 431 extern void free_fair_sched_group(struct task_group *tg); 432 extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); 433 extern void online_fair_sched_group(struct task_group *tg); 434 extern void unregister_fair_sched_group(struct task_group *tg); 435 extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 436 struct sched_entity *se, int cpu, 437 struct sched_entity *parent); 438 extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 439 440 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); 441 extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); 442 extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); 443 444 extern void free_rt_sched_group(struct task_group *tg); 445 extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); 446 extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 447 struct sched_rt_entity *rt_se, int cpu, 448 struct sched_rt_entity *parent); 449 extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); 450 extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); 451 extern long sched_group_rt_runtime(struct task_group *tg); 452 extern long sched_group_rt_period(struct task_group *tg); 453 extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); 454 455 extern struct task_group *sched_create_group(struct task_group *parent); 456 extern void sched_online_group(struct task_group *tg, 457 struct task_group *parent); 458 extern void sched_destroy_group(struct task_group *tg); 459 extern void sched_offline_group(struct task_group *tg); 460 461 extern void sched_move_task(struct task_struct *tsk); 462 463 #ifdef CONFIG_FAIR_GROUP_SCHED 464 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); 465 466 #ifdef CONFIG_SMP 467 extern void set_task_rq_fair(struct sched_entity *se, 468 struct cfs_rq *prev, struct cfs_rq *next); 469 #else /* !CONFIG_SMP */ 470 static inline void set_task_rq_fair(struct sched_entity *se, 471 struct cfs_rq *prev, struct cfs_rq *next) { } 472 #endif /* CONFIG_SMP */ 473 #endif /* CONFIG_FAIR_GROUP_SCHED */ 474 475 #else /* CONFIG_CGROUP_SCHED */ 476 477 struct cfs_bandwidth { }; 478 479 #endif /* CONFIG_CGROUP_SCHED */ 480 481 /* CFS-related fields in a runqueue */ 482 struct cfs_rq { 483 struct load_weight load; 484 unsigned long runnable_weight; 485 unsigned int nr_running; 486 unsigned int h_nr_running; 487 488 u64 exec_clock; 489 u64 min_vruntime; 490 #ifndef CONFIG_64BIT 491 u64 min_vruntime_copy; 492 #endif 493 494 struct rb_root_cached tasks_timeline; 495 496 /* 497 * 'curr' points to currently running entity on this cfs_rq. 498 * It is set to NULL otherwise (i.e when none are currently running). 499 */ 500 struct sched_entity *curr; 501 struct sched_entity *next; 502 struct sched_entity *last; 503 struct sched_entity *skip; 504 505 #ifdef CONFIG_SCHED_DEBUG 506 unsigned int nr_spread_over; 507 #endif 508 509 #ifdef CONFIG_SMP 510 /* 511 * CFS load tracking 512 */ 513 struct sched_avg avg; 514 #ifndef CONFIG_64BIT 515 u64 load_last_update_time_copy; 516 #endif 517 struct { 518 raw_spinlock_t lock ____cacheline_aligned; 519 int nr; 520 unsigned long load_avg; 521 unsigned long util_avg; 522 unsigned long runnable_sum; 523 } removed; 524 525 #ifdef CONFIG_FAIR_GROUP_SCHED 526 unsigned long tg_load_avg_contrib; 527 long propagate; 528 long prop_runnable_sum; 529 530 /* 531 * h_load = weight * f(tg) 532 * 533 * Where f(tg) is the recursive weight fraction assigned to 534 * this group. 535 */ 536 unsigned long h_load; 537 u64 last_h_load_update; 538 struct sched_entity *h_load_next; 539 #endif /* CONFIG_FAIR_GROUP_SCHED */ 540 #endif /* CONFIG_SMP */ 541 542 #ifdef CONFIG_FAIR_GROUP_SCHED 543 struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ 544 545 /* 546 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 547 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 548 * (like users, containers etc.) 549 * 550 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. 551 * This list is used during load balance. 552 */ 553 int on_list; 554 struct list_head leaf_cfs_rq_list; 555 struct task_group *tg; /* group that "owns" this runqueue */ 556 557 #ifdef CONFIG_CFS_BANDWIDTH 558 int runtime_enabled; 559 int expires_seq; 560 u64 runtime_expires; 561 s64 runtime_remaining; 562 563 u64 throttled_clock; 564 u64 throttled_clock_task; 565 u64 throttled_clock_task_time; 566 int throttled; 567 int throttle_count; 568 struct list_head throttled_list; 569 #endif /* CONFIG_CFS_BANDWIDTH */ 570 #endif /* CONFIG_FAIR_GROUP_SCHED */ 571 }; 572 573 static inline int rt_bandwidth_enabled(void) 574 { 575 return sysctl_sched_rt_runtime >= 0; 576 } 577 578 /* RT IPI pull logic requires IRQ_WORK */ 579 #if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) 580 # define HAVE_RT_PUSH_IPI 581 #endif 582 583 /* Real-Time classes' related field in a runqueue: */ 584 struct rt_rq { 585 struct rt_prio_array active; 586 unsigned int rt_nr_running; 587 unsigned int rr_nr_running; 588 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 589 struct { 590 int curr; /* highest queued rt task prio */ 591 #ifdef CONFIG_SMP 592 int next; /* next highest */ 593 #endif 594 } highest_prio; 595 #endif 596 #ifdef CONFIG_SMP 597 unsigned long rt_nr_migratory; 598 unsigned long rt_nr_total; 599 int overloaded; 600 struct plist_head pushable_tasks; 601 602 #endif /* CONFIG_SMP */ 603 int rt_queued; 604 605 int rt_throttled; 606 u64 rt_time; 607 u64 rt_runtime; 608 /* Nests inside the rq lock: */ 609 raw_spinlock_t rt_runtime_lock; 610 611 #ifdef CONFIG_RT_GROUP_SCHED 612 unsigned long rt_nr_boosted; 613 614 struct rq *rq; 615 struct task_group *tg; 616 #endif 617 }; 618 619 static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) 620 { 621 return rt_rq->rt_queued && rt_rq->rt_nr_running; 622 } 623 624 /* Deadline class' related fields in a runqueue */ 625 struct dl_rq { 626 /* runqueue is an rbtree, ordered by deadline */ 627 struct rb_root_cached root; 628 629 unsigned long dl_nr_running; 630 631 #ifdef CONFIG_SMP 632 /* 633 * Deadline values of the currently executing and the 634 * earliest ready task on this rq. Caching these facilitates 635 * the decision wether or not a ready but not running task 636 * should migrate somewhere else. 637 */ 638 struct { 639 u64 curr; 640 u64 next; 641 } earliest_dl; 642 643 unsigned long dl_nr_migratory; 644 int overloaded; 645 646 /* 647 * Tasks on this rq that can be pushed away. They are kept in 648 * an rb-tree, ordered by tasks' deadlines, with caching 649 * of the leftmost (earliest deadline) element. 650 */ 651 struct rb_root_cached pushable_dl_tasks_root; 652 #else 653 struct dl_bw dl_bw; 654 #endif 655 /* 656 * "Active utilization" for this runqueue: increased when a 657 * task wakes up (becomes TASK_RUNNING) and decreased when a 658 * task blocks 659 */ 660 u64 running_bw; 661 662 /* 663 * Utilization of the tasks "assigned" to this runqueue (including 664 * the tasks that are in runqueue and the tasks that executed on this 665 * CPU and blocked). Increased when a task moves to this runqueue, and 666 * decreased when the task moves away (migrates, changes scheduling 667 * policy, or terminates). 668 * This is needed to compute the "inactive utilization" for the 669 * runqueue (inactive utilization = this_bw - running_bw). 670 */ 671 u64 this_bw; 672 u64 extra_bw; 673 674 /* 675 * Inverse of the fraction of CPU utilization that can be reclaimed 676 * by the GRUB algorithm. 677 */ 678 u64 bw_ratio; 679 }; 680 681 #ifdef CONFIG_FAIR_GROUP_SCHED 682 /* An entity is a task if it doesn't "own" a runqueue */ 683 #define entity_is_task(se) (!se->my_q) 684 #else 685 #define entity_is_task(se) 1 686 #endif 687 688 #ifdef CONFIG_SMP 689 /* 690 * XXX we want to get rid of these helpers and use the full load resolution. 691 */ 692 static inline long se_weight(struct sched_entity *se) 693 { 694 return scale_load_down(se->load.weight); 695 } 696 697 static inline long se_runnable(struct sched_entity *se) 698 { 699 return scale_load_down(se->runnable_weight); 700 } 701 702 static inline bool sched_asym_prefer(int a, int b) 703 { 704 return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); 705 } 706 707 /* 708 * We add the notion of a root-domain which will be used to define per-domain 709 * variables. Each exclusive cpuset essentially defines an island domain by 710 * fully partitioning the member CPUs from any other cpuset. Whenever a new 711 * exclusive cpuset is created, we also create and attach a new root-domain 712 * object. 713 * 714 */ 715 struct root_domain { 716 atomic_t refcount; 717 atomic_t rto_count; 718 struct rcu_head rcu; 719 cpumask_var_t span; 720 cpumask_var_t online; 721 722 /* 723 * Indicate pullable load on at least one CPU, e.g: 724 * - More than one runnable task 725 * - Running task is misfit 726 */ 727 int overload; 728 729 /* 730 * The bit corresponding to a CPU gets set here if such CPU has more 731 * than one runnable -deadline task (as it is below for RT tasks). 732 */ 733 cpumask_var_t dlo_mask; 734 atomic_t dlo_count; 735 struct dl_bw dl_bw; 736 struct cpudl cpudl; 737 738 #ifdef HAVE_RT_PUSH_IPI 739 /* 740 * For IPI pull requests, loop across the rto_mask. 741 */ 742 struct irq_work rto_push_work; 743 raw_spinlock_t rto_lock; 744 /* These are only updated and read within rto_lock */ 745 int rto_loop; 746 int rto_cpu; 747 /* These atomics are updated outside of a lock */ 748 atomic_t rto_loop_next; 749 atomic_t rto_loop_start; 750 #endif 751 /* 752 * The "RT overload" flag: it gets set if a CPU has more than 753 * one runnable RT task. 754 */ 755 cpumask_var_t rto_mask; 756 struct cpupri cpupri; 757 758 unsigned long max_cpu_capacity; 759 }; 760 761 extern struct root_domain def_root_domain; 762 extern struct mutex sched_domains_mutex; 763 764 extern void init_defrootdomain(void); 765 extern int sched_init_domains(const struct cpumask *cpu_map); 766 extern void rq_attach_root(struct rq *rq, struct root_domain *rd); 767 extern void sched_get_rd(struct root_domain *rd); 768 extern void sched_put_rd(struct root_domain *rd); 769 770 #ifdef HAVE_RT_PUSH_IPI 771 extern void rto_push_irq_work_func(struct irq_work *work); 772 #endif 773 #endif /* CONFIG_SMP */ 774 775 /* 776 * This is the main, per-CPU runqueue data structure. 777 * 778 * Locking rule: those places that want to lock multiple runqueues 779 * (such as the load balancing or the thread migration code), lock 780 * acquire operations must be ordered by ascending &runqueue. 781 */ 782 struct rq { 783 /* runqueue lock: */ 784 raw_spinlock_t lock; 785 786 /* 787 * nr_running and cpu_load should be in the same cacheline because 788 * remote CPUs use both these fields when doing load calculation. 789 */ 790 unsigned int nr_running; 791 #ifdef CONFIG_NUMA_BALANCING 792 unsigned int nr_numa_running; 793 unsigned int nr_preferred_running; 794 unsigned int numa_migrate_on; 795 #endif 796 #define CPU_LOAD_IDX_MAX 5 797 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 798 #ifdef CONFIG_NO_HZ_COMMON 799 #ifdef CONFIG_SMP 800 unsigned long last_load_update_tick; 801 unsigned long last_blocked_load_update_tick; 802 unsigned int has_blocked_load; 803 #endif /* CONFIG_SMP */ 804 unsigned int nohz_tick_stopped; 805 atomic_t nohz_flags; 806 #endif /* CONFIG_NO_HZ_COMMON */ 807 808 /* capture load from *all* tasks on this CPU: */ 809 struct load_weight load; 810 unsigned long nr_load_updates; 811 u64 nr_switches; 812 813 struct cfs_rq cfs; 814 struct rt_rq rt; 815 struct dl_rq dl; 816 817 #ifdef CONFIG_FAIR_GROUP_SCHED 818 /* list of leaf cfs_rq on this CPU: */ 819 struct list_head leaf_cfs_rq_list; 820 struct list_head *tmp_alone_branch; 821 #endif /* CONFIG_FAIR_GROUP_SCHED */ 822 823 /* 824 * This is part of a global counter where only the total sum 825 * over all CPUs matters. A task can increase this counter on 826 * one CPU and if it got migrated afterwards it may decrease 827 * it on another CPU. Always updated under the runqueue lock: 828 */ 829 unsigned long nr_uninterruptible; 830 831 struct task_struct *curr; 832 struct task_struct *idle; 833 struct task_struct *stop; 834 unsigned long next_balance; 835 struct mm_struct *prev_mm; 836 837 unsigned int clock_update_flags; 838 u64 clock; 839 u64 clock_task; 840 841 atomic_t nr_iowait; 842 843 #ifdef CONFIG_SMP 844 struct root_domain *rd; 845 struct sched_domain *sd; 846 847 unsigned long cpu_capacity; 848 unsigned long cpu_capacity_orig; 849 850 struct callback_head *balance_callback; 851 852 unsigned char idle_balance; 853 854 unsigned long misfit_task_load; 855 856 /* For active balancing */ 857 int active_balance; 858 int push_cpu; 859 struct cpu_stop_work active_balance_work; 860 861 /* CPU of this runqueue: */ 862 int cpu; 863 int online; 864 865 struct list_head cfs_tasks; 866 867 struct sched_avg avg_rt; 868 struct sched_avg avg_dl; 869 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 870 struct sched_avg avg_irq; 871 #endif 872 u64 idle_stamp; 873 u64 avg_idle; 874 875 /* This is used to determine avg_idle's max value */ 876 u64 max_idle_balance_cost; 877 #endif 878 879 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 880 u64 prev_irq_time; 881 #endif 882 #ifdef CONFIG_PARAVIRT 883 u64 prev_steal_time; 884 #endif 885 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 886 u64 prev_steal_time_rq; 887 #endif 888 889 /* calc_load related fields */ 890 unsigned long calc_load_update; 891 long calc_load_active; 892 893 #ifdef CONFIG_SCHED_HRTICK 894 #ifdef CONFIG_SMP 895 int hrtick_csd_pending; 896 call_single_data_t hrtick_csd; 897 #endif 898 struct hrtimer hrtick_timer; 899 #endif 900 901 #ifdef CONFIG_SCHEDSTATS 902 /* latency stats */ 903 struct sched_info rq_sched_info; 904 unsigned long long rq_cpu_time; 905 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 906 907 /* sys_sched_yield() stats */ 908 unsigned int yld_count; 909 910 /* schedule() stats */ 911 unsigned int sched_count; 912 unsigned int sched_goidle; 913 914 /* try_to_wake_up() stats */ 915 unsigned int ttwu_count; 916 unsigned int ttwu_local; 917 #endif 918 919 #ifdef CONFIG_SMP 920 struct llist_head wake_list; 921 #endif 922 923 #ifdef CONFIG_CPU_IDLE 924 /* Must be inspected within a rcu lock section */ 925 struct cpuidle_state *idle_state; 926 #endif 927 }; 928 929 static inline int cpu_of(struct rq *rq) 930 { 931 #ifdef CONFIG_SMP 932 return rq->cpu; 933 #else 934 return 0; 935 #endif 936 } 937 938 939 #ifdef CONFIG_SCHED_SMT 940 extern void __update_idle_core(struct rq *rq); 941 942 static inline void update_idle_core(struct rq *rq) 943 { 944 if (static_branch_unlikely(&sched_smt_present)) 945 __update_idle_core(rq); 946 } 947 948 #else 949 static inline void update_idle_core(struct rq *rq) { } 950 #endif 951 952 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 953 954 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 955 #define this_rq() this_cpu_ptr(&runqueues) 956 #define task_rq(p) cpu_rq(task_cpu(p)) 957 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 958 #define raw_rq() raw_cpu_ptr(&runqueues) 959 960 extern void update_rq_clock(struct rq *rq); 961 962 static inline u64 __rq_clock_broken(struct rq *rq) 963 { 964 return READ_ONCE(rq->clock); 965 } 966 967 /* 968 * rq::clock_update_flags bits 969 * 970 * %RQCF_REQ_SKIP - will request skipping of clock update on the next 971 * call to __schedule(). This is an optimisation to avoid 972 * neighbouring rq clock updates. 973 * 974 * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is 975 * in effect and calls to update_rq_clock() are being ignored. 976 * 977 * %RQCF_UPDATED - is a debug flag that indicates whether a call has been 978 * made to update_rq_clock() since the last time rq::lock was pinned. 979 * 980 * If inside of __schedule(), clock_update_flags will have been 981 * shifted left (a left shift is a cheap operation for the fast path 982 * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, 983 * 984 * if (rq-clock_update_flags >= RQCF_UPDATED) 985 * 986 * to check if %RQCF_UPADTED is set. It'll never be shifted more than 987 * one position though, because the next rq_unpin_lock() will shift it 988 * back. 989 */ 990 #define RQCF_REQ_SKIP 0x01 991 #define RQCF_ACT_SKIP 0x02 992 #define RQCF_UPDATED 0x04 993 994 static inline void assert_clock_updated(struct rq *rq) 995 { 996 /* 997 * The only reason for not seeing a clock update since the 998 * last rq_pin_lock() is if we're currently skipping updates. 999 */ 1000 SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); 1001 } 1002 1003 static inline u64 rq_clock(struct rq *rq) 1004 { 1005 lockdep_assert_held(&rq->lock); 1006 assert_clock_updated(rq); 1007 1008 return rq->clock; 1009 } 1010 1011 static inline u64 rq_clock_task(struct rq *rq) 1012 { 1013 lockdep_assert_held(&rq->lock); 1014 assert_clock_updated(rq); 1015 1016 return rq->clock_task; 1017 } 1018 1019 static inline void rq_clock_skip_update(struct rq *rq) 1020 { 1021 lockdep_assert_held(&rq->lock); 1022 rq->clock_update_flags |= RQCF_REQ_SKIP; 1023 } 1024 1025 /* 1026 * See rt task throttling, which is the only time a skip 1027 * request is cancelled. 1028 */ 1029 static inline void rq_clock_cancel_skipupdate(struct rq *rq) 1030 { 1031 lockdep_assert_held(&rq->lock); 1032 rq->clock_update_flags &= ~RQCF_REQ_SKIP; 1033 } 1034 1035 struct rq_flags { 1036 unsigned long flags; 1037 struct pin_cookie cookie; 1038 #ifdef CONFIG_SCHED_DEBUG 1039 /* 1040 * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the 1041 * current pin context is stashed here in case it needs to be 1042 * restored in rq_repin_lock(). 1043 */ 1044 unsigned int clock_update_flags; 1045 #endif 1046 }; 1047 1048 static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) 1049 { 1050 rf->cookie = lockdep_pin_lock(&rq->lock); 1051 1052 #ifdef CONFIG_SCHED_DEBUG 1053 rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); 1054 rf->clock_update_flags = 0; 1055 #endif 1056 } 1057 1058 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) 1059 { 1060 #ifdef CONFIG_SCHED_DEBUG 1061 if (rq->clock_update_flags > RQCF_ACT_SKIP) 1062 rf->clock_update_flags = RQCF_UPDATED; 1063 #endif 1064 1065 lockdep_unpin_lock(&rq->lock, rf->cookie); 1066 } 1067 1068 static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) 1069 { 1070 lockdep_repin_lock(&rq->lock, rf->cookie); 1071 1072 #ifdef CONFIG_SCHED_DEBUG 1073 /* 1074 * Restore the value we stashed in @rf for this pin context. 1075 */ 1076 rq->clock_update_flags |= rf->clock_update_flags; 1077 #endif 1078 } 1079 1080 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) 1081 __acquires(rq->lock); 1082 1083 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) 1084 __acquires(p->pi_lock) 1085 __acquires(rq->lock); 1086 1087 static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) 1088 __releases(rq->lock) 1089 { 1090 rq_unpin_lock(rq, rf); 1091 raw_spin_unlock(&rq->lock); 1092 } 1093 1094 static inline void 1095 task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) 1096 __releases(rq->lock) 1097 __releases(p->pi_lock) 1098 { 1099 rq_unpin_lock(rq, rf); 1100 raw_spin_unlock(&rq->lock); 1101 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); 1102 } 1103 1104 static inline void 1105 rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) 1106 __acquires(rq->lock) 1107 { 1108 raw_spin_lock_irqsave(&rq->lock, rf->flags); 1109 rq_pin_lock(rq, rf); 1110 } 1111 1112 static inline void 1113 rq_lock_irq(struct rq *rq, struct rq_flags *rf) 1114 __acquires(rq->lock) 1115 { 1116 raw_spin_lock_irq(&rq->lock); 1117 rq_pin_lock(rq, rf); 1118 } 1119 1120 static inline void 1121 rq_lock(struct rq *rq, struct rq_flags *rf) 1122 __acquires(rq->lock) 1123 { 1124 raw_spin_lock(&rq->lock); 1125 rq_pin_lock(rq, rf); 1126 } 1127 1128 static inline void 1129 rq_relock(struct rq *rq, struct rq_flags *rf) 1130 __acquires(rq->lock) 1131 { 1132 raw_spin_lock(&rq->lock); 1133 rq_repin_lock(rq, rf); 1134 } 1135 1136 static inline void 1137 rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) 1138 __releases(rq->lock) 1139 { 1140 rq_unpin_lock(rq, rf); 1141 raw_spin_unlock_irqrestore(&rq->lock, rf->flags); 1142 } 1143 1144 static inline void 1145 rq_unlock_irq(struct rq *rq, struct rq_flags *rf) 1146 __releases(rq->lock) 1147 { 1148 rq_unpin_lock(rq, rf); 1149 raw_spin_unlock_irq(&rq->lock); 1150 } 1151 1152 static inline void 1153 rq_unlock(struct rq *rq, struct rq_flags *rf) 1154 __releases(rq->lock) 1155 { 1156 rq_unpin_lock(rq, rf); 1157 raw_spin_unlock(&rq->lock); 1158 } 1159 1160 static inline struct rq * 1161 this_rq_lock_irq(struct rq_flags *rf) 1162 __acquires(rq->lock) 1163 { 1164 struct rq *rq; 1165 1166 local_irq_disable(); 1167 rq = this_rq(); 1168 rq_lock(rq, rf); 1169 return rq; 1170 } 1171 1172 #ifdef CONFIG_NUMA 1173 enum numa_topology_type { 1174 NUMA_DIRECT, 1175 NUMA_GLUELESS_MESH, 1176 NUMA_BACKPLANE, 1177 }; 1178 extern enum numa_topology_type sched_numa_topology_type; 1179 extern int sched_max_numa_distance; 1180 extern bool find_numa_distance(int distance); 1181 #endif 1182 1183 #ifdef CONFIG_NUMA 1184 extern void sched_init_numa(void); 1185 extern void sched_domains_numa_masks_set(unsigned int cpu); 1186 extern void sched_domains_numa_masks_clear(unsigned int cpu); 1187 #else 1188 static inline void sched_init_numa(void) { } 1189 static inline void sched_domains_numa_masks_set(unsigned int cpu) { } 1190 static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } 1191 #endif 1192 1193 #ifdef CONFIG_NUMA_BALANCING 1194 /* The regions in numa_faults array from task_struct */ 1195 enum numa_faults_stats { 1196 NUMA_MEM = 0, 1197 NUMA_CPU, 1198 NUMA_MEMBUF, 1199 NUMA_CPUBUF 1200 }; 1201 extern void sched_setnuma(struct task_struct *p, int node); 1202 extern int migrate_task_to(struct task_struct *p, int cpu); 1203 extern int migrate_swap(struct task_struct *p, struct task_struct *t, 1204 int cpu, int scpu); 1205 extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); 1206 #else 1207 static inline void 1208 init_numa_balancing(unsigned long clone_flags, struct task_struct *p) 1209 { 1210 } 1211 #endif /* CONFIG_NUMA_BALANCING */ 1212 1213 #ifdef CONFIG_SMP 1214 1215 static inline void 1216 queue_balance_callback(struct rq *rq, 1217 struct callback_head *head, 1218 void (*func)(struct rq *rq)) 1219 { 1220 lockdep_assert_held(&rq->lock); 1221 1222 if (unlikely(head->next)) 1223 return; 1224 1225 head->func = (void (*)(struct callback_head *))func; 1226 head->next = rq->balance_callback; 1227 rq->balance_callback = head; 1228 } 1229 1230 extern void sched_ttwu_pending(void); 1231 1232 #define rcu_dereference_check_sched_domain(p) \ 1233 rcu_dereference_check((p), \ 1234 lockdep_is_held(&sched_domains_mutex)) 1235 1236 /* 1237 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 1238 * See detach_destroy_domains: synchronize_sched for details. 1239 * 1240 * The domain tree of any CPU may only be accessed from within 1241 * preempt-disabled sections. 1242 */ 1243 #define for_each_domain(cpu, __sd) \ 1244 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ 1245 __sd; __sd = __sd->parent) 1246 1247 #define for_each_lower_domain(sd) for (; sd; sd = sd->child) 1248 1249 /** 1250 * highest_flag_domain - Return highest sched_domain containing flag. 1251 * @cpu: The CPU whose highest level of sched domain is to 1252 * be returned. 1253 * @flag: The flag to check for the highest sched_domain 1254 * for the given CPU. 1255 * 1256 * Returns the highest sched_domain of a CPU which contains the given flag. 1257 */ 1258 static inline struct sched_domain *highest_flag_domain(int cpu, int flag) 1259 { 1260 struct sched_domain *sd, *hsd = NULL; 1261 1262 for_each_domain(cpu, sd) { 1263 if (!(sd->flags & flag)) 1264 break; 1265 hsd = sd; 1266 } 1267 1268 return hsd; 1269 } 1270 1271 static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) 1272 { 1273 struct sched_domain *sd; 1274 1275 for_each_domain(cpu, sd) { 1276 if (sd->flags & flag) 1277 break; 1278 } 1279 1280 return sd; 1281 } 1282 1283 DECLARE_PER_CPU(struct sched_domain *, sd_llc); 1284 DECLARE_PER_CPU(int, sd_llc_size); 1285 DECLARE_PER_CPU(int, sd_llc_id); 1286 DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); 1287 DECLARE_PER_CPU(struct sched_domain *, sd_numa); 1288 DECLARE_PER_CPU(struct sched_domain *, sd_asym); 1289 extern struct static_key_false sched_asym_cpucapacity; 1290 1291 struct sched_group_capacity { 1292 atomic_t ref; 1293 /* 1294 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity 1295 * for a single CPU. 1296 */ 1297 unsigned long capacity; 1298 unsigned long min_capacity; /* Min per-CPU capacity in group */ 1299 unsigned long max_capacity; /* Max per-CPU capacity in group */ 1300 unsigned long next_update; 1301 int imbalance; /* XXX unrelated to capacity but shared group state */ 1302 1303 #ifdef CONFIG_SCHED_DEBUG 1304 int id; 1305 #endif 1306 1307 unsigned long cpumask[0]; /* Balance mask */ 1308 }; 1309 1310 struct sched_group { 1311 struct sched_group *next; /* Must be a circular list */ 1312 atomic_t ref; 1313 1314 unsigned int group_weight; 1315 struct sched_group_capacity *sgc; 1316 int asym_prefer_cpu; /* CPU of highest priority in group */ 1317 1318 /* 1319 * The CPUs this group covers. 1320 * 1321 * NOTE: this field is variable length. (Allocated dynamically 1322 * by attaching extra space to the end of the structure, 1323 * depending on how many CPUs the kernel has booted up with) 1324 */ 1325 unsigned long cpumask[0]; 1326 }; 1327 1328 static inline struct cpumask *sched_group_span(struct sched_group *sg) 1329 { 1330 return to_cpumask(sg->cpumask); 1331 } 1332 1333 /* 1334 * See build_balance_mask(). 1335 */ 1336 static inline struct cpumask *group_balance_mask(struct sched_group *sg) 1337 { 1338 return to_cpumask(sg->sgc->cpumask); 1339 } 1340 1341 /** 1342 * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. 1343 * @group: The group whose first CPU is to be returned. 1344 */ 1345 static inline unsigned int group_first_cpu(struct sched_group *group) 1346 { 1347 return cpumask_first(sched_group_span(group)); 1348 } 1349 1350 extern int group_balance_cpu(struct sched_group *sg); 1351 1352 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 1353 void register_sched_domain_sysctl(void); 1354 void dirty_sched_domain_sysctl(int cpu); 1355 void unregister_sched_domain_sysctl(void); 1356 #else 1357 static inline void register_sched_domain_sysctl(void) 1358 { 1359 } 1360 static inline void dirty_sched_domain_sysctl(int cpu) 1361 { 1362 } 1363 static inline void unregister_sched_domain_sysctl(void) 1364 { 1365 } 1366 #endif 1367 1368 #else 1369 1370 static inline void sched_ttwu_pending(void) { } 1371 1372 #endif /* CONFIG_SMP */ 1373 1374 #include "stats.h" 1375 #include "autogroup.h" 1376 1377 #ifdef CONFIG_CGROUP_SCHED 1378 1379 /* 1380 * Return the group to which this tasks belongs. 1381 * 1382 * We cannot use task_css() and friends because the cgroup subsystem 1383 * changes that value before the cgroup_subsys::attach() method is called, 1384 * therefore we cannot pin it and might observe the wrong value. 1385 * 1386 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup 1387 * core changes this before calling sched_move_task(). 1388 * 1389 * Instead we use a 'copy' which is updated from sched_move_task() while 1390 * holding both task_struct::pi_lock and rq::lock. 1391 */ 1392 static inline struct task_group *task_group(struct task_struct *p) 1393 { 1394 return p->sched_task_group; 1395 } 1396 1397 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 1398 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 1399 { 1400 #if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) 1401 struct task_group *tg = task_group(p); 1402 #endif 1403 1404 #ifdef CONFIG_FAIR_GROUP_SCHED 1405 set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); 1406 p->se.cfs_rq = tg->cfs_rq[cpu]; 1407 p->se.parent = tg->se[cpu]; 1408 #endif 1409 1410 #ifdef CONFIG_RT_GROUP_SCHED 1411 p->rt.rt_rq = tg->rt_rq[cpu]; 1412 p->rt.parent = tg->rt_se[cpu]; 1413 #endif 1414 } 1415 1416 #else /* CONFIG_CGROUP_SCHED */ 1417 1418 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 1419 static inline struct task_group *task_group(struct task_struct *p) 1420 { 1421 return NULL; 1422 } 1423 1424 #endif /* CONFIG_CGROUP_SCHED */ 1425 1426 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1427 { 1428 set_task_rq(p, cpu); 1429 #ifdef CONFIG_SMP 1430 /* 1431 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1432 * successfuly executed on another CPU. We must ensure that updates of 1433 * per-task data have been completed by this moment. 1434 */ 1435 smp_wmb(); 1436 #ifdef CONFIG_THREAD_INFO_IN_TASK 1437 p->cpu = cpu; 1438 #else 1439 task_thread_info(p)->cpu = cpu; 1440 #endif 1441 p->wake_cpu = cpu; 1442 #endif 1443 } 1444 1445 /* 1446 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 1447 */ 1448 #ifdef CONFIG_SCHED_DEBUG 1449 # include <linux/static_key.h> 1450 # define const_debug __read_mostly 1451 #else 1452 # define const_debug const 1453 #endif 1454 1455 #define SCHED_FEAT(name, enabled) \ 1456 __SCHED_FEAT_##name , 1457 1458 enum { 1459 #include "features.h" 1460 __SCHED_FEAT_NR, 1461 }; 1462 1463 #undef SCHED_FEAT 1464 1465 #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 1466 1467 /* 1468 * To support run-time toggling of sched features, all the translation units 1469 * (but core.c) reference the sysctl_sched_features defined in core.c. 1470 */ 1471 extern const_debug unsigned int sysctl_sched_features; 1472 1473 #define SCHED_FEAT(name, enabled) \ 1474 static __always_inline bool static_branch_##name(struct static_key *key) \ 1475 { \ 1476 return static_key_##enabled(key); \ 1477 } 1478 1479 #include "features.h" 1480 #undef SCHED_FEAT 1481 1482 extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; 1483 #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) 1484 1485 #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ 1486 1487 /* 1488 * Each translation unit has its own copy of sysctl_sched_features to allow 1489 * constants propagation at compile time and compiler optimization based on 1490 * features default. 1491 */ 1492 #define SCHED_FEAT(name, enabled) \ 1493 (1UL << __SCHED_FEAT_##name) * enabled | 1494 static const_debug __maybe_unused unsigned int sysctl_sched_features = 1495 #include "features.h" 1496 0; 1497 #undef SCHED_FEAT 1498 1499 #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 1500 1501 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 1502 1503 extern struct static_key_false sched_numa_balancing; 1504 extern struct static_key_false sched_schedstats; 1505 1506 static inline u64 global_rt_period(void) 1507 { 1508 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 1509 } 1510 1511 static inline u64 global_rt_runtime(void) 1512 { 1513 if (sysctl_sched_rt_runtime < 0) 1514 return RUNTIME_INF; 1515 1516 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 1517 } 1518 1519 static inline int task_current(struct rq *rq, struct task_struct *p) 1520 { 1521 return rq->curr == p; 1522 } 1523 1524 static inline int task_running(struct rq *rq, struct task_struct *p) 1525 { 1526 #ifdef CONFIG_SMP 1527 return p->on_cpu; 1528 #else 1529 return task_current(rq, p); 1530 #endif 1531 } 1532 1533 static inline int task_on_rq_queued(struct task_struct *p) 1534 { 1535 return p->on_rq == TASK_ON_RQ_QUEUED; 1536 } 1537 1538 static inline int task_on_rq_migrating(struct task_struct *p) 1539 { 1540 return p->on_rq == TASK_ON_RQ_MIGRATING; 1541 } 1542 1543 /* 1544 * wake flags 1545 */ 1546 #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ 1547 #define WF_FORK 0x02 /* Child wakeup after fork */ 1548 #define WF_MIGRATED 0x4 /* Internal use, task got migrated */ 1549 1550 /* 1551 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1552 * of tasks with abnormal "nice" values across CPUs the contribution that 1553 * each task makes to its run queue's load is weighted according to its 1554 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 1555 * scaled version of the new time slice allocation that they receive on time 1556 * slice expiry etc. 1557 */ 1558 1559 #define WEIGHT_IDLEPRIO 3 1560 #define WMULT_IDLEPRIO 1431655765 1561 1562 extern const int sched_prio_to_weight[40]; 1563 extern const u32 sched_prio_to_wmult[40]; 1564 1565 /* 1566 * {de,en}queue flags: 1567 * 1568 * DEQUEUE_SLEEP - task is no longer runnable 1569 * ENQUEUE_WAKEUP - task just became runnable 1570 * 1571 * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks 1572 * are in a known state which allows modification. Such pairs 1573 * should preserve as much state as possible. 1574 * 1575 * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location 1576 * in the runqueue. 1577 * 1578 * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) 1579 * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) 1580 * ENQUEUE_MIGRATED - the task was migrated during wakeup 1581 * 1582 */ 1583 1584 #define DEQUEUE_SLEEP 0x01 1585 #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ 1586 #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ 1587 #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ 1588 1589 #define ENQUEUE_WAKEUP 0x01 1590 #define ENQUEUE_RESTORE 0x02 1591 #define ENQUEUE_MOVE 0x04 1592 #define ENQUEUE_NOCLOCK 0x08 1593 1594 #define ENQUEUE_HEAD 0x10 1595 #define ENQUEUE_REPLENISH 0x20 1596 #ifdef CONFIG_SMP 1597 #define ENQUEUE_MIGRATED 0x40 1598 #else 1599 #define ENQUEUE_MIGRATED 0x00 1600 #endif 1601 1602 #define RETRY_TASK ((void *)-1UL) 1603 1604 struct sched_class { 1605 const struct sched_class *next; 1606 1607 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1608 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1609 void (*yield_task) (struct rq *rq); 1610 bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); 1611 1612 void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); 1613 1614 /* 1615 * It is the responsibility of the pick_next_task() method that will 1616 * return the next task to call put_prev_task() on the @prev task or 1617 * something equivalent. 1618 * 1619 * May return RETRY_TASK when it finds a higher prio class has runnable 1620 * tasks. 1621 */ 1622 struct task_struct * (*pick_next_task)(struct rq *rq, 1623 struct task_struct *prev, 1624 struct rq_flags *rf); 1625 void (*put_prev_task)(struct rq *rq, struct task_struct *p); 1626 1627 #ifdef CONFIG_SMP 1628 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1629 void (*migrate_task_rq)(struct task_struct *p, int new_cpu); 1630 1631 void (*task_woken)(struct rq *this_rq, struct task_struct *task); 1632 1633 void (*set_cpus_allowed)(struct task_struct *p, 1634 const struct cpumask *newmask); 1635 1636 void (*rq_online)(struct rq *rq); 1637 void (*rq_offline)(struct rq *rq); 1638 #endif 1639 1640 void (*set_curr_task)(struct rq *rq); 1641 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); 1642 void (*task_fork)(struct task_struct *p); 1643 void (*task_dead)(struct task_struct *p); 1644 1645 /* 1646 * The switched_from() call is allowed to drop rq->lock, therefore we 1647 * cannot assume the switched_from/switched_to pair is serliazed by 1648 * rq->lock. They are however serialized by p->pi_lock. 1649 */ 1650 void (*switched_from)(struct rq *this_rq, struct task_struct *task); 1651 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1652 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1653 int oldprio); 1654 1655 unsigned int (*get_rr_interval)(struct rq *rq, 1656 struct task_struct *task); 1657 1658 void (*update_curr)(struct rq *rq); 1659 1660 #define TASK_SET_GROUP 0 1661 #define TASK_MOVE_GROUP 1 1662 1663 #ifdef CONFIG_FAIR_GROUP_SCHED 1664 void (*task_change_group)(struct task_struct *p, int type); 1665 #endif 1666 }; 1667 1668 static inline void put_prev_task(struct rq *rq, struct task_struct *prev) 1669 { 1670 prev->sched_class->put_prev_task(rq, prev); 1671 } 1672 1673 static inline void set_curr_task(struct rq *rq, struct task_struct *curr) 1674 { 1675 curr->sched_class->set_curr_task(rq); 1676 } 1677 1678 #ifdef CONFIG_SMP 1679 #define sched_class_highest (&stop_sched_class) 1680 #else 1681 #define sched_class_highest (&dl_sched_class) 1682 #endif 1683 #define for_each_class(class) \ 1684 for (class = sched_class_highest; class; class = class->next) 1685 1686 extern const struct sched_class stop_sched_class; 1687 extern const struct sched_class dl_sched_class; 1688 extern const struct sched_class rt_sched_class; 1689 extern const struct sched_class fair_sched_class; 1690 extern const struct sched_class idle_sched_class; 1691 1692 1693 #ifdef CONFIG_SMP 1694 1695 extern void update_group_capacity(struct sched_domain *sd, int cpu); 1696 1697 extern void trigger_load_balance(struct rq *rq); 1698 1699 extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); 1700 1701 #endif 1702 1703 #ifdef CONFIG_CPU_IDLE 1704 static inline void idle_set_state(struct rq *rq, 1705 struct cpuidle_state *idle_state) 1706 { 1707 rq->idle_state = idle_state; 1708 } 1709 1710 static inline struct cpuidle_state *idle_get_state(struct rq *rq) 1711 { 1712 SCHED_WARN_ON(!rcu_read_lock_held()); 1713 1714 return rq->idle_state; 1715 } 1716 #else 1717 static inline void idle_set_state(struct rq *rq, 1718 struct cpuidle_state *idle_state) 1719 { 1720 } 1721 1722 static inline struct cpuidle_state *idle_get_state(struct rq *rq) 1723 { 1724 return NULL; 1725 } 1726 #endif 1727 1728 extern void schedule_idle(void); 1729 1730 extern void sysrq_sched_debug_show(void); 1731 extern void sched_init_granularity(void); 1732 extern void update_max_interval(void); 1733 1734 extern void init_sched_dl_class(void); 1735 extern void init_sched_rt_class(void); 1736 extern void init_sched_fair_class(void); 1737 1738 extern void reweight_task(struct task_struct *p, int prio); 1739 1740 extern void resched_curr(struct rq *rq); 1741 extern void resched_cpu(int cpu); 1742 1743 extern struct rt_bandwidth def_rt_bandwidth; 1744 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); 1745 1746 extern struct dl_bandwidth def_dl_bandwidth; 1747 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); 1748 extern void init_dl_task_timer(struct sched_dl_entity *dl_se); 1749 extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); 1750 extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); 1751 1752 #define BW_SHIFT 20 1753 #define BW_UNIT (1 << BW_SHIFT) 1754 #define RATIO_SHIFT 8 1755 unsigned long to_ratio(u64 period, u64 runtime); 1756 1757 extern void init_entity_runnable_average(struct sched_entity *se); 1758 extern void post_init_entity_util_avg(struct sched_entity *se); 1759 1760 #ifdef CONFIG_NO_HZ_FULL 1761 extern bool sched_can_stop_tick(struct rq *rq); 1762 extern int __init sched_tick_offload_init(void); 1763 1764 /* 1765 * Tick may be needed by tasks in the runqueue depending on their policy and 1766 * requirements. If tick is needed, lets send the target an IPI to kick it out of 1767 * nohz mode if necessary. 1768 */ 1769 static inline void sched_update_tick_dependency(struct rq *rq) 1770 { 1771 int cpu; 1772 1773 if (!tick_nohz_full_enabled()) 1774 return; 1775 1776 cpu = cpu_of(rq); 1777 1778 if (!tick_nohz_full_cpu(cpu)) 1779 return; 1780 1781 if (sched_can_stop_tick(rq)) 1782 tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); 1783 else 1784 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); 1785 } 1786 #else 1787 static inline int sched_tick_offload_init(void) { return 0; } 1788 static inline void sched_update_tick_dependency(struct rq *rq) { } 1789 #endif 1790 1791 static inline void add_nr_running(struct rq *rq, unsigned count) 1792 { 1793 unsigned prev_nr = rq->nr_running; 1794 1795 rq->nr_running = prev_nr + count; 1796 1797 if (prev_nr < 2 && rq->nr_running >= 2) { 1798 #ifdef CONFIG_SMP 1799 if (!READ_ONCE(rq->rd->overload)) 1800 WRITE_ONCE(rq->rd->overload, 1); 1801 #endif 1802 } 1803 1804 sched_update_tick_dependency(rq); 1805 } 1806 1807 static inline void sub_nr_running(struct rq *rq, unsigned count) 1808 { 1809 rq->nr_running -= count; 1810 /* Check if we still need preemption */ 1811 sched_update_tick_dependency(rq); 1812 } 1813 1814 extern void activate_task(struct rq *rq, struct task_struct *p, int flags); 1815 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); 1816 1817 extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); 1818 1819 extern const_debug unsigned int sysctl_sched_nr_migrate; 1820 extern const_debug unsigned int sysctl_sched_migration_cost; 1821 1822 #ifdef CONFIG_SCHED_HRTICK 1823 1824 /* 1825 * Use hrtick when: 1826 * - enabled by features 1827 * - hrtimer is actually high res 1828 */ 1829 static inline int hrtick_enabled(struct rq *rq) 1830 { 1831 if (!sched_feat(HRTICK)) 1832 return 0; 1833 if (!cpu_active(cpu_of(rq))) 1834 return 0; 1835 return hrtimer_is_hres_active(&rq->hrtick_timer); 1836 } 1837 1838 void hrtick_start(struct rq *rq, u64 delay); 1839 1840 #else 1841 1842 static inline int hrtick_enabled(struct rq *rq) 1843 { 1844 return 0; 1845 } 1846 1847 #endif /* CONFIG_SCHED_HRTICK */ 1848 1849 #ifndef arch_scale_freq_capacity 1850 static __always_inline 1851 unsigned long arch_scale_freq_capacity(int cpu) 1852 { 1853 return SCHED_CAPACITY_SCALE; 1854 } 1855 #endif 1856 1857 #ifdef CONFIG_SMP 1858 #ifndef arch_scale_cpu_capacity 1859 static __always_inline 1860 unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) 1861 { 1862 if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) 1863 return sd->smt_gain / sd->span_weight; 1864 1865 return SCHED_CAPACITY_SCALE; 1866 } 1867 #endif 1868 #else 1869 #ifndef arch_scale_cpu_capacity 1870 static __always_inline 1871 unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) 1872 { 1873 return SCHED_CAPACITY_SCALE; 1874 } 1875 #endif 1876 #endif 1877 1878 #ifdef CONFIG_SMP 1879 #ifdef CONFIG_PREEMPT 1880 1881 static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); 1882 1883 /* 1884 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1885 * way at the expense of forcing extra atomic operations in all 1886 * invocations. This assures that the double_lock is acquired using the 1887 * same underlying policy as the spinlock_t on this architecture, which 1888 * reduces latency compared to the unfair variant below. However, it 1889 * also adds more overhead and therefore may reduce throughput. 1890 */ 1891 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1892 __releases(this_rq->lock) 1893 __acquires(busiest->lock) 1894 __acquires(this_rq->lock) 1895 { 1896 raw_spin_unlock(&this_rq->lock); 1897 double_rq_lock(this_rq, busiest); 1898 1899 return 1; 1900 } 1901 1902 #else 1903 /* 1904 * Unfair double_lock_balance: Optimizes throughput at the expense of 1905 * latency by eliminating extra atomic operations when the locks are 1906 * already in proper order on entry. This favors lower CPU-ids and will 1907 * grant the double lock to lower CPUs over higher ids under contention, 1908 * regardless of entry order into the function. 1909 */ 1910 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1911 __releases(this_rq->lock) 1912 __acquires(busiest->lock) 1913 __acquires(this_rq->lock) 1914 { 1915 int ret = 0; 1916 1917 if (unlikely(!raw_spin_trylock(&busiest->lock))) { 1918 if (busiest < this_rq) { 1919 raw_spin_unlock(&this_rq->lock); 1920 raw_spin_lock(&busiest->lock); 1921 raw_spin_lock_nested(&this_rq->lock, 1922 SINGLE_DEPTH_NESTING); 1923 ret = 1; 1924 } else 1925 raw_spin_lock_nested(&busiest->lock, 1926 SINGLE_DEPTH_NESTING); 1927 } 1928 return ret; 1929 } 1930 1931 #endif /* CONFIG_PREEMPT */ 1932 1933 /* 1934 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1935 */ 1936 static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1937 { 1938 if (unlikely(!irqs_disabled())) { 1939 /* printk() doesn't work well under rq->lock */ 1940 raw_spin_unlock(&this_rq->lock); 1941 BUG_ON(1); 1942 } 1943 1944 return _double_lock_balance(this_rq, busiest); 1945 } 1946 1947 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1948 __releases(busiest->lock) 1949 { 1950 raw_spin_unlock(&busiest->lock); 1951 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1952 } 1953 1954 static inline void double_lock(spinlock_t *l1, spinlock_t *l2) 1955 { 1956 if (l1 > l2) 1957 swap(l1, l2); 1958 1959 spin_lock(l1); 1960 spin_lock_nested(l2, SINGLE_DEPTH_NESTING); 1961 } 1962 1963 static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) 1964 { 1965 if (l1 > l2) 1966 swap(l1, l2); 1967 1968 spin_lock_irq(l1); 1969 spin_lock_nested(l2, SINGLE_DEPTH_NESTING); 1970 } 1971 1972 static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) 1973 { 1974 if (l1 > l2) 1975 swap(l1, l2); 1976 1977 raw_spin_lock(l1); 1978 raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); 1979 } 1980 1981 /* 1982 * double_rq_lock - safely lock two runqueues 1983 * 1984 * Note this does not disable interrupts like task_rq_lock, 1985 * you need to do so manually before calling. 1986 */ 1987 static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) 1988 __acquires(rq1->lock) 1989 __acquires(rq2->lock) 1990 { 1991 BUG_ON(!irqs_disabled()); 1992 if (rq1 == rq2) { 1993 raw_spin_lock(&rq1->lock); 1994 __acquire(rq2->lock); /* Fake it out ;) */ 1995 } else { 1996 if (rq1 < rq2) { 1997 raw_spin_lock(&rq1->lock); 1998 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 1999 } else { 2000 raw_spin_lock(&rq2->lock); 2001 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 2002 } 2003 } 2004 } 2005 2006 /* 2007 * double_rq_unlock - safely unlock two runqueues 2008 * 2009 * Note this does not restore interrupts like task_rq_unlock, 2010 * you need to do so manually after calling. 2011 */ 2012 static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) 2013 __releases(rq1->lock) 2014 __releases(rq2->lock) 2015 { 2016 raw_spin_unlock(&rq1->lock); 2017 if (rq1 != rq2) 2018 raw_spin_unlock(&rq2->lock); 2019 else 2020 __release(rq2->lock); 2021 } 2022 2023 extern void set_rq_online (struct rq *rq); 2024 extern void set_rq_offline(struct rq *rq); 2025 extern bool sched_smp_initialized; 2026 2027 #else /* CONFIG_SMP */ 2028 2029 /* 2030 * double_rq_lock - safely lock two runqueues 2031 * 2032 * Note this does not disable interrupts like task_rq_lock, 2033 * you need to do so manually before calling. 2034 */ 2035 static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) 2036 __acquires(rq1->lock) 2037 __acquires(rq2->lock) 2038 { 2039 BUG_ON(!irqs_disabled()); 2040 BUG_ON(rq1 != rq2); 2041 raw_spin_lock(&rq1->lock); 2042 __acquire(rq2->lock); /* Fake it out ;) */ 2043 } 2044 2045 /* 2046 * double_rq_unlock - safely unlock two runqueues 2047 * 2048 * Note this does not restore interrupts like task_rq_unlock, 2049 * you need to do so manually after calling. 2050 */ 2051 static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) 2052 __releases(rq1->lock) 2053 __releases(rq2->lock) 2054 { 2055 BUG_ON(rq1 != rq2); 2056 raw_spin_unlock(&rq1->lock); 2057 __release(rq2->lock); 2058 } 2059 2060 #endif 2061 2062 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); 2063 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); 2064 2065 #ifdef CONFIG_SCHED_DEBUG 2066 extern bool sched_debug_enabled; 2067 2068 extern void print_cfs_stats(struct seq_file *m, int cpu); 2069 extern void print_rt_stats(struct seq_file *m, int cpu); 2070 extern void print_dl_stats(struct seq_file *m, int cpu); 2071 extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); 2072 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2073 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); 2074 #ifdef CONFIG_NUMA_BALANCING 2075 extern void 2076 show_numa_stats(struct task_struct *p, struct seq_file *m); 2077 extern void 2078 print_numa_stats(struct seq_file *m, int node, unsigned long tsf, 2079 unsigned long tpf, unsigned long gsf, unsigned long gpf); 2080 #endif /* CONFIG_NUMA_BALANCING */ 2081 #endif /* CONFIG_SCHED_DEBUG */ 2082 2083 extern void init_cfs_rq(struct cfs_rq *cfs_rq); 2084 extern void init_rt_rq(struct rt_rq *rt_rq); 2085 extern void init_dl_rq(struct dl_rq *dl_rq); 2086 2087 extern void cfs_bandwidth_usage_inc(void); 2088 extern void cfs_bandwidth_usage_dec(void); 2089 2090 #ifdef CONFIG_NO_HZ_COMMON 2091 #define NOHZ_BALANCE_KICK_BIT 0 2092 #define NOHZ_STATS_KICK_BIT 1 2093 2094 #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) 2095 #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) 2096 2097 #define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) 2098 2099 #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 2100 2101 extern void nohz_balance_exit_idle(struct rq *rq); 2102 #else 2103 static inline void nohz_balance_exit_idle(struct rq *rq) { } 2104 #endif 2105 2106 2107 #ifdef CONFIG_SMP 2108 static inline 2109 void __dl_update(struct dl_bw *dl_b, s64 bw) 2110 { 2111 struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); 2112 int i; 2113 2114 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), 2115 "sched RCU must be held"); 2116 for_each_cpu_and(i, rd->span, cpu_active_mask) { 2117 struct rq *rq = cpu_rq(i); 2118 2119 rq->dl.extra_bw += bw; 2120 } 2121 } 2122 #else 2123 static inline 2124 void __dl_update(struct dl_bw *dl_b, s64 bw) 2125 { 2126 struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); 2127 2128 dl->extra_bw += bw; 2129 } 2130 #endif 2131 2132 2133 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 2134 struct irqtime { 2135 u64 total; 2136 u64 tick_delta; 2137 u64 irq_start_time; 2138 struct u64_stats_sync sync; 2139 }; 2140 2141 DECLARE_PER_CPU(struct irqtime, cpu_irqtime); 2142 2143 /* 2144 * Returns the irqtime minus the softirq time computed by ksoftirqd. 2145 * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime 2146 * and never move forward. 2147 */ 2148 static inline u64 irq_time_read(int cpu) 2149 { 2150 struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); 2151 unsigned int seq; 2152 u64 total; 2153 2154 do { 2155 seq = __u64_stats_fetch_begin(&irqtime->sync); 2156 total = irqtime->total; 2157 } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); 2158 2159 return total; 2160 } 2161 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 2162 2163 #ifdef CONFIG_CPU_FREQ 2164 DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); 2165 2166 /** 2167 * cpufreq_update_util - Take a note about CPU utilization changes. 2168 * @rq: Runqueue to carry out the update for. 2169 * @flags: Update reason flags. 2170 * 2171 * This function is called by the scheduler on the CPU whose utilization is 2172 * being updated. 2173 * 2174 * It can only be called from RCU-sched read-side critical sections. 2175 * 2176 * The way cpufreq is currently arranged requires it to evaluate the CPU 2177 * performance state (frequency/voltage) on a regular basis to prevent it from 2178 * being stuck in a completely inadequate performance level for too long. 2179 * That is not guaranteed to happen if the updates are only triggered from CFS 2180 * and DL, though, because they may not be coming in if only RT tasks are 2181 * active all the time (or there are RT tasks only). 2182 * 2183 * As a workaround for that issue, this function is called periodically by the 2184 * RT sched class to trigger extra cpufreq updates to prevent it from stalling, 2185 * but that really is a band-aid. Going forward it should be replaced with 2186 * solutions targeted more specifically at RT tasks. 2187 */ 2188 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) 2189 { 2190 struct update_util_data *data; 2191 2192 data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, 2193 cpu_of(rq))); 2194 if (data) 2195 data->func(data, rq_clock(rq), flags); 2196 } 2197 #else 2198 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} 2199 #endif /* CONFIG_CPU_FREQ */ 2200 2201 #ifdef arch_scale_freq_capacity 2202 # ifndef arch_scale_freq_invariant 2203 # define arch_scale_freq_invariant() true 2204 # endif 2205 #else 2206 # define arch_scale_freq_invariant() false 2207 #endif 2208 2209 #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 2210 static inline unsigned long cpu_bw_dl(struct rq *rq) 2211 { 2212 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; 2213 } 2214 2215 static inline unsigned long cpu_util_dl(struct rq *rq) 2216 { 2217 return READ_ONCE(rq->avg_dl.util_avg); 2218 } 2219 2220 static inline unsigned long cpu_util_cfs(struct rq *rq) 2221 { 2222 unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); 2223 2224 if (sched_feat(UTIL_EST)) { 2225 util = max_t(unsigned long, util, 2226 READ_ONCE(rq->cfs.avg.util_est.enqueued)); 2227 } 2228 2229 return util; 2230 } 2231 2232 static inline unsigned long cpu_util_rt(struct rq *rq) 2233 { 2234 return READ_ONCE(rq->avg_rt.util_avg); 2235 } 2236 #endif 2237 2238 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 2239 static inline unsigned long cpu_util_irq(struct rq *rq) 2240 { 2241 return rq->avg_irq.util_avg; 2242 } 2243 2244 static inline 2245 unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) 2246 { 2247 util *= (max - irq); 2248 util /= max; 2249 2250 return util; 2251 2252 } 2253 #else 2254 static inline unsigned long cpu_util_irq(struct rq *rq) 2255 { 2256 return 0; 2257 } 2258 2259 static inline 2260 unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) 2261 { 2262 return util; 2263 } 2264 #endif 2265