1 /* 2 * kernel/sched/core.c 3 * 4 * Kernel scheduler and related syscalls 5 * 6 * Copyright (C) 1991-2002 Linus Torvalds 7 * 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 9 * make semaphores SMP safe 10 * 1998-11-19 Implemented schedule_timeout() and related stuff 11 * by Andrea Arcangeli 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 13 * hybrid priority-list and round-robin design with 14 * an array-switch method of distributing timeslices 15 * and per-CPU runqueues. Cleanups and useful suggestions 16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 17 * 2003-09-03 Interactivity tuning by Con Kolivas. 18 * 2004-04-02 Scheduler domains code by Nick Piggin 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 20 * fair scheduling design by Con Kolivas. 21 * 2007-05-05 Load balancing (smp-nice) and other improvements 22 * by Peter Williams 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 26 * Thomas Gleixner, Mike Kravetz 27 */ 28 29 #include <linux/kasan.h> 30 #include <linux/mm.h> 31 #include <linux/module.h> 32 #include <linux/nmi.h> 33 #include <linux/init.h> 34 #include <linux/uaccess.h> 35 #include <linux/highmem.h> 36 #include <asm/mmu_context.h> 37 #include <linux/interrupt.h> 38 #include <linux/capability.h> 39 #include <linux/completion.h> 40 #include <linux/kernel_stat.h> 41 #include <linux/debug_locks.h> 42 #include <linux/perf_event.h> 43 #include <linux/security.h> 44 #include <linux/notifier.h> 45 #include <linux/profile.h> 46 #include <linux/freezer.h> 47 #include <linux/vmalloc.h> 48 #include <linux/blkdev.h> 49 #include <linux/delay.h> 50 #include <linux/pid_namespace.h> 51 #include <linux/smp.h> 52 #include <linux/threads.h> 53 #include <linux/timer.h> 54 #include <linux/rcupdate.h> 55 #include <linux/cpu.h> 56 #include <linux/cpuset.h> 57 #include <linux/percpu.h> 58 #include <linux/proc_fs.h> 59 #include <linux/seq_file.h> 60 #include <linux/sysctl.h> 61 #include <linux/syscalls.h> 62 #include <linux/times.h> 63 #include <linux/tsacct_kern.h> 64 #include <linux/kprobes.h> 65 #include <linux/delayacct.h> 66 #include <linux/unistd.h> 67 #include <linux/pagemap.h> 68 #include <linux/hrtimer.h> 69 #include <linux/tick.h> 70 #include <linux/ctype.h> 71 #include <linux/ftrace.h> 72 #include <linux/slab.h> 73 #include <linux/init_task.h> 74 #include <linux/context_tracking.h> 75 #include <linux/compiler.h> 76 #include <linux/frame.h> 77 78 #include <asm/switch_to.h> 79 #include <asm/tlb.h> 80 #include <asm/irq_regs.h> 81 #include <asm/mutex.h> 82 #ifdef CONFIG_PARAVIRT 83 #include <asm/paravirt.h> 84 #endif 85 86 #include "sched.h" 87 #include "../workqueue_internal.h" 88 #include "../smpboot.h" 89 90 #define CREATE_TRACE_POINTS 91 #include <trace/events/sched.h> 92 93 DEFINE_MUTEX(sched_domains_mutex); 94 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 95 96 static void update_rq_clock_task(struct rq *rq, s64 delta); 97 98 void update_rq_clock(struct rq *rq) 99 { 100 s64 delta; 101 102 lockdep_assert_held(&rq->lock); 103 104 if (rq->clock_skip_update & RQCF_ACT_SKIP) 105 return; 106 107 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 108 if (delta < 0) 109 return; 110 rq->clock += delta; 111 update_rq_clock_task(rq, delta); 112 } 113 114 /* 115 * Debugging: various feature bits 116 */ 117 118 #define SCHED_FEAT(name, enabled) \ 119 (1UL << __SCHED_FEAT_##name) * enabled | 120 121 const_debug unsigned int sysctl_sched_features = 122 #include "features.h" 123 0; 124 125 #undef SCHED_FEAT 126 127 /* 128 * Number of tasks to iterate in a single balance run. 129 * Limited because this is done with IRQs disabled. 130 */ 131 const_debug unsigned int sysctl_sched_nr_migrate = 32; 132 133 /* 134 * period over which we average the RT time consumption, measured 135 * in ms. 136 * 137 * default: 1s 138 */ 139 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 140 141 /* 142 * period over which we measure -rt task cpu usage in us. 143 * default: 1s 144 */ 145 unsigned int sysctl_sched_rt_period = 1000000; 146 147 __read_mostly int scheduler_running; 148 149 /* 150 * part of the period that we allow rt tasks to run in us. 151 * default: 0.95s 152 */ 153 int sysctl_sched_rt_runtime = 950000; 154 155 /* cpus with isolated domains */ 156 cpumask_var_t cpu_isolated_map; 157 158 /* 159 * this_rq_lock - lock this runqueue and disable interrupts. 160 */ 161 static struct rq *this_rq_lock(void) 162 __acquires(rq->lock) 163 { 164 struct rq *rq; 165 166 local_irq_disable(); 167 rq = this_rq(); 168 raw_spin_lock(&rq->lock); 169 170 return rq; 171 } 172 173 #ifdef CONFIG_SCHED_HRTICK 174 /* 175 * Use HR-timers to deliver accurate preemption points. 176 */ 177 178 static void hrtick_clear(struct rq *rq) 179 { 180 if (hrtimer_active(&rq->hrtick_timer)) 181 hrtimer_cancel(&rq->hrtick_timer); 182 } 183 184 /* 185 * High-resolution timer tick. 186 * Runs from hardirq context with interrupts disabled. 187 */ 188 static enum hrtimer_restart hrtick(struct hrtimer *timer) 189 { 190 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 191 192 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 193 194 raw_spin_lock(&rq->lock); 195 update_rq_clock(rq); 196 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 197 raw_spin_unlock(&rq->lock); 198 199 return HRTIMER_NORESTART; 200 } 201 202 #ifdef CONFIG_SMP 203 204 static void __hrtick_restart(struct rq *rq) 205 { 206 struct hrtimer *timer = &rq->hrtick_timer; 207 208 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 209 } 210 211 /* 212 * called from hardirq (IPI) context 213 */ 214 static void __hrtick_start(void *arg) 215 { 216 struct rq *rq = arg; 217 218 raw_spin_lock(&rq->lock); 219 __hrtick_restart(rq); 220 rq->hrtick_csd_pending = 0; 221 raw_spin_unlock(&rq->lock); 222 } 223 224 /* 225 * Called to set the hrtick timer state. 226 * 227 * called with rq->lock held and irqs disabled 228 */ 229 void hrtick_start(struct rq *rq, u64 delay) 230 { 231 struct hrtimer *timer = &rq->hrtick_timer; 232 ktime_t time; 233 s64 delta; 234 235 /* 236 * Don't schedule slices shorter than 10000ns, that just 237 * doesn't make sense and can cause timer DoS. 238 */ 239 delta = max_t(s64, delay, 10000LL); 240 time = ktime_add_ns(timer->base->get_time(), delta); 241 242 hrtimer_set_expires(timer, time); 243 244 if (rq == this_rq()) { 245 __hrtick_restart(rq); 246 } else if (!rq->hrtick_csd_pending) { 247 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 248 rq->hrtick_csd_pending = 1; 249 } 250 } 251 252 static int 253 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 254 { 255 int cpu = (int)(long)hcpu; 256 257 switch (action) { 258 case CPU_UP_CANCELED: 259 case CPU_UP_CANCELED_FROZEN: 260 case CPU_DOWN_PREPARE: 261 case CPU_DOWN_PREPARE_FROZEN: 262 case CPU_DEAD: 263 case CPU_DEAD_FROZEN: 264 hrtick_clear(cpu_rq(cpu)); 265 return NOTIFY_OK; 266 } 267 268 return NOTIFY_DONE; 269 } 270 271 static __init void init_hrtick(void) 272 { 273 hotcpu_notifier(hotplug_hrtick, 0); 274 } 275 #else 276 /* 277 * Called to set the hrtick timer state. 278 * 279 * called with rq->lock held and irqs disabled 280 */ 281 void hrtick_start(struct rq *rq, u64 delay) 282 { 283 /* 284 * Don't schedule slices shorter than 10000ns, that just 285 * doesn't make sense. Rely on vruntime for fairness. 286 */ 287 delay = max_t(u64, delay, 10000LL); 288 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), 289 HRTIMER_MODE_REL_PINNED); 290 } 291 292 static inline void init_hrtick(void) 293 { 294 } 295 #endif /* CONFIG_SMP */ 296 297 static void init_rq_hrtick(struct rq *rq) 298 { 299 #ifdef CONFIG_SMP 300 rq->hrtick_csd_pending = 0; 301 302 rq->hrtick_csd.flags = 0; 303 rq->hrtick_csd.func = __hrtick_start; 304 rq->hrtick_csd.info = rq; 305 #endif 306 307 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 308 rq->hrtick_timer.function = hrtick; 309 } 310 #else /* CONFIG_SCHED_HRTICK */ 311 static inline void hrtick_clear(struct rq *rq) 312 { 313 } 314 315 static inline void init_rq_hrtick(struct rq *rq) 316 { 317 } 318 319 static inline void init_hrtick(void) 320 { 321 } 322 #endif /* CONFIG_SCHED_HRTICK */ 323 324 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) 325 /* 326 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 327 * this avoids any races wrt polling state changes and thereby avoids 328 * spurious IPIs. 329 */ 330 static bool set_nr_and_not_polling(struct task_struct *p) 331 { 332 struct thread_info *ti = task_thread_info(p); 333 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); 334 } 335 336 /* 337 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. 338 * 339 * If this returns true, then the idle task promises to call 340 * sched_ttwu_pending() and reschedule soon. 341 */ 342 static bool set_nr_if_polling(struct task_struct *p) 343 { 344 struct thread_info *ti = task_thread_info(p); 345 typeof(ti->flags) old, val = READ_ONCE(ti->flags); 346 347 for (;;) { 348 if (!(val & _TIF_POLLING_NRFLAG)) 349 return false; 350 if (val & _TIF_NEED_RESCHED) 351 return true; 352 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); 353 if (old == val) 354 break; 355 val = old; 356 } 357 return true; 358 } 359 360 #else 361 static bool set_nr_and_not_polling(struct task_struct *p) 362 { 363 set_tsk_need_resched(p); 364 return true; 365 } 366 367 #ifdef CONFIG_SMP 368 static bool set_nr_if_polling(struct task_struct *p) 369 { 370 return false; 371 } 372 #endif 373 #endif 374 375 void wake_q_add(struct wake_q_head *head, struct task_struct *task) 376 { 377 struct wake_q_node *node = &task->wake_q; 378 379 /* 380 * Atomically grab the task, if ->wake_q is !nil already it means 381 * its already queued (either by us or someone else) and will get the 382 * wakeup due to that. 383 * 384 * This cmpxchg() implies a full barrier, which pairs with the write 385 * barrier implied by the wakeup in wake_up_list(). 386 */ 387 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) 388 return; 389 390 get_task_struct(task); 391 392 /* 393 * The head is context local, there can be no concurrency. 394 */ 395 *head->lastp = node; 396 head->lastp = &node->next; 397 } 398 399 void wake_up_q(struct wake_q_head *head) 400 { 401 struct wake_q_node *node = head->first; 402 403 while (node != WAKE_Q_TAIL) { 404 struct task_struct *task; 405 406 task = container_of(node, struct task_struct, wake_q); 407 BUG_ON(!task); 408 /* task can safely be re-inserted now */ 409 node = node->next; 410 task->wake_q.next = NULL; 411 412 /* 413 * wake_up_process() implies a wmb() to pair with the queueing 414 * in wake_q_add() so as not to miss wakeups. 415 */ 416 wake_up_process(task); 417 put_task_struct(task); 418 } 419 } 420 421 /* 422 * resched_curr - mark rq's current task 'to be rescheduled now'. 423 * 424 * On UP this means the setting of the need_resched flag, on SMP it 425 * might also involve a cross-CPU call to trigger the scheduler on 426 * the target CPU. 427 */ 428 void resched_curr(struct rq *rq) 429 { 430 struct task_struct *curr = rq->curr; 431 int cpu; 432 433 lockdep_assert_held(&rq->lock); 434 435 if (test_tsk_need_resched(curr)) 436 return; 437 438 cpu = cpu_of(rq); 439 440 if (cpu == smp_processor_id()) { 441 set_tsk_need_resched(curr); 442 set_preempt_need_resched(); 443 return; 444 } 445 446 if (set_nr_and_not_polling(curr)) 447 smp_send_reschedule(cpu); 448 else 449 trace_sched_wake_idle_without_ipi(cpu); 450 } 451 452 void resched_cpu(int cpu) 453 { 454 struct rq *rq = cpu_rq(cpu); 455 unsigned long flags; 456 457 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 458 return; 459 resched_curr(rq); 460 raw_spin_unlock_irqrestore(&rq->lock, flags); 461 } 462 463 #ifdef CONFIG_SMP 464 #ifdef CONFIG_NO_HZ_COMMON 465 /* 466 * In the semi idle case, use the nearest busy cpu for migrating timers 467 * from an idle cpu. This is good for power-savings. 468 * 469 * We don't do similar optimization for completely idle system, as 470 * selecting an idle cpu will add more delays to the timers than intended 471 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 472 */ 473 int get_nohz_timer_target(void) 474 { 475 int i, cpu = smp_processor_id(); 476 struct sched_domain *sd; 477 478 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) 479 return cpu; 480 481 rcu_read_lock(); 482 for_each_domain(cpu, sd) { 483 for_each_cpu(i, sched_domain_span(sd)) { 484 if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { 485 cpu = i; 486 goto unlock; 487 } 488 } 489 } 490 491 if (!is_housekeeping_cpu(cpu)) 492 cpu = housekeeping_any_cpu(); 493 unlock: 494 rcu_read_unlock(); 495 return cpu; 496 } 497 /* 498 * When add_timer_on() enqueues a timer into the timer wheel of an 499 * idle CPU then this timer might expire before the next timer event 500 * which is scheduled to wake up that CPU. In case of a completely 501 * idle system the next event might even be infinite time into the 502 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 503 * leaves the inner idle loop so the newly added timer is taken into 504 * account when the CPU goes back to idle and evaluates the timer 505 * wheel for the next timer event. 506 */ 507 static void wake_up_idle_cpu(int cpu) 508 { 509 struct rq *rq = cpu_rq(cpu); 510 511 if (cpu == smp_processor_id()) 512 return; 513 514 if (set_nr_and_not_polling(rq->idle)) 515 smp_send_reschedule(cpu); 516 else 517 trace_sched_wake_idle_without_ipi(cpu); 518 } 519 520 static bool wake_up_full_nohz_cpu(int cpu) 521 { 522 /* 523 * We just need the target to call irq_exit() and re-evaluate 524 * the next tick. The nohz full kick at least implies that. 525 * If needed we can still optimize that later with an 526 * empty IRQ. 527 */ 528 if (tick_nohz_full_cpu(cpu)) { 529 if (cpu != smp_processor_id() || 530 tick_nohz_tick_stopped()) 531 tick_nohz_full_kick_cpu(cpu); 532 return true; 533 } 534 535 return false; 536 } 537 538 void wake_up_nohz_cpu(int cpu) 539 { 540 if (!wake_up_full_nohz_cpu(cpu)) 541 wake_up_idle_cpu(cpu); 542 } 543 544 static inline bool got_nohz_idle_kick(void) 545 { 546 int cpu = smp_processor_id(); 547 548 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 549 return false; 550 551 if (idle_cpu(cpu) && !need_resched()) 552 return true; 553 554 /* 555 * We can't run Idle Load Balance on this CPU for this time so we 556 * cancel it and clear NOHZ_BALANCE_KICK 557 */ 558 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 559 return false; 560 } 561 562 #else /* CONFIG_NO_HZ_COMMON */ 563 564 static inline bool got_nohz_idle_kick(void) 565 { 566 return false; 567 } 568 569 #endif /* CONFIG_NO_HZ_COMMON */ 570 571 #ifdef CONFIG_NO_HZ_FULL 572 bool sched_can_stop_tick(struct rq *rq) 573 { 574 int fifo_nr_running; 575 576 /* Deadline tasks, even if single, need the tick */ 577 if (rq->dl.dl_nr_running) 578 return false; 579 580 /* 581 * FIFO realtime policy runs the highest priority task (after DEADLINE). 582 * Other runnable tasks are of a lower priority. The scheduler tick 583 * isn't needed. 584 */ 585 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; 586 if (fifo_nr_running) 587 return true; 588 589 /* 590 * Round-robin realtime tasks time slice with other tasks at the same 591 * realtime priority. 592 */ 593 if (rq->rt.rr_nr_running) { 594 if (rq->rt.rr_nr_running == 1) 595 return true; 596 else 597 return false; 598 } 599 600 /* Normal multitasking need periodic preemption checks */ 601 if (rq->cfs.nr_running > 1) 602 return false; 603 604 return true; 605 } 606 #endif /* CONFIG_NO_HZ_FULL */ 607 608 void sched_avg_update(struct rq *rq) 609 { 610 s64 period = sched_avg_period(); 611 612 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { 613 /* 614 * Inline assembly required to prevent the compiler 615 * optimising this loop into a divmod call. 616 * See __iter_div_u64_rem() for another example of this. 617 */ 618 asm("" : "+rm" (rq->age_stamp)); 619 rq->age_stamp += period; 620 rq->rt_avg /= 2; 621 } 622 } 623 624 #endif /* CONFIG_SMP */ 625 626 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 627 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 628 /* 629 * Iterate task_group tree rooted at *from, calling @down when first entering a 630 * node and @up when leaving it for the final time. 631 * 632 * Caller must hold rcu_lock or sufficient equivalent. 633 */ 634 int walk_tg_tree_from(struct task_group *from, 635 tg_visitor down, tg_visitor up, void *data) 636 { 637 struct task_group *parent, *child; 638 int ret; 639 640 parent = from; 641 642 down: 643 ret = (*down)(parent, data); 644 if (ret) 645 goto out; 646 list_for_each_entry_rcu(child, &parent->children, siblings) { 647 parent = child; 648 goto down; 649 650 up: 651 continue; 652 } 653 ret = (*up)(parent, data); 654 if (ret || parent == from) 655 goto out; 656 657 child = parent; 658 parent = parent->parent; 659 if (parent) 660 goto up; 661 out: 662 return ret; 663 } 664 665 int tg_nop(struct task_group *tg, void *data) 666 { 667 return 0; 668 } 669 #endif 670 671 static void set_load_weight(struct task_struct *p) 672 { 673 int prio = p->static_prio - MAX_RT_PRIO; 674 struct load_weight *load = &p->se.load; 675 676 /* 677 * SCHED_IDLE tasks get minimal weight: 678 */ 679 if (idle_policy(p->policy)) { 680 load->weight = scale_load(WEIGHT_IDLEPRIO); 681 load->inv_weight = WMULT_IDLEPRIO; 682 return; 683 } 684 685 load->weight = scale_load(sched_prio_to_weight[prio]); 686 load->inv_weight = sched_prio_to_wmult[prio]; 687 } 688 689 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 690 { 691 update_rq_clock(rq); 692 if (!(flags & ENQUEUE_RESTORE)) 693 sched_info_queued(rq, p); 694 p->sched_class->enqueue_task(rq, p, flags); 695 } 696 697 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 698 { 699 update_rq_clock(rq); 700 if (!(flags & DEQUEUE_SAVE)) 701 sched_info_dequeued(rq, p); 702 p->sched_class->dequeue_task(rq, p, flags); 703 } 704 705 void activate_task(struct rq *rq, struct task_struct *p, int flags) 706 { 707 if (task_contributes_to_load(p)) 708 rq->nr_uninterruptible--; 709 710 enqueue_task(rq, p, flags); 711 } 712 713 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 714 { 715 if (task_contributes_to_load(p)) 716 rq->nr_uninterruptible++; 717 718 dequeue_task(rq, p, flags); 719 } 720 721 static void update_rq_clock_task(struct rq *rq, s64 delta) 722 { 723 /* 724 * In theory, the compile should just see 0 here, and optimize out the call 725 * to sched_rt_avg_update. But I don't trust it... 726 */ 727 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 728 s64 steal = 0, irq_delta = 0; 729 #endif 730 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 731 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 732 733 /* 734 * Since irq_time is only updated on {soft,}irq_exit, we might run into 735 * this case when a previous update_rq_clock() happened inside a 736 * {soft,}irq region. 737 * 738 * When this happens, we stop ->clock_task and only update the 739 * prev_irq_time stamp to account for the part that fit, so that a next 740 * update will consume the rest. This ensures ->clock_task is 741 * monotonic. 742 * 743 * It does however cause some slight miss-attribution of {soft,}irq 744 * time, a more accurate solution would be to update the irq_time using 745 * the current rq->clock timestamp, except that would require using 746 * atomic ops. 747 */ 748 if (irq_delta > delta) 749 irq_delta = delta; 750 751 rq->prev_irq_time += irq_delta; 752 delta -= irq_delta; 753 #endif 754 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 755 if (static_key_false((¶virt_steal_rq_enabled))) { 756 steal = paravirt_steal_clock(cpu_of(rq)); 757 steal -= rq->prev_steal_time_rq; 758 759 if (unlikely(steal > delta)) 760 steal = delta; 761 762 rq->prev_steal_time_rq += steal; 763 delta -= steal; 764 } 765 #endif 766 767 rq->clock_task += delta; 768 769 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 770 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 771 sched_rt_avg_update(rq, irq_delta + steal); 772 #endif 773 } 774 775 void sched_set_stop_task(int cpu, struct task_struct *stop) 776 { 777 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 778 struct task_struct *old_stop = cpu_rq(cpu)->stop; 779 780 if (stop) { 781 /* 782 * Make it appear like a SCHED_FIFO task, its something 783 * userspace knows about and won't get confused about. 784 * 785 * Also, it will make PI more or less work without too 786 * much confusion -- but then, stop work should not 787 * rely on PI working anyway. 788 */ 789 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 790 791 stop->sched_class = &stop_sched_class; 792 } 793 794 cpu_rq(cpu)->stop = stop; 795 796 if (old_stop) { 797 /* 798 * Reset it back to a normal scheduling class so that 799 * it can die in pieces. 800 */ 801 old_stop->sched_class = &rt_sched_class; 802 } 803 } 804 805 /* 806 * __normal_prio - return the priority that is based on the static prio 807 */ 808 static inline int __normal_prio(struct task_struct *p) 809 { 810 return p->static_prio; 811 } 812 813 /* 814 * Calculate the expected normal priority: i.e. priority 815 * without taking RT-inheritance into account. Might be 816 * boosted by interactivity modifiers. Changes upon fork, 817 * setprio syscalls, and whenever the interactivity 818 * estimator recalculates. 819 */ 820 static inline int normal_prio(struct task_struct *p) 821 { 822 int prio; 823 824 if (task_has_dl_policy(p)) 825 prio = MAX_DL_PRIO-1; 826 else if (task_has_rt_policy(p)) 827 prio = MAX_RT_PRIO-1 - p->rt_priority; 828 else 829 prio = __normal_prio(p); 830 return prio; 831 } 832 833 /* 834 * Calculate the current priority, i.e. the priority 835 * taken into account by the scheduler. This value might 836 * be boosted by RT tasks, or might be boosted by 837 * interactivity modifiers. Will be RT if the task got 838 * RT-boosted. If not then it returns p->normal_prio. 839 */ 840 static int effective_prio(struct task_struct *p) 841 { 842 p->normal_prio = normal_prio(p); 843 /* 844 * If we are RT tasks or we were boosted to RT priority, 845 * keep the priority unchanged. Otherwise, update priority 846 * to the normal priority: 847 */ 848 if (!rt_prio(p->prio)) 849 return p->normal_prio; 850 return p->prio; 851 } 852 853 /** 854 * task_curr - is this task currently executing on a CPU? 855 * @p: the task in question. 856 * 857 * Return: 1 if the task is currently executing. 0 otherwise. 858 */ 859 inline int task_curr(const struct task_struct *p) 860 { 861 return cpu_curr(task_cpu(p)) == p; 862 } 863 864 /* 865 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, 866 * use the balance_callback list if you want balancing. 867 * 868 * this means any call to check_class_changed() must be followed by a call to 869 * balance_callback(). 870 */ 871 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 872 const struct sched_class *prev_class, 873 int oldprio) 874 { 875 if (prev_class != p->sched_class) { 876 if (prev_class->switched_from) 877 prev_class->switched_from(rq, p); 878 879 p->sched_class->switched_to(rq, p); 880 } else if (oldprio != p->prio || dl_task(p)) 881 p->sched_class->prio_changed(rq, p, oldprio); 882 } 883 884 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 885 { 886 const struct sched_class *class; 887 888 if (p->sched_class == rq->curr->sched_class) { 889 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 890 } else { 891 for_each_class(class) { 892 if (class == rq->curr->sched_class) 893 break; 894 if (class == p->sched_class) { 895 resched_curr(rq); 896 break; 897 } 898 } 899 } 900 901 /* 902 * A queue event has occurred, and we're going to schedule. In 903 * this case, we can save a useless back to back clock update. 904 */ 905 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 906 rq_clock_skip_update(rq, true); 907 } 908 909 #ifdef CONFIG_SMP 910 /* 911 * This is how migration works: 912 * 913 * 1) we invoke migration_cpu_stop() on the target CPU using 914 * stop_one_cpu(). 915 * 2) stopper starts to run (implicitly forcing the migrated thread 916 * off the CPU) 917 * 3) it checks whether the migrated task is still in the wrong runqueue. 918 * 4) if it's in the wrong runqueue then the migration thread removes 919 * it and puts it into the right queue. 920 * 5) stopper completes and stop_one_cpu() returns and the migration 921 * is done. 922 */ 923 924 /* 925 * move_queued_task - move a queued task to new rq. 926 * 927 * Returns (locked) new rq. Old rq's lock is released. 928 */ 929 static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) 930 { 931 lockdep_assert_held(&rq->lock); 932 933 p->on_rq = TASK_ON_RQ_MIGRATING; 934 dequeue_task(rq, p, 0); 935 set_task_cpu(p, new_cpu); 936 raw_spin_unlock(&rq->lock); 937 938 rq = cpu_rq(new_cpu); 939 940 raw_spin_lock(&rq->lock); 941 BUG_ON(task_cpu(p) != new_cpu); 942 enqueue_task(rq, p, 0); 943 p->on_rq = TASK_ON_RQ_QUEUED; 944 check_preempt_curr(rq, p, 0); 945 946 return rq; 947 } 948 949 struct migration_arg { 950 struct task_struct *task; 951 int dest_cpu; 952 }; 953 954 /* 955 * Move (not current) task off this cpu, onto dest cpu. We're doing 956 * this because either it can't run here any more (set_cpus_allowed() 957 * away from this CPU, or CPU going down), or because we're 958 * attempting to rebalance this task on exec (sched_exec). 959 * 960 * So we race with normal scheduler movements, but that's OK, as long 961 * as the task is no longer on this CPU. 962 */ 963 static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) 964 { 965 if (unlikely(!cpu_active(dest_cpu))) 966 return rq; 967 968 /* Affinity changed (again). */ 969 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 970 return rq; 971 972 rq = move_queued_task(rq, p, dest_cpu); 973 974 return rq; 975 } 976 977 /* 978 * migration_cpu_stop - this will be executed by a highprio stopper thread 979 * and performs thread migration by bumping thread off CPU then 980 * 'pushing' onto another runqueue. 981 */ 982 static int migration_cpu_stop(void *data) 983 { 984 struct migration_arg *arg = data; 985 struct task_struct *p = arg->task; 986 struct rq *rq = this_rq(); 987 988 /* 989 * The original target cpu might have gone down and we might 990 * be on another cpu but it doesn't matter. 991 */ 992 local_irq_disable(); 993 /* 994 * We need to explicitly wake pending tasks before running 995 * __migrate_task() such that we will not miss enforcing cpus_allowed 996 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 997 */ 998 sched_ttwu_pending(); 999 1000 raw_spin_lock(&p->pi_lock); 1001 raw_spin_lock(&rq->lock); 1002 /* 1003 * If task_rq(p) != rq, it cannot be migrated here, because we're 1004 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 1005 * we're holding p->pi_lock. 1006 */ 1007 if (task_rq(p) == rq && task_on_rq_queued(p)) 1008 rq = __migrate_task(rq, p, arg->dest_cpu); 1009 raw_spin_unlock(&rq->lock); 1010 raw_spin_unlock(&p->pi_lock); 1011 1012 local_irq_enable(); 1013 return 0; 1014 } 1015 1016 /* 1017 * sched_class::set_cpus_allowed must do the below, but is not required to 1018 * actually call this function. 1019 */ 1020 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) 1021 { 1022 cpumask_copy(&p->cpus_allowed, new_mask); 1023 p->nr_cpus_allowed = cpumask_weight(new_mask); 1024 } 1025 1026 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1027 { 1028 struct rq *rq = task_rq(p); 1029 bool queued, running; 1030 1031 lockdep_assert_held(&p->pi_lock); 1032 1033 queued = task_on_rq_queued(p); 1034 running = task_current(rq, p); 1035 1036 if (queued) { 1037 /* 1038 * Because __kthread_bind() calls this on blocked tasks without 1039 * holding rq->lock. 1040 */ 1041 lockdep_assert_held(&rq->lock); 1042 dequeue_task(rq, p, DEQUEUE_SAVE); 1043 } 1044 if (running) 1045 put_prev_task(rq, p); 1046 1047 p->sched_class->set_cpus_allowed(p, new_mask); 1048 1049 if (running) 1050 p->sched_class->set_curr_task(rq); 1051 if (queued) 1052 enqueue_task(rq, p, ENQUEUE_RESTORE); 1053 } 1054 1055 /* 1056 * Change a given task's CPU affinity. Migrate the thread to a 1057 * proper CPU and schedule it away if the CPU it's executing on 1058 * is removed from the allowed bitmask. 1059 * 1060 * NOTE: the caller must have a valid reference to the task, the 1061 * task must not exit() & deallocate itself prematurely. The 1062 * call is not atomic; no spinlocks may be held. 1063 */ 1064 static int __set_cpus_allowed_ptr(struct task_struct *p, 1065 const struct cpumask *new_mask, bool check) 1066 { 1067 unsigned long flags; 1068 struct rq *rq; 1069 unsigned int dest_cpu; 1070 int ret = 0; 1071 1072 rq = task_rq_lock(p, &flags); 1073 1074 /* 1075 * Must re-check here, to close a race against __kthread_bind(), 1076 * sched_setaffinity() is not guaranteed to observe the flag. 1077 */ 1078 if (check && (p->flags & PF_NO_SETAFFINITY)) { 1079 ret = -EINVAL; 1080 goto out; 1081 } 1082 1083 if (cpumask_equal(&p->cpus_allowed, new_mask)) 1084 goto out; 1085 1086 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 1087 ret = -EINVAL; 1088 goto out; 1089 } 1090 1091 do_set_cpus_allowed(p, new_mask); 1092 1093 /* Can the task run on the task's current CPU? If so, we're done */ 1094 if (cpumask_test_cpu(task_cpu(p), new_mask)) 1095 goto out; 1096 1097 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 1098 if (task_running(rq, p) || p->state == TASK_WAKING) { 1099 struct migration_arg arg = { p, dest_cpu }; 1100 /* Need help from migration thread: drop lock and wait. */ 1101 task_rq_unlock(rq, p, &flags); 1102 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 1103 tlb_migrate_finish(p->mm); 1104 return 0; 1105 } else if (task_on_rq_queued(p)) { 1106 /* 1107 * OK, since we're going to drop the lock immediately 1108 * afterwards anyway. 1109 */ 1110 lockdep_unpin_lock(&rq->lock); 1111 rq = move_queued_task(rq, p, dest_cpu); 1112 lockdep_pin_lock(&rq->lock); 1113 } 1114 out: 1115 task_rq_unlock(rq, p, &flags); 1116 1117 return ret; 1118 } 1119 1120 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 1121 { 1122 return __set_cpus_allowed_ptr(p, new_mask, false); 1123 } 1124 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 1125 1126 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1127 { 1128 #ifdef CONFIG_SCHED_DEBUG 1129 /* 1130 * We should never call set_task_cpu() on a blocked task, 1131 * ttwu() will sort out the placement. 1132 */ 1133 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1134 !p->on_rq); 1135 1136 /* 1137 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, 1138 * because schedstat_wait_{start,end} rebase migrating task's wait_start 1139 * time relying on p->on_rq. 1140 */ 1141 WARN_ON_ONCE(p->state == TASK_RUNNING && 1142 p->sched_class == &fair_sched_class && 1143 (p->on_rq && !task_on_rq_migrating(p))); 1144 1145 #ifdef CONFIG_LOCKDEP 1146 /* 1147 * The caller should hold either p->pi_lock or rq->lock, when changing 1148 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1149 * 1150 * sched_move_task() holds both and thus holding either pins the cgroup, 1151 * see task_group(). 1152 * 1153 * Furthermore, all task_rq users should acquire both locks, see 1154 * task_rq_lock(). 1155 */ 1156 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 1157 lockdep_is_held(&task_rq(p)->lock))); 1158 #endif 1159 #endif 1160 1161 trace_sched_migrate_task(p, new_cpu); 1162 1163 if (task_cpu(p) != new_cpu) { 1164 if (p->sched_class->migrate_task_rq) 1165 p->sched_class->migrate_task_rq(p); 1166 p->se.nr_migrations++; 1167 perf_event_task_migrate(p); 1168 } 1169 1170 __set_task_cpu(p, new_cpu); 1171 } 1172 1173 static void __migrate_swap_task(struct task_struct *p, int cpu) 1174 { 1175 if (task_on_rq_queued(p)) { 1176 struct rq *src_rq, *dst_rq; 1177 1178 src_rq = task_rq(p); 1179 dst_rq = cpu_rq(cpu); 1180 1181 p->on_rq = TASK_ON_RQ_MIGRATING; 1182 deactivate_task(src_rq, p, 0); 1183 set_task_cpu(p, cpu); 1184 activate_task(dst_rq, p, 0); 1185 p->on_rq = TASK_ON_RQ_QUEUED; 1186 check_preempt_curr(dst_rq, p, 0); 1187 } else { 1188 /* 1189 * Task isn't running anymore; make it appear like we migrated 1190 * it before it went to sleep. This means on wakeup we make the 1191 * previous cpu our targer instead of where it really is. 1192 */ 1193 p->wake_cpu = cpu; 1194 } 1195 } 1196 1197 struct migration_swap_arg { 1198 struct task_struct *src_task, *dst_task; 1199 int src_cpu, dst_cpu; 1200 }; 1201 1202 static int migrate_swap_stop(void *data) 1203 { 1204 struct migration_swap_arg *arg = data; 1205 struct rq *src_rq, *dst_rq; 1206 int ret = -EAGAIN; 1207 1208 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) 1209 return -EAGAIN; 1210 1211 src_rq = cpu_rq(arg->src_cpu); 1212 dst_rq = cpu_rq(arg->dst_cpu); 1213 1214 double_raw_lock(&arg->src_task->pi_lock, 1215 &arg->dst_task->pi_lock); 1216 double_rq_lock(src_rq, dst_rq); 1217 1218 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1219 goto unlock; 1220 1221 if (task_cpu(arg->src_task) != arg->src_cpu) 1222 goto unlock; 1223 1224 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) 1225 goto unlock; 1226 1227 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) 1228 goto unlock; 1229 1230 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1231 __migrate_swap_task(arg->dst_task, arg->src_cpu); 1232 1233 ret = 0; 1234 1235 unlock: 1236 double_rq_unlock(src_rq, dst_rq); 1237 raw_spin_unlock(&arg->dst_task->pi_lock); 1238 raw_spin_unlock(&arg->src_task->pi_lock); 1239 1240 return ret; 1241 } 1242 1243 /* 1244 * Cross migrate two tasks 1245 */ 1246 int migrate_swap(struct task_struct *cur, struct task_struct *p) 1247 { 1248 struct migration_swap_arg arg; 1249 int ret = -EINVAL; 1250 1251 arg = (struct migration_swap_arg){ 1252 .src_task = cur, 1253 .src_cpu = task_cpu(cur), 1254 .dst_task = p, 1255 .dst_cpu = task_cpu(p), 1256 }; 1257 1258 if (arg.src_cpu == arg.dst_cpu) 1259 goto out; 1260 1261 /* 1262 * These three tests are all lockless; this is OK since all of them 1263 * will be re-checked with proper locks held further down the line. 1264 */ 1265 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1266 goto out; 1267 1268 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) 1269 goto out; 1270 1271 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1272 goto out; 1273 1274 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1275 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1276 1277 out: 1278 return ret; 1279 } 1280 1281 /* 1282 * wait_task_inactive - wait for a thread to unschedule. 1283 * 1284 * If @match_state is nonzero, it's the @p->state value just checked and 1285 * not expected to change. If it changes, i.e. @p might have woken up, 1286 * then return zero. When we succeed in waiting for @p to be off its CPU, 1287 * we return a positive number (its total switch count). If a second call 1288 * a short while later returns the same number, the caller can be sure that 1289 * @p has remained unscheduled the whole time. 1290 * 1291 * The caller must ensure that the task *will* unschedule sometime soon, 1292 * else this function might spin for a *long* time. This function can't 1293 * be called with interrupts off, or it may introduce deadlock with 1294 * smp_call_function() if an IPI is sent by the same process we are 1295 * waiting to become inactive. 1296 */ 1297 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1298 { 1299 unsigned long flags; 1300 int running, queued; 1301 unsigned long ncsw; 1302 struct rq *rq; 1303 1304 for (;;) { 1305 /* 1306 * We do the initial early heuristics without holding 1307 * any task-queue locks at all. We'll only try to get 1308 * the runqueue lock when things look like they will 1309 * work out! 1310 */ 1311 rq = task_rq(p); 1312 1313 /* 1314 * If the task is actively running on another CPU 1315 * still, just relax and busy-wait without holding 1316 * any locks. 1317 * 1318 * NOTE! Since we don't hold any locks, it's not 1319 * even sure that "rq" stays as the right runqueue! 1320 * But we don't care, since "task_running()" will 1321 * return false if the runqueue has changed and p 1322 * is actually now running somewhere else! 1323 */ 1324 while (task_running(rq, p)) { 1325 if (match_state && unlikely(p->state != match_state)) 1326 return 0; 1327 cpu_relax(); 1328 } 1329 1330 /* 1331 * Ok, time to look more closely! We need the rq 1332 * lock now, to be *sure*. If we're wrong, we'll 1333 * just go back and repeat. 1334 */ 1335 rq = task_rq_lock(p, &flags); 1336 trace_sched_wait_task(p); 1337 running = task_running(rq, p); 1338 queued = task_on_rq_queued(p); 1339 ncsw = 0; 1340 if (!match_state || p->state == match_state) 1341 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1342 task_rq_unlock(rq, p, &flags); 1343 1344 /* 1345 * If it changed from the expected state, bail out now. 1346 */ 1347 if (unlikely(!ncsw)) 1348 break; 1349 1350 /* 1351 * Was it really running after all now that we 1352 * checked with the proper locks actually held? 1353 * 1354 * Oops. Go back and try again.. 1355 */ 1356 if (unlikely(running)) { 1357 cpu_relax(); 1358 continue; 1359 } 1360 1361 /* 1362 * It's not enough that it's not actively running, 1363 * it must be off the runqueue _entirely_, and not 1364 * preempted! 1365 * 1366 * So if it was still runnable (but just not actively 1367 * running right now), it's preempted, and we should 1368 * yield - it could be a while. 1369 */ 1370 if (unlikely(queued)) { 1371 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1372 1373 set_current_state(TASK_UNINTERRUPTIBLE); 1374 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1375 continue; 1376 } 1377 1378 /* 1379 * Ahh, all good. It wasn't running, and it wasn't 1380 * runnable, which means that it will never become 1381 * running in the future either. We're all done! 1382 */ 1383 break; 1384 } 1385 1386 return ncsw; 1387 } 1388 1389 /*** 1390 * kick_process - kick a running thread to enter/exit the kernel 1391 * @p: the to-be-kicked thread 1392 * 1393 * Cause a process which is running on another CPU to enter 1394 * kernel-mode, without any delay. (to get signals handled.) 1395 * 1396 * NOTE: this function doesn't have to take the runqueue lock, 1397 * because all it wants to ensure is that the remote task enters 1398 * the kernel. If the IPI races and the task has been migrated 1399 * to another CPU then no harm is done and the purpose has been 1400 * achieved as well. 1401 */ 1402 void kick_process(struct task_struct *p) 1403 { 1404 int cpu; 1405 1406 preempt_disable(); 1407 cpu = task_cpu(p); 1408 if ((cpu != smp_processor_id()) && task_curr(p)) 1409 smp_send_reschedule(cpu); 1410 preempt_enable(); 1411 } 1412 EXPORT_SYMBOL_GPL(kick_process); 1413 1414 /* 1415 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1416 */ 1417 static int select_fallback_rq(int cpu, struct task_struct *p) 1418 { 1419 int nid = cpu_to_node(cpu); 1420 const struct cpumask *nodemask = NULL; 1421 enum { cpuset, possible, fail } state = cpuset; 1422 int dest_cpu; 1423 1424 /* 1425 * If the node that the cpu is on has been offlined, cpu_to_node() 1426 * will return -1. There is no cpu on the node, and we should 1427 * select the cpu on the other node. 1428 */ 1429 if (nid != -1) { 1430 nodemask = cpumask_of_node(nid); 1431 1432 /* Look for allowed, online CPU in same node. */ 1433 for_each_cpu(dest_cpu, nodemask) { 1434 if (!cpu_online(dest_cpu)) 1435 continue; 1436 if (!cpu_active(dest_cpu)) 1437 continue; 1438 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1439 return dest_cpu; 1440 } 1441 } 1442 1443 for (;;) { 1444 /* Any allowed, online CPU? */ 1445 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1446 if (!cpu_online(dest_cpu)) 1447 continue; 1448 if (!cpu_active(dest_cpu)) 1449 continue; 1450 goto out; 1451 } 1452 1453 /* No more Mr. Nice Guy. */ 1454 switch (state) { 1455 case cpuset: 1456 if (IS_ENABLED(CONFIG_CPUSETS)) { 1457 cpuset_cpus_allowed_fallback(p); 1458 state = possible; 1459 break; 1460 } 1461 /* fall-through */ 1462 case possible: 1463 do_set_cpus_allowed(p, cpu_possible_mask); 1464 state = fail; 1465 break; 1466 1467 case fail: 1468 BUG(); 1469 break; 1470 } 1471 } 1472 1473 out: 1474 if (state != cpuset) { 1475 /* 1476 * Don't tell them about moving exiting tasks or 1477 * kernel threads (both mm NULL), since they never 1478 * leave kernel. 1479 */ 1480 if (p->mm && printk_ratelimit()) { 1481 printk_deferred("process %d (%s) no longer affine to cpu%d\n", 1482 task_pid_nr(p), p->comm, cpu); 1483 } 1484 } 1485 1486 return dest_cpu; 1487 } 1488 1489 /* 1490 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1491 */ 1492 static inline 1493 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1494 { 1495 lockdep_assert_held(&p->pi_lock); 1496 1497 if (p->nr_cpus_allowed > 1) 1498 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1499 1500 /* 1501 * In order not to call set_task_cpu() on a blocking task we need 1502 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1503 * cpu. 1504 * 1505 * Since this is common to all placement strategies, this lives here. 1506 * 1507 * [ this allows ->select_task() to simply return task_cpu(p) and 1508 * not worry about this generic constraint ] 1509 */ 1510 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1511 !cpu_online(cpu))) 1512 cpu = select_fallback_rq(task_cpu(p), p); 1513 1514 return cpu; 1515 } 1516 1517 static void update_avg(u64 *avg, u64 sample) 1518 { 1519 s64 diff = sample - *avg; 1520 *avg += diff >> 3; 1521 } 1522 1523 #else 1524 1525 static inline int __set_cpus_allowed_ptr(struct task_struct *p, 1526 const struct cpumask *new_mask, bool check) 1527 { 1528 return set_cpus_allowed_ptr(p, new_mask); 1529 } 1530 1531 #endif /* CONFIG_SMP */ 1532 1533 static void 1534 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1535 { 1536 #ifdef CONFIG_SCHEDSTATS 1537 struct rq *rq = this_rq(); 1538 1539 #ifdef CONFIG_SMP 1540 int this_cpu = smp_processor_id(); 1541 1542 if (cpu == this_cpu) { 1543 schedstat_inc(rq, ttwu_local); 1544 schedstat_inc(p, se.statistics.nr_wakeups_local); 1545 } else { 1546 struct sched_domain *sd; 1547 1548 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1549 rcu_read_lock(); 1550 for_each_domain(this_cpu, sd) { 1551 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1552 schedstat_inc(sd, ttwu_wake_remote); 1553 break; 1554 } 1555 } 1556 rcu_read_unlock(); 1557 } 1558 1559 if (wake_flags & WF_MIGRATED) 1560 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1561 1562 #endif /* CONFIG_SMP */ 1563 1564 schedstat_inc(rq, ttwu_count); 1565 schedstat_inc(p, se.statistics.nr_wakeups); 1566 1567 if (wake_flags & WF_SYNC) 1568 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1569 1570 #endif /* CONFIG_SCHEDSTATS */ 1571 } 1572 1573 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1574 { 1575 activate_task(rq, p, en_flags); 1576 p->on_rq = TASK_ON_RQ_QUEUED; 1577 1578 /* if a worker is waking up, notify workqueue */ 1579 if (p->flags & PF_WQ_WORKER) 1580 wq_worker_waking_up(p, cpu_of(rq)); 1581 } 1582 1583 /* 1584 * Mark the task runnable and perform wakeup-preemption. 1585 */ 1586 static void 1587 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1588 { 1589 check_preempt_curr(rq, p, wake_flags); 1590 p->state = TASK_RUNNING; 1591 trace_sched_wakeup(p); 1592 1593 #ifdef CONFIG_SMP 1594 if (p->sched_class->task_woken) { 1595 /* 1596 * Our task @p is fully woken up and running; so its safe to 1597 * drop the rq->lock, hereafter rq is only used for statistics. 1598 */ 1599 lockdep_unpin_lock(&rq->lock); 1600 p->sched_class->task_woken(rq, p); 1601 lockdep_pin_lock(&rq->lock); 1602 } 1603 1604 if (rq->idle_stamp) { 1605 u64 delta = rq_clock(rq) - rq->idle_stamp; 1606 u64 max = 2*rq->max_idle_balance_cost; 1607 1608 update_avg(&rq->avg_idle, delta); 1609 1610 if (rq->avg_idle > max) 1611 rq->avg_idle = max; 1612 1613 rq->idle_stamp = 0; 1614 } 1615 #endif 1616 } 1617 1618 static void 1619 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 1620 { 1621 lockdep_assert_held(&rq->lock); 1622 1623 #ifdef CONFIG_SMP 1624 if (p->sched_contributes_to_load) 1625 rq->nr_uninterruptible--; 1626 #endif 1627 1628 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 1629 ttwu_do_wakeup(rq, p, wake_flags); 1630 } 1631 1632 /* 1633 * Called in case the task @p isn't fully descheduled from its runqueue, 1634 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1635 * since all we need to do is flip p->state to TASK_RUNNING, since 1636 * the task is still ->on_rq. 1637 */ 1638 static int ttwu_remote(struct task_struct *p, int wake_flags) 1639 { 1640 struct rq *rq; 1641 int ret = 0; 1642 1643 rq = __task_rq_lock(p); 1644 if (task_on_rq_queued(p)) { 1645 /* check_preempt_curr() may use rq clock */ 1646 update_rq_clock(rq); 1647 ttwu_do_wakeup(rq, p, wake_flags); 1648 ret = 1; 1649 } 1650 __task_rq_unlock(rq); 1651 1652 return ret; 1653 } 1654 1655 #ifdef CONFIG_SMP 1656 void sched_ttwu_pending(void) 1657 { 1658 struct rq *rq = this_rq(); 1659 struct llist_node *llist = llist_del_all(&rq->wake_list); 1660 struct task_struct *p; 1661 unsigned long flags; 1662 1663 if (!llist) 1664 return; 1665 1666 raw_spin_lock_irqsave(&rq->lock, flags); 1667 lockdep_pin_lock(&rq->lock); 1668 1669 while (llist) { 1670 p = llist_entry(llist, struct task_struct, wake_entry); 1671 llist = llist_next(llist); 1672 ttwu_do_activate(rq, p, 0); 1673 } 1674 1675 lockdep_unpin_lock(&rq->lock); 1676 raw_spin_unlock_irqrestore(&rq->lock, flags); 1677 } 1678 1679 void scheduler_ipi(void) 1680 { 1681 /* 1682 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting 1683 * TIF_NEED_RESCHED remotely (for the first time) will also send 1684 * this IPI. 1685 */ 1686 preempt_fold_need_resched(); 1687 1688 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1689 return; 1690 1691 /* 1692 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1693 * traditionally all their work was done from the interrupt return 1694 * path. Now that we actually do some work, we need to make sure 1695 * we do call them. 1696 * 1697 * Some archs already do call them, luckily irq_enter/exit nest 1698 * properly. 1699 * 1700 * Arguably we should visit all archs and update all handlers, 1701 * however a fair share of IPIs are still resched only so this would 1702 * somewhat pessimize the simple resched case. 1703 */ 1704 irq_enter(); 1705 sched_ttwu_pending(); 1706 1707 /* 1708 * Check if someone kicked us for doing the nohz idle load balance. 1709 */ 1710 if (unlikely(got_nohz_idle_kick())) { 1711 this_rq()->idle_balance = 1; 1712 raise_softirq_irqoff(SCHED_SOFTIRQ); 1713 } 1714 irq_exit(); 1715 } 1716 1717 static void ttwu_queue_remote(struct task_struct *p, int cpu) 1718 { 1719 struct rq *rq = cpu_rq(cpu); 1720 1721 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { 1722 if (!set_nr_if_polling(rq->idle)) 1723 smp_send_reschedule(cpu); 1724 else 1725 trace_sched_wake_idle_without_ipi(cpu); 1726 } 1727 } 1728 1729 void wake_up_if_idle(int cpu) 1730 { 1731 struct rq *rq = cpu_rq(cpu); 1732 unsigned long flags; 1733 1734 rcu_read_lock(); 1735 1736 if (!is_idle_task(rcu_dereference(rq->curr))) 1737 goto out; 1738 1739 if (set_nr_if_polling(rq->idle)) { 1740 trace_sched_wake_idle_without_ipi(cpu); 1741 } else { 1742 raw_spin_lock_irqsave(&rq->lock, flags); 1743 if (is_idle_task(rq->curr)) 1744 smp_send_reschedule(cpu); 1745 /* Else cpu is not in idle, do nothing here */ 1746 raw_spin_unlock_irqrestore(&rq->lock, flags); 1747 } 1748 1749 out: 1750 rcu_read_unlock(); 1751 } 1752 1753 bool cpus_share_cache(int this_cpu, int that_cpu) 1754 { 1755 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1756 } 1757 #endif /* CONFIG_SMP */ 1758 1759 static void ttwu_queue(struct task_struct *p, int cpu) 1760 { 1761 struct rq *rq = cpu_rq(cpu); 1762 1763 #if defined(CONFIG_SMP) 1764 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1765 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1766 ttwu_queue_remote(p, cpu); 1767 return; 1768 } 1769 #endif 1770 1771 raw_spin_lock(&rq->lock); 1772 lockdep_pin_lock(&rq->lock); 1773 ttwu_do_activate(rq, p, 0); 1774 lockdep_unpin_lock(&rq->lock); 1775 raw_spin_unlock(&rq->lock); 1776 } 1777 1778 /* 1779 * Notes on Program-Order guarantees on SMP systems. 1780 * 1781 * MIGRATION 1782 * 1783 * The basic program-order guarantee on SMP systems is that when a task [t] 1784 * migrates, all its activity on its old cpu [c0] happens-before any subsequent 1785 * execution on its new cpu [c1]. 1786 * 1787 * For migration (of runnable tasks) this is provided by the following means: 1788 * 1789 * A) UNLOCK of the rq(c0)->lock scheduling out task t 1790 * B) migration for t is required to synchronize *both* rq(c0)->lock and 1791 * rq(c1)->lock (if not at the same time, then in that order). 1792 * C) LOCK of the rq(c1)->lock scheduling in task 1793 * 1794 * Transitivity guarantees that B happens after A and C after B. 1795 * Note: we only require RCpc transitivity. 1796 * Note: the cpu doing B need not be c0 or c1 1797 * 1798 * Example: 1799 * 1800 * CPU0 CPU1 CPU2 1801 * 1802 * LOCK rq(0)->lock 1803 * sched-out X 1804 * sched-in Y 1805 * UNLOCK rq(0)->lock 1806 * 1807 * LOCK rq(0)->lock // orders against CPU0 1808 * dequeue X 1809 * UNLOCK rq(0)->lock 1810 * 1811 * LOCK rq(1)->lock 1812 * enqueue X 1813 * UNLOCK rq(1)->lock 1814 * 1815 * LOCK rq(1)->lock // orders against CPU2 1816 * sched-out Z 1817 * sched-in X 1818 * UNLOCK rq(1)->lock 1819 * 1820 * 1821 * BLOCKING -- aka. SLEEP + WAKEUP 1822 * 1823 * For blocking we (obviously) need to provide the same guarantee as for 1824 * migration. However the means are completely different as there is no lock 1825 * chain to provide order. Instead we do: 1826 * 1827 * 1) smp_store_release(X->on_cpu, 0) 1828 * 2) smp_cond_acquire(!X->on_cpu) 1829 * 1830 * Example: 1831 * 1832 * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) 1833 * 1834 * LOCK rq(0)->lock LOCK X->pi_lock 1835 * dequeue X 1836 * sched-out X 1837 * smp_store_release(X->on_cpu, 0); 1838 * 1839 * smp_cond_acquire(!X->on_cpu); 1840 * X->state = WAKING 1841 * set_task_cpu(X,2) 1842 * 1843 * LOCK rq(2)->lock 1844 * enqueue X 1845 * X->state = RUNNING 1846 * UNLOCK rq(2)->lock 1847 * 1848 * LOCK rq(2)->lock // orders against CPU1 1849 * sched-out Z 1850 * sched-in X 1851 * UNLOCK rq(2)->lock 1852 * 1853 * UNLOCK X->pi_lock 1854 * UNLOCK rq(0)->lock 1855 * 1856 * 1857 * However; for wakeups there is a second guarantee we must provide, namely we 1858 * must observe the state that lead to our wakeup. That is, not only must our 1859 * task observe its own prior state, it must also observe the stores prior to 1860 * its wakeup. 1861 * 1862 * This means that any means of doing remote wakeups must order the CPU doing 1863 * the wakeup against the CPU the task is going to end up running on. This, 1864 * however, is already required for the regular Program-Order guarantee above, 1865 * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). 1866 * 1867 */ 1868 1869 /** 1870 * try_to_wake_up - wake up a thread 1871 * @p: the thread to be awakened 1872 * @state: the mask of task states that can be woken 1873 * @wake_flags: wake modifier flags (WF_*) 1874 * 1875 * Put it on the run-queue if it's not already there. The "current" 1876 * thread is always on the run-queue (except when the actual 1877 * re-schedule is in progress), and as such you're allowed to do 1878 * the simpler "current->state = TASK_RUNNING" to mark yourself 1879 * runnable without the overhead of this. 1880 * 1881 * Return: %true if @p was woken up, %false if it was already running. 1882 * or @state didn't match @p's state. 1883 */ 1884 static int 1885 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1886 { 1887 unsigned long flags; 1888 int cpu, success = 0; 1889 1890 /* 1891 * If we are going to wake up a thread waiting for CONDITION we 1892 * need to ensure that CONDITION=1 done by the caller can not be 1893 * reordered with p->state check below. This pairs with mb() in 1894 * set_current_state() the waiting thread does. 1895 */ 1896 smp_mb__before_spinlock(); 1897 raw_spin_lock_irqsave(&p->pi_lock, flags); 1898 if (!(p->state & state)) 1899 goto out; 1900 1901 trace_sched_waking(p); 1902 1903 success = 1; /* we're going to change ->state */ 1904 cpu = task_cpu(p); 1905 1906 if (p->on_rq && ttwu_remote(p, wake_flags)) 1907 goto stat; 1908 1909 #ifdef CONFIG_SMP 1910 /* 1911 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be 1912 * possible to, falsely, observe p->on_cpu == 0. 1913 * 1914 * One must be running (->on_cpu == 1) in order to remove oneself 1915 * from the runqueue. 1916 * 1917 * [S] ->on_cpu = 1; [L] ->on_rq 1918 * UNLOCK rq->lock 1919 * RMB 1920 * LOCK rq->lock 1921 * [S] ->on_rq = 0; [L] ->on_cpu 1922 * 1923 * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock 1924 * from the consecutive calls to schedule(); the first switching to our 1925 * task, the second putting it to sleep. 1926 */ 1927 smp_rmb(); 1928 1929 /* 1930 * If the owning (remote) cpu is still in the middle of schedule() with 1931 * this task as prev, wait until its done referencing the task. 1932 * 1933 * Pairs with the smp_store_release() in finish_lock_switch(). 1934 * 1935 * This ensures that tasks getting woken will be fully ordered against 1936 * their previous state and preserve Program Order. 1937 */ 1938 smp_cond_acquire(!p->on_cpu); 1939 1940 p->sched_contributes_to_load = !!task_contributes_to_load(p); 1941 p->state = TASK_WAKING; 1942 1943 if (p->sched_class->task_waking) 1944 p->sched_class->task_waking(p); 1945 1946 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 1947 if (task_cpu(p) != cpu) { 1948 wake_flags |= WF_MIGRATED; 1949 set_task_cpu(p, cpu); 1950 } 1951 #endif /* CONFIG_SMP */ 1952 1953 ttwu_queue(p, cpu); 1954 stat: 1955 if (schedstat_enabled()) 1956 ttwu_stat(p, cpu, wake_flags); 1957 out: 1958 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1959 1960 return success; 1961 } 1962 1963 /** 1964 * try_to_wake_up_local - try to wake up a local task with rq lock held 1965 * @p: the thread to be awakened 1966 * 1967 * Put @p on the run-queue if it's not already there. The caller must 1968 * ensure that this_rq() is locked, @p is bound to this_rq() and not 1969 * the current task. 1970 */ 1971 static void try_to_wake_up_local(struct task_struct *p) 1972 { 1973 struct rq *rq = task_rq(p); 1974 1975 if (WARN_ON_ONCE(rq != this_rq()) || 1976 WARN_ON_ONCE(p == current)) 1977 return; 1978 1979 lockdep_assert_held(&rq->lock); 1980 1981 if (!raw_spin_trylock(&p->pi_lock)) { 1982 /* 1983 * This is OK, because current is on_cpu, which avoids it being 1984 * picked for load-balance and preemption/IRQs are still 1985 * disabled avoiding further scheduler activity on it and we've 1986 * not yet picked a replacement task. 1987 */ 1988 lockdep_unpin_lock(&rq->lock); 1989 raw_spin_unlock(&rq->lock); 1990 raw_spin_lock(&p->pi_lock); 1991 raw_spin_lock(&rq->lock); 1992 lockdep_pin_lock(&rq->lock); 1993 } 1994 1995 if (!(p->state & TASK_NORMAL)) 1996 goto out; 1997 1998 trace_sched_waking(p); 1999 2000 if (!task_on_rq_queued(p)) 2001 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2002 2003 ttwu_do_wakeup(rq, p, 0); 2004 if (schedstat_enabled()) 2005 ttwu_stat(p, smp_processor_id(), 0); 2006 out: 2007 raw_spin_unlock(&p->pi_lock); 2008 } 2009 2010 /** 2011 * wake_up_process - Wake up a specific process 2012 * @p: The process to be woken up. 2013 * 2014 * Attempt to wake up the nominated process and move it to the set of runnable 2015 * processes. 2016 * 2017 * Return: 1 if the process was woken up, 0 if it was already running. 2018 * 2019 * It may be assumed that this function implies a write memory barrier before 2020 * changing the task state if and only if any tasks are woken up. 2021 */ 2022 int wake_up_process(struct task_struct *p) 2023 { 2024 return try_to_wake_up(p, TASK_NORMAL, 0); 2025 } 2026 EXPORT_SYMBOL(wake_up_process); 2027 2028 int wake_up_state(struct task_struct *p, unsigned int state) 2029 { 2030 return try_to_wake_up(p, state, 0); 2031 } 2032 2033 /* 2034 * This function clears the sched_dl_entity static params. 2035 */ 2036 void __dl_clear_params(struct task_struct *p) 2037 { 2038 struct sched_dl_entity *dl_se = &p->dl; 2039 2040 dl_se->dl_runtime = 0; 2041 dl_se->dl_deadline = 0; 2042 dl_se->dl_period = 0; 2043 dl_se->flags = 0; 2044 dl_se->dl_bw = 0; 2045 2046 dl_se->dl_throttled = 0; 2047 dl_se->dl_yielded = 0; 2048 } 2049 2050 /* 2051 * Perform scheduler related setup for a newly forked process p. 2052 * p is forked by current. 2053 * 2054 * __sched_fork() is basic setup used by init_idle() too: 2055 */ 2056 static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 2057 { 2058 p->on_rq = 0; 2059 2060 p->se.on_rq = 0; 2061 p->se.exec_start = 0; 2062 p->se.sum_exec_runtime = 0; 2063 p->se.prev_sum_exec_runtime = 0; 2064 p->se.nr_migrations = 0; 2065 p->se.vruntime = 0; 2066 INIT_LIST_HEAD(&p->se.group_node); 2067 2068 #ifdef CONFIG_FAIR_GROUP_SCHED 2069 p->se.cfs_rq = NULL; 2070 #endif 2071 2072 #ifdef CONFIG_SCHEDSTATS 2073 /* Even if schedstat is disabled, there should not be garbage */ 2074 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2075 #endif 2076 2077 RB_CLEAR_NODE(&p->dl.rb_node); 2078 init_dl_task_timer(&p->dl); 2079 __dl_clear_params(p); 2080 2081 INIT_LIST_HEAD(&p->rt.run_list); 2082 p->rt.timeout = 0; 2083 p->rt.time_slice = sched_rr_timeslice; 2084 p->rt.on_rq = 0; 2085 p->rt.on_list = 0; 2086 2087 #ifdef CONFIG_PREEMPT_NOTIFIERS 2088 INIT_HLIST_HEAD(&p->preempt_notifiers); 2089 #endif 2090 2091 #ifdef CONFIG_NUMA_BALANCING 2092 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 2093 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 2094 p->mm->numa_scan_seq = 0; 2095 } 2096 2097 if (clone_flags & CLONE_VM) 2098 p->numa_preferred_nid = current->numa_preferred_nid; 2099 else 2100 p->numa_preferred_nid = -1; 2101 2102 p->node_stamp = 0ULL; 2103 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 2104 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 2105 p->numa_work.next = &p->numa_work; 2106 p->numa_faults = NULL; 2107 p->last_task_numa_placement = 0; 2108 p->last_sum_exec_runtime = 0; 2109 2110 p->numa_group = NULL; 2111 #endif /* CONFIG_NUMA_BALANCING */ 2112 } 2113 2114 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); 2115 2116 #ifdef CONFIG_NUMA_BALANCING 2117 2118 void set_numabalancing_state(bool enabled) 2119 { 2120 if (enabled) 2121 static_branch_enable(&sched_numa_balancing); 2122 else 2123 static_branch_disable(&sched_numa_balancing); 2124 } 2125 2126 #ifdef CONFIG_PROC_SYSCTL 2127 int sysctl_numa_balancing(struct ctl_table *table, int write, 2128 void __user *buffer, size_t *lenp, loff_t *ppos) 2129 { 2130 struct ctl_table t; 2131 int err; 2132 int state = static_branch_likely(&sched_numa_balancing); 2133 2134 if (write && !capable(CAP_SYS_ADMIN)) 2135 return -EPERM; 2136 2137 t = *table; 2138 t.data = &state; 2139 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 2140 if (err < 0) 2141 return err; 2142 if (write) 2143 set_numabalancing_state(state); 2144 return err; 2145 } 2146 #endif 2147 #endif 2148 2149 DEFINE_STATIC_KEY_FALSE(sched_schedstats); 2150 2151 #ifdef CONFIG_SCHEDSTATS 2152 static void set_schedstats(bool enabled) 2153 { 2154 if (enabled) 2155 static_branch_enable(&sched_schedstats); 2156 else 2157 static_branch_disable(&sched_schedstats); 2158 } 2159 2160 void force_schedstat_enabled(void) 2161 { 2162 if (!schedstat_enabled()) { 2163 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); 2164 static_branch_enable(&sched_schedstats); 2165 } 2166 } 2167 2168 static int __init setup_schedstats(char *str) 2169 { 2170 int ret = 0; 2171 if (!str) 2172 goto out; 2173 2174 if (!strcmp(str, "enable")) { 2175 set_schedstats(true); 2176 ret = 1; 2177 } else if (!strcmp(str, "disable")) { 2178 set_schedstats(false); 2179 ret = 1; 2180 } 2181 out: 2182 if (!ret) 2183 pr_warn("Unable to parse schedstats=\n"); 2184 2185 return ret; 2186 } 2187 __setup("schedstats=", setup_schedstats); 2188 2189 #ifdef CONFIG_PROC_SYSCTL 2190 int sysctl_schedstats(struct ctl_table *table, int write, 2191 void __user *buffer, size_t *lenp, loff_t *ppos) 2192 { 2193 struct ctl_table t; 2194 int err; 2195 int state = static_branch_likely(&sched_schedstats); 2196 2197 if (write && !capable(CAP_SYS_ADMIN)) 2198 return -EPERM; 2199 2200 t = *table; 2201 t.data = &state; 2202 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 2203 if (err < 0) 2204 return err; 2205 if (write) 2206 set_schedstats(state); 2207 return err; 2208 } 2209 #endif 2210 #endif 2211 2212 /* 2213 * fork()/clone()-time setup: 2214 */ 2215 int sched_fork(unsigned long clone_flags, struct task_struct *p) 2216 { 2217 unsigned long flags; 2218 int cpu = get_cpu(); 2219 2220 __sched_fork(clone_flags, p); 2221 /* 2222 * We mark the process as running here. This guarantees that 2223 * nobody will actually run it, and a signal or other external 2224 * event cannot wake it up and insert it on the runqueue either. 2225 */ 2226 p->state = TASK_RUNNING; 2227 2228 /* 2229 * Make sure we do not leak PI boosting priority to the child. 2230 */ 2231 p->prio = current->normal_prio; 2232 2233 /* 2234 * Revert to default priority/policy on fork if requested. 2235 */ 2236 if (unlikely(p->sched_reset_on_fork)) { 2237 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 2238 p->policy = SCHED_NORMAL; 2239 p->static_prio = NICE_TO_PRIO(0); 2240 p->rt_priority = 0; 2241 } else if (PRIO_TO_NICE(p->static_prio) < 0) 2242 p->static_prio = NICE_TO_PRIO(0); 2243 2244 p->prio = p->normal_prio = __normal_prio(p); 2245 set_load_weight(p); 2246 2247 /* 2248 * We don't need the reset flag anymore after the fork. It has 2249 * fulfilled its duty: 2250 */ 2251 p->sched_reset_on_fork = 0; 2252 } 2253 2254 if (dl_prio(p->prio)) { 2255 put_cpu(); 2256 return -EAGAIN; 2257 } else if (rt_prio(p->prio)) { 2258 p->sched_class = &rt_sched_class; 2259 } else { 2260 p->sched_class = &fair_sched_class; 2261 } 2262 2263 if (p->sched_class->task_fork) 2264 p->sched_class->task_fork(p); 2265 2266 /* 2267 * The child is not yet in the pid-hash so no cgroup attach races, 2268 * and the cgroup is pinned to this child due to cgroup_fork() 2269 * is ran before sched_fork(). 2270 * 2271 * Silence PROVE_RCU. 2272 */ 2273 raw_spin_lock_irqsave(&p->pi_lock, flags); 2274 set_task_cpu(p, cpu); 2275 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2276 2277 #ifdef CONFIG_SCHED_INFO 2278 if (likely(sched_info_on())) 2279 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2280 #endif 2281 #if defined(CONFIG_SMP) 2282 p->on_cpu = 0; 2283 #endif 2284 init_task_preempt_count(p); 2285 #ifdef CONFIG_SMP 2286 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2287 RB_CLEAR_NODE(&p->pushable_dl_tasks); 2288 #endif 2289 2290 put_cpu(); 2291 return 0; 2292 } 2293 2294 unsigned long to_ratio(u64 period, u64 runtime) 2295 { 2296 if (runtime == RUNTIME_INF) 2297 return 1ULL << 20; 2298 2299 /* 2300 * Doing this here saves a lot of checks in all 2301 * the calling paths, and returning zero seems 2302 * safe for them anyway. 2303 */ 2304 if (period == 0) 2305 return 0; 2306 2307 return div64_u64(runtime << 20, period); 2308 } 2309 2310 #ifdef CONFIG_SMP 2311 inline struct dl_bw *dl_bw_of(int i) 2312 { 2313 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), 2314 "sched RCU must be held"); 2315 return &cpu_rq(i)->rd->dl_bw; 2316 } 2317 2318 static inline int dl_bw_cpus(int i) 2319 { 2320 struct root_domain *rd = cpu_rq(i)->rd; 2321 int cpus = 0; 2322 2323 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), 2324 "sched RCU must be held"); 2325 for_each_cpu_and(i, rd->span, cpu_active_mask) 2326 cpus++; 2327 2328 return cpus; 2329 } 2330 #else 2331 inline struct dl_bw *dl_bw_of(int i) 2332 { 2333 return &cpu_rq(i)->dl.dl_bw; 2334 } 2335 2336 static inline int dl_bw_cpus(int i) 2337 { 2338 return 1; 2339 } 2340 #endif 2341 2342 /* 2343 * We must be sure that accepting a new task (or allowing changing the 2344 * parameters of an existing one) is consistent with the bandwidth 2345 * constraints. If yes, this function also accordingly updates the currently 2346 * allocated bandwidth to reflect the new situation. 2347 * 2348 * This function is called while holding p's rq->lock. 2349 * 2350 * XXX we should delay bw change until the task's 0-lag point, see 2351 * __setparam_dl(). 2352 */ 2353 static int dl_overflow(struct task_struct *p, int policy, 2354 const struct sched_attr *attr) 2355 { 2356 2357 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 2358 u64 period = attr->sched_period ?: attr->sched_deadline; 2359 u64 runtime = attr->sched_runtime; 2360 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; 2361 int cpus, err = -1; 2362 2363 if (new_bw == p->dl.dl_bw) 2364 return 0; 2365 2366 /* 2367 * Either if a task, enters, leave, or stays -deadline but changes 2368 * its parameters, we may need to update accordingly the total 2369 * allocated bandwidth of the container. 2370 */ 2371 raw_spin_lock(&dl_b->lock); 2372 cpus = dl_bw_cpus(task_cpu(p)); 2373 if (dl_policy(policy) && !task_has_dl_policy(p) && 2374 !__dl_overflow(dl_b, cpus, 0, new_bw)) { 2375 __dl_add(dl_b, new_bw); 2376 err = 0; 2377 } else if (dl_policy(policy) && task_has_dl_policy(p) && 2378 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { 2379 __dl_clear(dl_b, p->dl.dl_bw); 2380 __dl_add(dl_b, new_bw); 2381 err = 0; 2382 } else if (!dl_policy(policy) && task_has_dl_policy(p)) { 2383 __dl_clear(dl_b, p->dl.dl_bw); 2384 err = 0; 2385 } 2386 raw_spin_unlock(&dl_b->lock); 2387 2388 return err; 2389 } 2390 2391 extern void init_dl_bw(struct dl_bw *dl_b); 2392 2393 /* 2394 * wake_up_new_task - wake up a newly created task for the first time. 2395 * 2396 * This function will do some initial scheduler statistics housekeeping 2397 * that must be done for every newly created context, then puts the task 2398 * on the runqueue and wakes it. 2399 */ 2400 void wake_up_new_task(struct task_struct *p) 2401 { 2402 unsigned long flags; 2403 struct rq *rq; 2404 2405 raw_spin_lock_irqsave(&p->pi_lock, flags); 2406 /* Initialize new task's runnable average */ 2407 init_entity_runnable_average(&p->se); 2408 #ifdef CONFIG_SMP 2409 /* 2410 * Fork balancing, do it here and not earlier because: 2411 * - cpus_allowed can change in the fork path 2412 * - any previously selected cpu might disappear through hotplug 2413 */ 2414 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2415 #endif 2416 2417 rq = __task_rq_lock(p); 2418 activate_task(rq, p, 0); 2419 p->on_rq = TASK_ON_RQ_QUEUED; 2420 trace_sched_wakeup_new(p); 2421 check_preempt_curr(rq, p, WF_FORK); 2422 #ifdef CONFIG_SMP 2423 if (p->sched_class->task_woken) { 2424 /* 2425 * Nothing relies on rq->lock after this, so its fine to 2426 * drop it. 2427 */ 2428 lockdep_unpin_lock(&rq->lock); 2429 p->sched_class->task_woken(rq, p); 2430 lockdep_pin_lock(&rq->lock); 2431 } 2432 #endif 2433 task_rq_unlock(rq, p, &flags); 2434 } 2435 2436 #ifdef CONFIG_PREEMPT_NOTIFIERS 2437 2438 static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; 2439 2440 void preempt_notifier_inc(void) 2441 { 2442 static_key_slow_inc(&preempt_notifier_key); 2443 } 2444 EXPORT_SYMBOL_GPL(preempt_notifier_inc); 2445 2446 void preempt_notifier_dec(void) 2447 { 2448 static_key_slow_dec(&preempt_notifier_key); 2449 } 2450 EXPORT_SYMBOL_GPL(preempt_notifier_dec); 2451 2452 /** 2453 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2454 * @notifier: notifier struct to register 2455 */ 2456 void preempt_notifier_register(struct preempt_notifier *notifier) 2457 { 2458 if (!static_key_false(&preempt_notifier_key)) 2459 WARN(1, "registering preempt_notifier while notifiers disabled\n"); 2460 2461 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 2462 } 2463 EXPORT_SYMBOL_GPL(preempt_notifier_register); 2464 2465 /** 2466 * preempt_notifier_unregister - no longer interested in preemption notifications 2467 * @notifier: notifier struct to unregister 2468 * 2469 * This is *not* safe to call from within a preemption notifier. 2470 */ 2471 void preempt_notifier_unregister(struct preempt_notifier *notifier) 2472 { 2473 hlist_del(¬ifier->link); 2474 } 2475 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2476 2477 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) 2478 { 2479 struct preempt_notifier *notifier; 2480 2481 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2482 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2483 } 2484 2485 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2486 { 2487 if (static_key_false(&preempt_notifier_key)) 2488 __fire_sched_in_preempt_notifiers(curr); 2489 } 2490 2491 static void 2492 __fire_sched_out_preempt_notifiers(struct task_struct *curr, 2493 struct task_struct *next) 2494 { 2495 struct preempt_notifier *notifier; 2496 2497 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2498 notifier->ops->sched_out(notifier, next); 2499 } 2500 2501 static __always_inline void 2502 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2503 struct task_struct *next) 2504 { 2505 if (static_key_false(&preempt_notifier_key)) 2506 __fire_sched_out_preempt_notifiers(curr, next); 2507 } 2508 2509 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 2510 2511 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2512 { 2513 } 2514 2515 static inline void 2516 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2517 struct task_struct *next) 2518 { 2519 } 2520 2521 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 2522 2523 /** 2524 * prepare_task_switch - prepare to switch tasks 2525 * @rq: the runqueue preparing to switch 2526 * @prev: the current task that is being switched out 2527 * @next: the task we are going to switch to. 2528 * 2529 * This is called with the rq lock held and interrupts off. It must 2530 * be paired with a subsequent finish_task_switch after the context 2531 * switch. 2532 * 2533 * prepare_task_switch sets up locking and calls architecture specific 2534 * hooks. 2535 */ 2536 static inline void 2537 prepare_task_switch(struct rq *rq, struct task_struct *prev, 2538 struct task_struct *next) 2539 { 2540 sched_info_switch(rq, prev, next); 2541 perf_event_task_sched_out(prev, next); 2542 fire_sched_out_preempt_notifiers(prev, next); 2543 prepare_lock_switch(rq, next); 2544 prepare_arch_switch(next); 2545 } 2546 2547 /** 2548 * finish_task_switch - clean up after a task-switch 2549 * @prev: the thread we just switched away from. 2550 * 2551 * finish_task_switch must be called after the context switch, paired 2552 * with a prepare_task_switch call before the context switch. 2553 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2554 * and do any other architecture-specific cleanup actions. 2555 * 2556 * Note that we may have delayed dropping an mm in context_switch(). If 2557 * so, we finish that here outside of the runqueue lock. (Doing it 2558 * with the lock held can cause deadlocks; see schedule() for 2559 * details.) 2560 * 2561 * The context switch have flipped the stack from under us and restored the 2562 * local variables which were saved when this task called schedule() in the 2563 * past. prev == current is still correct but we need to recalculate this_rq 2564 * because prev may have moved to another CPU. 2565 */ 2566 static struct rq *finish_task_switch(struct task_struct *prev) 2567 __releases(rq->lock) 2568 { 2569 struct rq *rq = this_rq(); 2570 struct mm_struct *mm = rq->prev_mm; 2571 long prev_state; 2572 2573 /* 2574 * The previous task will have left us with a preempt_count of 2 2575 * because it left us after: 2576 * 2577 * schedule() 2578 * preempt_disable(); // 1 2579 * __schedule() 2580 * raw_spin_lock_irq(&rq->lock) // 2 2581 * 2582 * Also, see FORK_PREEMPT_COUNT. 2583 */ 2584 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, 2585 "corrupted preempt_count: %s/%d/0x%x\n", 2586 current->comm, current->pid, preempt_count())) 2587 preempt_count_set(FORK_PREEMPT_COUNT); 2588 2589 rq->prev_mm = NULL; 2590 2591 /* 2592 * A task struct has one reference for the use as "current". 2593 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2594 * schedule one last time. The schedule call will never return, and 2595 * the scheduled task must drop that reference. 2596 * 2597 * We must observe prev->state before clearing prev->on_cpu (in 2598 * finish_lock_switch), otherwise a concurrent wakeup can get prev 2599 * running on another CPU and we could rave with its RUNNING -> DEAD 2600 * transition, resulting in a double drop. 2601 */ 2602 prev_state = prev->state; 2603 vtime_task_switch(prev); 2604 perf_event_task_sched_in(prev, current); 2605 finish_lock_switch(rq, prev); 2606 finish_arch_post_lock_switch(); 2607 2608 fire_sched_in_preempt_notifiers(current); 2609 if (mm) 2610 mmdrop(mm); 2611 if (unlikely(prev_state == TASK_DEAD)) { 2612 if (prev->sched_class->task_dead) 2613 prev->sched_class->task_dead(prev); 2614 2615 /* 2616 * Remove function-return probe instances associated with this 2617 * task and put them back on the free list. 2618 */ 2619 kprobe_flush_task(prev); 2620 put_task_struct(prev); 2621 } 2622 2623 tick_nohz_task_switch(); 2624 return rq; 2625 } 2626 2627 #ifdef CONFIG_SMP 2628 2629 /* rq->lock is NOT held, but preemption is disabled */ 2630 static void __balance_callback(struct rq *rq) 2631 { 2632 struct callback_head *head, *next; 2633 void (*func)(struct rq *rq); 2634 unsigned long flags; 2635 2636 raw_spin_lock_irqsave(&rq->lock, flags); 2637 head = rq->balance_callback; 2638 rq->balance_callback = NULL; 2639 while (head) { 2640 func = (void (*)(struct rq *))head->func; 2641 next = head->next; 2642 head->next = NULL; 2643 head = next; 2644 2645 func(rq); 2646 } 2647 raw_spin_unlock_irqrestore(&rq->lock, flags); 2648 } 2649 2650 static inline void balance_callback(struct rq *rq) 2651 { 2652 if (unlikely(rq->balance_callback)) 2653 __balance_callback(rq); 2654 } 2655 2656 #else 2657 2658 static inline void balance_callback(struct rq *rq) 2659 { 2660 } 2661 2662 #endif 2663 2664 /** 2665 * schedule_tail - first thing a freshly forked thread must call. 2666 * @prev: the thread we just switched away from. 2667 */ 2668 asmlinkage __visible void schedule_tail(struct task_struct *prev) 2669 __releases(rq->lock) 2670 { 2671 struct rq *rq; 2672 2673 /* 2674 * New tasks start with FORK_PREEMPT_COUNT, see there and 2675 * finish_task_switch() for details. 2676 * 2677 * finish_task_switch() will drop rq->lock() and lower preempt_count 2678 * and the preempt_enable() will end up enabling preemption (on 2679 * PREEMPT_COUNT kernels). 2680 */ 2681 2682 rq = finish_task_switch(prev); 2683 balance_callback(rq); 2684 preempt_enable(); 2685 2686 if (current->set_child_tid) 2687 put_user(task_pid_vnr(current), current->set_child_tid); 2688 } 2689 2690 /* 2691 * context_switch - switch to the new MM and the new thread's register state. 2692 */ 2693 static __always_inline struct rq * 2694 context_switch(struct rq *rq, struct task_struct *prev, 2695 struct task_struct *next) 2696 { 2697 struct mm_struct *mm, *oldmm; 2698 2699 prepare_task_switch(rq, prev, next); 2700 2701 mm = next->mm; 2702 oldmm = prev->active_mm; 2703 /* 2704 * For paravirt, this is coupled with an exit in switch_to to 2705 * combine the page table reload and the switch backend into 2706 * one hypercall. 2707 */ 2708 arch_start_context_switch(prev); 2709 2710 if (!mm) { 2711 next->active_mm = oldmm; 2712 atomic_inc(&oldmm->mm_count); 2713 enter_lazy_tlb(oldmm, next); 2714 } else 2715 switch_mm(oldmm, mm, next); 2716 2717 if (!prev->mm) { 2718 prev->active_mm = NULL; 2719 rq->prev_mm = oldmm; 2720 } 2721 /* 2722 * Since the runqueue lock will be released by the next 2723 * task (which is an invalid locking op but in the case 2724 * of the scheduler it's an obvious special-case), so we 2725 * do an early lockdep release here: 2726 */ 2727 lockdep_unpin_lock(&rq->lock); 2728 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2729 2730 /* Here we just switch the register state and the stack. */ 2731 switch_to(prev, next, prev); 2732 barrier(); 2733 2734 return finish_task_switch(prev); 2735 } 2736 2737 /* 2738 * nr_running and nr_context_switches: 2739 * 2740 * externally visible scheduler statistics: current number of runnable 2741 * threads, total number of context switches performed since bootup. 2742 */ 2743 unsigned long nr_running(void) 2744 { 2745 unsigned long i, sum = 0; 2746 2747 for_each_online_cpu(i) 2748 sum += cpu_rq(i)->nr_running; 2749 2750 return sum; 2751 } 2752 2753 /* 2754 * Check if only the current task is running on the cpu. 2755 * 2756 * Caution: this function does not check that the caller has disabled 2757 * preemption, thus the result might have a time-of-check-to-time-of-use 2758 * race. The caller is responsible to use it correctly, for example: 2759 * 2760 * - from a non-preemptable section (of course) 2761 * 2762 * - from a thread that is bound to a single CPU 2763 * 2764 * - in a loop with very short iterations (e.g. a polling loop) 2765 */ 2766 bool single_task_running(void) 2767 { 2768 return raw_rq()->nr_running == 1; 2769 } 2770 EXPORT_SYMBOL(single_task_running); 2771 2772 unsigned long long nr_context_switches(void) 2773 { 2774 int i; 2775 unsigned long long sum = 0; 2776 2777 for_each_possible_cpu(i) 2778 sum += cpu_rq(i)->nr_switches; 2779 2780 return sum; 2781 } 2782 2783 unsigned long nr_iowait(void) 2784 { 2785 unsigned long i, sum = 0; 2786 2787 for_each_possible_cpu(i) 2788 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2789 2790 return sum; 2791 } 2792 2793 unsigned long nr_iowait_cpu(int cpu) 2794 { 2795 struct rq *this = cpu_rq(cpu); 2796 return atomic_read(&this->nr_iowait); 2797 } 2798 2799 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) 2800 { 2801 struct rq *rq = this_rq(); 2802 *nr_waiters = atomic_read(&rq->nr_iowait); 2803 *load = rq->load.weight; 2804 } 2805 2806 #ifdef CONFIG_SMP 2807 2808 /* 2809 * sched_exec - execve() is a valuable balancing opportunity, because at 2810 * this point the task has the smallest effective memory and cache footprint. 2811 */ 2812 void sched_exec(void) 2813 { 2814 struct task_struct *p = current; 2815 unsigned long flags; 2816 int dest_cpu; 2817 2818 raw_spin_lock_irqsave(&p->pi_lock, flags); 2819 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 2820 if (dest_cpu == smp_processor_id()) 2821 goto unlock; 2822 2823 if (likely(cpu_active(dest_cpu))) { 2824 struct migration_arg arg = { p, dest_cpu }; 2825 2826 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2827 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2828 return; 2829 } 2830 unlock: 2831 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2832 } 2833 2834 #endif 2835 2836 DEFINE_PER_CPU(struct kernel_stat, kstat); 2837 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2838 2839 EXPORT_PER_CPU_SYMBOL(kstat); 2840 EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2841 2842 /* 2843 * Return accounted runtime for the task. 2844 * In case the task is currently running, return the runtime plus current's 2845 * pending runtime that have not been accounted yet. 2846 */ 2847 unsigned long long task_sched_runtime(struct task_struct *p) 2848 { 2849 unsigned long flags; 2850 struct rq *rq; 2851 u64 ns; 2852 2853 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 2854 /* 2855 * 64-bit doesn't need locks to atomically read a 64bit value. 2856 * So we have a optimization chance when the task's delta_exec is 0. 2857 * Reading ->on_cpu is racy, but this is ok. 2858 * 2859 * If we race with it leaving cpu, we'll take a lock. So we're correct. 2860 * If we race with it entering cpu, unaccounted time is 0. This is 2861 * indistinguishable from the read occurring a few cycles earlier. 2862 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 2863 * been accounted, so we're correct here as well. 2864 */ 2865 if (!p->on_cpu || !task_on_rq_queued(p)) 2866 return p->se.sum_exec_runtime; 2867 #endif 2868 2869 rq = task_rq_lock(p, &flags); 2870 /* 2871 * Must be ->curr _and_ ->on_rq. If dequeued, we would 2872 * project cycles that may never be accounted to this 2873 * thread, breaking clock_gettime(). 2874 */ 2875 if (task_current(rq, p) && task_on_rq_queued(p)) { 2876 update_rq_clock(rq); 2877 p->sched_class->update_curr(rq); 2878 } 2879 ns = p->se.sum_exec_runtime; 2880 task_rq_unlock(rq, p, &flags); 2881 2882 return ns; 2883 } 2884 2885 /* 2886 * This function gets called by the timer code, with HZ frequency. 2887 * We call it with interrupts disabled. 2888 */ 2889 void scheduler_tick(void) 2890 { 2891 int cpu = smp_processor_id(); 2892 struct rq *rq = cpu_rq(cpu); 2893 struct task_struct *curr = rq->curr; 2894 2895 sched_clock_tick(); 2896 2897 raw_spin_lock(&rq->lock); 2898 update_rq_clock(rq); 2899 curr->sched_class->task_tick(rq, curr, 0); 2900 update_cpu_load_active(rq); 2901 calc_global_load_tick(rq); 2902 raw_spin_unlock(&rq->lock); 2903 2904 perf_event_task_tick(); 2905 2906 #ifdef CONFIG_SMP 2907 rq->idle_balance = idle_cpu(cpu); 2908 trigger_load_balance(rq); 2909 #endif 2910 rq_last_tick_reset(rq); 2911 } 2912 2913 #ifdef CONFIG_NO_HZ_FULL 2914 /** 2915 * scheduler_tick_max_deferment 2916 * 2917 * Keep at least one tick per second when a single 2918 * active task is running because the scheduler doesn't 2919 * yet completely support full dynticks environment. 2920 * 2921 * This makes sure that uptime, CFS vruntime, load 2922 * balancing, etc... continue to move forward, even 2923 * with a very low granularity. 2924 * 2925 * Return: Maximum deferment in nanoseconds. 2926 */ 2927 u64 scheduler_tick_max_deferment(void) 2928 { 2929 struct rq *rq = this_rq(); 2930 unsigned long next, now = READ_ONCE(jiffies); 2931 2932 next = rq->last_sched_tick + HZ; 2933 2934 if (time_before_eq(next, now)) 2935 return 0; 2936 2937 return jiffies_to_nsecs(next - now); 2938 } 2939 #endif 2940 2941 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2942 defined(CONFIG_PREEMPT_TRACER)) 2943 2944 void preempt_count_add(int val) 2945 { 2946 #ifdef CONFIG_DEBUG_PREEMPT 2947 /* 2948 * Underflow? 2949 */ 2950 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2951 return; 2952 #endif 2953 __preempt_count_add(val); 2954 #ifdef CONFIG_DEBUG_PREEMPT 2955 /* 2956 * Spinlock count overflowing soon? 2957 */ 2958 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2959 PREEMPT_MASK - 10); 2960 #endif 2961 if (preempt_count() == val) { 2962 unsigned long ip = get_lock_parent_ip(); 2963 #ifdef CONFIG_DEBUG_PREEMPT 2964 current->preempt_disable_ip = ip; 2965 #endif 2966 trace_preempt_off(CALLER_ADDR0, ip); 2967 } 2968 } 2969 EXPORT_SYMBOL(preempt_count_add); 2970 NOKPROBE_SYMBOL(preempt_count_add); 2971 2972 void preempt_count_sub(int val) 2973 { 2974 #ifdef CONFIG_DEBUG_PREEMPT 2975 /* 2976 * Underflow? 2977 */ 2978 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 2979 return; 2980 /* 2981 * Is the spinlock portion underflowing? 2982 */ 2983 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 2984 !(preempt_count() & PREEMPT_MASK))) 2985 return; 2986 #endif 2987 2988 if (preempt_count() == val) 2989 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); 2990 __preempt_count_sub(val); 2991 } 2992 EXPORT_SYMBOL(preempt_count_sub); 2993 NOKPROBE_SYMBOL(preempt_count_sub); 2994 2995 #endif 2996 2997 /* 2998 * Print scheduling while atomic bug: 2999 */ 3000 static noinline void __schedule_bug(struct task_struct *prev) 3001 { 3002 if (oops_in_progress) 3003 return; 3004 3005 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3006 prev->comm, prev->pid, preempt_count()); 3007 3008 debug_show_held_locks(prev); 3009 print_modules(); 3010 if (irqs_disabled()) 3011 print_irqtrace_events(prev); 3012 #ifdef CONFIG_DEBUG_PREEMPT 3013 if (in_atomic_preempt_off()) { 3014 pr_err("Preemption disabled at:"); 3015 print_ip_sym(current->preempt_disable_ip); 3016 pr_cont("\n"); 3017 } 3018 #endif 3019 dump_stack(); 3020 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 3021 } 3022 3023 /* 3024 * Various schedule()-time debugging checks and statistics: 3025 */ 3026 static inline void schedule_debug(struct task_struct *prev) 3027 { 3028 #ifdef CONFIG_SCHED_STACK_END_CHECK 3029 BUG_ON(task_stack_end_corrupted(prev)); 3030 #endif 3031 3032 if (unlikely(in_atomic_preempt_off())) { 3033 __schedule_bug(prev); 3034 preempt_count_set(PREEMPT_DISABLED); 3035 } 3036 rcu_sleep_check(); 3037 3038 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3039 3040 schedstat_inc(this_rq(), sched_count); 3041 } 3042 3043 /* 3044 * Pick up the highest-prio task: 3045 */ 3046 static inline struct task_struct * 3047 pick_next_task(struct rq *rq, struct task_struct *prev) 3048 { 3049 const struct sched_class *class = &fair_sched_class; 3050 struct task_struct *p; 3051 3052 /* 3053 * Optimization: we know that if all tasks are in 3054 * the fair class we can call that function directly: 3055 */ 3056 if (likely(prev->sched_class == class && 3057 rq->nr_running == rq->cfs.h_nr_running)) { 3058 p = fair_sched_class.pick_next_task(rq, prev); 3059 if (unlikely(p == RETRY_TASK)) 3060 goto again; 3061 3062 /* assumes fair_sched_class->next == idle_sched_class */ 3063 if (unlikely(!p)) 3064 p = idle_sched_class.pick_next_task(rq, prev); 3065 3066 return p; 3067 } 3068 3069 again: 3070 for_each_class(class) { 3071 p = class->pick_next_task(rq, prev); 3072 if (p) { 3073 if (unlikely(p == RETRY_TASK)) 3074 goto again; 3075 return p; 3076 } 3077 } 3078 3079 BUG(); /* the idle class will always have a runnable task */ 3080 } 3081 3082 /* 3083 * __schedule() is the main scheduler function. 3084 * 3085 * The main means of driving the scheduler and thus entering this function are: 3086 * 3087 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 3088 * 3089 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 3090 * paths. For example, see arch/x86/entry_64.S. 3091 * 3092 * To drive preemption between tasks, the scheduler sets the flag in timer 3093 * interrupt handler scheduler_tick(). 3094 * 3095 * 3. Wakeups don't really cause entry into schedule(). They add a 3096 * task to the run-queue and that's it. 3097 * 3098 * Now, if the new task added to the run-queue preempts the current 3099 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 3100 * called on the nearest possible occasion: 3101 * 3102 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 3103 * 3104 * - in syscall or exception context, at the next outmost 3105 * preempt_enable(). (this might be as soon as the wake_up()'s 3106 * spin_unlock()!) 3107 * 3108 * - in IRQ context, return from interrupt-handler to 3109 * preemptible context 3110 * 3111 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 3112 * then at the next: 3113 * 3114 * - cond_resched() call 3115 * - explicit schedule() call 3116 * - return from syscall or exception to user-space 3117 * - return from interrupt-handler to user-space 3118 * 3119 * WARNING: must be called with preemption disabled! 3120 */ 3121 static void __sched notrace __schedule(bool preempt) 3122 { 3123 struct task_struct *prev, *next; 3124 unsigned long *switch_count; 3125 struct rq *rq; 3126 int cpu; 3127 3128 cpu = smp_processor_id(); 3129 rq = cpu_rq(cpu); 3130 prev = rq->curr; 3131 3132 /* 3133 * do_exit() calls schedule() with preemption disabled as an exception; 3134 * however we must fix that up, otherwise the next task will see an 3135 * inconsistent (higher) preempt count. 3136 * 3137 * It also avoids the below schedule_debug() test from complaining 3138 * about this. 3139 */ 3140 if (unlikely(prev->state == TASK_DEAD)) 3141 preempt_enable_no_resched_notrace(); 3142 3143 schedule_debug(prev); 3144 3145 if (sched_feat(HRTICK)) 3146 hrtick_clear(rq); 3147 3148 local_irq_disable(); 3149 rcu_note_context_switch(); 3150 3151 /* 3152 * Make sure that signal_pending_state()->signal_pending() below 3153 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 3154 * done by the caller to avoid the race with signal_wake_up(). 3155 */ 3156 smp_mb__before_spinlock(); 3157 raw_spin_lock(&rq->lock); 3158 lockdep_pin_lock(&rq->lock); 3159 3160 rq->clock_skip_update <<= 1; /* promote REQ to ACT */ 3161 3162 switch_count = &prev->nivcsw; 3163 if (!preempt && prev->state) { 3164 if (unlikely(signal_pending_state(prev->state, prev))) { 3165 prev->state = TASK_RUNNING; 3166 } else { 3167 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3168 prev->on_rq = 0; 3169 3170 /* 3171 * If a worker went to sleep, notify and ask workqueue 3172 * whether it wants to wake up a task to maintain 3173 * concurrency. 3174 */ 3175 if (prev->flags & PF_WQ_WORKER) { 3176 struct task_struct *to_wakeup; 3177 3178 to_wakeup = wq_worker_sleeping(prev); 3179 if (to_wakeup) 3180 try_to_wake_up_local(to_wakeup); 3181 } 3182 } 3183 switch_count = &prev->nvcsw; 3184 } 3185 3186 if (task_on_rq_queued(prev)) 3187 update_rq_clock(rq); 3188 3189 next = pick_next_task(rq, prev); 3190 clear_tsk_need_resched(prev); 3191 clear_preempt_need_resched(); 3192 rq->clock_skip_update = 0; 3193 3194 if (likely(prev != next)) { 3195 rq->nr_switches++; 3196 rq->curr = next; 3197 ++*switch_count; 3198 3199 trace_sched_switch(preempt, prev, next); 3200 rq = context_switch(rq, prev, next); /* unlocks the rq */ 3201 } else { 3202 lockdep_unpin_lock(&rq->lock); 3203 raw_spin_unlock_irq(&rq->lock); 3204 } 3205 3206 balance_callback(rq); 3207 } 3208 STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ 3209 3210 static inline void sched_submit_work(struct task_struct *tsk) 3211 { 3212 if (!tsk->state || tsk_is_pi_blocked(tsk)) 3213 return; 3214 /* 3215 * If we are going to sleep and we have plugged IO queued, 3216 * make sure to submit it to avoid deadlocks. 3217 */ 3218 if (blk_needs_flush_plug(tsk)) 3219 blk_schedule_flush_plug(tsk); 3220 } 3221 3222 asmlinkage __visible void __sched schedule(void) 3223 { 3224 struct task_struct *tsk = current; 3225 3226 sched_submit_work(tsk); 3227 do { 3228 preempt_disable(); 3229 __schedule(false); 3230 sched_preempt_enable_no_resched(); 3231 } while (need_resched()); 3232 } 3233 EXPORT_SYMBOL(schedule); 3234 3235 #ifdef CONFIG_CONTEXT_TRACKING 3236 asmlinkage __visible void __sched schedule_user(void) 3237 { 3238 /* 3239 * If we come here after a random call to set_need_resched(), 3240 * or we have been woken up remotely but the IPI has not yet arrived, 3241 * we haven't yet exited the RCU idle mode. Do it here manually until 3242 * we find a better solution. 3243 * 3244 * NB: There are buggy callers of this function. Ideally we 3245 * should warn if prev_state != CONTEXT_USER, but that will trigger 3246 * too frequently to make sense yet. 3247 */ 3248 enum ctx_state prev_state = exception_enter(); 3249 schedule(); 3250 exception_exit(prev_state); 3251 } 3252 #endif 3253 3254 /** 3255 * schedule_preempt_disabled - called with preemption disabled 3256 * 3257 * Returns with preemption disabled. Note: preempt_count must be 1 3258 */ 3259 void __sched schedule_preempt_disabled(void) 3260 { 3261 sched_preempt_enable_no_resched(); 3262 schedule(); 3263 preempt_disable(); 3264 } 3265 3266 static void __sched notrace preempt_schedule_common(void) 3267 { 3268 do { 3269 preempt_disable_notrace(); 3270 __schedule(true); 3271 preempt_enable_no_resched_notrace(); 3272 3273 /* 3274 * Check again in case we missed a preemption opportunity 3275 * between schedule and now. 3276 */ 3277 } while (need_resched()); 3278 } 3279 3280 #ifdef CONFIG_PREEMPT 3281 /* 3282 * this is the entry point to schedule() from in-kernel preemption 3283 * off of preempt_enable. Kernel preemptions off return from interrupt 3284 * occur there and call schedule directly. 3285 */ 3286 asmlinkage __visible void __sched notrace preempt_schedule(void) 3287 { 3288 /* 3289 * If there is a non-zero preempt_count or interrupts are disabled, 3290 * we do not want to preempt the current task. Just return.. 3291 */ 3292 if (likely(!preemptible())) 3293 return; 3294 3295 preempt_schedule_common(); 3296 } 3297 NOKPROBE_SYMBOL(preempt_schedule); 3298 EXPORT_SYMBOL(preempt_schedule); 3299 3300 /** 3301 * preempt_schedule_notrace - preempt_schedule called by tracing 3302 * 3303 * The tracing infrastructure uses preempt_enable_notrace to prevent 3304 * recursion and tracing preempt enabling caused by the tracing 3305 * infrastructure itself. But as tracing can happen in areas coming 3306 * from userspace or just about to enter userspace, a preempt enable 3307 * can occur before user_exit() is called. This will cause the scheduler 3308 * to be called when the system is still in usermode. 3309 * 3310 * To prevent this, the preempt_enable_notrace will use this function 3311 * instead of preempt_schedule() to exit user context if needed before 3312 * calling the scheduler. 3313 */ 3314 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) 3315 { 3316 enum ctx_state prev_ctx; 3317 3318 if (likely(!preemptible())) 3319 return; 3320 3321 do { 3322 preempt_disable_notrace(); 3323 /* 3324 * Needs preempt disabled in case user_exit() is traced 3325 * and the tracer calls preempt_enable_notrace() causing 3326 * an infinite recursion. 3327 */ 3328 prev_ctx = exception_enter(); 3329 __schedule(true); 3330 exception_exit(prev_ctx); 3331 3332 preempt_enable_no_resched_notrace(); 3333 } while (need_resched()); 3334 } 3335 EXPORT_SYMBOL_GPL(preempt_schedule_notrace); 3336 3337 #endif /* CONFIG_PREEMPT */ 3338 3339 /* 3340 * this is the entry point to schedule() from kernel preemption 3341 * off of irq context. 3342 * Note, that this is called and return with irqs disabled. This will 3343 * protect us against recursive calling from irq. 3344 */ 3345 asmlinkage __visible void __sched preempt_schedule_irq(void) 3346 { 3347 enum ctx_state prev_state; 3348 3349 /* Catch callers which need to be fixed */ 3350 BUG_ON(preempt_count() || !irqs_disabled()); 3351 3352 prev_state = exception_enter(); 3353 3354 do { 3355 preempt_disable(); 3356 local_irq_enable(); 3357 __schedule(true); 3358 local_irq_disable(); 3359 sched_preempt_enable_no_resched(); 3360 } while (need_resched()); 3361 3362 exception_exit(prev_state); 3363 } 3364 3365 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 3366 void *key) 3367 { 3368 return try_to_wake_up(curr->private, mode, wake_flags); 3369 } 3370 EXPORT_SYMBOL(default_wake_function); 3371 3372 #ifdef CONFIG_RT_MUTEXES 3373 3374 /* 3375 * rt_mutex_setprio - set the current priority of a task 3376 * @p: task 3377 * @prio: prio value (kernel-internal form) 3378 * 3379 * This function changes the 'effective' priority of a task. It does 3380 * not touch ->normal_prio like __setscheduler(). 3381 * 3382 * Used by the rt_mutex code to implement priority inheritance 3383 * logic. Call site only calls if the priority of the task changed. 3384 */ 3385 void rt_mutex_setprio(struct task_struct *p, int prio) 3386 { 3387 int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; 3388 struct rq *rq; 3389 const struct sched_class *prev_class; 3390 3391 BUG_ON(prio > MAX_PRIO); 3392 3393 rq = __task_rq_lock(p); 3394 3395 /* 3396 * Idle task boosting is a nono in general. There is one 3397 * exception, when PREEMPT_RT and NOHZ is active: 3398 * 3399 * The idle task calls get_next_timer_interrupt() and holds 3400 * the timer wheel base->lock on the CPU and another CPU wants 3401 * to access the timer (probably to cancel it). We can safely 3402 * ignore the boosting request, as the idle CPU runs this code 3403 * with interrupts disabled and will complete the lock 3404 * protected section without being interrupted. So there is no 3405 * real need to boost. 3406 */ 3407 if (unlikely(p == rq->idle)) { 3408 WARN_ON(p != rq->curr); 3409 WARN_ON(p->pi_blocked_on); 3410 goto out_unlock; 3411 } 3412 3413 trace_sched_pi_setprio(p, prio); 3414 oldprio = p->prio; 3415 3416 if (oldprio == prio) 3417 queue_flag &= ~DEQUEUE_MOVE; 3418 3419 prev_class = p->sched_class; 3420 queued = task_on_rq_queued(p); 3421 running = task_current(rq, p); 3422 if (queued) 3423 dequeue_task(rq, p, queue_flag); 3424 if (running) 3425 put_prev_task(rq, p); 3426 3427 /* 3428 * Boosting condition are: 3429 * 1. -rt task is running and holds mutex A 3430 * --> -dl task blocks on mutex A 3431 * 3432 * 2. -dl task is running and holds mutex A 3433 * --> -dl task blocks on mutex A and could preempt the 3434 * running task 3435 */ 3436 if (dl_prio(prio)) { 3437 struct task_struct *pi_task = rt_mutex_get_top_task(p); 3438 if (!dl_prio(p->normal_prio) || 3439 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3440 p->dl.dl_boosted = 1; 3441 queue_flag |= ENQUEUE_REPLENISH; 3442 } else 3443 p->dl.dl_boosted = 0; 3444 p->sched_class = &dl_sched_class; 3445 } else if (rt_prio(prio)) { 3446 if (dl_prio(oldprio)) 3447 p->dl.dl_boosted = 0; 3448 if (oldprio < prio) 3449 queue_flag |= ENQUEUE_HEAD; 3450 p->sched_class = &rt_sched_class; 3451 } else { 3452 if (dl_prio(oldprio)) 3453 p->dl.dl_boosted = 0; 3454 if (rt_prio(oldprio)) 3455 p->rt.timeout = 0; 3456 p->sched_class = &fair_sched_class; 3457 } 3458 3459 p->prio = prio; 3460 3461 if (running) 3462 p->sched_class->set_curr_task(rq); 3463 if (queued) 3464 enqueue_task(rq, p, queue_flag); 3465 3466 check_class_changed(rq, p, prev_class, oldprio); 3467 out_unlock: 3468 preempt_disable(); /* avoid rq from going away on us */ 3469 __task_rq_unlock(rq); 3470 3471 balance_callback(rq); 3472 preempt_enable(); 3473 } 3474 #endif 3475 3476 void set_user_nice(struct task_struct *p, long nice) 3477 { 3478 int old_prio, delta, queued; 3479 unsigned long flags; 3480 struct rq *rq; 3481 3482 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) 3483 return; 3484 /* 3485 * We have to be careful, if called from sys_setpriority(), 3486 * the task might be in the middle of scheduling on another CPU. 3487 */ 3488 rq = task_rq_lock(p, &flags); 3489 /* 3490 * The RT priorities are set via sched_setscheduler(), but we still 3491 * allow the 'normal' nice value to be set - but as expected 3492 * it wont have any effect on scheduling until the task is 3493 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: 3494 */ 3495 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 3496 p->static_prio = NICE_TO_PRIO(nice); 3497 goto out_unlock; 3498 } 3499 queued = task_on_rq_queued(p); 3500 if (queued) 3501 dequeue_task(rq, p, DEQUEUE_SAVE); 3502 3503 p->static_prio = NICE_TO_PRIO(nice); 3504 set_load_weight(p); 3505 old_prio = p->prio; 3506 p->prio = effective_prio(p); 3507 delta = p->prio - old_prio; 3508 3509 if (queued) { 3510 enqueue_task(rq, p, ENQUEUE_RESTORE); 3511 /* 3512 * If the task increased its priority or is running and 3513 * lowered its priority, then reschedule its CPU: 3514 */ 3515 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3516 resched_curr(rq); 3517 } 3518 out_unlock: 3519 task_rq_unlock(rq, p, &flags); 3520 } 3521 EXPORT_SYMBOL(set_user_nice); 3522 3523 /* 3524 * can_nice - check if a task can reduce its nice value 3525 * @p: task 3526 * @nice: nice value 3527 */ 3528 int can_nice(const struct task_struct *p, const int nice) 3529 { 3530 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3531 int nice_rlim = nice_to_rlimit(nice); 3532 3533 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3534 capable(CAP_SYS_NICE)); 3535 } 3536 3537 #ifdef __ARCH_WANT_SYS_NICE 3538 3539 /* 3540 * sys_nice - change the priority of the current process. 3541 * @increment: priority increment 3542 * 3543 * sys_setpriority is a more generic, but much slower function that 3544 * does similar things. 3545 */ 3546 SYSCALL_DEFINE1(nice, int, increment) 3547 { 3548 long nice, retval; 3549 3550 /* 3551 * Setpriority might change our priority at the same moment. 3552 * We don't have to worry. Conceptually one call occurs first 3553 * and we have a single winner. 3554 */ 3555 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); 3556 nice = task_nice(current) + increment; 3557 3558 nice = clamp_val(nice, MIN_NICE, MAX_NICE); 3559 if (increment < 0 && !can_nice(current, nice)) 3560 return -EPERM; 3561 3562 retval = security_task_setnice(current, nice); 3563 if (retval) 3564 return retval; 3565 3566 set_user_nice(current, nice); 3567 return 0; 3568 } 3569 3570 #endif 3571 3572 /** 3573 * task_prio - return the priority value of a given task. 3574 * @p: the task in question. 3575 * 3576 * Return: The priority value as seen by users in /proc. 3577 * RT tasks are offset by -200. Normal tasks are centered 3578 * around 0, value goes from -16 to +15. 3579 */ 3580 int task_prio(const struct task_struct *p) 3581 { 3582 return p->prio - MAX_RT_PRIO; 3583 } 3584 3585 /** 3586 * idle_cpu - is a given cpu idle currently? 3587 * @cpu: the processor in question. 3588 * 3589 * Return: 1 if the CPU is currently idle. 0 otherwise. 3590 */ 3591 int idle_cpu(int cpu) 3592 { 3593 struct rq *rq = cpu_rq(cpu); 3594 3595 if (rq->curr != rq->idle) 3596 return 0; 3597 3598 if (rq->nr_running) 3599 return 0; 3600 3601 #ifdef CONFIG_SMP 3602 if (!llist_empty(&rq->wake_list)) 3603 return 0; 3604 #endif 3605 3606 return 1; 3607 } 3608 3609 /** 3610 * idle_task - return the idle task for a given cpu. 3611 * @cpu: the processor in question. 3612 * 3613 * Return: The idle task for the cpu @cpu. 3614 */ 3615 struct task_struct *idle_task(int cpu) 3616 { 3617 return cpu_rq(cpu)->idle; 3618 } 3619 3620 /** 3621 * find_process_by_pid - find a process with a matching PID value. 3622 * @pid: the pid in question. 3623 * 3624 * The task of @pid, if found. %NULL otherwise. 3625 */ 3626 static struct task_struct *find_process_by_pid(pid_t pid) 3627 { 3628 return pid ? find_task_by_vpid(pid) : current; 3629 } 3630 3631 /* 3632 * This function initializes the sched_dl_entity of a newly becoming 3633 * SCHED_DEADLINE task. 3634 * 3635 * Only the static values are considered here, the actual runtime and the 3636 * absolute deadline will be properly calculated when the task is enqueued 3637 * for the first time with its new policy. 3638 */ 3639 static void 3640 __setparam_dl(struct task_struct *p, const struct sched_attr *attr) 3641 { 3642 struct sched_dl_entity *dl_se = &p->dl; 3643 3644 dl_se->dl_runtime = attr->sched_runtime; 3645 dl_se->dl_deadline = attr->sched_deadline; 3646 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3647 dl_se->flags = attr->sched_flags; 3648 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3649 3650 /* 3651 * Changing the parameters of a task is 'tricky' and we're not doing 3652 * the correct thing -- also see task_dead_dl() and switched_from_dl(). 3653 * 3654 * What we SHOULD do is delay the bandwidth release until the 0-lag 3655 * point. This would include retaining the task_struct until that time 3656 * and change dl_overflow() to not immediately decrement the current 3657 * amount. 3658 * 3659 * Instead we retain the current runtime/deadline and let the new 3660 * parameters take effect after the current reservation period lapses. 3661 * This is safe (albeit pessimistic) because the 0-lag point is always 3662 * before the current scheduling deadline. 3663 * 3664 * We can still have temporary overloads because we do not delay the 3665 * change in bandwidth until that time; so admission control is 3666 * not on the safe side. It does however guarantee tasks will never 3667 * consume more than promised. 3668 */ 3669 } 3670 3671 /* 3672 * sched_setparam() passes in -1 for its policy, to let the functions 3673 * it calls know not to change it. 3674 */ 3675 #define SETPARAM_POLICY -1 3676 3677 static void __setscheduler_params(struct task_struct *p, 3678 const struct sched_attr *attr) 3679 { 3680 int policy = attr->sched_policy; 3681 3682 if (policy == SETPARAM_POLICY) 3683 policy = p->policy; 3684 3685 p->policy = policy; 3686 3687 if (dl_policy(policy)) 3688 __setparam_dl(p, attr); 3689 else if (fair_policy(policy)) 3690 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 3691 3692 /* 3693 * __sched_setscheduler() ensures attr->sched_priority == 0 when 3694 * !rt_policy. Always setting this ensures that things like 3695 * getparam()/getattr() don't report silly values for !rt tasks. 3696 */ 3697 p->rt_priority = attr->sched_priority; 3698 p->normal_prio = normal_prio(p); 3699 set_load_weight(p); 3700 } 3701 3702 /* Actually do priority change: must hold pi & rq lock. */ 3703 static void __setscheduler(struct rq *rq, struct task_struct *p, 3704 const struct sched_attr *attr, bool keep_boost) 3705 { 3706 __setscheduler_params(p, attr); 3707 3708 /* 3709 * Keep a potential priority boosting if called from 3710 * sched_setscheduler(). 3711 */ 3712 if (keep_boost) 3713 p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); 3714 else 3715 p->prio = normal_prio(p); 3716 3717 if (dl_prio(p->prio)) 3718 p->sched_class = &dl_sched_class; 3719 else if (rt_prio(p->prio)) 3720 p->sched_class = &rt_sched_class; 3721 else 3722 p->sched_class = &fair_sched_class; 3723 } 3724 3725 static void 3726 __getparam_dl(struct task_struct *p, struct sched_attr *attr) 3727 { 3728 struct sched_dl_entity *dl_se = &p->dl; 3729 3730 attr->sched_priority = p->rt_priority; 3731 attr->sched_runtime = dl_se->dl_runtime; 3732 attr->sched_deadline = dl_se->dl_deadline; 3733 attr->sched_period = dl_se->dl_period; 3734 attr->sched_flags = dl_se->flags; 3735 } 3736 3737 /* 3738 * This function validates the new parameters of a -deadline task. 3739 * We ask for the deadline not being zero, and greater or equal 3740 * than the runtime, as well as the period of being zero or 3741 * greater than deadline. Furthermore, we have to be sure that 3742 * user parameters are above the internal resolution of 1us (we 3743 * check sched_runtime only since it is always the smaller one) and 3744 * below 2^63 ns (we have to check both sched_deadline and 3745 * sched_period, as the latter can be zero). 3746 */ 3747 static bool 3748 __checkparam_dl(const struct sched_attr *attr) 3749 { 3750 /* deadline != 0 */ 3751 if (attr->sched_deadline == 0) 3752 return false; 3753 3754 /* 3755 * Since we truncate DL_SCALE bits, make sure we're at least 3756 * that big. 3757 */ 3758 if (attr->sched_runtime < (1ULL << DL_SCALE)) 3759 return false; 3760 3761 /* 3762 * Since we use the MSB for wrap-around and sign issues, make 3763 * sure it's not set (mind that period can be equal to zero). 3764 */ 3765 if (attr->sched_deadline & (1ULL << 63) || 3766 attr->sched_period & (1ULL << 63)) 3767 return false; 3768 3769 /* runtime <= deadline <= period (if period != 0) */ 3770 if ((attr->sched_period != 0 && 3771 attr->sched_period < attr->sched_deadline) || 3772 attr->sched_deadline < attr->sched_runtime) 3773 return false; 3774 3775 return true; 3776 } 3777 3778 /* 3779 * check the target process has a UID that matches the current process's 3780 */ 3781 static bool check_same_owner(struct task_struct *p) 3782 { 3783 const struct cred *cred = current_cred(), *pcred; 3784 bool match; 3785 3786 rcu_read_lock(); 3787 pcred = __task_cred(p); 3788 match = (uid_eq(cred->euid, pcred->euid) || 3789 uid_eq(cred->euid, pcred->uid)); 3790 rcu_read_unlock(); 3791 return match; 3792 } 3793 3794 static bool dl_param_changed(struct task_struct *p, 3795 const struct sched_attr *attr) 3796 { 3797 struct sched_dl_entity *dl_se = &p->dl; 3798 3799 if (dl_se->dl_runtime != attr->sched_runtime || 3800 dl_se->dl_deadline != attr->sched_deadline || 3801 dl_se->dl_period != attr->sched_period || 3802 dl_se->flags != attr->sched_flags) 3803 return true; 3804 3805 return false; 3806 } 3807 3808 static int __sched_setscheduler(struct task_struct *p, 3809 const struct sched_attr *attr, 3810 bool user, bool pi) 3811 { 3812 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 3813 MAX_RT_PRIO - 1 - attr->sched_priority; 3814 int retval, oldprio, oldpolicy = -1, queued, running; 3815 int new_effective_prio, policy = attr->sched_policy; 3816 unsigned long flags; 3817 const struct sched_class *prev_class; 3818 struct rq *rq; 3819 int reset_on_fork; 3820 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 3821 3822 /* may grab non-irq protected spin_locks */ 3823 BUG_ON(in_interrupt()); 3824 recheck: 3825 /* double check policy once rq lock held */ 3826 if (policy < 0) { 3827 reset_on_fork = p->sched_reset_on_fork; 3828 policy = oldpolicy = p->policy; 3829 } else { 3830 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 3831 3832 if (!valid_policy(policy)) 3833 return -EINVAL; 3834 } 3835 3836 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) 3837 return -EINVAL; 3838 3839 /* 3840 * Valid priorities for SCHED_FIFO and SCHED_RR are 3841 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3842 * SCHED_BATCH and SCHED_IDLE is 0. 3843 */ 3844 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 3845 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 3846 return -EINVAL; 3847 if ((dl_policy(policy) && !__checkparam_dl(attr)) || 3848 (rt_policy(policy) != (attr->sched_priority != 0))) 3849 return -EINVAL; 3850 3851 /* 3852 * Allow unprivileged RT tasks to decrease priority: 3853 */ 3854 if (user && !capable(CAP_SYS_NICE)) { 3855 if (fair_policy(policy)) { 3856 if (attr->sched_nice < task_nice(p) && 3857 !can_nice(p, attr->sched_nice)) 3858 return -EPERM; 3859 } 3860 3861 if (rt_policy(policy)) { 3862 unsigned long rlim_rtprio = 3863 task_rlimit(p, RLIMIT_RTPRIO); 3864 3865 /* can't set/change the rt policy */ 3866 if (policy != p->policy && !rlim_rtprio) 3867 return -EPERM; 3868 3869 /* can't increase priority */ 3870 if (attr->sched_priority > p->rt_priority && 3871 attr->sched_priority > rlim_rtprio) 3872 return -EPERM; 3873 } 3874 3875 /* 3876 * Can't set/change SCHED_DEADLINE policy at all for now 3877 * (safest behavior); in the future we would like to allow 3878 * unprivileged DL tasks to increase their relative deadline 3879 * or reduce their runtime (both ways reducing utilization) 3880 */ 3881 if (dl_policy(policy)) 3882 return -EPERM; 3883 3884 /* 3885 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3886 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3887 */ 3888 if (idle_policy(p->policy) && !idle_policy(policy)) { 3889 if (!can_nice(p, task_nice(p))) 3890 return -EPERM; 3891 } 3892 3893 /* can't change other user's priorities */ 3894 if (!check_same_owner(p)) 3895 return -EPERM; 3896 3897 /* Normal users shall not reset the sched_reset_on_fork flag */ 3898 if (p->sched_reset_on_fork && !reset_on_fork) 3899 return -EPERM; 3900 } 3901 3902 if (user) { 3903 retval = security_task_setscheduler(p); 3904 if (retval) 3905 return retval; 3906 } 3907 3908 /* 3909 * make sure no PI-waiters arrive (or leave) while we are 3910 * changing the priority of the task: 3911 * 3912 * To be able to change p->policy safely, the appropriate 3913 * runqueue lock must be held. 3914 */ 3915 rq = task_rq_lock(p, &flags); 3916 3917 /* 3918 * Changing the policy of the stop threads its a very bad idea 3919 */ 3920 if (p == rq->stop) { 3921 task_rq_unlock(rq, p, &flags); 3922 return -EINVAL; 3923 } 3924 3925 /* 3926 * If not changing anything there's no need to proceed further, 3927 * but store a possible modification of reset_on_fork. 3928 */ 3929 if (unlikely(policy == p->policy)) { 3930 if (fair_policy(policy) && attr->sched_nice != task_nice(p)) 3931 goto change; 3932 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3933 goto change; 3934 if (dl_policy(policy) && dl_param_changed(p, attr)) 3935 goto change; 3936 3937 p->sched_reset_on_fork = reset_on_fork; 3938 task_rq_unlock(rq, p, &flags); 3939 return 0; 3940 } 3941 change: 3942 3943 if (user) { 3944 #ifdef CONFIG_RT_GROUP_SCHED 3945 /* 3946 * Do not allow realtime tasks into groups that have no runtime 3947 * assigned. 3948 */ 3949 if (rt_bandwidth_enabled() && rt_policy(policy) && 3950 task_group(p)->rt_bandwidth.rt_runtime == 0 && 3951 !task_group_is_autogroup(task_group(p))) { 3952 task_rq_unlock(rq, p, &flags); 3953 return -EPERM; 3954 } 3955 #endif 3956 #ifdef CONFIG_SMP 3957 if (dl_bandwidth_enabled() && dl_policy(policy)) { 3958 cpumask_t *span = rq->rd->span; 3959 3960 /* 3961 * Don't allow tasks with an affinity mask smaller than 3962 * the entire root_domain to become SCHED_DEADLINE. We 3963 * will also fail if there's no bandwidth available. 3964 */ 3965 if (!cpumask_subset(span, &p->cpus_allowed) || 3966 rq->rd->dl_bw.bw == 0) { 3967 task_rq_unlock(rq, p, &flags); 3968 return -EPERM; 3969 } 3970 } 3971 #endif 3972 } 3973 3974 /* recheck policy now with rq lock held */ 3975 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3976 policy = oldpolicy = -1; 3977 task_rq_unlock(rq, p, &flags); 3978 goto recheck; 3979 } 3980 3981 /* 3982 * If setscheduling to SCHED_DEADLINE (or changing the parameters 3983 * of a SCHED_DEADLINE task) we need to check if enough bandwidth 3984 * is available. 3985 */ 3986 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { 3987 task_rq_unlock(rq, p, &flags); 3988 return -EBUSY; 3989 } 3990 3991 p->sched_reset_on_fork = reset_on_fork; 3992 oldprio = p->prio; 3993 3994 if (pi) { 3995 /* 3996 * Take priority boosted tasks into account. If the new 3997 * effective priority is unchanged, we just store the new 3998 * normal parameters and do not touch the scheduler class and 3999 * the runqueue. This will be done when the task deboost 4000 * itself. 4001 */ 4002 new_effective_prio = rt_mutex_get_effective_prio(p, newprio); 4003 if (new_effective_prio == oldprio) 4004 queue_flags &= ~DEQUEUE_MOVE; 4005 } 4006 4007 queued = task_on_rq_queued(p); 4008 running = task_current(rq, p); 4009 if (queued) 4010 dequeue_task(rq, p, queue_flags); 4011 if (running) 4012 put_prev_task(rq, p); 4013 4014 prev_class = p->sched_class; 4015 __setscheduler(rq, p, attr, pi); 4016 4017 if (running) 4018 p->sched_class->set_curr_task(rq); 4019 if (queued) { 4020 /* 4021 * We enqueue to tail when the priority of a task is 4022 * increased (user space view). 4023 */ 4024 if (oldprio < p->prio) 4025 queue_flags |= ENQUEUE_HEAD; 4026 4027 enqueue_task(rq, p, queue_flags); 4028 } 4029 4030 check_class_changed(rq, p, prev_class, oldprio); 4031 preempt_disable(); /* avoid rq from going away on us */ 4032 task_rq_unlock(rq, p, &flags); 4033 4034 if (pi) 4035 rt_mutex_adjust_pi(p); 4036 4037 /* 4038 * Run balance callbacks after we've adjusted the PI chain. 4039 */ 4040 balance_callback(rq); 4041 preempt_enable(); 4042 4043 return 0; 4044 } 4045 4046 static int _sched_setscheduler(struct task_struct *p, int policy, 4047 const struct sched_param *param, bool check) 4048 { 4049 struct sched_attr attr = { 4050 .sched_policy = policy, 4051 .sched_priority = param->sched_priority, 4052 .sched_nice = PRIO_TO_NICE(p->static_prio), 4053 }; 4054 4055 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ 4056 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { 4057 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 4058 policy &= ~SCHED_RESET_ON_FORK; 4059 attr.sched_policy = policy; 4060 } 4061 4062 return __sched_setscheduler(p, &attr, check, true); 4063 } 4064 /** 4065 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4066 * @p: the task in question. 4067 * @policy: new policy. 4068 * @param: structure containing the new RT priority. 4069 * 4070 * Return: 0 on success. An error code otherwise. 4071 * 4072 * NOTE that the task may be already dead. 4073 */ 4074 int sched_setscheduler(struct task_struct *p, int policy, 4075 const struct sched_param *param) 4076 { 4077 return _sched_setscheduler(p, policy, param, true); 4078 } 4079 EXPORT_SYMBOL_GPL(sched_setscheduler); 4080 4081 int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 4082 { 4083 return __sched_setscheduler(p, attr, true, true); 4084 } 4085 EXPORT_SYMBOL_GPL(sched_setattr); 4086 4087 /** 4088 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 4089 * @p: the task in question. 4090 * @policy: new policy. 4091 * @param: structure containing the new RT priority. 4092 * 4093 * Just like sched_setscheduler, only don't bother checking if the 4094 * current context has permission. For example, this is needed in 4095 * stop_machine(): we create temporary high priority worker threads, 4096 * but our caller might not have that capability. 4097 * 4098 * Return: 0 on success. An error code otherwise. 4099 */ 4100 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4101 const struct sched_param *param) 4102 { 4103 return _sched_setscheduler(p, policy, param, false); 4104 } 4105 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); 4106 4107 static int 4108 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4109 { 4110 struct sched_param lparam; 4111 struct task_struct *p; 4112 int retval; 4113 4114 if (!param || pid < 0) 4115 return -EINVAL; 4116 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 4117 return -EFAULT; 4118 4119 rcu_read_lock(); 4120 retval = -ESRCH; 4121 p = find_process_by_pid(pid); 4122 if (p != NULL) 4123 retval = sched_setscheduler(p, policy, &lparam); 4124 rcu_read_unlock(); 4125 4126 return retval; 4127 } 4128 4129 /* 4130 * Mimics kernel/events/core.c perf_copy_attr(). 4131 */ 4132 static int sched_copy_attr(struct sched_attr __user *uattr, 4133 struct sched_attr *attr) 4134 { 4135 u32 size; 4136 int ret; 4137 4138 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) 4139 return -EFAULT; 4140 4141 /* 4142 * zero the full structure, so that a short copy will be nice. 4143 */ 4144 memset(attr, 0, sizeof(*attr)); 4145 4146 ret = get_user(size, &uattr->size); 4147 if (ret) 4148 return ret; 4149 4150 if (size > PAGE_SIZE) /* silly large */ 4151 goto err_size; 4152 4153 if (!size) /* abi compat */ 4154 size = SCHED_ATTR_SIZE_VER0; 4155 4156 if (size < SCHED_ATTR_SIZE_VER0) 4157 goto err_size; 4158 4159 /* 4160 * If we're handed a bigger struct than we know of, 4161 * ensure all the unknown bits are 0 - i.e. new 4162 * user-space does not rely on any kernel feature 4163 * extensions we dont know about yet. 4164 */ 4165 if (size > sizeof(*attr)) { 4166 unsigned char __user *addr; 4167 unsigned char __user *end; 4168 unsigned char val; 4169 4170 addr = (void __user *)uattr + sizeof(*attr); 4171 end = (void __user *)uattr + size; 4172 4173 for (; addr < end; addr++) { 4174 ret = get_user(val, addr); 4175 if (ret) 4176 return ret; 4177 if (val) 4178 goto err_size; 4179 } 4180 size = sizeof(*attr); 4181 } 4182 4183 ret = copy_from_user(attr, uattr, size); 4184 if (ret) 4185 return -EFAULT; 4186 4187 /* 4188 * XXX: do we want to be lenient like existing syscalls; or do we want 4189 * to be strict and return an error on out-of-bounds values? 4190 */ 4191 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 4192 4193 return 0; 4194 4195 err_size: 4196 put_user(sizeof(*attr), &uattr->size); 4197 return -E2BIG; 4198 } 4199 4200 /** 4201 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 4202 * @pid: the pid in question. 4203 * @policy: new policy. 4204 * @param: structure containing the new RT priority. 4205 * 4206 * Return: 0 on success. An error code otherwise. 4207 */ 4208 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 4209 struct sched_param __user *, param) 4210 { 4211 /* negative values for policy are not valid */ 4212 if (policy < 0) 4213 return -EINVAL; 4214 4215 return do_sched_setscheduler(pid, policy, param); 4216 } 4217 4218 /** 4219 * sys_sched_setparam - set/change the RT priority of a thread 4220 * @pid: the pid in question. 4221 * @param: structure containing the new RT priority. 4222 * 4223 * Return: 0 on success. An error code otherwise. 4224 */ 4225 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 4226 { 4227 return do_sched_setscheduler(pid, SETPARAM_POLICY, param); 4228 } 4229 4230 /** 4231 * sys_sched_setattr - same as above, but with extended sched_attr 4232 * @pid: the pid in question. 4233 * @uattr: structure containing the extended parameters. 4234 * @flags: for future extension. 4235 */ 4236 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 4237 unsigned int, flags) 4238 { 4239 struct sched_attr attr; 4240 struct task_struct *p; 4241 int retval; 4242 4243 if (!uattr || pid < 0 || flags) 4244 return -EINVAL; 4245 4246 retval = sched_copy_attr(uattr, &attr); 4247 if (retval) 4248 return retval; 4249 4250 if ((int)attr.sched_policy < 0) 4251 return -EINVAL; 4252 4253 rcu_read_lock(); 4254 retval = -ESRCH; 4255 p = find_process_by_pid(pid); 4256 if (p != NULL) 4257 retval = sched_setattr(p, &attr); 4258 rcu_read_unlock(); 4259 4260 return retval; 4261 } 4262 4263 /** 4264 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 4265 * @pid: the pid in question. 4266 * 4267 * Return: On success, the policy of the thread. Otherwise, a negative error 4268 * code. 4269 */ 4270 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 4271 { 4272 struct task_struct *p; 4273 int retval; 4274 4275 if (pid < 0) 4276 return -EINVAL; 4277 4278 retval = -ESRCH; 4279 rcu_read_lock(); 4280 p = find_process_by_pid(pid); 4281 if (p) { 4282 retval = security_task_getscheduler(p); 4283 if (!retval) 4284 retval = p->policy 4285 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4286 } 4287 rcu_read_unlock(); 4288 return retval; 4289 } 4290 4291 /** 4292 * sys_sched_getparam - get the RT priority of a thread 4293 * @pid: the pid in question. 4294 * @param: structure containing the RT priority. 4295 * 4296 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error 4297 * code. 4298 */ 4299 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 4300 { 4301 struct sched_param lp = { .sched_priority = 0 }; 4302 struct task_struct *p; 4303 int retval; 4304 4305 if (!param || pid < 0) 4306 return -EINVAL; 4307 4308 rcu_read_lock(); 4309 p = find_process_by_pid(pid); 4310 retval = -ESRCH; 4311 if (!p) 4312 goto out_unlock; 4313 4314 retval = security_task_getscheduler(p); 4315 if (retval) 4316 goto out_unlock; 4317 4318 if (task_has_rt_policy(p)) 4319 lp.sched_priority = p->rt_priority; 4320 rcu_read_unlock(); 4321 4322 /* 4323 * This one might sleep, we cannot do it with a spinlock held ... 4324 */ 4325 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4326 4327 return retval; 4328 4329 out_unlock: 4330 rcu_read_unlock(); 4331 return retval; 4332 } 4333 4334 static int sched_read_attr(struct sched_attr __user *uattr, 4335 struct sched_attr *attr, 4336 unsigned int usize) 4337 { 4338 int ret; 4339 4340 if (!access_ok(VERIFY_WRITE, uattr, usize)) 4341 return -EFAULT; 4342 4343 /* 4344 * If we're handed a smaller struct than we know of, 4345 * ensure all the unknown bits are 0 - i.e. old 4346 * user-space does not get uncomplete information. 4347 */ 4348 if (usize < sizeof(*attr)) { 4349 unsigned char *addr; 4350 unsigned char *end; 4351 4352 addr = (void *)attr + usize; 4353 end = (void *)attr + sizeof(*attr); 4354 4355 for (; addr < end; addr++) { 4356 if (*addr) 4357 return -EFBIG; 4358 } 4359 4360 attr->size = usize; 4361 } 4362 4363 ret = copy_to_user(uattr, attr, attr->size); 4364 if (ret) 4365 return -EFAULT; 4366 4367 return 0; 4368 } 4369 4370 /** 4371 * sys_sched_getattr - similar to sched_getparam, but with sched_attr 4372 * @pid: the pid in question. 4373 * @uattr: structure containing the extended parameters. 4374 * @size: sizeof(attr) for fwd/bwd comp. 4375 * @flags: for future extension. 4376 */ 4377 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 4378 unsigned int, size, unsigned int, flags) 4379 { 4380 struct sched_attr attr = { 4381 .size = sizeof(struct sched_attr), 4382 }; 4383 struct task_struct *p; 4384 int retval; 4385 4386 if (!uattr || pid < 0 || size > PAGE_SIZE || 4387 size < SCHED_ATTR_SIZE_VER0 || flags) 4388 return -EINVAL; 4389 4390 rcu_read_lock(); 4391 p = find_process_by_pid(pid); 4392 retval = -ESRCH; 4393 if (!p) 4394 goto out_unlock; 4395 4396 retval = security_task_getscheduler(p); 4397 if (retval) 4398 goto out_unlock; 4399 4400 attr.sched_policy = p->policy; 4401 if (p->sched_reset_on_fork) 4402 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 4403 if (task_has_dl_policy(p)) 4404 __getparam_dl(p, &attr); 4405 else if (task_has_rt_policy(p)) 4406 attr.sched_priority = p->rt_priority; 4407 else 4408 attr.sched_nice = task_nice(p); 4409 4410 rcu_read_unlock(); 4411 4412 retval = sched_read_attr(uattr, &attr, size); 4413 return retval; 4414 4415 out_unlock: 4416 rcu_read_unlock(); 4417 return retval; 4418 } 4419 4420 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 4421 { 4422 cpumask_var_t cpus_allowed, new_mask; 4423 struct task_struct *p; 4424 int retval; 4425 4426 rcu_read_lock(); 4427 4428 p = find_process_by_pid(pid); 4429 if (!p) { 4430 rcu_read_unlock(); 4431 return -ESRCH; 4432 } 4433 4434 /* Prevent p going away */ 4435 get_task_struct(p); 4436 rcu_read_unlock(); 4437 4438 if (p->flags & PF_NO_SETAFFINITY) { 4439 retval = -EINVAL; 4440 goto out_put_task; 4441 } 4442 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4443 retval = -ENOMEM; 4444 goto out_put_task; 4445 } 4446 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 4447 retval = -ENOMEM; 4448 goto out_free_cpus_allowed; 4449 } 4450 retval = -EPERM; 4451 if (!check_same_owner(p)) { 4452 rcu_read_lock(); 4453 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4454 rcu_read_unlock(); 4455 goto out_free_new_mask; 4456 } 4457 rcu_read_unlock(); 4458 } 4459 4460 retval = security_task_setscheduler(p); 4461 if (retval) 4462 goto out_free_new_mask; 4463 4464 4465 cpuset_cpus_allowed(p, cpus_allowed); 4466 cpumask_and(new_mask, in_mask, cpus_allowed); 4467 4468 /* 4469 * Since bandwidth control happens on root_domain basis, 4470 * if admission test is enabled, we only admit -deadline 4471 * tasks allowed to run on all the CPUs in the task's 4472 * root_domain. 4473 */ 4474 #ifdef CONFIG_SMP 4475 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { 4476 rcu_read_lock(); 4477 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { 4478 retval = -EBUSY; 4479 rcu_read_unlock(); 4480 goto out_free_new_mask; 4481 } 4482 rcu_read_unlock(); 4483 } 4484 #endif 4485 again: 4486 retval = __set_cpus_allowed_ptr(p, new_mask, true); 4487 4488 if (!retval) { 4489 cpuset_cpus_allowed(p, cpus_allowed); 4490 if (!cpumask_subset(new_mask, cpus_allowed)) { 4491 /* 4492 * We must have raced with a concurrent cpuset 4493 * update. Just reset the cpus_allowed to the 4494 * cpuset's cpus_allowed 4495 */ 4496 cpumask_copy(new_mask, cpus_allowed); 4497 goto again; 4498 } 4499 } 4500 out_free_new_mask: 4501 free_cpumask_var(new_mask); 4502 out_free_cpus_allowed: 4503 free_cpumask_var(cpus_allowed); 4504 out_put_task: 4505 put_task_struct(p); 4506 return retval; 4507 } 4508 4509 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4510 struct cpumask *new_mask) 4511 { 4512 if (len < cpumask_size()) 4513 cpumask_clear(new_mask); 4514 else if (len > cpumask_size()) 4515 len = cpumask_size(); 4516 4517 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4518 } 4519 4520 /** 4521 * sys_sched_setaffinity - set the cpu affinity of a process 4522 * @pid: pid of the process 4523 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4524 * @user_mask_ptr: user-space pointer to the new cpu mask 4525 * 4526 * Return: 0 on success. An error code otherwise. 4527 */ 4528 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 4529 unsigned long __user *, user_mask_ptr) 4530 { 4531 cpumask_var_t new_mask; 4532 int retval; 4533 4534 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4535 return -ENOMEM; 4536 4537 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 4538 if (retval == 0) 4539 retval = sched_setaffinity(pid, new_mask); 4540 free_cpumask_var(new_mask); 4541 return retval; 4542 } 4543 4544 long sched_getaffinity(pid_t pid, struct cpumask *mask) 4545 { 4546 struct task_struct *p; 4547 unsigned long flags; 4548 int retval; 4549 4550 rcu_read_lock(); 4551 4552 retval = -ESRCH; 4553 p = find_process_by_pid(pid); 4554 if (!p) 4555 goto out_unlock; 4556 4557 retval = security_task_getscheduler(p); 4558 if (retval) 4559 goto out_unlock; 4560 4561 raw_spin_lock_irqsave(&p->pi_lock, flags); 4562 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); 4563 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4564 4565 out_unlock: 4566 rcu_read_unlock(); 4567 4568 return retval; 4569 } 4570 4571 /** 4572 * sys_sched_getaffinity - get the cpu affinity of a process 4573 * @pid: pid of the process 4574 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4575 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4576 * 4577 * Return: 0 on success. An error code otherwise. 4578 */ 4579 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4580 unsigned long __user *, user_mask_ptr) 4581 { 4582 int ret; 4583 cpumask_var_t mask; 4584 4585 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4586 return -EINVAL; 4587 if (len & (sizeof(unsigned long)-1)) 4588 return -EINVAL; 4589 4590 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4591 return -ENOMEM; 4592 4593 ret = sched_getaffinity(pid, mask); 4594 if (ret == 0) { 4595 size_t retlen = min_t(size_t, len, cpumask_size()); 4596 4597 if (copy_to_user(user_mask_ptr, mask, retlen)) 4598 ret = -EFAULT; 4599 else 4600 ret = retlen; 4601 } 4602 free_cpumask_var(mask); 4603 4604 return ret; 4605 } 4606 4607 /** 4608 * sys_sched_yield - yield the current processor to other threads. 4609 * 4610 * This function yields the current CPU to other tasks. If there are no 4611 * other threads running on this CPU then this function will return. 4612 * 4613 * Return: 0. 4614 */ 4615 SYSCALL_DEFINE0(sched_yield) 4616 { 4617 struct rq *rq = this_rq_lock(); 4618 4619 schedstat_inc(rq, yld_count); 4620 current->sched_class->yield_task(rq); 4621 4622 /* 4623 * Since we are going to call schedule() anyway, there's 4624 * no need to preempt or enable interrupts: 4625 */ 4626 __release(rq->lock); 4627 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4628 do_raw_spin_unlock(&rq->lock); 4629 sched_preempt_enable_no_resched(); 4630 4631 schedule(); 4632 4633 return 0; 4634 } 4635 4636 int __sched _cond_resched(void) 4637 { 4638 if (should_resched(0)) { 4639 preempt_schedule_common(); 4640 return 1; 4641 } 4642 return 0; 4643 } 4644 EXPORT_SYMBOL(_cond_resched); 4645 4646 /* 4647 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4648 * call schedule, and on return reacquire the lock. 4649 * 4650 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4651 * operations here to prevent schedule() from being called twice (once via 4652 * spin_unlock(), once by hand). 4653 */ 4654 int __cond_resched_lock(spinlock_t *lock) 4655 { 4656 int resched = should_resched(PREEMPT_LOCK_OFFSET); 4657 int ret = 0; 4658 4659 lockdep_assert_held(lock); 4660 4661 if (spin_needbreak(lock) || resched) { 4662 spin_unlock(lock); 4663 if (resched) 4664 preempt_schedule_common(); 4665 else 4666 cpu_relax(); 4667 ret = 1; 4668 spin_lock(lock); 4669 } 4670 return ret; 4671 } 4672 EXPORT_SYMBOL(__cond_resched_lock); 4673 4674 int __sched __cond_resched_softirq(void) 4675 { 4676 BUG_ON(!in_softirq()); 4677 4678 if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { 4679 local_bh_enable(); 4680 preempt_schedule_common(); 4681 local_bh_disable(); 4682 return 1; 4683 } 4684 return 0; 4685 } 4686 EXPORT_SYMBOL(__cond_resched_softirq); 4687 4688 /** 4689 * yield - yield the current processor to other threads. 4690 * 4691 * Do not ever use this function, there's a 99% chance you're doing it wrong. 4692 * 4693 * The scheduler is at all times free to pick the calling task as the most 4694 * eligible task to run, if removing the yield() call from your code breaks 4695 * it, its already broken. 4696 * 4697 * Typical broken usage is: 4698 * 4699 * while (!event) 4700 * yield(); 4701 * 4702 * where one assumes that yield() will let 'the other' process run that will 4703 * make event true. If the current task is a SCHED_FIFO task that will never 4704 * happen. Never use yield() as a progress guarantee!! 4705 * 4706 * If you want to use yield() to wait for something, use wait_event(). 4707 * If you want to use yield() to be 'nice' for others, use cond_resched(). 4708 * If you still want to use yield(), do not! 4709 */ 4710 void __sched yield(void) 4711 { 4712 set_current_state(TASK_RUNNING); 4713 sys_sched_yield(); 4714 } 4715 EXPORT_SYMBOL(yield); 4716 4717 /** 4718 * yield_to - yield the current processor to another thread in 4719 * your thread group, or accelerate that thread toward the 4720 * processor it's on. 4721 * @p: target task 4722 * @preempt: whether task preemption is allowed or not 4723 * 4724 * It's the caller's job to ensure that the target task struct 4725 * can't go away on us before we can do any checks. 4726 * 4727 * Return: 4728 * true (>0) if we indeed boosted the target task. 4729 * false (0) if we failed to boost the target. 4730 * -ESRCH if there's no task to yield to. 4731 */ 4732 int __sched yield_to(struct task_struct *p, bool preempt) 4733 { 4734 struct task_struct *curr = current; 4735 struct rq *rq, *p_rq; 4736 unsigned long flags; 4737 int yielded = 0; 4738 4739 local_irq_save(flags); 4740 rq = this_rq(); 4741 4742 again: 4743 p_rq = task_rq(p); 4744 /* 4745 * If we're the only runnable task on the rq and target rq also 4746 * has only one task, there's absolutely no point in yielding. 4747 */ 4748 if (rq->nr_running == 1 && p_rq->nr_running == 1) { 4749 yielded = -ESRCH; 4750 goto out_irq; 4751 } 4752 4753 double_rq_lock(rq, p_rq); 4754 if (task_rq(p) != p_rq) { 4755 double_rq_unlock(rq, p_rq); 4756 goto again; 4757 } 4758 4759 if (!curr->sched_class->yield_to_task) 4760 goto out_unlock; 4761 4762 if (curr->sched_class != p->sched_class) 4763 goto out_unlock; 4764 4765 if (task_running(p_rq, p) || p->state) 4766 goto out_unlock; 4767 4768 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4769 if (yielded) { 4770 schedstat_inc(rq, yld_count); 4771 /* 4772 * Make p's CPU reschedule; pick_next_entity takes care of 4773 * fairness. 4774 */ 4775 if (preempt && rq != p_rq) 4776 resched_curr(p_rq); 4777 } 4778 4779 out_unlock: 4780 double_rq_unlock(rq, p_rq); 4781 out_irq: 4782 local_irq_restore(flags); 4783 4784 if (yielded > 0) 4785 schedule(); 4786 4787 return yielded; 4788 } 4789 EXPORT_SYMBOL_GPL(yield_to); 4790 4791 /* 4792 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4793 * that process accounting knows that this is a task in IO wait state. 4794 */ 4795 long __sched io_schedule_timeout(long timeout) 4796 { 4797 int old_iowait = current->in_iowait; 4798 struct rq *rq; 4799 long ret; 4800 4801 current->in_iowait = 1; 4802 blk_schedule_flush_plug(current); 4803 4804 delayacct_blkio_start(); 4805 rq = raw_rq(); 4806 atomic_inc(&rq->nr_iowait); 4807 ret = schedule_timeout(timeout); 4808 current->in_iowait = old_iowait; 4809 atomic_dec(&rq->nr_iowait); 4810 delayacct_blkio_end(); 4811 4812 return ret; 4813 } 4814 EXPORT_SYMBOL(io_schedule_timeout); 4815 4816 /** 4817 * sys_sched_get_priority_max - return maximum RT priority. 4818 * @policy: scheduling class. 4819 * 4820 * Return: On success, this syscall returns the maximum 4821 * rt_priority that can be used by a given scheduling class. 4822 * On failure, a negative error code is returned. 4823 */ 4824 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4825 { 4826 int ret = -EINVAL; 4827 4828 switch (policy) { 4829 case SCHED_FIFO: 4830 case SCHED_RR: 4831 ret = MAX_USER_RT_PRIO-1; 4832 break; 4833 case SCHED_DEADLINE: 4834 case SCHED_NORMAL: 4835 case SCHED_BATCH: 4836 case SCHED_IDLE: 4837 ret = 0; 4838 break; 4839 } 4840 return ret; 4841 } 4842 4843 /** 4844 * sys_sched_get_priority_min - return minimum RT priority. 4845 * @policy: scheduling class. 4846 * 4847 * Return: On success, this syscall returns the minimum 4848 * rt_priority that can be used by a given scheduling class. 4849 * On failure, a negative error code is returned. 4850 */ 4851 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4852 { 4853 int ret = -EINVAL; 4854 4855 switch (policy) { 4856 case SCHED_FIFO: 4857 case SCHED_RR: 4858 ret = 1; 4859 break; 4860 case SCHED_DEADLINE: 4861 case SCHED_NORMAL: 4862 case SCHED_BATCH: 4863 case SCHED_IDLE: 4864 ret = 0; 4865 } 4866 return ret; 4867 } 4868 4869 /** 4870 * sys_sched_rr_get_interval - return the default timeslice of a process. 4871 * @pid: pid of the process. 4872 * @interval: userspace pointer to the timeslice value. 4873 * 4874 * this syscall writes the default timeslice value of a given process 4875 * into the user-space timespec buffer. A value of '0' means infinity. 4876 * 4877 * Return: On success, 0 and the timeslice is in @interval. Otherwise, 4878 * an error code. 4879 */ 4880 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4881 struct timespec __user *, interval) 4882 { 4883 struct task_struct *p; 4884 unsigned int time_slice; 4885 unsigned long flags; 4886 struct rq *rq; 4887 int retval; 4888 struct timespec t; 4889 4890 if (pid < 0) 4891 return -EINVAL; 4892 4893 retval = -ESRCH; 4894 rcu_read_lock(); 4895 p = find_process_by_pid(pid); 4896 if (!p) 4897 goto out_unlock; 4898 4899 retval = security_task_getscheduler(p); 4900 if (retval) 4901 goto out_unlock; 4902 4903 rq = task_rq_lock(p, &flags); 4904 time_slice = 0; 4905 if (p->sched_class->get_rr_interval) 4906 time_slice = p->sched_class->get_rr_interval(rq, p); 4907 task_rq_unlock(rq, p, &flags); 4908 4909 rcu_read_unlock(); 4910 jiffies_to_timespec(time_slice, &t); 4911 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4912 return retval; 4913 4914 out_unlock: 4915 rcu_read_unlock(); 4916 return retval; 4917 } 4918 4919 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 4920 4921 void sched_show_task(struct task_struct *p) 4922 { 4923 unsigned long free = 0; 4924 int ppid; 4925 unsigned long state = p->state; 4926 4927 if (state) 4928 state = __ffs(state) + 1; 4929 printk(KERN_INFO "%-15.15s %c", p->comm, 4930 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4931 #if BITS_PER_LONG == 32 4932 if (state == TASK_RUNNING) 4933 printk(KERN_CONT " running "); 4934 else 4935 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 4936 #else 4937 if (state == TASK_RUNNING) 4938 printk(KERN_CONT " running task "); 4939 else 4940 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 4941 #endif 4942 #ifdef CONFIG_DEBUG_STACK_USAGE 4943 free = stack_not_used(p); 4944 #endif 4945 ppid = 0; 4946 rcu_read_lock(); 4947 if (pid_alive(p)) 4948 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4949 rcu_read_unlock(); 4950 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4951 task_pid_nr(p), ppid, 4952 (unsigned long)task_thread_info(p)->flags); 4953 4954 print_worker_info(KERN_INFO, p); 4955 show_stack(p, NULL); 4956 } 4957 4958 void show_state_filter(unsigned long state_filter) 4959 { 4960 struct task_struct *g, *p; 4961 4962 #if BITS_PER_LONG == 32 4963 printk(KERN_INFO 4964 " task PC stack pid father\n"); 4965 #else 4966 printk(KERN_INFO 4967 " task PC stack pid father\n"); 4968 #endif 4969 rcu_read_lock(); 4970 for_each_process_thread(g, p) { 4971 /* 4972 * reset the NMI-timeout, listing all files on a slow 4973 * console might take a lot of time: 4974 */ 4975 touch_nmi_watchdog(); 4976 if (!state_filter || (p->state & state_filter)) 4977 sched_show_task(p); 4978 } 4979 4980 touch_all_softlockup_watchdogs(); 4981 4982 #ifdef CONFIG_SCHED_DEBUG 4983 sysrq_sched_debug_show(); 4984 #endif 4985 rcu_read_unlock(); 4986 /* 4987 * Only show locks if all tasks are dumped: 4988 */ 4989 if (!state_filter) 4990 debug_show_all_locks(); 4991 } 4992 4993 void init_idle_bootup_task(struct task_struct *idle) 4994 { 4995 idle->sched_class = &idle_sched_class; 4996 } 4997 4998 /** 4999 * init_idle - set up an idle thread for a given CPU 5000 * @idle: task in question 5001 * @cpu: cpu the idle task belongs to 5002 * 5003 * NOTE: this function does not set the idle thread's NEED_RESCHED 5004 * flag, to make booting more robust. 5005 */ 5006 void init_idle(struct task_struct *idle, int cpu) 5007 { 5008 struct rq *rq = cpu_rq(cpu); 5009 unsigned long flags; 5010 5011 raw_spin_lock_irqsave(&idle->pi_lock, flags); 5012 raw_spin_lock(&rq->lock); 5013 5014 __sched_fork(0, idle); 5015 idle->state = TASK_RUNNING; 5016 idle->se.exec_start = sched_clock(); 5017 5018 kasan_unpoison_task_stack(idle); 5019 5020 #ifdef CONFIG_SMP 5021 /* 5022 * Its possible that init_idle() gets called multiple times on a task, 5023 * in that case do_set_cpus_allowed() will not do the right thing. 5024 * 5025 * And since this is boot we can forgo the serialization. 5026 */ 5027 set_cpus_allowed_common(idle, cpumask_of(cpu)); 5028 #endif 5029 /* 5030 * We're having a chicken and egg problem, even though we are 5031 * holding rq->lock, the cpu isn't yet set to this cpu so the 5032 * lockdep check in task_group() will fail. 5033 * 5034 * Similar case to sched_fork(). / Alternatively we could 5035 * use task_rq_lock() here and obtain the other rq->lock. 5036 * 5037 * Silence PROVE_RCU 5038 */ 5039 rcu_read_lock(); 5040 __set_task_cpu(idle, cpu); 5041 rcu_read_unlock(); 5042 5043 rq->curr = rq->idle = idle; 5044 idle->on_rq = TASK_ON_RQ_QUEUED; 5045 #ifdef CONFIG_SMP 5046 idle->on_cpu = 1; 5047 #endif 5048 raw_spin_unlock(&rq->lock); 5049 raw_spin_unlock_irqrestore(&idle->pi_lock, flags); 5050 5051 /* Set the preempt count _outside_ the spinlocks! */ 5052 init_idle_preempt_count(idle, cpu); 5053 5054 /* 5055 * The idle tasks have their own, simple scheduling class: 5056 */ 5057 idle->sched_class = &idle_sched_class; 5058 ftrace_graph_init_idle_task(idle, cpu); 5059 vtime_init_idle(idle, cpu); 5060 #ifdef CONFIG_SMP 5061 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 5062 #endif 5063 } 5064 5065 int cpuset_cpumask_can_shrink(const struct cpumask *cur, 5066 const struct cpumask *trial) 5067 { 5068 int ret = 1, trial_cpus; 5069 struct dl_bw *cur_dl_b; 5070 unsigned long flags; 5071 5072 if (!cpumask_weight(cur)) 5073 return ret; 5074 5075 rcu_read_lock_sched(); 5076 cur_dl_b = dl_bw_of(cpumask_any(cur)); 5077 trial_cpus = cpumask_weight(trial); 5078 5079 raw_spin_lock_irqsave(&cur_dl_b->lock, flags); 5080 if (cur_dl_b->bw != -1 && 5081 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) 5082 ret = 0; 5083 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); 5084 rcu_read_unlock_sched(); 5085 5086 return ret; 5087 } 5088 5089 int task_can_attach(struct task_struct *p, 5090 const struct cpumask *cs_cpus_allowed) 5091 { 5092 int ret = 0; 5093 5094 /* 5095 * Kthreads which disallow setaffinity shouldn't be moved 5096 * to a new cpuset; we don't want to change their cpu 5097 * affinity and isolating such threads by their set of 5098 * allowed nodes is unnecessary. Thus, cpusets are not 5099 * applicable for such threads. This prevents checking for 5100 * success of set_cpus_allowed_ptr() on all attached tasks 5101 * before cpus_allowed may be changed. 5102 */ 5103 if (p->flags & PF_NO_SETAFFINITY) { 5104 ret = -EINVAL; 5105 goto out; 5106 } 5107 5108 #ifdef CONFIG_SMP 5109 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, 5110 cs_cpus_allowed)) { 5111 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, 5112 cs_cpus_allowed); 5113 struct dl_bw *dl_b; 5114 bool overflow; 5115 int cpus; 5116 unsigned long flags; 5117 5118 rcu_read_lock_sched(); 5119 dl_b = dl_bw_of(dest_cpu); 5120 raw_spin_lock_irqsave(&dl_b->lock, flags); 5121 cpus = dl_bw_cpus(dest_cpu); 5122 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); 5123 if (overflow) 5124 ret = -EBUSY; 5125 else { 5126 /* 5127 * We reserve space for this task in the destination 5128 * root_domain, as we can't fail after this point. 5129 * We will free resources in the source root_domain 5130 * later on (see set_cpus_allowed_dl()). 5131 */ 5132 __dl_add(dl_b, p->dl.dl_bw); 5133 } 5134 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 5135 rcu_read_unlock_sched(); 5136 5137 } 5138 #endif 5139 out: 5140 return ret; 5141 } 5142 5143 #ifdef CONFIG_SMP 5144 5145 #ifdef CONFIG_NUMA_BALANCING 5146 /* Migrate current task p to target_cpu */ 5147 int migrate_task_to(struct task_struct *p, int target_cpu) 5148 { 5149 struct migration_arg arg = { p, target_cpu }; 5150 int curr_cpu = task_cpu(p); 5151 5152 if (curr_cpu == target_cpu) 5153 return 0; 5154 5155 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) 5156 return -EINVAL; 5157 5158 /* TODO: This is not properly updating schedstats */ 5159 5160 trace_sched_move_numa(p, curr_cpu, target_cpu); 5161 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 5162 } 5163 5164 /* 5165 * Requeue a task on a given node and accurately track the number of NUMA 5166 * tasks on the runqueues 5167 */ 5168 void sched_setnuma(struct task_struct *p, int nid) 5169 { 5170 struct rq *rq; 5171 unsigned long flags; 5172 bool queued, running; 5173 5174 rq = task_rq_lock(p, &flags); 5175 queued = task_on_rq_queued(p); 5176 running = task_current(rq, p); 5177 5178 if (queued) 5179 dequeue_task(rq, p, DEQUEUE_SAVE); 5180 if (running) 5181 put_prev_task(rq, p); 5182 5183 p->numa_preferred_nid = nid; 5184 5185 if (running) 5186 p->sched_class->set_curr_task(rq); 5187 if (queued) 5188 enqueue_task(rq, p, ENQUEUE_RESTORE); 5189 task_rq_unlock(rq, p, &flags); 5190 } 5191 #endif /* CONFIG_NUMA_BALANCING */ 5192 5193 #ifdef CONFIG_HOTPLUG_CPU 5194 /* 5195 * Ensures that the idle task is using init_mm right before its cpu goes 5196 * offline. 5197 */ 5198 void idle_task_exit(void) 5199 { 5200 struct mm_struct *mm = current->active_mm; 5201 5202 BUG_ON(cpu_online(smp_processor_id())); 5203 5204 if (mm != &init_mm) { 5205 switch_mm(mm, &init_mm, current); 5206 finish_arch_post_lock_switch(); 5207 } 5208 mmdrop(mm); 5209 } 5210 5211 /* 5212 * Since this CPU is going 'away' for a while, fold any nr_active delta 5213 * we might have. Assumes we're called after migrate_tasks() so that the 5214 * nr_active count is stable. 5215 * 5216 * Also see the comment "Global load-average calculations". 5217 */ 5218 static void calc_load_migrate(struct rq *rq) 5219 { 5220 long delta = calc_load_fold_active(rq); 5221 if (delta) 5222 atomic_long_add(delta, &calc_load_tasks); 5223 } 5224 5225 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) 5226 { 5227 } 5228 5229 static const struct sched_class fake_sched_class = { 5230 .put_prev_task = put_prev_task_fake, 5231 }; 5232 5233 static struct task_struct fake_task = { 5234 /* 5235 * Avoid pull_{rt,dl}_task() 5236 */ 5237 .prio = MAX_PRIO + 1, 5238 .sched_class = &fake_sched_class, 5239 }; 5240 5241 /* 5242 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5243 * try_to_wake_up()->select_task_rq(). 5244 * 5245 * Called with rq->lock held even though we'er in stop_machine() and 5246 * there's no concurrency possible, we hold the required locks anyway 5247 * because of lock validation efforts. 5248 */ 5249 static void migrate_tasks(struct rq *dead_rq) 5250 { 5251 struct rq *rq = dead_rq; 5252 struct task_struct *next, *stop = rq->stop; 5253 int dest_cpu; 5254 5255 /* 5256 * Fudge the rq selection such that the below task selection loop 5257 * doesn't get stuck on the currently eligible stop task. 5258 * 5259 * We're currently inside stop_machine() and the rq is either stuck 5260 * in the stop_machine_cpu_stop() loop, or we're executing this code, 5261 * either way we should never end up calling schedule() until we're 5262 * done here. 5263 */ 5264 rq->stop = NULL; 5265 5266 /* 5267 * put_prev_task() and pick_next_task() sched 5268 * class method both need to have an up-to-date 5269 * value of rq->clock[_task] 5270 */ 5271 update_rq_clock(rq); 5272 5273 for (;;) { 5274 /* 5275 * There's this thread running, bail when that's the only 5276 * remaining thread. 5277 */ 5278 if (rq->nr_running == 1) 5279 break; 5280 5281 /* 5282 * pick_next_task assumes pinned rq->lock. 5283 */ 5284 lockdep_pin_lock(&rq->lock); 5285 next = pick_next_task(rq, &fake_task); 5286 BUG_ON(!next); 5287 next->sched_class->put_prev_task(rq, next); 5288 5289 /* 5290 * Rules for changing task_struct::cpus_allowed are holding 5291 * both pi_lock and rq->lock, such that holding either 5292 * stabilizes the mask. 5293 * 5294 * Drop rq->lock is not quite as disastrous as it usually is 5295 * because !cpu_active at this point, which means load-balance 5296 * will not interfere. Also, stop-machine. 5297 */ 5298 lockdep_unpin_lock(&rq->lock); 5299 raw_spin_unlock(&rq->lock); 5300 raw_spin_lock(&next->pi_lock); 5301 raw_spin_lock(&rq->lock); 5302 5303 /* 5304 * Since we're inside stop-machine, _nothing_ should have 5305 * changed the task, WARN if weird stuff happened, because in 5306 * that case the above rq->lock drop is a fail too. 5307 */ 5308 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { 5309 raw_spin_unlock(&next->pi_lock); 5310 continue; 5311 } 5312 5313 /* Find suitable destination for @next, with force if needed. */ 5314 dest_cpu = select_fallback_rq(dead_rq->cpu, next); 5315 5316 rq = __migrate_task(rq, next, dest_cpu); 5317 if (rq != dead_rq) { 5318 raw_spin_unlock(&rq->lock); 5319 rq = dead_rq; 5320 raw_spin_lock(&rq->lock); 5321 } 5322 raw_spin_unlock(&next->pi_lock); 5323 } 5324 5325 rq->stop = stop; 5326 } 5327 #endif /* CONFIG_HOTPLUG_CPU */ 5328 5329 static void set_rq_online(struct rq *rq) 5330 { 5331 if (!rq->online) { 5332 const struct sched_class *class; 5333 5334 cpumask_set_cpu(rq->cpu, rq->rd->online); 5335 rq->online = 1; 5336 5337 for_each_class(class) { 5338 if (class->rq_online) 5339 class->rq_online(rq); 5340 } 5341 } 5342 } 5343 5344 static void set_rq_offline(struct rq *rq) 5345 { 5346 if (rq->online) { 5347 const struct sched_class *class; 5348 5349 for_each_class(class) { 5350 if (class->rq_offline) 5351 class->rq_offline(rq); 5352 } 5353 5354 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5355 rq->online = 0; 5356 } 5357 } 5358 5359 /* 5360 * migration_call - callback that gets triggered when a CPU is added. 5361 * Here we can start up the necessary migration thread for the new CPU. 5362 */ 5363 static int 5364 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5365 { 5366 int cpu = (long)hcpu; 5367 unsigned long flags; 5368 struct rq *rq = cpu_rq(cpu); 5369 5370 switch (action & ~CPU_TASKS_FROZEN) { 5371 5372 case CPU_UP_PREPARE: 5373 rq->calc_load_update = calc_load_update; 5374 account_reset_rq(rq); 5375 break; 5376 5377 case CPU_ONLINE: 5378 /* Update our root-domain */ 5379 raw_spin_lock_irqsave(&rq->lock, flags); 5380 if (rq->rd) { 5381 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5382 5383 set_rq_online(rq); 5384 } 5385 raw_spin_unlock_irqrestore(&rq->lock, flags); 5386 break; 5387 5388 #ifdef CONFIG_HOTPLUG_CPU 5389 case CPU_DYING: 5390 sched_ttwu_pending(); 5391 /* Update our root-domain */ 5392 raw_spin_lock_irqsave(&rq->lock, flags); 5393 if (rq->rd) { 5394 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5395 set_rq_offline(rq); 5396 } 5397 migrate_tasks(rq); 5398 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5399 raw_spin_unlock_irqrestore(&rq->lock, flags); 5400 break; 5401 5402 case CPU_DEAD: 5403 calc_load_migrate(rq); 5404 break; 5405 #endif 5406 } 5407 5408 update_max_interval(); 5409 5410 return NOTIFY_OK; 5411 } 5412 5413 /* 5414 * Register at high priority so that task migration (migrate_all_tasks) 5415 * happens before everything else. This has to be lower priority than 5416 * the notifier in the perf_event subsystem, though. 5417 */ 5418 static struct notifier_block migration_notifier = { 5419 .notifier_call = migration_call, 5420 .priority = CPU_PRI_MIGRATION, 5421 }; 5422 5423 static void set_cpu_rq_start_time(void) 5424 { 5425 int cpu = smp_processor_id(); 5426 struct rq *rq = cpu_rq(cpu); 5427 rq->age_stamp = sched_clock_cpu(cpu); 5428 } 5429 5430 static int sched_cpu_active(struct notifier_block *nfb, 5431 unsigned long action, void *hcpu) 5432 { 5433 int cpu = (long)hcpu; 5434 5435 switch (action & ~CPU_TASKS_FROZEN) { 5436 case CPU_STARTING: 5437 set_cpu_rq_start_time(); 5438 return NOTIFY_OK; 5439 5440 case CPU_DOWN_FAILED: 5441 set_cpu_active(cpu, true); 5442 return NOTIFY_OK; 5443 5444 default: 5445 return NOTIFY_DONE; 5446 } 5447 } 5448 5449 static int sched_cpu_inactive(struct notifier_block *nfb, 5450 unsigned long action, void *hcpu) 5451 { 5452 switch (action & ~CPU_TASKS_FROZEN) { 5453 case CPU_DOWN_PREPARE: 5454 set_cpu_active((long)hcpu, false); 5455 return NOTIFY_OK; 5456 default: 5457 return NOTIFY_DONE; 5458 } 5459 } 5460 5461 static int __init migration_init(void) 5462 { 5463 void *cpu = (void *)(long)smp_processor_id(); 5464 int err; 5465 5466 /* Initialize migration for the boot CPU */ 5467 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5468 BUG_ON(err == NOTIFY_BAD); 5469 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5470 register_cpu_notifier(&migration_notifier); 5471 5472 /* Register cpu active notifiers */ 5473 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 5474 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 5475 5476 return 0; 5477 } 5478 early_initcall(migration_init); 5479 5480 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 5481 5482 #ifdef CONFIG_SCHED_DEBUG 5483 5484 static __read_mostly int sched_debug_enabled; 5485 5486 static int __init sched_debug_setup(char *str) 5487 { 5488 sched_debug_enabled = 1; 5489 5490 return 0; 5491 } 5492 early_param("sched_debug", sched_debug_setup); 5493 5494 static inline bool sched_debug(void) 5495 { 5496 return sched_debug_enabled; 5497 } 5498 5499 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5500 struct cpumask *groupmask) 5501 { 5502 struct sched_group *group = sd->groups; 5503 5504 cpumask_clear(groupmask); 5505 5506 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5507 5508 if (!(sd->flags & SD_LOAD_BALANCE)) { 5509 printk("does not load-balance\n"); 5510 if (sd->parent) 5511 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 5512 " has parent"); 5513 return -1; 5514 } 5515 5516 printk(KERN_CONT "span %*pbl level %s\n", 5517 cpumask_pr_args(sched_domain_span(sd)), sd->name); 5518 5519 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5520 printk(KERN_ERR "ERROR: domain->span does not contain " 5521 "CPU%d\n", cpu); 5522 } 5523 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 5524 printk(KERN_ERR "ERROR: domain->groups does not contain" 5525 " CPU%d\n", cpu); 5526 } 5527 5528 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 5529 do { 5530 if (!group) { 5531 printk("\n"); 5532 printk(KERN_ERR "ERROR: group is NULL\n"); 5533 break; 5534 } 5535 5536 if (!cpumask_weight(sched_group_cpus(group))) { 5537 printk(KERN_CONT "\n"); 5538 printk(KERN_ERR "ERROR: empty group\n"); 5539 break; 5540 } 5541 5542 if (!(sd->flags & SD_OVERLAP) && 5543 cpumask_intersects(groupmask, sched_group_cpus(group))) { 5544 printk(KERN_CONT "\n"); 5545 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5546 break; 5547 } 5548 5549 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5550 5551 printk(KERN_CONT " %*pbl", 5552 cpumask_pr_args(sched_group_cpus(group))); 5553 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { 5554 printk(KERN_CONT " (cpu_capacity = %d)", 5555 group->sgc->capacity); 5556 } 5557 5558 group = group->next; 5559 } while (group != sd->groups); 5560 printk(KERN_CONT "\n"); 5561 5562 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 5563 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5564 5565 if (sd->parent && 5566 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 5567 printk(KERN_ERR "ERROR: parent span is not a superset " 5568 "of domain->span\n"); 5569 return 0; 5570 } 5571 5572 static void sched_domain_debug(struct sched_domain *sd, int cpu) 5573 { 5574 int level = 0; 5575 5576 if (!sched_debug_enabled) 5577 return; 5578 5579 if (!sd) { 5580 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5581 return; 5582 } 5583 5584 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5585 5586 for (;;) { 5587 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 5588 break; 5589 level++; 5590 sd = sd->parent; 5591 if (!sd) 5592 break; 5593 } 5594 } 5595 #else /* !CONFIG_SCHED_DEBUG */ 5596 # define sched_domain_debug(sd, cpu) do { } while (0) 5597 static inline bool sched_debug(void) 5598 { 5599 return false; 5600 } 5601 #endif /* CONFIG_SCHED_DEBUG */ 5602 5603 static int sd_degenerate(struct sched_domain *sd) 5604 { 5605 if (cpumask_weight(sched_domain_span(sd)) == 1) 5606 return 1; 5607 5608 /* Following flags need at least 2 groups */ 5609 if (sd->flags & (SD_LOAD_BALANCE | 5610 SD_BALANCE_NEWIDLE | 5611 SD_BALANCE_FORK | 5612 SD_BALANCE_EXEC | 5613 SD_SHARE_CPUCAPACITY | 5614 SD_SHARE_PKG_RESOURCES | 5615 SD_SHARE_POWERDOMAIN)) { 5616 if (sd->groups != sd->groups->next) 5617 return 0; 5618 } 5619 5620 /* Following flags don't use groups */ 5621 if (sd->flags & (SD_WAKE_AFFINE)) 5622 return 0; 5623 5624 return 1; 5625 } 5626 5627 static int 5628 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5629 { 5630 unsigned long cflags = sd->flags, pflags = parent->flags; 5631 5632 if (sd_degenerate(parent)) 5633 return 1; 5634 5635 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 5636 return 0; 5637 5638 /* Flags needing groups don't count if only 1 group in parent */ 5639 if (parent->groups == parent->groups->next) { 5640 pflags &= ~(SD_LOAD_BALANCE | 5641 SD_BALANCE_NEWIDLE | 5642 SD_BALANCE_FORK | 5643 SD_BALANCE_EXEC | 5644 SD_SHARE_CPUCAPACITY | 5645 SD_SHARE_PKG_RESOURCES | 5646 SD_PREFER_SIBLING | 5647 SD_SHARE_POWERDOMAIN); 5648 if (nr_node_ids == 1) 5649 pflags &= ~SD_SERIALIZE; 5650 } 5651 if (~cflags & pflags) 5652 return 0; 5653 5654 return 1; 5655 } 5656 5657 static void free_rootdomain(struct rcu_head *rcu) 5658 { 5659 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5660 5661 cpupri_cleanup(&rd->cpupri); 5662 cpudl_cleanup(&rd->cpudl); 5663 free_cpumask_var(rd->dlo_mask); 5664 free_cpumask_var(rd->rto_mask); 5665 free_cpumask_var(rd->online); 5666 free_cpumask_var(rd->span); 5667 kfree(rd); 5668 } 5669 5670 static void rq_attach_root(struct rq *rq, struct root_domain *rd) 5671 { 5672 struct root_domain *old_rd = NULL; 5673 unsigned long flags; 5674 5675 raw_spin_lock_irqsave(&rq->lock, flags); 5676 5677 if (rq->rd) { 5678 old_rd = rq->rd; 5679 5680 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 5681 set_rq_offline(rq); 5682 5683 cpumask_clear_cpu(rq->cpu, old_rd->span); 5684 5685 /* 5686 * If we dont want to free the old_rd yet then 5687 * set old_rd to NULL to skip the freeing later 5688 * in this function: 5689 */ 5690 if (!atomic_dec_and_test(&old_rd->refcount)) 5691 old_rd = NULL; 5692 } 5693 5694 atomic_inc(&rd->refcount); 5695 rq->rd = rd; 5696 5697 cpumask_set_cpu(rq->cpu, rd->span); 5698 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 5699 set_rq_online(rq); 5700 5701 raw_spin_unlock_irqrestore(&rq->lock, flags); 5702 5703 if (old_rd) 5704 call_rcu_sched(&old_rd->rcu, free_rootdomain); 5705 } 5706 5707 static int init_rootdomain(struct root_domain *rd) 5708 { 5709 memset(rd, 0, sizeof(*rd)); 5710 5711 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) 5712 goto out; 5713 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) 5714 goto free_span; 5715 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) 5716 goto free_online; 5717 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5718 goto free_dlo_mask; 5719 5720 init_dl_bw(&rd->dl_bw); 5721 if (cpudl_init(&rd->cpudl) != 0) 5722 goto free_dlo_mask; 5723 5724 if (cpupri_init(&rd->cpupri) != 0) 5725 goto free_rto_mask; 5726 return 0; 5727 5728 free_rto_mask: 5729 free_cpumask_var(rd->rto_mask); 5730 free_dlo_mask: 5731 free_cpumask_var(rd->dlo_mask); 5732 free_online: 5733 free_cpumask_var(rd->online); 5734 free_span: 5735 free_cpumask_var(rd->span); 5736 out: 5737 return -ENOMEM; 5738 } 5739 5740 /* 5741 * By default the system creates a single root-domain with all cpus as 5742 * members (mimicking the global state we have today). 5743 */ 5744 struct root_domain def_root_domain; 5745 5746 static void init_defrootdomain(void) 5747 { 5748 init_rootdomain(&def_root_domain); 5749 5750 atomic_set(&def_root_domain.refcount, 1); 5751 } 5752 5753 static struct root_domain *alloc_rootdomain(void) 5754 { 5755 struct root_domain *rd; 5756 5757 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 5758 if (!rd) 5759 return NULL; 5760 5761 if (init_rootdomain(rd) != 0) { 5762 kfree(rd); 5763 return NULL; 5764 } 5765 5766 return rd; 5767 } 5768 5769 static void free_sched_groups(struct sched_group *sg, int free_sgc) 5770 { 5771 struct sched_group *tmp, *first; 5772 5773 if (!sg) 5774 return; 5775 5776 first = sg; 5777 do { 5778 tmp = sg->next; 5779 5780 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) 5781 kfree(sg->sgc); 5782 5783 kfree(sg); 5784 sg = tmp; 5785 } while (sg != first); 5786 } 5787 5788 static void free_sched_domain(struct rcu_head *rcu) 5789 { 5790 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 5791 5792 /* 5793 * If its an overlapping domain it has private groups, iterate and 5794 * nuke them all. 5795 */ 5796 if (sd->flags & SD_OVERLAP) { 5797 free_sched_groups(sd->groups, 1); 5798 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5799 kfree(sd->groups->sgc); 5800 kfree(sd->groups); 5801 } 5802 kfree(sd); 5803 } 5804 5805 static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5806 { 5807 call_rcu(&sd->rcu, free_sched_domain); 5808 } 5809 5810 static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5811 { 5812 for (; sd; sd = sd->parent) 5813 destroy_sched_domain(sd, cpu); 5814 } 5815 5816 /* 5817 * Keep a special pointer to the highest sched_domain that has 5818 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 5819 * allows us to avoid some pointer chasing select_idle_sibling(). 5820 * 5821 * Also keep a unique ID per domain (we use the first cpu number in 5822 * the cpumask of the domain), this allows us to quickly tell if 5823 * two cpus are in the same cache domain, see cpus_share_cache(). 5824 */ 5825 DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5826 DEFINE_PER_CPU(int, sd_llc_size); 5827 DEFINE_PER_CPU(int, sd_llc_id); 5828 DEFINE_PER_CPU(struct sched_domain *, sd_numa); 5829 DEFINE_PER_CPU(struct sched_domain *, sd_busy); 5830 DEFINE_PER_CPU(struct sched_domain *, sd_asym); 5831 5832 static void update_top_cache_domain(int cpu) 5833 { 5834 struct sched_domain *sd; 5835 struct sched_domain *busy_sd = NULL; 5836 int id = cpu; 5837 int size = 1; 5838 5839 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5840 if (sd) { 5841 id = cpumask_first(sched_domain_span(sd)); 5842 size = cpumask_weight(sched_domain_span(sd)); 5843 busy_sd = sd->parent; /* sd_busy */ 5844 } 5845 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); 5846 5847 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5848 per_cpu(sd_llc_size, cpu) = size; 5849 per_cpu(sd_llc_id, cpu) = id; 5850 5851 sd = lowest_flag_domain(cpu, SD_NUMA); 5852 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); 5853 5854 sd = highest_flag_domain(cpu, SD_ASYM_PACKING); 5855 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); 5856 } 5857 5858 /* 5859 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5860 * hold the hotplug lock. 5861 */ 5862 static void 5863 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 5864 { 5865 struct rq *rq = cpu_rq(cpu); 5866 struct sched_domain *tmp; 5867 5868 /* Remove the sched domains which do not contribute to scheduling. */ 5869 for (tmp = sd; tmp; ) { 5870 struct sched_domain *parent = tmp->parent; 5871 if (!parent) 5872 break; 5873 5874 if (sd_parent_degenerate(tmp, parent)) { 5875 tmp->parent = parent->parent; 5876 if (parent->parent) 5877 parent->parent->child = tmp; 5878 /* 5879 * Transfer SD_PREFER_SIBLING down in case of a 5880 * degenerate parent; the spans match for this 5881 * so the property transfers. 5882 */ 5883 if (parent->flags & SD_PREFER_SIBLING) 5884 tmp->flags |= SD_PREFER_SIBLING; 5885 destroy_sched_domain(parent, cpu); 5886 } else 5887 tmp = tmp->parent; 5888 } 5889 5890 if (sd && sd_degenerate(sd)) { 5891 tmp = sd; 5892 sd = sd->parent; 5893 destroy_sched_domain(tmp, cpu); 5894 if (sd) 5895 sd->child = NULL; 5896 } 5897 5898 sched_domain_debug(sd, cpu); 5899 5900 rq_attach_root(rq, rd); 5901 tmp = rq->sd; 5902 rcu_assign_pointer(rq->sd, sd); 5903 destroy_sched_domains(tmp, cpu); 5904 5905 update_top_cache_domain(cpu); 5906 } 5907 5908 /* Setup the mask of cpus configured for isolated domains */ 5909 static int __init isolated_cpu_setup(char *str) 5910 { 5911 int ret; 5912 5913 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5914 ret = cpulist_parse(str, cpu_isolated_map); 5915 if (ret) { 5916 pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); 5917 return 0; 5918 } 5919 return 1; 5920 } 5921 __setup("isolcpus=", isolated_cpu_setup); 5922 5923 struct s_data { 5924 struct sched_domain ** __percpu sd; 5925 struct root_domain *rd; 5926 }; 5927 5928 enum s_alloc { 5929 sa_rootdomain, 5930 sa_sd, 5931 sa_sd_storage, 5932 sa_none, 5933 }; 5934 5935 /* 5936 * Build an iteration mask that can exclude certain CPUs from the upwards 5937 * domain traversal. 5938 * 5939 * Asymmetric node setups can result in situations where the domain tree is of 5940 * unequal depth, make sure to skip domains that already cover the entire 5941 * range. 5942 * 5943 * In that case build_sched_domains() will have terminated the iteration early 5944 * and our sibling sd spans will be empty. Domains should always include the 5945 * cpu they're built on, so check that. 5946 * 5947 */ 5948 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 5949 { 5950 const struct cpumask *span = sched_domain_span(sd); 5951 struct sd_data *sdd = sd->private; 5952 struct sched_domain *sibling; 5953 int i; 5954 5955 for_each_cpu(i, span) { 5956 sibling = *per_cpu_ptr(sdd->sd, i); 5957 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5958 continue; 5959 5960 cpumask_set_cpu(i, sched_group_mask(sg)); 5961 } 5962 } 5963 5964 /* 5965 * Return the canonical balance cpu for this group, this is the first cpu 5966 * of this group that's also in the iteration mask. 5967 */ 5968 int group_balance_cpu(struct sched_group *sg) 5969 { 5970 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 5971 } 5972 5973 static int 5974 build_overlap_sched_groups(struct sched_domain *sd, int cpu) 5975 { 5976 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 5977 const struct cpumask *span = sched_domain_span(sd); 5978 struct cpumask *covered = sched_domains_tmpmask; 5979 struct sd_data *sdd = sd->private; 5980 struct sched_domain *sibling; 5981 int i; 5982 5983 cpumask_clear(covered); 5984 5985 for_each_cpu(i, span) { 5986 struct cpumask *sg_span; 5987 5988 if (cpumask_test_cpu(i, covered)) 5989 continue; 5990 5991 sibling = *per_cpu_ptr(sdd->sd, i); 5992 5993 /* See the comment near build_group_mask(). */ 5994 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5995 continue; 5996 5997 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5998 GFP_KERNEL, cpu_to_node(cpu)); 5999 6000 if (!sg) 6001 goto fail; 6002 6003 sg_span = sched_group_cpus(sg); 6004 if (sibling->child) 6005 cpumask_copy(sg_span, sched_domain_span(sibling->child)); 6006 else 6007 cpumask_set_cpu(i, sg_span); 6008 6009 cpumask_or(covered, covered, sg_span); 6010 6011 sg->sgc = *per_cpu_ptr(sdd->sgc, i); 6012 if (atomic_inc_return(&sg->sgc->ref) == 1) 6013 build_group_mask(sd, sg); 6014 6015 /* 6016 * Initialize sgc->capacity such that even if we mess up the 6017 * domains and no possible iteration will get us here, we won't 6018 * die on a /0 trap. 6019 */ 6020 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); 6021 6022 /* 6023 * Make sure the first group of this domain contains the 6024 * canonical balance cpu. Otherwise the sched_domain iteration 6025 * breaks. See update_sg_lb_stats(). 6026 */ 6027 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 6028 group_balance_cpu(sg) == cpu) 6029 groups = sg; 6030 6031 if (!first) 6032 first = sg; 6033 if (last) 6034 last->next = sg; 6035 last = sg; 6036 last->next = first; 6037 } 6038 sd->groups = groups; 6039 6040 return 0; 6041 6042 fail: 6043 free_sched_groups(first, 0); 6044 6045 return -ENOMEM; 6046 } 6047 6048 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 6049 { 6050 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 6051 struct sched_domain *child = sd->child; 6052 6053 if (child) 6054 cpu = cpumask_first(sched_domain_span(child)); 6055 6056 if (sg) { 6057 *sg = *per_cpu_ptr(sdd->sg, cpu); 6058 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); 6059 atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ 6060 } 6061 6062 return cpu; 6063 } 6064 6065 /* 6066 * build_sched_groups will build a circular linked list of the groups 6067 * covered by the given span, and will set each group's ->cpumask correctly, 6068 * and ->cpu_capacity to 0. 6069 * 6070 * Assumes the sched_domain tree is fully constructed 6071 */ 6072 static int 6073 build_sched_groups(struct sched_domain *sd, int cpu) 6074 { 6075 struct sched_group *first = NULL, *last = NULL; 6076 struct sd_data *sdd = sd->private; 6077 const struct cpumask *span = sched_domain_span(sd); 6078 struct cpumask *covered; 6079 int i; 6080 6081 get_group(cpu, sdd, &sd->groups); 6082 atomic_inc(&sd->groups->ref); 6083 6084 if (cpu != cpumask_first(span)) 6085 return 0; 6086 6087 lockdep_assert_held(&sched_domains_mutex); 6088 covered = sched_domains_tmpmask; 6089 6090 cpumask_clear(covered); 6091 6092 for_each_cpu(i, span) { 6093 struct sched_group *sg; 6094 int group, j; 6095 6096 if (cpumask_test_cpu(i, covered)) 6097 continue; 6098 6099 group = get_group(i, sdd, &sg); 6100 cpumask_setall(sched_group_mask(sg)); 6101 6102 for_each_cpu(j, span) { 6103 if (get_group(j, sdd, NULL) != group) 6104 continue; 6105 6106 cpumask_set_cpu(j, covered); 6107 cpumask_set_cpu(j, sched_group_cpus(sg)); 6108 } 6109 6110 if (!first) 6111 first = sg; 6112 if (last) 6113 last->next = sg; 6114 last = sg; 6115 } 6116 last->next = first; 6117 6118 return 0; 6119 } 6120 6121 /* 6122 * Initialize sched groups cpu_capacity. 6123 * 6124 * cpu_capacity indicates the capacity of sched group, which is used while 6125 * distributing the load between different sched groups in a sched domain. 6126 * Typically cpu_capacity for all the groups in a sched domain will be same 6127 * unless there are asymmetries in the topology. If there are asymmetries, 6128 * group having more cpu_capacity will pickup more load compared to the 6129 * group having less cpu_capacity. 6130 */ 6131 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) 6132 { 6133 struct sched_group *sg = sd->groups; 6134 6135 WARN_ON(!sg); 6136 6137 do { 6138 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 6139 sg = sg->next; 6140 } while (sg != sd->groups); 6141 6142 if (cpu != group_balance_cpu(sg)) 6143 return; 6144 6145 update_group_capacity(sd, cpu); 6146 atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); 6147 } 6148 6149 /* 6150 * Initializers for schedule domains 6151 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 6152 */ 6153 6154 static int default_relax_domain_level = -1; 6155 int sched_domain_level_max; 6156 6157 static int __init setup_relax_domain_level(char *str) 6158 { 6159 if (kstrtoint(str, 0, &default_relax_domain_level)) 6160 pr_warn("Unable to set relax_domain_level\n"); 6161 6162 return 1; 6163 } 6164 __setup("relax_domain_level=", setup_relax_domain_level); 6165 6166 static void set_domain_attribute(struct sched_domain *sd, 6167 struct sched_domain_attr *attr) 6168 { 6169 int request; 6170 6171 if (!attr || attr->relax_domain_level < 0) { 6172 if (default_relax_domain_level < 0) 6173 return; 6174 else 6175 request = default_relax_domain_level; 6176 } else 6177 request = attr->relax_domain_level; 6178 if (request < sd->level) { 6179 /* turn off idle balance on this domain */ 6180 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6181 } else { 6182 /* turn on idle balance on this domain */ 6183 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6184 } 6185 } 6186 6187 static void __sdt_free(const struct cpumask *cpu_map); 6188 static int __sdt_alloc(const struct cpumask *cpu_map); 6189 6190 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 6191 const struct cpumask *cpu_map) 6192 { 6193 switch (what) { 6194 case sa_rootdomain: 6195 if (!atomic_read(&d->rd->refcount)) 6196 free_rootdomain(&d->rd->rcu); /* fall through */ 6197 case sa_sd: 6198 free_percpu(d->sd); /* fall through */ 6199 case sa_sd_storage: 6200 __sdt_free(cpu_map); /* fall through */ 6201 case sa_none: 6202 break; 6203 } 6204 } 6205 6206 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 6207 const struct cpumask *cpu_map) 6208 { 6209 memset(d, 0, sizeof(*d)); 6210 6211 if (__sdt_alloc(cpu_map)) 6212 return sa_sd_storage; 6213 d->sd = alloc_percpu(struct sched_domain *); 6214 if (!d->sd) 6215 return sa_sd_storage; 6216 d->rd = alloc_rootdomain(); 6217 if (!d->rd) 6218 return sa_sd; 6219 return sa_rootdomain; 6220 } 6221 6222 /* 6223 * NULL the sd_data elements we've used to build the sched_domain and 6224 * sched_group structure so that the subsequent __free_domain_allocs() 6225 * will not free the data we're using. 6226 */ 6227 static void claim_allocations(int cpu, struct sched_domain *sd) 6228 { 6229 struct sd_data *sdd = sd->private; 6230 6231 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 6232 *per_cpu_ptr(sdd->sd, cpu) = NULL; 6233 6234 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 6235 *per_cpu_ptr(sdd->sg, cpu) = NULL; 6236 6237 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) 6238 *per_cpu_ptr(sdd->sgc, cpu) = NULL; 6239 } 6240 6241 #ifdef CONFIG_NUMA 6242 static int sched_domains_numa_levels; 6243 enum numa_topology_type sched_numa_topology_type; 6244 static int *sched_domains_numa_distance; 6245 int sched_max_numa_distance; 6246 static struct cpumask ***sched_domains_numa_masks; 6247 static int sched_domains_curr_level; 6248 #endif 6249 6250 /* 6251 * SD_flags allowed in topology descriptions. 6252 * 6253 * SD_SHARE_CPUCAPACITY - describes SMT topologies 6254 * SD_SHARE_PKG_RESOURCES - describes shared caches 6255 * SD_NUMA - describes NUMA topologies 6256 * SD_SHARE_POWERDOMAIN - describes shared power domain 6257 * 6258 * Odd one out: 6259 * SD_ASYM_PACKING - describes SMT quirks 6260 */ 6261 #define TOPOLOGY_SD_FLAGS \ 6262 (SD_SHARE_CPUCAPACITY | \ 6263 SD_SHARE_PKG_RESOURCES | \ 6264 SD_NUMA | \ 6265 SD_ASYM_PACKING | \ 6266 SD_SHARE_POWERDOMAIN) 6267 6268 static struct sched_domain * 6269 sd_init(struct sched_domain_topology_level *tl, int cpu) 6270 { 6271 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6272 int sd_weight, sd_flags = 0; 6273 6274 #ifdef CONFIG_NUMA 6275 /* 6276 * Ugly hack to pass state to sd_numa_mask()... 6277 */ 6278 sched_domains_curr_level = tl->numa_level; 6279 #endif 6280 6281 sd_weight = cpumask_weight(tl->mask(cpu)); 6282 6283 if (tl->sd_flags) 6284 sd_flags = (*tl->sd_flags)(); 6285 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, 6286 "wrong sd_flags in topology description\n")) 6287 sd_flags &= ~TOPOLOGY_SD_FLAGS; 6288 6289 *sd = (struct sched_domain){ 6290 .min_interval = sd_weight, 6291 .max_interval = 2*sd_weight, 6292 .busy_factor = 32, 6293 .imbalance_pct = 125, 6294 6295 .cache_nice_tries = 0, 6296 .busy_idx = 0, 6297 .idle_idx = 0, 6298 .newidle_idx = 0, 6299 .wake_idx = 0, 6300 .forkexec_idx = 0, 6301 6302 .flags = 1*SD_LOAD_BALANCE 6303 | 1*SD_BALANCE_NEWIDLE 6304 | 1*SD_BALANCE_EXEC 6305 | 1*SD_BALANCE_FORK 6306 | 0*SD_BALANCE_WAKE 6307 | 1*SD_WAKE_AFFINE 6308 | 0*SD_SHARE_CPUCAPACITY 6309 | 0*SD_SHARE_PKG_RESOURCES 6310 | 0*SD_SERIALIZE 6311 | 0*SD_PREFER_SIBLING 6312 | 0*SD_NUMA 6313 | sd_flags 6314 , 6315 6316 .last_balance = jiffies, 6317 .balance_interval = sd_weight, 6318 .smt_gain = 0, 6319 .max_newidle_lb_cost = 0, 6320 .next_decay_max_lb_cost = jiffies, 6321 #ifdef CONFIG_SCHED_DEBUG 6322 .name = tl->name, 6323 #endif 6324 }; 6325 6326 /* 6327 * Convert topological properties into behaviour. 6328 */ 6329 6330 if (sd->flags & SD_SHARE_CPUCAPACITY) { 6331 sd->flags |= SD_PREFER_SIBLING; 6332 sd->imbalance_pct = 110; 6333 sd->smt_gain = 1178; /* ~15% */ 6334 6335 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 6336 sd->imbalance_pct = 117; 6337 sd->cache_nice_tries = 1; 6338 sd->busy_idx = 2; 6339 6340 #ifdef CONFIG_NUMA 6341 } else if (sd->flags & SD_NUMA) { 6342 sd->cache_nice_tries = 2; 6343 sd->busy_idx = 3; 6344 sd->idle_idx = 2; 6345 6346 sd->flags |= SD_SERIALIZE; 6347 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { 6348 sd->flags &= ~(SD_BALANCE_EXEC | 6349 SD_BALANCE_FORK | 6350 SD_WAKE_AFFINE); 6351 } 6352 6353 #endif 6354 } else { 6355 sd->flags |= SD_PREFER_SIBLING; 6356 sd->cache_nice_tries = 1; 6357 sd->busy_idx = 2; 6358 sd->idle_idx = 1; 6359 } 6360 6361 sd->private = &tl->data; 6362 6363 return sd; 6364 } 6365 6366 /* 6367 * Topology list, bottom-up. 6368 */ 6369 static struct sched_domain_topology_level default_topology[] = { 6370 #ifdef CONFIG_SCHED_SMT 6371 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 6372 #endif 6373 #ifdef CONFIG_SCHED_MC 6374 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 6375 #endif 6376 { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 6377 { NULL, }, 6378 }; 6379 6380 static struct sched_domain_topology_level *sched_domain_topology = 6381 default_topology; 6382 6383 #define for_each_sd_topology(tl) \ 6384 for (tl = sched_domain_topology; tl->mask; tl++) 6385 6386 void set_sched_topology(struct sched_domain_topology_level *tl) 6387 { 6388 sched_domain_topology = tl; 6389 } 6390 6391 #ifdef CONFIG_NUMA 6392 6393 static const struct cpumask *sd_numa_mask(int cpu) 6394 { 6395 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6396 } 6397 6398 static void sched_numa_warn(const char *str) 6399 { 6400 static int done = false; 6401 int i,j; 6402 6403 if (done) 6404 return; 6405 6406 done = true; 6407 6408 printk(KERN_WARNING "ERROR: %s\n\n", str); 6409 6410 for (i = 0; i < nr_node_ids; i++) { 6411 printk(KERN_WARNING " "); 6412 for (j = 0; j < nr_node_ids; j++) 6413 printk(KERN_CONT "%02d ", node_distance(i,j)); 6414 printk(KERN_CONT "\n"); 6415 } 6416 printk(KERN_WARNING "\n"); 6417 } 6418 6419 bool find_numa_distance(int distance) 6420 { 6421 int i; 6422 6423 if (distance == node_distance(0, 0)) 6424 return true; 6425 6426 for (i = 0; i < sched_domains_numa_levels; i++) { 6427 if (sched_domains_numa_distance[i] == distance) 6428 return true; 6429 } 6430 6431 return false; 6432 } 6433 6434 /* 6435 * A system can have three types of NUMA topology: 6436 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system 6437 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes 6438 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane 6439 * 6440 * The difference between a glueless mesh topology and a backplane 6441 * topology lies in whether communication between not directly 6442 * connected nodes goes through intermediary nodes (where programs 6443 * could run), or through backplane controllers. This affects 6444 * placement of programs. 6445 * 6446 * The type of topology can be discerned with the following tests: 6447 * - If the maximum distance between any nodes is 1 hop, the system 6448 * is directly connected. 6449 * - If for two nodes A and B, located N > 1 hops away from each other, 6450 * there is an intermediary node C, which is < N hops away from both 6451 * nodes A and B, the system is a glueless mesh. 6452 */ 6453 static void init_numa_topology_type(void) 6454 { 6455 int a, b, c, n; 6456 6457 n = sched_max_numa_distance; 6458 6459 if (sched_domains_numa_levels <= 1) { 6460 sched_numa_topology_type = NUMA_DIRECT; 6461 return; 6462 } 6463 6464 for_each_online_node(a) { 6465 for_each_online_node(b) { 6466 /* Find two nodes furthest removed from each other. */ 6467 if (node_distance(a, b) < n) 6468 continue; 6469 6470 /* Is there an intermediary node between a and b? */ 6471 for_each_online_node(c) { 6472 if (node_distance(a, c) < n && 6473 node_distance(b, c) < n) { 6474 sched_numa_topology_type = 6475 NUMA_GLUELESS_MESH; 6476 return; 6477 } 6478 } 6479 6480 sched_numa_topology_type = NUMA_BACKPLANE; 6481 return; 6482 } 6483 } 6484 } 6485 6486 static void sched_init_numa(void) 6487 { 6488 int next_distance, curr_distance = node_distance(0, 0); 6489 struct sched_domain_topology_level *tl; 6490 int level = 0; 6491 int i, j, k; 6492 6493 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6494 if (!sched_domains_numa_distance) 6495 return; 6496 6497 /* 6498 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 6499 * unique distances in the node_distance() table. 6500 * 6501 * Assumes node_distance(0,j) includes all distances in 6502 * node_distance(i,j) in order to avoid cubic time. 6503 */ 6504 next_distance = curr_distance; 6505 for (i = 0; i < nr_node_ids; i++) { 6506 for (j = 0; j < nr_node_ids; j++) { 6507 for (k = 0; k < nr_node_ids; k++) { 6508 int distance = node_distance(i, k); 6509 6510 if (distance > curr_distance && 6511 (distance < next_distance || 6512 next_distance == curr_distance)) 6513 next_distance = distance; 6514 6515 /* 6516 * While not a strong assumption it would be nice to know 6517 * about cases where if node A is connected to B, B is not 6518 * equally connected to A. 6519 */ 6520 if (sched_debug() && node_distance(k, i) != distance) 6521 sched_numa_warn("Node-distance not symmetric"); 6522 6523 if (sched_debug() && i && !find_numa_distance(distance)) 6524 sched_numa_warn("Node-0 not representative"); 6525 } 6526 if (next_distance != curr_distance) { 6527 sched_domains_numa_distance[level++] = next_distance; 6528 sched_domains_numa_levels = level; 6529 curr_distance = next_distance; 6530 } else break; 6531 } 6532 6533 /* 6534 * In case of sched_debug() we verify the above assumption. 6535 */ 6536 if (!sched_debug()) 6537 break; 6538 } 6539 6540 if (!level) 6541 return; 6542 6543 /* 6544 * 'level' contains the number of unique distances, excluding the 6545 * identity distance node_distance(i,i). 6546 * 6547 * The sched_domains_numa_distance[] array includes the actual distance 6548 * numbers. 6549 */ 6550 6551 /* 6552 * Here, we should temporarily reset sched_domains_numa_levels to 0. 6553 * If it fails to allocate memory for array sched_domains_numa_masks[][], 6554 * the array will contain less then 'level' members. This could be 6555 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 6556 * in other functions. 6557 * 6558 * We reset it to 'level' at the end of this function. 6559 */ 6560 sched_domains_numa_levels = 0; 6561 6562 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 6563 if (!sched_domains_numa_masks) 6564 return; 6565 6566 /* 6567 * Now for each level, construct a mask per node which contains all 6568 * cpus of nodes that are that many hops away from us. 6569 */ 6570 for (i = 0; i < level; i++) { 6571 sched_domains_numa_masks[i] = 6572 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 6573 if (!sched_domains_numa_masks[i]) 6574 return; 6575 6576 for (j = 0; j < nr_node_ids; j++) { 6577 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 6578 if (!mask) 6579 return; 6580 6581 sched_domains_numa_masks[i][j] = mask; 6582 6583 for_each_node(k) { 6584 if (node_distance(j, k) > sched_domains_numa_distance[i]) 6585 continue; 6586 6587 cpumask_or(mask, mask, cpumask_of_node(k)); 6588 } 6589 } 6590 } 6591 6592 /* Compute default topology size */ 6593 for (i = 0; sched_domain_topology[i].mask; i++); 6594 6595 tl = kzalloc((i + level + 1) * 6596 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6597 if (!tl) 6598 return; 6599 6600 /* 6601 * Copy the default topology bits.. 6602 */ 6603 for (i = 0; sched_domain_topology[i].mask; i++) 6604 tl[i] = sched_domain_topology[i]; 6605 6606 /* 6607 * .. and append 'j' levels of NUMA goodness. 6608 */ 6609 for (j = 0; j < level; i++, j++) { 6610 tl[i] = (struct sched_domain_topology_level){ 6611 .mask = sd_numa_mask, 6612 .sd_flags = cpu_numa_flags, 6613 .flags = SDTL_OVERLAP, 6614 .numa_level = j, 6615 SD_INIT_NAME(NUMA) 6616 }; 6617 } 6618 6619 sched_domain_topology = tl; 6620 6621 sched_domains_numa_levels = level; 6622 sched_max_numa_distance = sched_domains_numa_distance[level - 1]; 6623 6624 init_numa_topology_type(); 6625 } 6626 6627 static void sched_domains_numa_masks_set(int cpu) 6628 { 6629 int i, j; 6630 int node = cpu_to_node(cpu); 6631 6632 for (i = 0; i < sched_domains_numa_levels; i++) { 6633 for (j = 0; j < nr_node_ids; j++) { 6634 if (node_distance(j, node) <= sched_domains_numa_distance[i]) 6635 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 6636 } 6637 } 6638 } 6639 6640 static void sched_domains_numa_masks_clear(int cpu) 6641 { 6642 int i, j; 6643 for (i = 0; i < sched_domains_numa_levels; i++) { 6644 for (j = 0; j < nr_node_ids; j++) 6645 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 6646 } 6647 } 6648 6649 /* 6650 * Update sched_domains_numa_masks[level][node] array when new cpus 6651 * are onlined. 6652 */ 6653 static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6654 unsigned long action, 6655 void *hcpu) 6656 { 6657 int cpu = (long)hcpu; 6658 6659 switch (action & ~CPU_TASKS_FROZEN) { 6660 case CPU_ONLINE: 6661 sched_domains_numa_masks_set(cpu); 6662 break; 6663 6664 case CPU_DEAD: 6665 sched_domains_numa_masks_clear(cpu); 6666 break; 6667 6668 default: 6669 return NOTIFY_DONE; 6670 } 6671 6672 return NOTIFY_OK; 6673 } 6674 #else 6675 static inline void sched_init_numa(void) 6676 { 6677 } 6678 6679 static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6680 unsigned long action, 6681 void *hcpu) 6682 { 6683 return 0; 6684 } 6685 #endif /* CONFIG_NUMA */ 6686 6687 static int __sdt_alloc(const struct cpumask *cpu_map) 6688 { 6689 struct sched_domain_topology_level *tl; 6690 int j; 6691 6692 for_each_sd_topology(tl) { 6693 struct sd_data *sdd = &tl->data; 6694 6695 sdd->sd = alloc_percpu(struct sched_domain *); 6696 if (!sdd->sd) 6697 return -ENOMEM; 6698 6699 sdd->sg = alloc_percpu(struct sched_group *); 6700 if (!sdd->sg) 6701 return -ENOMEM; 6702 6703 sdd->sgc = alloc_percpu(struct sched_group_capacity *); 6704 if (!sdd->sgc) 6705 return -ENOMEM; 6706 6707 for_each_cpu(j, cpu_map) { 6708 struct sched_domain *sd; 6709 struct sched_group *sg; 6710 struct sched_group_capacity *sgc; 6711 6712 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6713 GFP_KERNEL, cpu_to_node(j)); 6714 if (!sd) 6715 return -ENOMEM; 6716 6717 *per_cpu_ptr(sdd->sd, j) = sd; 6718 6719 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6720 GFP_KERNEL, cpu_to_node(j)); 6721 if (!sg) 6722 return -ENOMEM; 6723 6724 sg->next = sg; 6725 6726 *per_cpu_ptr(sdd->sg, j) = sg; 6727 6728 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), 6729 GFP_KERNEL, cpu_to_node(j)); 6730 if (!sgc) 6731 return -ENOMEM; 6732 6733 *per_cpu_ptr(sdd->sgc, j) = sgc; 6734 } 6735 } 6736 6737 return 0; 6738 } 6739 6740 static void __sdt_free(const struct cpumask *cpu_map) 6741 { 6742 struct sched_domain_topology_level *tl; 6743 int j; 6744 6745 for_each_sd_topology(tl) { 6746 struct sd_data *sdd = &tl->data; 6747 6748 for_each_cpu(j, cpu_map) { 6749 struct sched_domain *sd; 6750 6751 if (sdd->sd) { 6752 sd = *per_cpu_ptr(sdd->sd, j); 6753 if (sd && (sd->flags & SD_OVERLAP)) 6754 free_sched_groups(sd->groups, 0); 6755 kfree(*per_cpu_ptr(sdd->sd, j)); 6756 } 6757 6758 if (sdd->sg) 6759 kfree(*per_cpu_ptr(sdd->sg, j)); 6760 if (sdd->sgc) 6761 kfree(*per_cpu_ptr(sdd->sgc, j)); 6762 } 6763 free_percpu(sdd->sd); 6764 sdd->sd = NULL; 6765 free_percpu(sdd->sg); 6766 sdd->sg = NULL; 6767 free_percpu(sdd->sgc); 6768 sdd->sgc = NULL; 6769 } 6770 } 6771 6772 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 6773 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6774 struct sched_domain *child, int cpu) 6775 { 6776 struct sched_domain *sd = sd_init(tl, cpu); 6777 if (!sd) 6778 return child; 6779 6780 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6781 if (child) { 6782 sd->level = child->level + 1; 6783 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6784 child->parent = sd; 6785 sd->child = child; 6786 6787 if (!cpumask_subset(sched_domain_span(child), 6788 sched_domain_span(sd))) { 6789 pr_err("BUG: arch topology borken\n"); 6790 #ifdef CONFIG_SCHED_DEBUG 6791 pr_err(" the %s domain not a subset of the %s domain\n", 6792 child->name, sd->name); 6793 #endif 6794 /* Fixup, ensure @sd has at least @child cpus. */ 6795 cpumask_or(sched_domain_span(sd), 6796 sched_domain_span(sd), 6797 sched_domain_span(child)); 6798 } 6799 6800 } 6801 set_domain_attribute(sd, attr); 6802 6803 return sd; 6804 } 6805 6806 /* 6807 * Build sched domains for a given set of cpus and attach the sched domains 6808 * to the individual cpus 6809 */ 6810 static int build_sched_domains(const struct cpumask *cpu_map, 6811 struct sched_domain_attr *attr) 6812 { 6813 enum s_alloc alloc_state; 6814 struct sched_domain *sd; 6815 struct s_data d; 6816 int i, ret = -ENOMEM; 6817 6818 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 6819 if (alloc_state != sa_rootdomain) 6820 goto error; 6821 6822 /* Set up domains for cpus specified by the cpu_map. */ 6823 for_each_cpu(i, cpu_map) { 6824 struct sched_domain_topology_level *tl; 6825 6826 sd = NULL; 6827 for_each_sd_topology(tl) { 6828 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 6829 if (tl == sched_domain_topology) 6830 *per_cpu_ptr(d.sd, i) = sd; 6831 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6832 sd->flags |= SD_OVERLAP; 6833 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6834 break; 6835 } 6836 } 6837 6838 /* Build the groups for the domains */ 6839 for_each_cpu(i, cpu_map) { 6840 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6841 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 6842 if (sd->flags & SD_OVERLAP) { 6843 if (build_overlap_sched_groups(sd, i)) 6844 goto error; 6845 } else { 6846 if (build_sched_groups(sd, i)) 6847 goto error; 6848 } 6849 } 6850 } 6851 6852 /* Calculate CPU capacity for physical packages and nodes */ 6853 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6854 if (!cpumask_test_cpu(i, cpu_map)) 6855 continue; 6856 6857 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6858 claim_allocations(i, sd); 6859 init_sched_groups_capacity(i, sd); 6860 } 6861 } 6862 6863 /* Attach the domains */ 6864 rcu_read_lock(); 6865 for_each_cpu(i, cpu_map) { 6866 sd = *per_cpu_ptr(d.sd, i); 6867 cpu_attach_domain(sd, d.rd, i); 6868 } 6869 rcu_read_unlock(); 6870 6871 ret = 0; 6872 error: 6873 __free_domain_allocs(&d, alloc_state, cpu_map); 6874 return ret; 6875 } 6876 6877 static cpumask_var_t *doms_cur; /* current sched domains */ 6878 static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 6879 static struct sched_domain_attr *dattr_cur; 6880 /* attribues of custom domains in 'doms_cur' */ 6881 6882 /* 6883 * Special case: If a kmalloc of a doms_cur partition (array of 6884 * cpumask) fails, then fallback to a single sched domain, 6885 * as determined by the single cpumask fallback_doms. 6886 */ 6887 static cpumask_var_t fallback_doms; 6888 6889 /* 6890 * arch_update_cpu_topology lets virtualized architectures update the 6891 * cpu core maps. It is supposed to return 1 if the topology changed 6892 * or 0 if it stayed the same. 6893 */ 6894 int __weak arch_update_cpu_topology(void) 6895 { 6896 return 0; 6897 } 6898 6899 cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 6900 { 6901 int i; 6902 cpumask_var_t *doms; 6903 6904 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 6905 if (!doms) 6906 return NULL; 6907 for (i = 0; i < ndoms; i++) { 6908 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 6909 free_sched_domains(doms, i); 6910 return NULL; 6911 } 6912 } 6913 return doms; 6914 } 6915 6916 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 6917 { 6918 unsigned int i; 6919 for (i = 0; i < ndoms; i++) 6920 free_cpumask_var(doms[i]); 6921 kfree(doms); 6922 } 6923 6924 /* 6925 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6926 * For now this just excludes isolated cpus, but could be used to 6927 * exclude other special cases in the future. 6928 */ 6929 static int init_sched_domains(const struct cpumask *cpu_map) 6930 { 6931 int err; 6932 6933 arch_update_cpu_topology(); 6934 ndoms_cur = 1; 6935 doms_cur = alloc_sched_domains(ndoms_cur); 6936 if (!doms_cur) 6937 doms_cur = &fallback_doms; 6938 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6939 err = build_sched_domains(doms_cur[0], NULL); 6940 register_sched_domain_sysctl(); 6941 6942 return err; 6943 } 6944 6945 /* 6946 * Detach sched domains from a group of cpus specified in cpu_map 6947 * These cpus will now be attached to the NULL domain 6948 */ 6949 static void detach_destroy_domains(const struct cpumask *cpu_map) 6950 { 6951 int i; 6952 6953 rcu_read_lock(); 6954 for_each_cpu(i, cpu_map) 6955 cpu_attach_domain(NULL, &def_root_domain, i); 6956 rcu_read_unlock(); 6957 } 6958 6959 /* handle null as "default" */ 6960 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 6961 struct sched_domain_attr *new, int idx_new) 6962 { 6963 struct sched_domain_attr tmp; 6964 6965 /* fast path */ 6966 if (!new && !cur) 6967 return 1; 6968 6969 tmp = SD_ATTR_INIT; 6970 return !memcmp(cur ? (cur + idx_cur) : &tmp, 6971 new ? (new + idx_new) : &tmp, 6972 sizeof(struct sched_domain_attr)); 6973 } 6974 6975 /* 6976 * Partition sched domains as specified by the 'ndoms_new' 6977 * cpumasks in the array doms_new[] of cpumasks. This compares 6978 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6979 * It destroys each deleted domain and builds each new domain. 6980 * 6981 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 6982 * The masks don't intersect (don't overlap.) We should setup one 6983 * sched domain for each mask. CPUs not in any of the cpumasks will 6984 * not be load balanced. If the same cpumask appears both in the 6985 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6986 * it as it is. 6987 * 6988 * The passed in 'doms_new' should be allocated using 6989 * alloc_sched_domains. This routine takes ownership of it and will 6990 * free_sched_domains it when done with it. If the caller failed the 6991 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 6992 * and partition_sched_domains() will fallback to the single partition 6993 * 'fallback_doms', it also forces the domains to be rebuilt. 6994 * 6995 * If doms_new == NULL it will be replaced with cpu_online_mask. 6996 * ndoms_new == 0 is a special case for destroying existing domains, 6997 * and it will not create the default domain. 6998 * 6999 * Call with hotplug lock held 7000 */ 7001 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 7002 struct sched_domain_attr *dattr_new) 7003 { 7004 int i, j, n; 7005 int new_topology; 7006 7007 mutex_lock(&sched_domains_mutex); 7008 7009 /* always unregister in case we don't destroy any domains */ 7010 unregister_sched_domain_sysctl(); 7011 7012 /* Let architecture update cpu core mappings. */ 7013 new_topology = arch_update_cpu_topology(); 7014 7015 n = doms_new ? ndoms_new : 0; 7016 7017 /* Destroy deleted domains */ 7018 for (i = 0; i < ndoms_cur; i++) { 7019 for (j = 0; j < n && !new_topology; j++) { 7020 if (cpumask_equal(doms_cur[i], doms_new[j]) 7021 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7022 goto match1; 7023 } 7024 /* no match - a current sched domain not in new doms_new[] */ 7025 detach_destroy_domains(doms_cur[i]); 7026 match1: 7027 ; 7028 } 7029 7030 n = ndoms_cur; 7031 if (doms_new == NULL) { 7032 n = 0; 7033 doms_new = &fallback_doms; 7034 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 7035 WARN_ON_ONCE(dattr_new); 7036 } 7037 7038 /* Build new domains */ 7039 for (i = 0; i < ndoms_new; i++) { 7040 for (j = 0; j < n && !new_topology; j++) { 7041 if (cpumask_equal(doms_new[i], doms_cur[j]) 7042 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7043 goto match2; 7044 } 7045 /* no match - add a new doms_new */ 7046 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 7047 match2: 7048 ; 7049 } 7050 7051 /* Remember the new sched domains */ 7052 if (doms_cur != &fallback_doms) 7053 free_sched_domains(doms_cur, ndoms_cur); 7054 kfree(dattr_cur); /* kfree(NULL) is safe */ 7055 doms_cur = doms_new; 7056 dattr_cur = dattr_new; 7057 ndoms_cur = ndoms_new; 7058 7059 register_sched_domain_sysctl(); 7060 7061 mutex_unlock(&sched_domains_mutex); 7062 } 7063 7064 static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ 7065 7066 /* 7067 * Update cpusets according to cpu_active mask. If cpusets are 7068 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7069 * around partition_sched_domains(). 7070 * 7071 * If we come here as part of a suspend/resume, don't touch cpusets because we 7072 * want to restore it back to its original state upon resume anyway. 7073 */ 7074 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 7075 void *hcpu) 7076 { 7077 switch (action) { 7078 case CPU_ONLINE_FROZEN: 7079 case CPU_DOWN_FAILED_FROZEN: 7080 7081 /* 7082 * num_cpus_frozen tracks how many CPUs are involved in suspend 7083 * resume sequence. As long as this is not the last online 7084 * operation in the resume sequence, just build a single sched 7085 * domain, ignoring cpusets. 7086 */ 7087 num_cpus_frozen--; 7088 if (likely(num_cpus_frozen)) { 7089 partition_sched_domains(1, NULL, NULL); 7090 break; 7091 } 7092 7093 /* 7094 * This is the last CPU online operation. So fall through and 7095 * restore the original sched domains by considering the 7096 * cpuset configurations. 7097 */ 7098 7099 case CPU_ONLINE: 7100 cpuset_update_active_cpus(true); 7101 break; 7102 default: 7103 return NOTIFY_DONE; 7104 } 7105 return NOTIFY_OK; 7106 } 7107 7108 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7109 void *hcpu) 7110 { 7111 unsigned long flags; 7112 long cpu = (long)hcpu; 7113 struct dl_bw *dl_b; 7114 bool overflow; 7115 int cpus; 7116 7117 switch (action) { 7118 case CPU_DOWN_PREPARE: 7119 rcu_read_lock_sched(); 7120 dl_b = dl_bw_of(cpu); 7121 7122 raw_spin_lock_irqsave(&dl_b->lock, flags); 7123 cpus = dl_bw_cpus(cpu); 7124 overflow = __dl_overflow(dl_b, cpus, 0, 0); 7125 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7126 7127 rcu_read_unlock_sched(); 7128 7129 if (overflow) 7130 return notifier_from_errno(-EBUSY); 7131 cpuset_update_active_cpus(false); 7132 break; 7133 case CPU_DOWN_PREPARE_FROZEN: 7134 num_cpus_frozen++; 7135 partition_sched_domains(1, NULL, NULL); 7136 break; 7137 default: 7138 return NOTIFY_DONE; 7139 } 7140 return NOTIFY_OK; 7141 } 7142 7143 void __init sched_init_smp(void) 7144 { 7145 cpumask_var_t non_isolated_cpus; 7146 7147 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7148 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7149 7150 sched_init_numa(); 7151 7152 /* 7153 * There's no userspace yet to cause hotplug operations; hence all the 7154 * cpu masks are stable and all blatant races in the below code cannot 7155 * happen. 7156 */ 7157 mutex_lock(&sched_domains_mutex); 7158 init_sched_domains(cpu_active_mask); 7159 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7160 if (cpumask_empty(non_isolated_cpus)) 7161 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7162 mutex_unlock(&sched_domains_mutex); 7163 7164 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 7165 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 7166 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 7167 7168 init_hrtick(); 7169 7170 /* Move init over to a non-isolated CPU */ 7171 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 7172 BUG(); 7173 sched_init_granularity(); 7174 free_cpumask_var(non_isolated_cpus); 7175 7176 init_sched_rt_class(); 7177 init_sched_dl_class(); 7178 } 7179 #else 7180 void __init sched_init_smp(void) 7181 { 7182 sched_init_granularity(); 7183 } 7184 #endif /* CONFIG_SMP */ 7185 7186 int in_sched_functions(unsigned long addr) 7187 { 7188 return in_lock_functions(addr) || 7189 (addr >= (unsigned long)__sched_text_start 7190 && addr < (unsigned long)__sched_text_end); 7191 } 7192 7193 #ifdef CONFIG_CGROUP_SCHED 7194 /* 7195 * Default task group. 7196 * Every task in system belongs to this group at bootup. 7197 */ 7198 struct task_group root_task_group; 7199 LIST_HEAD(task_groups); 7200 7201 /* Cacheline aligned slab cache for task_group */ 7202 static struct kmem_cache *task_group_cache __read_mostly; 7203 #endif 7204 7205 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 7206 7207 void __init sched_init(void) 7208 { 7209 int i, j; 7210 unsigned long alloc_size = 0, ptr; 7211 7212 #ifdef CONFIG_FAIR_GROUP_SCHED 7213 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7214 #endif 7215 #ifdef CONFIG_RT_GROUP_SCHED 7216 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7217 #endif 7218 if (alloc_size) { 7219 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7220 7221 #ifdef CONFIG_FAIR_GROUP_SCHED 7222 root_task_group.se = (struct sched_entity **)ptr; 7223 ptr += nr_cpu_ids * sizeof(void **); 7224 7225 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 7226 ptr += nr_cpu_ids * sizeof(void **); 7227 7228 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7229 #ifdef CONFIG_RT_GROUP_SCHED 7230 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 7231 ptr += nr_cpu_ids * sizeof(void **); 7232 7233 root_task_group.rt_rq = (struct rt_rq **)ptr; 7234 ptr += nr_cpu_ids * sizeof(void **); 7235 7236 #endif /* CONFIG_RT_GROUP_SCHED */ 7237 } 7238 #ifdef CONFIG_CPUMASK_OFFSTACK 7239 for_each_possible_cpu(i) { 7240 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( 7241 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 7242 } 7243 #endif /* CONFIG_CPUMASK_OFFSTACK */ 7244 7245 init_rt_bandwidth(&def_rt_bandwidth, 7246 global_rt_period(), global_rt_runtime()); 7247 init_dl_bandwidth(&def_dl_bandwidth, 7248 global_rt_period(), global_rt_runtime()); 7249 7250 #ifdef CONFIG_SMP 7251 init_defrootdomain(); 7252 #endif 7253 7254 #ifdef CONFIG_RT_GROUP_SCHED 7255 init_rt_bandwidth(&root_task_group.rt_bandwidth, 7256 global_rt_period(), global_rt_runtime()); 7257 #endif /* CONFIG_RT_GROUP_SCHED */ 7258 7259 #ifdef CONFIG_CGROUP_SCHED 7260 task_group_cache = KMEM_CACHE(task_group, 0); 7261 7262 list_add(&root_task_group.list, &task_groups); 7263 INIT_LIST_HEAD(&root_task_group.children); 7264 INIT_LIST_HEAD(&root_task_group.siblings); 7265 autogroup_init(&init_task); 7266 #endif /* CONFIG_CGROUP_SCHED */ 7267 7268 for_each_possible_cpu(i) { 7269 struct rq *rq; 7270 7271 rq = cpu_rq(i); 7272 raw_spin_lock_init(&rq->lock); 7273 rq->nr_running = 0; 7274 rq->calc_load_active = 0; 7275 rq->calc_load_update = jiffies + LOAD_FREQ; 7276 init_cfs_rq(&rq->cfs); 7277 init_rt_rq(&rq->rt); 7278 init_dl_rq(&rq->dl); 7279 #ifdef CONFIG_FAIR_GROUP_SCHED 7280 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 7281 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7282 /* 7283 * How much cpu bandwidth does root_task_group get? 7284 * 7285 * In case of task-groups formed thr' the cgroup filesystem, it 7286 * gets 100% of the cpu resources in the system. This overall 7287 * system cpu resource is divided among the tasks of 7288 * root_task_group and its child task-groups in a fair manner, 7289 * based on each entity's (task or task-group's) weight 7290 * (se->load.weight). 7291 * 7292 * In other words, if root_task_group has 10 tasks of weight 7293 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7294 * then A0's share of the cpu resource is: 7295 * 7296 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7297 * 7298 * We achieve this by letting root_task_group's tasks sit 7299 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 7300 */ 7301 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 7302 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 7303 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7304 7305 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7306 #ifdef CONFIG_RT_GROUP_SCHED 7307 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 7308 #endif 7309 7310 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7311 rq->cpu_load[j] = 0; 7312 7313 rq->last_load_update_tick = jiffies; 7314 7315 #ifdef CONFIG_SMP 7316 rq->sd = NULL; 7317 rq->rd = NULL; 7318 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; 7319 rq->balance_callback = NULL; 7320 rq->active_balance = 0; 7321 rq->next_balance = jiffies; 7322 rq->push_cpu = 0; 7323 rq->cpu = i; 7324 rq->online = 0; 7325 rq->idle_stamp = 0; 7326 rq->avg_idle = 2*sysctl_sched_migration_cost; 7327 rq->max_idle_balance_cost = sysctl_sched_migration_cost; 7328 7329 INIT_LIST_HEAD(&rq->cfs_tasks); 7330 7331 rq_attach_root(rq, &def_root_domain); 7332 #ifdef CONFIG_NO_HZ_COMMON 7333 rq->nohz_flags = 0; 7334 #endif 7335 #ifdef CONFIG_NO_HZ_FULL 7336 rq->last_sched_tick = 0; 7337 #endif 7338 #endif 7339 init_rq_hrtick(rq); 7340 atomic_set(&rq->nr_iowait, 0); 7341 } 7342 7343 set_load_weight(&init_task); 7344 7345 #ifdef CONFIG_PREEMPT_NOTIFIERS 7346 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 7347 #endif 7348 7349 /* 7350 * The boot idle thread does lazy MMU switching as well: 7351 */ 7352 atomic_inc(&init_mm.mm_count); 7353 enter_lazy_tlb(&init_mm, current); 7354 7355 /* 7356 * During early bootup we pretend to be a normal task: 7357 */ 7358 current->sched_class = &fair_sched_class; 7359 7360 /* 7361 * Make us the idle thread. Technically, schedule() should not be 7362 * called from this thread, however somewhere below it might be, 7363 * but because we are the idle thread, we just pick up running again 7364 * when this runqueue becomes "idle". 7365 */ 7366 init_idle(current, smp_processor_id()); 7367 7368 calc_load_update = jiffies + LOAD_FREQ; 7369 7370 #ifdef CONFIG_SMP 7371 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7372 /* May be allocated at isolcpus cmdline parse time */ 7373 if (cpu_isolated_map == NULL) 7374 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7375 idle_thread_set_boot_cpu(); 7376 set_cpu_rq_start_time(); 7377 #endif 7378 init_sched_fair_class(); 7379 7380 scheduler_running = 1; 7381 } 7382 7383 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 7384 static inline int preempt_count_equals(int preempt_offset) 7385 { 7386 int nested = preempt_count() + rcu_preempt_depth(); 7387 7388 return (nested == preempt_offset); 7389 } 7390 7391 void __might_sleep(const char *file, int line, int preempt_offset) 7392 { 7393 /* 7394 * Blocking primitives will set (and therefore destroy) current->state, 7395 * since we will exit with TASK_RUNNING make sure we enter with it, 7396 * otherwise we will destroy state. 7397 */ 7398 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, 7399 "do not call blocking ops when !TASK_RUNNING; " 7400 "state=%lx set at [<%p>] %pS\n", 7401 current->state, 7402 (void *)current->task_state_change, 7403 (void *)current->task_state_change); 7404 7405 ___might_sleep(file, line, preempt_offset); 7406 } 7407 EXPORT_SYMBOL(__might_sleep); 7408 7409 void ___might_sleep(const char *file, int line, int preempt_offset) 7410 { 7411 static unsigned long prev_jiffy; /* ratelimiting */ 7412 7413 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7414 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 7415 !is_idle_task(current)) || 7416 system_state != SYSTEM_RUNNING || oops_in_progress) 7417 return; 7418 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 7419 return; 7420 prev_jiffy = jiffies; 7421 7422 printk(KERN_ERR 7423 "BUG: sleeping function called from invalid context at %s:%d\n", 7424 file, line); 7425 printk(KERN_ERR 7426 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 7427 in_atomic(), irqs_disabled(), 7428 current->pid, current->comm); 7429 7430 if (task_stack_end_corrupted(current)) 7431 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); 7432 7433 debug_show_held_locks(current); 7434 if (irqs_disabled()) 7435 print_irqtrace_events(current); 7436 #ifdef CONFIG_DEBUG_PREEMPT 7437 if (!preempt_count_equals(preempt_offset)) { 7438 pr_err("Preemption disabled at:"); 7439 print_ip_sym(current->preempt_disable_ip); 7440 pr_cont("\n"); 7441 } 7442 #endif 7443 dump_stack(); 7444 } 7445 EXPORT_SYMBOL(___might_sleep); 7446 #endif 7447 7448 #ifdef CONFIG_MAGIC_SYSRQ 7449 void normalize_rt_tasks(void) 7450 { 7451 struct task_struct *g, *p; 7452 struct sched_attr attr = { 7453 .sched_policy = SCHED_NORMAL, 7454 }; 7455 7456 read_lock(&tasklist_lock); 7457 for_each_process_thread(g, p) { 7458 /* 7459 * Only normalize user tasks: 7460 */ 7461 if (p->flags & PF_KTHREAD) 7462 continue; 7463 7464 p->se.exec_start = 0; 7465 #ifdef CONFIG_SCHEDSTATS 7466 p->se.statistics.wait_start = 0; 7467 p->se.statistics.sleep_start = 0; 7468 p->se.statistics.block_start = 0; 7469 #endif 7470 7471 if (!dl_task(p) && !rt_task(p)) { 7472 /* 7473 * Renice negative nice level userspace 7474 * tasks back to 0: 7475 */ 7476 if (task_nice(p) < 0) 7477 set_user_nice(p, 0); 7478 continue; 7479 } 7480 7481 __sched_setscheduler(p, &attr, false, false); 7482 } 7483 read_unlock(&tasklist_lock); 7484 } 7485 7486 #endif /* CONFIG_MAGIC_SYSRQ */ 7487 7488 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 7489 /* 7490 * These functions are only useful for the IA64 MCA handling, or kdb. 7491 * 7492 * They can only be called when the whole system has been 7493 * stopped - every CPU needs to be quiescent, and no scheduling 7494 * activity can take place. Using them for anything else would 7495 * be a serious bug, and as a result, they aren't even visible 7496 * under any other configuration. 7497 */ 7498 7499 /** 7500 * curr_task - return the current task for a given cpu. 7501 * @cpu: the processor in question. 7502 * 7503 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7504 * 7505 * Return: The current task for @cpu. 7506 */ 7507 struct task_struct *curr_task(int cpu) 7508 { 7509 return cpu_curr(cpu); 7510 } 7511 7512 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 7513 7514 #ifdef CONFIG_IA64 7515 /** 7516 * set_curr_task - set the current task for a given cpu. 7517 * @cpu: the processor in question. 7518 * @p: the task pointer to set. 7519 * 7520 * Description: This function must only be used when non-maskable interrupts 7521 * are serviced on a separate stack. It allows the architecture to switch the 7522 * notion of the current task on a cpu in a non-blocking manner. This function 7523 * must be called with all CPU's synchronized, and interrupts disabled, the 7524 * and caller must save the original value of the current task (see 7525 * curr_task() above) and restore that value before reenabling interrupts and 7526 * re-starting the system. 7527 * 7528 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7529 */ 7530 void set_curr_task(int cpu, struct task_struct *p) 7531 { 7532 cpu_curr(cpu) = p; 7533 } 7534 7535 #endif 7536 7537 #ifdef CONFIG_CGROUP_SCHED 7538 /* task_group_lock serializes the addition/removal of task groups */ 7539 static DEFINE_SPINLOCK(task_group_lock); 7540 7541 static void sched_free_group(struct task_group *tg) 7542 { 7543 free_fair_sched_group(tg); 7544 free_rt_sched_group(tg); 7545 autogroup_free(tg); 7546 kmem_cache_free(task_group_cache, tg); 7547 } 7548 7549 /* allocate runqueue etc for a new task group */ 7550 struct task_group *sched_create_group(struct task_group *parent) 7551 { 7552 struct task_group *tg; 7553 7554 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); 7555 if (!tg) 7556 return ERR_PTR(-ENOMEM); 7557 7558 if (!alloc_fair_sched_group(tg, parent)) 7559 goto err; 7560 7561 if (!alloc_rt_sched_group(tg, parent)) 7562 goto err; 7563 7564 return tg; 7565 7566 err: 7567 sched_free_group(tg); 7568 return ERR_PTR(-ENOMEM); 7569 } 7570 7571 void sched_online_group(struct task_group *tg, struct task_group *parent) 7572 { 7573 unsigned long flags; 7574 7575 spin_lock_irqsave(&task_group_lock, flags); 7576 list_add_rcu(&tg->list, &task_groups); 7577 7578 WARN_ON(!parent); /* root should already exist */ 7579 7580 tg->parent = parent; 7581 INIT_LIST_HEAD(&tg->children); 7582 list_add_rcu(&tg->siblings, &parent->children); 7583 spin_unlock_irqrestore(&task_group_lock, flags); 7584 } 7585 7586 /* rcu callback to free various structures associated with a task group */ 7587 static void sched_free_group_rcu(struct rcu_head *rhp) 7588 { 7589 /* now it should be safe to free those cfs_rqs */ 7590 sched_free_group(container_of(rhp, struct task_group, rcu)); 7591 } 7592 7593 void sched_destroy_group(struct task_group *tg) 7594 { 7595 /* wait for possible concurrent references to cfs_rqs complete */ 7596 call_rcu(&tg->rcu, sched_free_group_rcu); 7597 } 7598 7599 void sched_offline_group(struct task_group *tg) 7600 { 7601 unsigned long flags; 7602 7603 /* end participation in shares distribution */ 7604 unregister_fair_sched_group(tg); 7605 7606 spin_lock_irqsave(&task_group_lock, flags); 7607 list_del_rcu(&tg->list); 7608 list_del_rcu(&tg->siblings); 7609 spin_unlock_irqrestore(&task_group_lock, flags); 7610 } 7611 7612 /* change task's runqueue when it moves between groups. 7613 * The caller of this function should have put the task in its new group 7614 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 7615 * reflect its new group. 7616 */ 7617 void sched_move_task(struct task_struct *tsk) 7618 { 7619 struct task_group *tg; 7620 int queued, running; 7621 unsigned long flags; 7622 struct rq *rq; 7623 7624 rq = task_rq_lock(tsk, &flags); 7625 7626 running = task_current(rq, tsk); 7627 queued = task_on_rq_queued(tsk); 7628 7629 if (queued) 7630 dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); 7631 if (unlikely(running)) 7632 put_prev_task(rq, tsk); 7633 7634 /* 7635 * All callers are synchronized by task_rq_lock(); we do not use RCU 7636 * which is pointless here. Thus, we pass "true" to task_css_check() 7637 * to prevent lockdep warnings. 7638 */ 7639 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), 7640 struct task_group, css); 7641 tg = autogroup_task_group(tsk, tg); 7642 tsk->sched_task_group = tg; 7643 7644 #ifdef CONFIG_FAIR_GROUP_SCHED 7645 if (tsk->sched_class->task_move_group) 7646 tsk->sched_class->task_move_group(tsk); 7647 else 7648 #endif 7649 set_task_rq(tsk, task_cpu(tsk)); 7650 7651 if (unlikely(running)) 7652 tsk->sched_class->set_curr_task(rq); 7653 if (queued) 7654 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); 7655 7656 task_rq_unlock(rq, tsk, &flags); 7657 } 7658 #endif /* CONFIG_CGROUP_SCHED */ 7659 7660 #ifdef CONFIG_RT_GROUP_SCHED 7661 /* 7662 * Ensure that the real time constraints are schedulable. 7663 */ 7664 static DEFINE_MUTEX(rt_constraints_mutex); 7665 7666 /* Must be called with tasklist_lock held */ 7667 static inline int tg_has_rt_tasks(struct task_group *tg) 7668 { 7669 struct task_struct *g, *p; 7670 7671 /* 7672 * Autogroups do not have RT tasks; see autogroup_create(). 7673 */ 7674 if (task_group_is_autogroup(tg)) 7675 return 0; 7676 7677 for_each_process_thread(g, p) { 7678 if (rt_task(p) && task_group(p) == tg) 7679 return 1; 7680 } 7681 7682 return 0; 7683 } 7684 7685 struct rt_schedulable_data { 7686 struct task_group *tg; 7687 u64 rt_period; 7688 u64 rt_runtime; 7689 }; 7690 7691 static int tg_rt_schedulable(struct task_group *tg, void *data) 7692 { 7693 struct rt_schedulable_data *d = data; 7694 struct task_group *child; 7695 unsigned long total, sum = 0; 7696 u64 period, runtime; 7697 7698 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7699 runtime = tg->rt_bandwidth.rt_runtime; 7700 7701 if (tg == d->tg) { 7702 period = d->rt_period; 7703 runtime = d->rt_runtime; 7704 } 7705 7706 /* 7707 * Cannot have more runtime than the period. 7708 */ 7709 if (runtime > period && runtime != RUNTIME_INF) 7710 return -EINVAL; 7711 7712 /* 7713 * Ensure we don't starve existing RT tasks. 7714 */ 7715 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 7716 return -EBUSY; 7717 7718 total = to_ratio(period, runtime); 7719 7720 /* 7721 * Nobody can have more than the global setting allows. 7722 */ 7723 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 7724 return -EINVAL; 7725 7726 /* 7727 * The sum of our children's runtime should not exceed our own. 7728 */ 7729 list_for_each_entry_rcu(child, &tg->children, siblings) { 7730 period = ktime_to_ns(child->rt_bandwidth.rt_period); 7731 runtime = child->rt_bandwidth.rt_runtime; 7732 7733 if (child == d->tg) { 7734 period = d->rt_period; 7735 runtime = d->rt_runtime; 7736 } 7737 7738 sum += to_ratio(period, runtime); 7739 } 7740 7741 if (sum > total) 7742 return -EINVAL; 7743 7744 return 0; 7745 } 7746 7747 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7748 { 7749 int ret; 7750 7751 struct rt_schedulable_data data = { 7752 .tg = tg, 7753 .rt_period = period, 7754 .rt_runtime = runtime, 7755 }; 7756 7757 rcu_read_lock(); 7758 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 7759 rcu_read_unlock(); 7760 7761 return ret; 7762 } 7763 7764 static int tg_set_rt_bandwidth(struct task_group *tg, 7765 u64 rt_period, u64 rt_runtime) 7766 { 7767 int i, err = 0; 7768 7769 /* 7770 * Disallowing the root group RT runtime is BAD, it would disallow the 7771 * kernel creating (and or operating) RT threads. 7772 */ 7773 if (tg == &root_task_group && rt_runtime == 0) 7774 return -EINVAL; 7775 7776 /* No period doesn't make any sense. */ 7777 if (rt_period == 0) 7778 return -EINVAL; 7779 7780 mutex_lock(&rt_constraints_mutex); 7781 read_lock(&tasklist_lock); 7782 err = __rt_schedulable(tg, rt_period, rt_runtime); 7783 if (err) 7784 goto unlock; 7785 7786 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7787 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 7788 tg->rt_bandwidth.rt_runtime = rt_runtime; 7789 7790 for_each_possible_cpu(i) { 7791 struct rt_rq *rt_rq = tg->rt_rq[i]; 7792 7793 raw_spin_lock(&rt_rq->rt_runtime_lock); 7794 rt_rq->rt_runtime = rt_runtime; 7795 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7796 } 7797 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7798 unlock: 7799 read_unlock(&tasklist_lock); 7800 mutex_unlock(&rt_constraints_mutex); 7801 7802 return err; 7803 } 7804 7805 static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7806 { 7807 u64 rt_runtime, rt_period; 7808 7809 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7810 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7811 if (rt_runtime_us < 0) 7812 rt_runtime = RUNTIME_INF; 7813 7814 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7815 } 7816 7817 static long sched_group_rt_runtime(struct task_group *tg) 7818 { 7819 u64 rt_runtime_us; 7820 7821 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 7822 return -1; 7823 7824 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 7825 do_div(rt_runtime_us, NSEC_PER_USEC); 7826 return rt_runtime_us; 7827 } 7828 7829 static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) 7830 { 7831 u64 rt_runtime, rt_period; 7832 7833 rt_period = rt_period_us * NSEC_PER_USEC; 7834 rt_runtime = tg->rt_bandwidth.rt_runtime; 7835 7836 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7837 } 7838 7839 static long sched_group_rt_period(struct task_group *tg) 7840 { 7841 u64 rt_period_us; 7842 7843 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 7844 do_div(rt_period_us, NSEC_PER_USEC); 7845 return rt_period_us; 7846 } 7847 #endif /* CONFIG_RT_GROUP_SCHED */ 7848 7849 #ifdef CONFIG_RT_GROUP_SCHED 7850 static int sched_rt_global_constraints(void) 7851 { 7852 int ret = 0; 7853 7854 mutex_lock(&rt_constraints_mutex); 7855 read_lock(&tasklist_lock); 7856 ret = __rt_schedulable(NULL, 0, 0); 7857 read_unlock(&tasklist_lock); 7858 mutex_unlock(&rt_constraints_mutex); 7859 7860 return ret; 7861 } 7862 7863 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7864 { 7865 /* Don't accept realtime tasks when there is no way for them to run */ 7866 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7867 return 0; 7868 7869 return 1; 7870 } 7871 7872 #else /* !CONFIG_RT_GROUP_SCHED */ 7873 static int sched_rt_global_constraints(void) 7874 { 7875 unsigned long flags; 7876 int i, ret = 0; 7877 7878 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7879 for_each_possible_cpu(i) { 7880 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7881 7882 raw_spin_lock(&rt_rq->rt_runtime_lock); 7883 rt_rq->rt_runtime = global_rt_runtime(); 7884 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7885 } 7886 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7887 7888 return ret; 7889 } 7890 #endif /* CONFIG_RT_GROUP_SCHED */ 7891 7892 static int sched_dl_global_validate(void) 7893 { 7894 u64 runtime = global_rt_runtime(); 7895 u64 period = global_rt_period(); 7896 u64 new_bw = to_ratio(period, runtime); 7897 struct dl_bw *dl_b; 7898 int cpu, ret = 0; 7899 unsigned long flags; 7900 7901 /* 7902 * Here we want to check the bandwidth not being set to some 7903 * value smaller than the currently allocated bandwidth in 7904 * any of the root_domains. 7905 * 7906 * FIXME: Cycling on all the CPUs is overdoing, but simpler than 7907 * cycling on root_domains... Discussion on different/better 7908 * solutions is welcome! 7909 */ 7910 for_each_possible_cpu(cpu) { 7911 rcu_read_lock_sched(); 7912 dl_b = dl_bw_of(cpu); 7913 7914 raw_spin_lock_irqsave(&dl_b->lock, flags); 7915 if (new_bw < dl_b->total_bw) 7916 ret = -EBUSY; 7917 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7918 7919 rcu_read_unlock_sched(); 7920 7921 if (ret) 7922 break; 7923 } 7924 7925 return ret; 7926 } 7927 7928 static void sched_dl_do_global(void) 7929 { 7930 u64 new_bw = -1; 7931 struct dl_bw *dl_b; 7932 int cpu; 7933 unsigned long flags; 7934 7935 def_dl_bandwidth.dl_period = global_rt_period(); 7936 def_dl_bandwidth.dl_runtime = global_rt_runtime(); 7937 7938 if (global_rt_runtime() != RUNTIME_INF) 7939 new_bw = to_ratio(global_rt_period(), global_rt_runtime()); 7940 7941 /* 7942 * FIXME: As above... 7943 */ 7944 for_each_possible_cpu(cpu) { 7945 rcu_read_lock_sched(); 7946 dl_b = dl_bw_of(cpu); 7947 7948 raw_spin_lock_irqsave(&dl_b->lock, flags); 7949 dl_b->bw = new_bw; 7950 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7951 7952 rcu_read_unlock_sched(); 7953 } 7954 } 7955 7956 static int sched_rt_global_validate(void) 7957 { 7958 if (sysctl_sched_rt_period <= 0) 7959 return -EINVAL; 7960 7961 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && 7962 (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) 7963 return -EINVAL; 7964 7965 return 0; 7966 } 7967 7968 static void sched_rt_do_global(void) 7969 { 7970 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7971 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 7972 } 7973 7974 int sched_rt_handler(struct ctl_table *table, int write, 7975 void __user *buffer, size_t *lenp, 7976 loff_t *ppos) 7977 { 7978 int old_period, old_runtime; 7979 static DEFINE_MUTEX(mutex); 7980 int ret; 7981 7982 mutex_lock(&mutex); 7983 old_period = sysctl_sched_rt_period; 7984 old_runtime = sysctl_sched_rt_runtime; 7985 7986 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7987 7988 if (!ret && write) { 7989 ret = sched_rt_global_validate(); 7990 if (ret) 7991 goto undo; 7992 7993 ret = sched_dl_global_validate(); 7994 if (ret) 7995 goto undo; 7996 7997 ret = sched_rt_global_constraints(); 7998 if (ret) 7999 goto undo; 8000 8001 sched_rt_do_global(); 8002 sched_dl_do_global(); 8003 } 8004 if (0) { 8005 undo: 8006 sysctl_sched_rt_period = old_period; 8007 sysctl_sched_rt_runtime = old_runtime; 8008 } 8009 mutex_unlock(&mutex); 8010 8011 return ret; 8012 } 8013 8014 int sched_rr_handler(struct ctl_table *table, int write, 8015 void __user *buffer, size_t *lenp, 8016 loff_t *ppos) 8017 { 8018 int ret; 8019 static DEFINE_MUTEX(mutex); 8020 8021 mutex_lock(&mutex); 8022 ret = proc_dointvec(table, write, buffer, lenp, ppos); 8023 /* make sure that internally we keep jiffies */ 8024 /* also, writing zero resets timeslice to default */ 8025 if (!ret && write) { 8026 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 8027 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 8028 } 8029 mutex_unlock(&mutex); 8030 return ret; 8031 } 8032 8033 #ifdef CONFIG_CGROUP_SCHED 8034 8035 static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 8036 { 8037 return css ? container_of(css, struct task_group, css) : NULL; 8038 } 8039 8040 static struct cgroup_subsys_state * 8041 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 8042 { 8043 struct task_group *parent = css_tg(parent_css); 8044 struct task_group *tg; 8045 8046 if (!parent) { 8047 /* This is early initialization for the top cgroup */ 8048 return &root_task_group.css; 8049 } 8050 8051 tg = sched_create_group(parent); 8052 if (IS_ERR(tg)) 8053 return ERR_PTR(-ENOMEM); 8054 8055 sched_online_group(tg, parent); 8056 8057 return &tg->css; 8058 } 8059 8060 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) 8061 { 8062 struct task_group *tg = css_tg(css); 8063 8064 sched_offline_group(tg); 8065 } 8066 8067 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 8068 { 8069 struct task_group *tg = css_tg(css); 8070 8071 /* 8072 * Relies on the RCU grace period between css_released() and this. 8073 */ 8074 sched_free_group(tg); 8075 } 8076 8077 static void cpu_cgroup_fork(struct task_struct *task) 8078 { 8079 sched_move_task(task); 8080 } 8081 8082 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) 8083 { 8084 struct task_struct *task; 8085 struct cgroup_subsys_state *css; 8086 8087 cgroup_taskset_for_each(task, css, tset) { 8088 #ifdef CONFIG_RT_GROUP_SCHED 8089 if (!sched_rt_can_attach(css_tg(css), task)) 8090 return -EINVAL; 8091 #else 8092 /* We don't support RT-tasks being in separate groups */ 8093 if (task->sched_class != &fair_sched_class) 8094 return -EINVAL; 8095 #endif 8096 } 8097 return 0; 8098 } 8099 8100 static void cpu_cgroup_attach(struct cgroup_taskset *tset) 8101 { 8102 struct task_struct *task; 8103 struct cgroup_subsys_state *css; 8104 8105 cgroup_taskset_for_each(task, css, tset) 8106 sched_move_task(task); 8107 } 8108 8109 #ifdef CONFIG_FAIR_GROUP_SCHED 8110 static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 8111 struct cftype *cftype, u64 shareval) 8112 { 8113 return sched_group_set_shares(css_tg(css), scale_load(shareval)); 8114 } 8115 8116 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, 8117 struct cftype *cft) 8118 { 8119 struct task_group *tg = css_tg(css); 8120 8121 return (u64) scale_load_down(tg->shares); 8122 } 8123 8124 #ifdef CONFIG_CFS_BANDWIDTH 8125 static DEFINE_MUTEX(cfs_constraints_mutex); 8126 8127 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 8128 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 8129 8130 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 8131 8132 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 8133 { 8134 int i, ret = 0, runtime_enabled, runtime_was_enabled; 8135 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8136 8137 if (tg == &root_task_group) 8138 return -EINVAL; 8139 8140 /* 8141 * Ensure we have at some amount of bandwidth every period. This is 8142 * to prevent reaching a state of large arrears when throttled via 8143 * entity_tick() resulting in prolonged exit starvation. 8144 */ 8145 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 8146 return -EINVAL; 8147 8148 /* 8149 * Likewise, bound things on the otherside by preventing insane quota 8150 * periods. This also allows us to normalize in computing quota 8151 * feasibility. 8152 */ 8153 if (period > max_cfs_quota_period) 8154 return -EINVAL; 8155 8156 /* 8157 * Prevent race between setting of cfs_rq->runtime_enabled and 8158 * unthrottle_offline_cfs_rqs(). 8159 */ 8160 get_online_cpus(); 8161 mutex_lock(&cfs_constraints_mutex); 8162 ret = __cfs_schedulable(tg, period, quota); 8163 if (ret) 8164 goto out_unlock; 8165 8166 runtime_enabled = quota != RUNTIME_INF; 8167 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 8168 /* 8169 * If we need to toggle cfs_bandwidth_used, off->on must occur 8170 * before making related changes, and on->off must occur afterwards 8171 */ 8172 if (runtime_enabled && !runtime_was_enabled) 8173 cfs_bandwidth_usage_inc(); 8174 raw_spin_lock_irq(&cfs_b->lock); 8175 cfs_b->period = ns_to_ktime(period); 8176 cfs_b->quota = quota; 8177 8178 __refill_cfs_bandwidth_runtime(cfs_b); 8179 /* restart the period timer (if active) to handle new period expiry */ 8180 if (runtime_enabled) 8181 start_cfs_bandwidth(cfs_b); 8182 raw_spin_unlock_irq(&cfs_b->lock); 8183 8184 for_each_online_cpu(i) { 8185 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 8186 struct rq *rq = cfs_rq->rq; 8187 8188 raw_spin_lock_irq(&rq->lock); 8189 cfs_rq->runtime_enabled = runtime_enabled; 8190 cfs_rq->runtime_remaining = 0; 8191 8192 if (cfs_rq->throttled) 8193 unthrottle_cfs_rq(cfs_rq); 8194 raw_spin_unlock_irq(&rq->lock); 8195 } 8196 if (runtime_was_enabled && !runtime_enabled) 8197 cfs_bandwidth_usage_dec(); 8198 out_unlock: 8199 mutex_unlock(&cfs_constraints_mutex); 8200 put_online_cpus(); 8201 8202 return ret; 8203 } 8204 8205 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 8206 { 8207 u64 quota, period; 8208 8209 period = ktime_to_ns(tg->cfs_bandwidth.period); 8210 if (cfs_quota_us < 0) 8211 quota = RUNTIME_INF; 8212 else 8213 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 8214 8215 return tg_set_cfs_bandwidth(tg, period, quota); 8216 } 8217 8218 long tg_get_cfs_quota(struct task_group *tg) 8219 { 8220 u64 quota_us; 8221 8222 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 8223 return -1; 8224 8225 quota_us = tg->cfs_bandwidth.quota; 8226 do_div(quota_us, NSEC_PER_USEC); 8227 8228 return quota_us; 8229 } 8230 8231 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 8232 { 8233 u64 quota, period; 8234 8235 period = (u64)cfs_period_us * NSEC_PER_USEC; 8236 quota = tg->cfs_bandwidth.quota; 8237 8238 return tg_set_cfs_bandwidth(tg, period, quota); 8239 } 8240 8241 long tg_get_cfs_period(struct task_group *tg) 8242 { 8243 u64 cfs_period_us; 8244 8245 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 8246 do_div(cfs_period_us, NSEC_PER_USEC); 8247 8248 return cfs_period_us; 8249 } 8250 8251 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, 8252 struct cftype *cft) 8253 { 8254 return tg_get_cfs_quota(css_tg(css)); 8255 } 8256 8257 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, 8258 struct cftype *cftype, s64 cfs_quota_us) 8259 { 8260 return tg_set_cfs_quota(css_tg(css), cfs_quota_us); 8261 } 8262 8263 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, 8264 struct cftype *cft) 8265 { 8266 return tg_get_cfs_period(css_tg(css)); 8267 } 8268 8269 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, 8270 struct cftype *cftype, u64 cfs_period_us) 8271 { 8272 return tg_set_cfs_period(css_tg(css), cfs_period_us); 8273 } 8274 8275 struct cfs_schedulable_data { 8276 struct task_group *tg; 8277 u64 period, quota; 8278 }; 8279 8280 /* 8281 * normalize group quota/period to be quota/max_period 8282 * note: units are usecs 8283 */ 8284 static u64 normalize_cfs_quota(struct task_group *tg, 8285 struct cfs_schedulable_data *d) 8286 { 8287 u64 quota, period; 8288 8289 if (tg == d->tg) { 8290 period = d->period; 8291 quota = d->quota; 8292 } else { 8293 period = tg_get_cfs_period(tg); 8294 quota = tg_get_cfs_quota(tg); 8295 } 8296 8297 /* note: these should typically be equivalent */ 8298 if (quota == RUNTIME_INF || quota == -1) 8299 return RUNTIME_INF; 8300 8301 return to_ratio(period, quota); 8302 } 8303 8304 static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 8305 { 8306 struct cfs_schedulable_data *d = data; 8307 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8308 s64 quota = 0, parent_quota = -1; 8309 8310 if (!tg->parent) { 8311 quota = RUNTIME_INF; 8312 } else { 8313 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 8314 8315 quota = normalize_cfs_quota(tg, d); 8316 parent_quota = parent_b->hierarchical_quota; 8317 8318 /* 8319 * ensure max(child_quota) <= parent_quota, inherit when no 8320 * limit is set 8321 */ 8322 if (quota == RUNTIME_INF) 8323 quota = parent_quota; 8324 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 8325 return -EINVAL; 8326 } 8327 cfs_b->hierarchical_quota = quota; 8328 8329 return 0; 8330 } 8331 8332 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 8333 { 8334 int ret; 8335 struct cfs_schedulable_data data = { 8336 .tg = tg, 8337 .period = period, 8338 .quota = quota, 8339 }; 8340 8341 if (quota != RUNTIME_INF) { 8342 do_div(data.period, NSEC_PER_USEC); 8343 do_div(data.quota, NSEC_PER_USEC); 8344 } 8345 8346 rcu_read_lock(); 8347 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 8348 rcu_read_unlock(); 8349 8350 return ret; 8351 } 8352 8353 static int cpu_stats_show(struct seq_file *sf, void *v) 8354 { 8355 struct task_group *tg = css_tg(seq_css(sf)); 8356 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8357 8358 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); 8359 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 8360 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 8361 8362 return 0; 8363 } 8364 #endif /* CONFIG_CFS_BANDWIDTH */ 8365 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8366 8367 #ifdef CONFIG_RT_GROUP_SCHED 8368 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, 8369 struct cftype *cft, s64 val) 8370 { 8371 return sched_group_set_rt_runtime(css_tg(css), val); 8372 } 8373 8374 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, 8375 struct cftype *cft) 8376 { 8377 return sched_group_rt_runtime(css_tg(css)); 8378 } 8379 8380 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, 8381 struct cftype *cftype, u64 rt_period_us) 8382 { 8383 return sched_group_set_rt_period(css_tg(css), rt_period_us); 8384 } 8385 8386 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 8387 struct cftype *cft) 8388 { 8389 return sched_group_rt_period(css_tg(css)); 8390 } 8391 #endif /* CONFIG_RT_GROUP_SCHED */ 8392 8393 static struct cftype cpu_files[] = { 8394 #ifdef CONFIG_FAIR_GROUP_SCHED 8395 { 8396 .name = "shares", 8397 .read_u64 = cpu_shares_read_u64, 8398 .write_u64 = cpu_shares_write_u64, 8399 }, 8400 #endif 8401 #ifdef CONFIG_CFS_BANDWIDTH 8402 { 8403 .name = "cfs_quota_us", 8404 .read_s64 = cpu_cfs_quota_read_s64, 8405 .write_s64 = cpu_cfs_quota_write_s64, 8406 }, 8407 { 8408 .name = "cfs_period_us", 8409 .read_u64 = cpu_cfs_period_read_u64, 8410 .write_u64 = cpu_cfs_period_write_u64, 8411 }, 8412 { 8413 .name = "stat", 8414 .seq_show = cpu_stats_show, 8415 }, 8416 #endif 8417 #ifdef CONFIG_RT_GROUP_SCHED 8418 { 8419 .name = "rt_runtime_us", 8420 .read_s64 = cpu_rt_runtime_read, 8421 .write_s64 = cpu_rt_runtime_write, 8422 }, 8423 { 8424 .name = "rt_period_us", 8425 .read_u64 = cpu_rt_period_read_uint, 8426 .write_u64 = cpu_rt_period_write_uint, 8427 }, 8428 #endif 8429 { } /* terminate */ 8430 }; 8431 8432 struct cgroup_subsys cpu_cgrp_subsys = { 8433 .css_alloc = cpu_cgroup_css_alloc, 8434 .css_released = cpu_cgroup_css_released, 8435 .css_free = cpu_cgroup_css_free, 8436 .fork = cpu_cgroup_fork, 8437 .can_attach = cpu_cgroup_can_attach, 8438 .attach = cpu_cgroup_attach, 8439 .legacy_cftypes = cpu_files, 8440 .early_init = true, 8441 }; 8442 8443 #endif /* CONFIG_CGROUP_SCHED */ 8444 8445 void dump_cpu_task(int cpu) 8446 { 8447 pr_info("Task dump for CPU %d:\n", cpu); 8448 sched_show_task(cpu_curr(cpu)); 8449 } 8450 8451 /* 8452 * Nice levels are multiplicative, with a gentle 10% change for every 8453 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 8454 * nice 1, it will get ~10% less CPU time than another CPU-bound task 8455 * that remained on nice 0. 8456 * 8457 * The "10% effect" is relative and cumulative: from _any_ nice level, 8458 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 8459 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 8460 * If a task goes up by ~10% and another task goes down by ~10% then 8461 * the relative distance between them is ~25%.) 8462 */ 8463 const int sched_prio_to_weight[40] = { 8464 /* -20 */ 88761, 71755, 56483, 46273, 36291, 8465 /* -15 */ 29154, 23254, 18705, 14949, 11916, 8466 /* -10 */ 9548, 7620, 6100, 4904, 3906, 8467 /* -5 */ 3121, 2501, 1991, 1586, 1277, 8468 /* 0 */ 1024, 820, 655, 526, 423, 8469 /* 5 */ 335, 272, 215, 172, 137, 8470 /* 10 */ 110, 87, 70, 56, 45, 8471 /* 15 */ 36, 29, 23, 18, 15, 8472 }; 8473 8474 /* 8475 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. 8476 * 8477 * In cases where the weight does not change often, we can use the 8478 * precalculated inverse to speed up arithmetics by turning divisions 8479 * into multiplications: 8480 */ 8481 const u32 sched_prio_to_wmult[40] = { 8482 /* -20 */ 48388, 59856, 76040, 92818, 118348, 8483 /* -15 */ 147320, 184698, 229616, 287308, 360437, 8484 /* -10 */ 449829, 563644, 704093, 875809, 1099582, 8485 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 8486 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 8487 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 8488 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 8489 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 8490 }; 8491