1 /* 2 * kernel/sched/core.c 3 * 4 * Kernel scheduler and related syscalls 5 * 6 * Copyright (C) 1991-2002 Linus Torvalds 7 * 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 9 * make semaphores SMP safe 10 * 1998-11-19 Implemented schedule_timeout() and related stuff 11 * by Andrea Arcangeli 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 13 * hybrid priority-list and round-robin design with 14 * an array-switch method of distributing timeslices 15 * and per-CPU runqueues. Cleanups and useful suggestions 16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 17 * 2003-09-03 Interactivity tuning by Con Kolivas. 18 * 2004-04-02 Scheduler domains code by Nick Piggin 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 20 * fair scheduling design by Con Kolivas. 21 * 2007-05-05 Load balancing (smp-nice) and other improvements 22 * by Peter Williams 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 26 * Thomas Gleixner, Mike Kravetz 27 */ 28 29 #include <linux/mm.h> 30 #include <linux/module.h> 31 #include <linux/nmi.h> 32 #include <linux/init.h> 33 #include <linux/uaccess.h> 34 #include <linux/highmem.h> 35 #include <asm/mmu_context.h> 36 #include <linux/interrupt.h> 37 #include <linux/capability.h> 38 #include <linux/completion.h> 39 #include <linux/kernel_stat.h> 40 #include <linux/debug_locks.h> 41 #include <linux/perf_event.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/profile.h> 45 #include <linux/freezer.h> 46 #include <linux/vmalloc.h> 47 #include <linux/blkdev.h> 48 #include <linux/delay.h> 49 #include <linux/pid_namespace.h> 50 #include <linux/smp.h> 51 #include <linux/threads.h> 52 #include <linux/timer.h> 53 #include <linux/rcupdate.h> 54 #include <linux/cpu.h> 55 #include <linux/cpuset.h> 56 #include <linux/percpu.h> 57 #include <linux/proc_fs.h> 58 #include <linux/seq_file.h> 59 #include <linux/sysctl.h> 60 #include <linux/syscalls.h> 61 #include <linux/times.h> 62 #include <linux/tsacct_kern.h> 63 #include <linux/kprobes.h> 64 #include <linux/delayacct.h> 65 #include <linux/unistd.h> 66 #include <linux/pagemap.h> 67 #include <linux/hrtimer.h> 68 #include <linux/tick.h> 69 #include <linux/debugfs.h> 70 #include <linux/ctype.h> 71 #include <linux/ftrace.h> 72 #include <linux/slab.h> 73 #include <linux/init_task.h> 74 #include <linux/binfmts.h> 75 #include <linux/context_tracking.h> 76 #include <linux/compiler.h> 77 78 #include <asm/switch_to.h> 79 #include <asm/tlb.h> 80 #include <asm/irq_regs.h> 81 #include <asm/mutex.h> 82 #ifdef CONFIG_PARAVIRT 83 #include <asm/paravirt.h> 84 #endif 85 86 #include "sched.h" 87 #include "../workqueue_internal.h" 88 #include "../smpboot.h" 89 90 #define CREATE_TRACE_POINTS 91 #include <trace/events/sched.h> 92 93 DEFINE_MUTEX(sched_domains_mutex); 94 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 95 96 static void update_rq_clock_task(struct rq *rq, s64 delta); 97 98 void update_rq_clock(struct rq *rq) 99 { 100 s64 delta; 101 102 lockdep_assert_held(&rq->lock); 103 104 if (rq->clock_skip_update & RQCF_ACT_SKIP) 105 return; 106 107 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 108 if (delta < 0) 109 return; 110 rq->clock += delta; 111 update_rq_clock_task(rq, delta); 112 } 113 114 /* 115 * Debugging: various feature bits 116 */ 117 118 #define SCHED_FEAT(name, enabled) \ 119 (1UL << __SCHED_FEAT_##name) * enabled | 120 121 const_debug unsigned int sysctl_sched_features = 122 #include "features.h" 123 0; 124 125 #undef SCHED_FEAT 126 127 #ifdef CONFIG_SCHED_DEBUG 128 #define SCHED_FEAT(name, enabled) \ 129 #name , 130 131 static const char * const sched_feat_names[] = { 132 #include "features.h" 133 }; 134 135 #undef SCHED_FEAT 136 137 static int sched_feat_show(struct seq_file *m, void *v) 138 { 139 int i; 140 141 for (i = 0; i < __SCHED_FEAT_NR; i++) { 142 if (!(sysctl_sched_features & (1UL << i))) 143 seq_puts(m, "NO_"); 144 seq_printf(m, "%s ", sched_feat_names[i]); 145 } 146 seq_puts(m, "\n"); 147 148 return 0; 149 } 150 151 #ifdef HAVE_JUMP_LABEL 152 153 #define jump_label_key__true STATIC_KEY_INIT_TRUE 154 #define jump_label_key__false STATIC_KEY_INIT_FALSE 155 156 #define SCHED_FEAT(name, enabled) \ 157 jump_label_key__##enabled , 158 159 struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { 160 #include "features.h" 161 }; 162 163 #undef SCHED_FEAT 164 165 static void sched_feat_disable(int i) 166 { 167 if (static_key_enabled(&sched_feat_keys[i])) 168 static_key_slow_dec(&sched_feat_keys[i]); 169 } 170 171 static void sched_feat_enable(int i) 172 { 173 if (!static_key_enabled(&sched_feat_keys[i])) 174 static_key_slow_inc(&sched_feat_keys[i]); 175 } 176 #else 177 static void sched_feat_disable(int i) { }; 178 static void sched_feat_enable(int i) { }; 179 #endif /* HAVE_JUMP_LABEL */ 180 181 static int sched_feat_set(char *cmp) 182 { 183 int i; 184 int neg = 0; 185 186 if (strncmp(cmp, "NO_", 3) == 0) { 187 neg = 1; 188 cmp += 3; 189 } 190 191 for (i = 0; i < __SCHED_FEAT_NR; i++) { 192 if (strcmp(cmp, sched_feat_names[i]) == 0) { 193 if (neg) { 194 sysctl_sched_features &= ~(1UL << i); 195 sched_feat_disable(i); 196 } else { 197 sysctl_sched_features |= (1UL << i); 198 sched_feat_enable(i); 199 } 200 break; 201 } 202 } 203 204 return i; 205 } 206 207 static ssize_t 208 sched_feat_write(struct file *filp, const char __user *ubuf, 209 size_t cnt, loff_t *ppos) 210 { 211 char buf[64]; 212 char *cmp; 213 int i; 214 struct inode *inode; 215 216 if (cnt > 63) 217 cnt = 63; 218 219 if (copy_from_user(&buf, ubuf, cnt)) 220 return -EFAULT; 221 222 buf[cnt] = 0; 223 cmp = strstrip(buf); 224 225 /* Ensure the static_key remains in a consistent state */ 226 inode = file_inode(filp); 227 mutex_lock(&inode->i_mutex); 228 i = sched_feat_set(cmp); 229 mutex_unlock(&inode->i_mutex); 230 if (i == __SCHED_FEAT_NR) 231 return -EINVAL; 232 233 *ppos += cnt; 234 235 return cnt; 236 } 237 238 static int sched_feat_open(struct inode *inode, struct file *filp) 239 { 240 return single_open(filp, sched_feat_show, NULL); 241 } 242 243 static const struct file_operations sched_feat_fops = { 244 .open = sched_feat_open, 245 .write = sched_feat_write, 246 .read = seq_read, 247 .llseek = seq_lseek, 248 .release = single_release, 249 }; 250 251 static __init int sched_init_debug(void) 252 { 253 debugfs_create_file("sched_features", 0644, NULL, NULL, 254 &sched_feat_fops); 255 256 return 0; 257 } 258 late_initcall(sched_init_debug); 259 #endif /* CONFIG_SCHED_DEBUG */ 260 261 /* 262 * Number of tasks to iterate in a single balance run. 263 * Limited because this is done with IRQs disabled. 264 */ 265 const_debug unsigned int sysctl_sched_nr_migrate = 32; 266 267 /* 268 * period over which we average the RT time consumption, measured 269 * in ms. 270 * 271 * default: 1s 272 */ 273 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 274 275 /* 276 * period over which we measure -rt task cpu usage in us. 277 * default: 1s 278 */ 279 unsigned int sysctl_sched_rt_period = 1000000; 280 281 __read_mostly int scheduler_running; 282 283 /* 284 * part of the period that we allow rt tasks to run in us. 285 * default: 0.95s 286 */ 287 int sysctl_sched_rt_runtime = 950000; 288 289 /* cpus with isolated domains */ 290 cpumask_var_t cpu_isolated_map; 291 292 /* 293 * this_rq_lock - lock this runqueue and disable interrupts. 294 */ 295 static struct rq *this_rq_lock(void) 296 __acquires(rq->lock) 297 { 298 struct rq *rq; 299 300 local_irq_disable(); 301 rq = this_rq(); 302 raw_spin_lock(&rq->lock); 303 304 return rq; 305 } 306 307 #ifdef CONFIG_SCHED_HRTICK 308 /* 309 * Use HR-timers to deliver accurate preemption points. 310 */ 311 312 static void hrtick_clear(struct rq *rq) 313 { 314 if (hrtimer_active(&rq->hrtick_timer)) 315 hrtimer_cancel(&rq->hrtick_timer); 316 } 317 318 /* 319 * High-resolution timer tick. 320 * Runs from hardirq context with interrupts disabled. 321 */ 322 static enum hrtimer_restart hrtick(struct hrtimer *timer) 323 { 324 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 325 326 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 327 328 raw_spin_lock(&rq->lock); 329 update_rq_clock(rq); 330 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 331 raw_spin_unlock(&rq->lock); 332 333 return HRTIMER_NORESTART; 334 } 335 336 #ifdef CONFIG_SMP 337 338 static void __hrtick_restart(struct rq *rq) 339 { 340 struct hrtimer *timer = &rq->hrtick_timer; 341 342 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 343 } 344 345 /* 346 * called from hardirq (IPI) context 347 */ 348 static void __hrtick_start(void *arg) 349 { 350 struct rq *rq = arg; 351 352 raw_spin_lock(&rq->lock); 353 __hrtick_restart(rq); 354 rq->hrtick_csd_pending = 0; 355 raw_spin_unlock(&rq->lock); 356 } 357 358 /* 359 * Called to set the hrtick timer state. 360 * 361 * called with rq->lock held and irqs disabled 362 */ 363 void hrtick_start(struct rq *rq, u64 delay) 364 { 365 struct hrtimer *timer = &rq->hrtick_timer; 366 ktime_t time; 367 s64 delta; 368 369 /* 370 * Don't schedule slices shorter than 10000ns, that just 371 * doesn't make sense and can cause timer DoS. 372 */ 373 delta = max_t(s64, delay, 10000LL); 374 time = ktime_add_ns(timer->base->get_time(), delta); 375 376 hrtimer_set_expires(timer, time); 377 378 if (rq == this_rq()) { 379 __hrtick_restart(rq); 380 } else if (!rq->hrtick_csd_pending) { 381 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 382 rq->hrtick_csd_pending = 1; 383 } 384 } 385 386 static int 387 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 388 { 389 int cpu = (int)(long)hcpu; 390 391 switch (action) { 392 case CPU_UP_CANCELED: 393 case CPU_UP_CANCELED_FROZEN: 394 case CPU_DOWN_PREPARE: 395 case CPU_DOWN_PREPARE_FROZEN: 396 case CPU_DEAD: 397 case CPU_DEAD_FROZEN: 398 hrtick_clear(cpu_rq(cpu)); 399 return NOTIFY_OK; 400 } 401 402 return NOTIFY_DONE; 403 } 404 405 static __init void init_hrtick(void) 406 { 407 hotcpu_notifier(hotplug_hrtick, 0); 408 } 409 #else 410 /* 411 * Called to set the hrtick timer state. 412 * 413 * called with rq->lock held and irqs disabled 414 */ 415 void hrtick_start(struct rq *rq, u64 delay) 416 { 417 /* 418 * Don't schedule slices shorter than 10000ns, that just 419 * doesn't make sense. Rely on vruntime for fairness. 420 */ 421 delay = max_t(u64, delay, 10000LL); 422 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), 423 HRTIMER_MODE_REL_PINNED); 424 } 425 426 static inline void init_hrtick(void) 427 { 428 } 429 #endif /* CONFIG_SMP */ 430 431 static void init_rq_hrtick(struct rq *rq) 432 { 433 #ifdef CONFIG_SMP 434 rq->hrtick_csd_pending = 0; 435 436 rq->hrtick_csd.flags = 0; 437 rq->hrtick_csd.func = __hrtick_start; 438 rq->hrtick_csd.info = rq; 439 #endif 440 441 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 442 rq->hrtick_timer.function = hrtick; 443 } 444 #else /* CONFIG_SCHED_HRTICK */ 445 static inline void hrtick_clear(struct rq *rq) 446 { 447 } 448 449 static inline void init_rq_hrtick(struct rq *rq) 450 { 451 } 452 453 static inline void init_hrtick(void) 454 { 455 } 456 #endif /* CONFIG_SCHED_HRTICK */ 457 458 /* 459 * cmpxchg based fetch_or, macro so it works for different integer types 460 */ 461 #define fetch_or(ptr, val) \ 462 ({ typeof(*(ptr)) __old, __val = *(ptr); \ 463 for (;;) { \ 464 __old = cmpxchg((ptr), __val, __val | (val)); \ 465 if (__old == __val) \ 466 break; \ 467 __val = __old; \ 468 } \ 469 __old; \ 470 }) 471 472 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) 473 /* 474 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 475 * this avoids any races wrt polling state changes and thereby avoids 476 * spurious IPIs. 477 */ 478 static bool set_nr_and_not_polling(struct task_struct *p) 479 { 480 struct thread_info *ti = task_thread_info(p); 481 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); 482 } 483 484 /* 485 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. 486 * 487 * If this returns true, then the idle task promises to call 488 * sched_ttwu_pending() and reschedule soon. 489 */ 490 static bool set_nr_if_polling(struct task_struct *p) 491 { 492 struct thread_info *ti = task_thread_info(p); 493 typeof(ti->flags) old, val = READ_ONCE(ti->flags); 494 495 for (;;) { 496 if (!(val & _TIF_POLLING_NRFLAG)) 497 return false; 498 if (val & _TIF_NEED_RESCHED) 499 return true; 500 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); 501 if (old == val) 502 break; 503 val = old; 504 } 505 return true; 506 } 507 508 #else 509 static bool set_nr_and_not_polling(struct task_struct *p) 510 { 511 set_tsk_need_resched(p); 512 return true; 513 } 514 515 #ifdef CONFIG_SMP 516 static bool set_nr_if_polling(struct task_struct *p) 517 { 518 return false; 519 } 520 #endif 521 #endif 522 523 void wake_q_add(struct wake_q_head *head, struct task_struct *task) 524 { 525 struct wake_q_node *node = &task->wake_q; 526 527 /* 528 * Atomically grab the task, if ->wake_q is !nil already it means 529 * its already queued (either by us or someone else) and will get the 530 * wakeup due to that. 531 * 532 * This cmpxchg() implies a full barrier, which pairs with the write 533 * barrier implied by the wakeup in wake_up_list(). 534 */ 535 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) 536 return; 537 538 get_task_struct(task); 539 540 /* 541 * The head is context local, there can be no concurrency. 542 */ 543 *head->lastp = node; 544 head->lastp = &node->next; 545 } 546 547 void wake_up_q(struct wake_q_head *head) 548 { 549 struct wake_q_node *node = head->first; 550 551 while (node != WAKE_Q_TAIL) { 552 struct task_struct *task; 553 554 task = container_of(node, struct task_struct, wake_q); 555 BUG_ON(!task); 556 /* task can safely be re-inserted now */ 557 node = node->next; 558 task->wake_q.next = NULL; 559 560 /* 561 * wake_up_process() implies a wmb() to pair with the queueing 562 * in wake_q_add() so as not to miss wakeups. 563 */ 564 wake_up_process(task); 565 put_task_struct(task); 566 } 567 } 568 569 /* 570 * resched_curr - mark rq's current task 'to be rescheduled now'. 571 * 572 * On UP this means the setting of the need_resched flag, on SMP it 573 * might also involve a cross-CPU call to trigger the scheduler on 574 * the target CPU. 575 */ 576 void resched_curr(struct rq *rq) 577 { 578 struct task_struct *curr = rq->curr; 579 int cpu; 580 581 lockdep_assert_held(&rq->lock); 582 583 if (test_tsk_need_resched(curr)) 584 return; 585 586 cpu = cpu_of(rq); 587 588 if (cpu == smp_processor_id()) { 589 set_tsk_need_resched(curr); 590 set_preempt_need_resched(); 591 return; 592 } 593 594 if (set_nr_and_not_polling(curr)) 595 smp_send_reschedule(cpu); 596 else 597 trace_sched_wake_idle_without_ipi(cpu); 598 } 599 600 void resched_cpu(int cpu) 601 { 602 struct rq *rq = cpu_rq(cpu); 603 unsigned long flags; 604 605 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 606 return; 607 resched_curr(rq); 608 raw_spin_unlock_irqrestore(&rq->lock, flags); 609 } 610 611 #ifdef CONFIG_SMP 612 #ifdef CONFIG_NO_HZ_COMMON 613 /* 614 * In the semi idle case, use the nearest busy cpu for migrating timers 615 * from an idle cpu. This is good for power-savings. 616 * 617 * We don't do similar optimization for completely idle system, as 618 * selecting an idle cpu will add more delays to the timers than intended 619 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 620 */ 621 int get_nohz_timer_target(void) 622 { 623 int i, cpu = smp_processor_id(); 624 struct sched_domain *sd; 625 626 if (!idle_cpu(cpu)) 627 return cpu; 628 629 rcu_read_lock(); 630 for_each_domain(cpu, sd) { 631 for_each_cpu(i, sched_domain_span(sd)) { 632 if (!idle_cpu(i)) { 633 cpu = i; 634 goto unlock; 635 } 636 } 637 } 638 unlock: 639 rcu_read_unlock(); 640 return cpu; 641 } 642 /* 643 * When add_timer_on() enqueues a timer into the timer wheel of an 644 * idle CPU then this timer might expire before the next timer event 645 * which is scheduled to wake up that CPU. In case of a completely 646 * idle system the next event might even be infinite time into the 647 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 648 * leaves the inner idle loop so the newly added timer is taken into 649 * account when the CPU goes back to idle and evaluates the timer 650 * wheel for the next timer event. 651 */ 652 static void wake_up_idle_cpu(int cpu) 653 { 654 struct rq *rq = cpu_rq(cpu); 655 656 if (cpu == smp_processor_id()) 657 return; 658 659 if (set_nr_and_not_polling(rq->idle)) 660 smp_send_reschedule(cpu); 661 else 662 trace_sched_wake_idle_without_ipi(cpu); 663 } 664 665 static bool wake_up_full_nohz_cpu(int cpu) 666 { 667 /* 668 * We just need the target to call irq_exit() and re-evaluate 669 * the next tick. The nohz full kick at least implies that. 670 * If needed we can still optimize that later with an 671 * empty IRQ. 672 */ 673 if (tick_nohz_full_cpu(cpu)) { 674 if (cpu != smp_processor_id() || 675 tick_nohz_tick_stopped()) 676 tick_nohz_full_kick_cpu(cpu); 677 return true; 678 } 679 680 return false; 681 } 682 683 void wake_up_nohz_cpu(int cpu) 684 { 685 if (!wake_up_full_nohz_cpu(cpu)) 686 wake_up_idle_cpu(cpu); 687 } 688 689 static inline bool got_nohz_idle_kick(void) 690 { 691 int cpu = smp_processor_id(); 692 693 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 694 return false; 695 696 if (idle_cpu(cpu) && !need_resched()) 697 return true; 698 699 /* 700 * We can't run Idle Load Balance on this CPU for this time so we 701 * cancel it and clear NOHZ_BALANCE_KICK 702 */ 703 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 704 return false; 705 } 706 707 #else /* CONFIG_NO_HZ_COMMON */ 708 709 static inline bool got_nohz_idle_kick(void) 710 { 711 return false; 712 } 713 714 #endif /* CONFIG_NO_HZ_COMMON */ 715 716 #ifdef CONFIG_NO_HZ_FULL 717 bool sched_can_stop_tick(void) 718 { 719 /* 720 * FIFO realtime policy runs the highest priority task. Other runnable 721 * tasks are of a lower priority. The scheduler tick does nothing. 722 */ 723 if (current->policy == SCHED_FIFO) 724 return true; 725 726 /* 727 * Round-robin realtime tasks time slice with other tasks at the same 728 * realtime priority. Is this task the only one at this priority? 729 */ 730 if (current->policy == SCHED_RR) { 731 struct sched_rt_entity *rt_se = ¤t->rt; 732 733 return rt_se->run_list.prev == rt_se->run_list.next; 734 } 735 736 /* 737 * More than one running task need preemption. 738 * nr_running update is assumed to be visible 739 * after IPI is sent from wakers. 740 */ 741 if (this_rq()->nr_running > 1) 742 return false; 743 744 return true; 745 } 746 #endif /* CONFIG_NO_HZ_FULL */ 747 748 void sched_avg_update(struct rq *rq) 749 { 750 s64 period = sched_avg_period(); 751 752 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { 753 /* 754 * Inline assembly required to prevent the compiler 755 * optimising this loop into a divmod call. 756 * See __iter_div_u64_rem() for another example of this. 757 */ 758 asm("" : "+rm" (rq->age_stamp)); 759 rq->age_stamp += period; 760 rq->rt_avg /= 2; 761 } 762 } 763 764 #endif /* CONFIG_SMP */ 765 766 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 767 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 768 /* 769 * Iterate task_group tree rooted at *from, calling @down when first entering a 770 * node and @up when leaving it for the final time. 771 * 772 * Caller must hold rcu_lock or sufficient equivalent. 773 */ 774 int walk_tg_tree_from(struct task_group *from, 775 tg_visitor down, tg_visitor up, void *data) 776 { 777 struct task_group *parent, *child; 778 int ret; 779 780 parent = from; 781 782 down: 783 ret = (*down)(parent, data); 784 if (ret) 785 goto out; 786 list_for_each_entry_rcu(child, &parent->children, siblings) { 787 parent = child; 788 goto down; 789 790 up: 791 continue; 792 } 793 ret = (*up)(parent, data); 794 if (ret || parent == from) 795 goto out; 796 797 child = parent; 798 parent = parent->parent; 799 if (parent) 800 goto up; 801 out: 802 return ret; 803 } 804 805 int tg_nop(struct task_group *tg, void *data) 806 { 807 return 0; 808 } 809 #endif 810 811 static void set_load_weight(struct task_struct *p) 812 { 813 int prio = p->static_prio - MAX_RT_PRIO; 814 struct load_weight *load = &p->se.load; 815 816 /* 817 * SCHED_IDLE tasks get minimal weight: 818 */ 819 if (p->policy == SCHED_IDLE) { 820 load->weight = scale_load(WEIGHT_IDLEPRIO); 821 load->inv_weight = WMULT_IDLEPRIO; 822 return; 823 } 824 825 load->weight = scale_load(prio_to_weight[prio]); 826 load->inv_weight = prio_to_wmult[prio]; 827 } 828 829 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 830 { 831 update_rq_clock(rq); 832 sched_info_queued(rq, p); 833 p->sched_class->enqueue_task(rq, p, flags); 834 } 835 836 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 837 { 838 update_rq_clock(rq); 839 sched_info_dequeued(rq, p); 840 p->sched_class->dequeue_task(rq, p, flags); 841 } 842 843 void activate_task(struct rq *rq, struct task_struct *p, int flags) 844 { 845 if (task_contributes_to_load(p)) 846 rq->nr_uninterruptible--; 847 848 enqueue_task(rq, p, flags); 849 } 850 851 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 852 { 853 if (task_contributes_to_load(p)) 854 rq->nr_uninterruptible++; 855 856 dequeue_task(rq, p, flags); 857 } 858 859 static void update_rq_clock_task(struct rq *rq, s64 delta) 860 { 861 /* 862 * In theory, the compile should just see 0 here, and optimize out the call 863 * to sched_rt_avg_update. But I don't trust it... 864 */ 865 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 866 s64 steal = 0, irq_delta = 0; 867 #endif 868 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 869 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 870 871 /* 872 * Since irq_time is only updated on {soft,}irq_exit, we might run into 873 * this case when a previous update_rq_clock() happened inside a 874 * {soft,}irq region. 875 * 876 * When this happens, we stop ->clock_task and only update the 877 * prev_irq_time stamp to account for the part that fit, so that a next 878 * update will consume the rest. This ensures ->clock_task is 879 * monotonic. 880 * 881 * It does however cause some slight miss-attribution of {soft,}irq 882 * time, a more accurate solution would be to update the irq_time using 883 * the current rq->clock timestamp, except that would require using 884 * atomic ops. 885 */ 886 if (irq_delta > delta) 887 irq_delta = delta; 888 889 rq->prev_irq_time += irq_delta; 890 delta -= irq_delta; 891 #endif 892 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 893 if (static_key_false((¶virt_steal_rq_enabled))) { 894 steal = paravirt_steal_clock(cpu_of(rq)); 895 steal -= rq->prev_steal_time_rq; 896 897 if (unlikely(steal > delta)) 898 steal = delta; 899 900 rq->prev_steal_time_rq += steal; 901 delta -= steal; 902 } 903 #endif 904 905 rq->clock_task += delta; 906 907 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 908 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 909 sched_rt_avg_update(rq, irq_delta + steal); 910 #endif 911 } 912 913 void sched_set_stop_task(int cpu, struct task_struct *stop) 914 { 915 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 916 struct task_struct *old_stop = cpu_rq(cpu)->stop; 917 918 if (stop) { 919 /* 920 * Make it appear like a SCHED_FIFO task, its something 921 * userspace knows about and won't get confused about. 922 * 923 * Also, it will make PI more or less work without too 924 * much confusion -- but then, stop work should not 925 * rely on PI working anyway. 926 */ 927 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 928 929 stop->sched_class = &stop_sched_class; 930 } 931 932 cpu_rq(cpu)->stop = stop; 933 934 if (old_stop) { 935 /* 936 * Reset it back to a normal scheduling class so that 937 * it can die in pieces. 938 */ 939 old_stop->sched_class = &rt_sched_class; 940 } 941 } 942 943 /* 944 * __normal_prio - return the priority that is based on the static prio 945 */ 946 static inline int __normal_prio(struct task_struct *p) 947 { 948 return p->static_prio; 949 } 950 951 /* 952 * Calculate the expected normal priority: i.e. priority 953 * without taking RT-inheritance into account. Might be 954 * boosted by interactivity modifiers. Changes upon fork, 955 * setprio syscalls, and whenever the interactivity 956 * estimator recalculates. 957 */ 958 static inline int normal_prio(struct task_struct *p) 959 { 960 int prio; 961 962 if (task_has_dl_policy(p)) 963 prio = MAX_DL_PRIO-1; 964 else if (task_has_rt_policy(p)) 965 prio = MAX_RT_PRIO-1 - p->rt_priority; 966 else 967 prio = __normal_prio(p); 968 return prio; 969 } 970 971 /* 972 * Calculate the current priority, i.e. the priority 973 * taken into account by the scheduler. This value might 974 * be boosted by RT tasks, or might be boosted by 975 * interactivity modifiers. Will be RT if the task got 976 * RT-boosted. If not then it returns p->normal_prio. 977 */ 978 static int effective_prio(struct task_struct *p) 979 { 980 p->normal_prio = normal_prio(p); 981 /* 982 * If we are RT tasks or we were boosted to RT priority, 983 * keep the priority unchanged. Otherwise, update priority 984 * to the normal priority: 985 */ 986 if (!rt_prio(p->prio)) 987 return p->normal_prio; 988 return p->prio; 989 } 990 991 /** 992 * task_curr - is this task currently executing on a CPU? 993 * @p: the task in question. 994 * 995 * Return: 1 if the task is currently executing. 0 otherwise. 996 */ 997 inline int task_curr(const struct task_struct *p) 998 { 999 return cpu_curr(task_cpu(p)) == p; 1000 } 1001 1002 /* 1003 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, 1004 * use the balance_callback list if you want balancing. 1005 * 1006 * this means any call to check_class_changed() must be followed by a call to 1007 * balance_callback(). 1008 */ 1009 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1010 const struct sched_class *prev_class, 1011 int oldprio) 1012 { 1013 if (prev_class != p->sched_class) { 1014 if (prev_class->switched_from) 1015 prev_class->switched_from(rq, p); 1016 1017 p->sched_class->switched_to(rq, p); 1018 } else if (oldprio != p->prio || dl_task(p)) 1019 p->sched_class->prio_changed(rq, p, oldprio); 1020 } 1021 1022 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1023 { 1024 const struct sched_class *class; 1025 1026 if (p->sched_class == rq->curr->sched_class) { 1027 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 1028 } else { 1029 for_each_class(class) { 1030 if (class == rq->curr->sched_class) 1031 break; 1032 if (class == p->sched_class) { 1033 resched_curr(rq); 1034 break; 1035 } 1036 } 1037 } 1038 1039 /* 1040 * A queue event has occurred, and we're going to schedule. In 1041 * this case, we can save a useless back to back clock update. 1042 */ 1043 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 1044 rq_clock_skip_update(rq, true); 1045 } 1046 1047 #ifdef CONFIG_SMP 1048 /* 1049 * This is how migration works: 1050 * 1051 * 1) we invoke migration_cpu_stop() on the target CPU using 1052 * stop_one_cpu(). 1053 * 2) stopper starts to run (implicitly forcing the migrated thread 1054 * off the CPU) 1055 * 3) it checks whether the migrated task is still in the wrong runqueue. 1056 * 4) if it's in the wrong runqueue then the migration thread removes 1057 * it and puts it into the right queue. 1058 * 5) stopper completes and stop_one_cpu() returns and the migration 1059 * is done. 1060 */ 1061 1062 /* 1063 * move_queued_task - move a queued task to new rq. 1064 * 1065 * Returns (locked) new rq. Old rq's lock is released. 1066 */ 1067 static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) 1068 { 1069 lockdep_assert_held(&rq->lock); 1070 1071 dequeue_task(rq, p, 0); 1072 p->on_rq = TASK_ON_RQ_MIGRATING; 1073 set_task_cpu(p, new_cpu); 1074 raw_spin_unlock(&rq->lock); 1075 1076 rq = cpu_rq(new_cpu); 1077 1078 raw_spin_lock(&rq->lock); 1079 BUG_ON(task_cpu(p) != new_cpu); 1080 p->on_rq = TASK_ON_RQ_QUEUED; 1081 enqueue_task(rq, p, 0); 1082 check_preempt_curr(rq, p, 0); 1083 1084 return rq; 1085 } 1086 1087 struct migration_arg { 1088 struct task_struct *task; 1089 int dest_cpu; 1090 }; 1091 1092 /* 1093 * Move (not current) task off this cpu, onto dest cpu. We're doing 1094 * this because either it can't run here any more (set_cpus_allowed() 1095 * away from this CPU, or CPU going down), or because we're 1096 * attempting to rebalance this task on exec (sched_exec). 1097 * 1098 * So we race with normal scheduler movements, but that's OK, as long 1099 * as the task is no longer on this CPU. 1100 */ 1101 static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) 1102 { 1103 if (unlikely(!cpu_active(dest_cpu))) 1104 return rq; 1105 1106 /* Affinity changed (again). */ 1107 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1108 return rq; 1109 1110 rq = move_queued_task(rq, p, dest_cpu); 1111 1112 return rq; 1113 } 1114 1115 /* 1116 * migration_cpu_stop - this will be executed by a highprio stopper thread 1117 * and performs thread migration by bumping thread off CPU then 1118 * 'pushing' onto another runqueue. 1119 */ 1120 static int migration_cpu_stop(void *data) 1121 { 1122 struct migration_arg *arg = data; 1123 struct task_struct *p = arg->task; 1124 struct rq *rq = this_rq(); 1125 1126 /* 1127 * The original target cpu might have gone down and we might 1128 * be on another cpu but it doesn't matter. 1129 */ 1130 local_irq_disable(); 1131 /* 1132 * We need to explicitly wake pending tasks before running 1133 * __migrate_task() such that we will not miss enforcing cpus_allowed 1134 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 1135 */ 1136 sched_ttwu_pending(); 1137 1138 raw_spin_lock(&p->pi_lock); 1139 raw_spin_lock(&rq->lock); 1140 /* 1141 * If task_rq(p) != rq, it cannot be migrated here, because we're 1142 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 1143 * we're holding p->pi_lock. 1144 */ 1145 if (task_rq(p) == rq && task_on_rq_queued(p)) 1146 rq = __migrate_task(rq, p, arg->dest_cpu); 1147 raw_spin_unlock(&rq->lock); 1148 raw_spin_unlock(&p->pi_lock); 1149 1150 local_irq_enable(); 1151 return 0; 1152 } 1153 1154 /* 1155 * sched_class::set_cpus_allowed must do the below, but is not required to 1156 * actually call this function. 1157 */ 1158 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) 1159 { 1160 cpumask_copy(&p->cpus_allowed, new_mask); 1161 p->nr_cpus_allowed = cpumask_weight(new_mask); 1162 } 1163 1164 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1165 { 1166 lockdep_assert_held(&p->pi_lock); 1167 p->sched_class->set_cpus_allowed(p, new_mask); 1168 } 1169 1170 /* 1171 * Change a given task's CPU affinity. Migrate the thread to a 1172 * proper CPU and schedule it away if the CPU it's executing on 1173 * is removed from the allowed bitmask. 1174 * 1175 * NOTE: the caller must have a valid reference to the task, the 1176 * task must not exit() & deallocate itself prematurely. The 1177 * call is not atomic; no spinlocks may be held. 1178 */ 1179 static int __set_cpus_allowed_ptr(struct task_struct *p, 1180 const struct cpumask *new_mask, bool check) 1181 { 1182 unsigned long flags; 1183 struct rq *rq; 1184 unsigned int dest_cpu; 1185 int ret = 0; 1186 1187 rq = task_rq_lock(p, &flags); 1188 1189 /* 1190 * Must re-check here, to close a race against __kthread_bind(), 1191 * sched_setaffinity() is not guaranteed to observe the flag. 1192 */ 1193 if (check && (p->flags & PF_NO_SETAFFINITY)) { 1194 ret = -EINVAL; 1195 goto out; 1196 } 1197 1198 if (cpumask_equal(&p->cpus_allowed, new_mask)) 1199 goto out; 1200 1201 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 1202 ret = -EINVAL; 1203 goto out; 1204 } 1205 1206 do_set_cpus_allowed(p, new_mask); 1207 1208 /* Can the task run on the task's current CPU? If so, we're done */ 1209 if (cpumask_test_cpu(task_cpu(p), new_mask)) 1210 goto out; 1211 1212 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 1213 if (task_running(rq, p) || p->state == TASK_WAKING) { 1214 struct migration_arg arg = { p, dest_cpu }; 1215 /* Need help from migration thread: drop lock and wait. */ 1216 task_rq_unlock(rq, p, &flags); 1217 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 1218 tlb_migrate_finish(p->mm); 1219 return 0; 1220 } else if (task_on_rq_queued(p)) { 1221 /* 1222 * OK, since we're going to drop the lock immediately 1223 * afterwards anyway. 1224 */ 1225 lockdep_unpin_lock(&rq->lock); 1226 rq = move_queued_task(rq, p, dest_cpu); 1227 lockdep_pin_lock(&rq->lock); 1228 } 1229 out: 1230 task_rq_unlock(rq, p, &flags); 1231 1232 return ret; 1233 } 1234 1235 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 1236 { 1237 return __set_cpus_allowed_ptr(p, new_mask, false); 1238 } 1239 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 1240 1241 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1242 { 1243 #ifdef CONFIG_SCHED_DEBUG 1244 /* 1245 * We should never call set_task_cpu() on a blocked task, 1246 * ttwu() will sort out the placement. 1247 */ 1248 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1249 !p->on_rq); 1250 1251 #ifdef CONFIG_LOCKDEP 1252 /* 1253 * The caller should hold either p->pi_lock or rq->lock, when changing 1254 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1255 * 1256 * sched_move_task() holds both and thus holding either pins the cgroup, 1257 * see task_group(). 1258 * 1259 * Furthermore, all task_rq users should acquire both locks, see 1260 * task_rq_lock(). 1261 */ 1262 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 1263 lockdep_is_held(&task_rq(p)->lock))); 1264 #endif 1265 #endif 1266 1267 trace_sched_migrate_task(p, new_cpu); 1268 1269 if (task_cpu(p) != new_cpu) { 1270 if (p->sched_class->migrate_task_rq) 1271 p->sched_class->migrate_task_rq(p, new_cpu); 1272 p->se.nr_migrations++; 1273 perf_event_task_migrate(p); 1274 } 1275 1276 __set_task_cpu(p, new_cpu); 1277 } 1278 1279 static void __migrate_swap_task(struct task_struct *p, int cpu) 1280 { 1281 if (task_on_rq_queued(p)) { 1282 struct rq *src_rq, *dst_rq; 1283 1284 src_rq = task_rq(p); 1285 dst_rq = cpu_rq(cpu); 1286 1287 deactivate_task(src_rq, p, 0); 1288 set_task_cpu(p, cpu); 1289 activate_task(dst_rq, p, 0); 1290 check_preempt_curr(dst_rq, p, 0); 1291 } else { 1292 /* 1293 * Task isn't running anymore; make it appear like we migrated 1294 * it before it went to sleep. This means on wakeup we make the 1295 * previous cpu our targer instead of where it really is. 1296 */ 1297 p->wake_cpu = cpu; 1298 } 1299 } 1300 1301 struct migration_swap_arg { 1302 struct task_struct *src_task, *dst_task; 1303 int src_cpu, dst_cpu; 1304 }; 1305 1306 static int migrate_swap_stop(void *data) 1307 { 1308 struct migration_swap_arg *arg = data; 1309 struct rq *src_rq, *dst_rq; 1310 int ret = -EAGAIN; 1311 1312 src_rq = cpu_rq(arg->src_cpu); 1313 dst_rq = cpu_rq(arg->dst_cpu); 1314 1315 double_raw_lock(&arg->src_task->pi_lock, 1316 &arg->dst_task->pi_lock); 1317 double_rq_lock(src_rq, dst_rq); 1318 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1319 goto unlock; 1320 1321 if (task_cpu(arg->src_task) != arg->src_cpu) 1322 goto unlock; 1323 1324 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) 1325 goto unlock; 1326 1327 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) 1328 goto unlock; 1329 1330 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1331 __migrate_swap_task(arg->dst_task, arg->src_cpu); 1332 1333 ret = 0; 1334 1335 unlock: 1336 double_rq_unlock(src_rq, dst_rq); 1337 raw_spin_unlock(&arg->dst_task->pi_lock); 1338 raw_spin_unlock(&arg->src_task->pi_lock); 1339 1340 return ret; 1341 } 1342 1343 /* 1344 * Cross migrate two tasks 1345 */ 1346 int migrate_swap(struct task_struct *cur, struct task_struct *p) 1347 { 1348 struct migration_swap_arg arg; 1349 int ret = -EINVAL; 1350 1351 arg = (struct migration_swap_arg){ 1352 .src_task = cur, 1353 .src_cpu = task_cpu(cur), 1354 .dst_task = p, 1355 .dst_cpu = task_cpu(p), 1356 }; 1357 1358 if (arg.src_cpu == arg.dst_cpu) 1359 goto out; 1360 1361 /* 1362 * These three tests are all lockless; this is OK since all of them 1363 * will be re-checked with proper locks held further down the line. 1364 */ 1365 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1366 goto out; 1367 1368 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) 1369 goto out; 1370 1371 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1372 goto out; 1373 1374 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1375 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1376 1377 out: 1378 return ret; 1379 } 1380 1381 /* 1382 * wait_task_inactive - wait for a thread to unschedule. 1383 * 1384 * If @match_state is nonzero, it's the @p->state value just checked and 1385 * not expected to change. If it changes, i.e. @p might have woken up, 1386 * then return zero. When we succeed in waiting for @p to be off its CPU, 1387 * we return a positive number (its total switch count). If a second call 1388 * a short while later returns the same number, the caller can be sure that 1389 * @p has remained unscheduled the whole time. 1390 * 1391 * The caller must ensure that the task *will* unschedule sometime soon, 1392 * else this function might spin for a *long* time. This function can't 1393 * be called with interrupts off, or it may introduce deadlock with 1394 * smp_call_function() if an IPI is sent by the same process we are 1395 * waiting to become inactive. 1396 */ 1397 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1398 { 1399 unsigned long flags; 1400 int running, queued; 1401 unsigned long ncsw; 1402 struct rq *rq; 1403 1404 for (;;) { 1405 /* 1406 * We do the initial early heuristics without holding 1407 * any task-queue locks at all. We'll only try to get 1408 * the runqueue lock when things look like they will 1409 * work out! 1410 */ 1411 rq = task_rq(p); 1412 1413 /* 1414 * If the task is actively running on another CPU 1415 * still, just relax and busy-wait without holding 1416 * any locks. 1417 * 1418 * NOTE! Since we don't hold any locks, it's not 1419 * even sure that "rq" stays as the right runqueue! 1420 * But we don't care, since "task_running()" will 1421 * return false if the runqueue has changed and p 1422 * is actually now running somewhere else! 1423 */ 1424 while (task_running(rq, p)) { 1425 if (match_state && unlikely(p->state != match_state)) 1426 return 0; 1427 cpu_relax(); 1428 } 1429 1430 /* 1431 * Ok, time to look more closely! We need the rq 1432 * lock now, to be *sure*. If we're wrong, we'll 1433 * just go back and repeat. 1434 */ 1435 rq = task_rq_lock(p, &flags); 1436 trace_sched_wait_task(p); 1437 running = task_running(rq, p); 1438 queued = task_on_rq_queued(p); 1439 ncsw = 0; 1440 if (!match_state || p->state == match_state) 1441 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1442 task_rq_unlock(rq, p, &flags); 1443 1444 /* 1445 * If it changed from the expected state, bail out now. 1446 */ 1447 if (unlikely(!ncsw)) 1448 break; 1449 1450 /* 1451 * Was it really running after all now that we 1452 * checked with the proper locks actually held? 1453 * 1454 * Oops. Go back and try again.. 1455 */ 1456 if (unlikely(running)) { 1457 cpu_relax(); 1458 continue; 1459 } 1460 1461 /* 1462 * It's not enough that it's not actively running, 1463 * it must be off the runqueue _entirely_, and not 1464 * preempted! 1465 * 1466 * So if it was still runnable (but just not actively 1467 * running right now), it's preempted, and we should 1468 * yield - it could be a while. 1469 */ 1470 if (unlikely(queued)) { 1471 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1472 1473 set_current_state(TASK_UNINTERRUPTIBLE); 1474 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1475 continue; 1476 } 1477 1478 /* 1479 * Ahh, all good. It wasn't running, and it wasn't 1480 * runnable, which means that it will never become 1481 * running in the future either. We're all done! 1482 */ 1483 break; 1484 } 1485 1486 return ncsw; 1487 } 1488 1489 /*** 1490 * kick_process - kick a running thread to enter/exit the kernel 1491 * @p: the to-be-kicked thread 1492 * 1493 * Cause a process which is running on another CPU to enter 1494 * kernel-mode, without any delay. (to get signals handled.) 1495 * 1496 * NOTE: this function doesn't have to take the runqueue lock, 1497 * because all it wants to ensure is that the remote task enters 1498 * the kernel. If the IPI races and the task has been migrated 1499 * to another CPU then no harm is done and the purpose has been 1500 * achieved as well. 1501 */ 1502 void kick_process(struct task_struct *p) 1503 { 1504 int cpu; 1505 1506 preempt_disable(); 1507 cpu = task_cpu(p); 1508 if ((cpu != smp_processor_id()) && task_curr(p)) 1509 smp_send_reschedule(cpu); 1510 preempt_enable(); 1511 } 1512 EXPORT_SYMBOL_GPL(kick_process); 1513 1514 /* 1515 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1516 */ 1517 static int select_fallback_rq(int cpu, struct task_struct *p) 1518 { 1519 int nid = cpu_to_node(cpu); 1520 const struct cpumask *nodemask = NULL; 1521 enum { cpuset, possible, fail } state = cpuset; 1522 int dest_cpu; 1523 1524 /* 1525 * If the node that the cpu is on has been offlined, cpu_to_node() 1526 * will return -1. There is no cpu on the node, and we should 1527 * select the cpu on the other node. 1528 */ 1529 if (nid != -1) { 1530 nodemask = cpumask_of_node(nid); 1531 1532 /* Look for allowed, online CPU in same node. */ 1533 for_each_cpu(dest_cpu, nodemask) { 1534 if (!cpu_online(dest_cpu)) 1535 continue; 1536 if (!cpu_active(dest_cpu)) 1537 continue; 1538 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1539 return dest_cpu; 1540 } 1541 } 1542 1543 for (;;) { 1544 /* Any allowed, online CPU? */ 1545 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1546 if (!cpu_online(dest_cpu)) 1547 continue; 1548 if (!cpu_active(dest_cpu)) 1549 continue; 1550 goto out; 1551 } 1552 1553 switch (state) { 1554 case cpuset: 1555 /* No more Mr. Nice Guy. */ 1556 cpuset_cpus_allowed_fallback(p); 1557 state = possible; 1558 break; 1559 1560 case possible: 1561 do_set_cpus_allowed(p, cpu_possible_mask); 1562 state = fail; 1563 break; 1564 1565 case fail: 1566 BUG(); 1567 break; 1568 } 1569 } 1570 1571 out: 1572 if (state != cpuset) { 1573 /* 1574 * Don't tell them about moving exiting tasks or 1575 * kernel threads (both mm NULL), since they never 1576 * leave kernel. 1577 */ 1578 if (p->mm && printk_ratelimit()) { 1579 printk_deferred("process %d (%s) no longer affine to cpu%d\n", 1580 task_pid_nr(p), p->comm, cpu); 1581 } 1582 } 1583 1584 return dest_cpu; 1585 } 1586 1587 /* 1588 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1589 */ 1590 static inline 1591 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1592 { 1593 lockdep_assert_held(&p->pi_lock); 1594 1595 if (p->nr_cpus_allowed > 1) 1596 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1597 1598 /* 1599 * In order not to call set_task_cpu() on a blocking task we need 1600 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1601 * cpu. 1602 * 1603 * Since this is common to all placement strategies, this lives here. 1604 * 1605 * [ this allows ->select_task() to simply return task_cpu(p) and 1606 * not worry about this generic constraint ] 1607 */ 1608 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1609 !cpu_online(cpu))) 1610 cpu = select_fallback_rq(task_cpu(p), p); 1611 1612 return cpu; 1613 } 1614 1615 static void update_avg(u64 *avg, u64 sample) 1616 { 1617 s64 diff = sample - *avg; 1618 *avg += diff >> 3; 1619 } 1620 1621 #else 1622 1623 static inline int __set_cpus_allowed_ptr(struct task_struct *p, 1624 const struct cpumask *new_mask, bool check) 1625 { 1626 return set_cpus_allowed_ptr(p, new_mask); 1627 } 1628 1629 #endif /* CONFIG_SMP */ 1630 1631 static void 1632 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1633 { 1634 #ifdef CONFIG_SCHEDSTATS 1635 struct rq *rq = this_rq(); 1636 1637 #ifdef CONFIG_SMP 1638 int this_cpu = smp_processor_id(); 1639 1640 if (cpu == this_cpu) { 1641 schedstat_inc(rq, ttwu_local); 1642 schedstat_inc(p, se.statistics.nr_wakeups_local); 1643 } else { 1644 struct sched_domain *sd; 1645 1646 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1647 rcu_read_lock(); 1648 for_each_domain(this_cpu, sd) { 1649 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1650 schedstat_inc(sd, ttwu_wake_remote); 1651 break; 1652 } 1653 } 1654 rcu_read_unlock(); 1655 } 1656 1657 if (wake_flags & WF_MIGRATED) 1658 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1659 1660 #endif /* CONFIG_SMP */ 1661 1662 schedstat_inc(rq, ttwu_count); 1663 schedstat_inc(p, se.statistics.nr_wakeups); 1664 1665 if (wake_flags & WF_SYNC) 1666 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1667 1668 #endif /* CONFIG_SCHEDSTATS */ 1669 } 1670 1671 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1672 { 1673 activate_task(rq, p, en_flags); 1674 p->on_rq = TASK_ON_RQ_QUEUED; 1675 1676 /* if a worker is waking up, notify workqueue */ 1677 if (p->flags & PF_WQ_WORKER) 1678 wq_worker_waking_up(p, cpu_of(rq)); 1679 } 1680 1681 /* 1682 * Mark the task runnable and perform wakeup-preemption. 1683 */ 1684 static void 1685 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1686 { 1687 check_preempt_curr(rq, p, wake_flags); 1688 p->state = TASK_RUNNING; 1689 trace_sched_wakeup(p); 1690 1691 #ifdef CONFIG_SMP 1692 if (p->sched_class->task_woken) { 1693 /* 1694 * Our task @p is fully woken up and running; so its safe to 1695 * drop the rq->lock, hereafter rq is only used for statistics. 1696 */ 1697 lockdep_unpin_lock(&rq->lock); 1698 p->sched_class->task_woken(rq, p); 1699 lockdep_pin_lock(&rq->lock); 1700 } 1701 1702 if (rq->idle_stamp) { 1703 u64 delta = rq_clock(rq) - rq->idle_stamp; 1704 u64 max = 2*rq->max_idle_balance_cost; 1705 1706 update_avg(&rq->avg_idle, delta); 1707 1708 if (rq->avg_idle > max) 1709 rq->avg_idle = max; 1710 1711 rq->idle_stamp = 0; 1712 } 1713 #endif 1714 } 1715 1716 static void 1717 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 1718 { 1719 lockdep_assert_held(&rq->lock); 1720 1721 #ifdef CONFIG_SMP 1722 if (p->sched_contributes_to_load) 1723 rq->nr_uninterruptible--; 1724 #endif 1725 1726 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 1727 ttwu_do_wakeup(rq, p, wake_flags); 1728 } 1729 1730 /* 1731 * Called in case the task @p isn't fully descheduled from its runqueue, 1732 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1733 * since all we need to do is flip p->state to TASK_RUNNING, since 1734 * the task is still ->on_rq. 1735 */ 1736 static int ttwu_remote(struct task_struct *p, int wake_flags) 1737 { 1738 struct rq *rq; 1739 int ret = 0; 1740 1741 rq = __task_rq_lock(p); 1742 if (task_on_rq_queued(p)) { 1743 /* check_preempt_curr() may use rq clock */ 1744 update_rq_clock(rq); 1745 ttwu_do_wakeup(rq, p, wake_flags); 1746 ret = 1; 1747 } 1748 __task_rq_unlock(rq); 1749 1750 return ret; 1751 } 1752 1753 #ifdef CONFIG_SMP 1754 void sched_ttwu_pending(void) 1755 { 1756 struct rq *rq = this_rq(); 1757 struct llist_node *llist = llist_del_all(&rq->wake_list); 1758 struct task_struct *p; 1759 unsigned long flags; 1760 1761 if (!llist) 1762 return; 1763 1764 raw_spin_lock_irqsave(&rq->lock, flags); 1765 lockdep_pin_lock(&rq->lock); 1766 1767 while (llist) { 1768 p = llist_entry(llist, struct task_struct, wake_entry); 1769 llist = llist_next(llist); 1770 ttwu_do_activate(rq, p, 0); 1771 } 1772 1773 lockdep_unpin_lock(&rq->lock); 1774 raw_spin_unlock_irqrestore(&rq->lock, flags); 1775 } 1776 1777 void scheduler_ipi(void) 1778 { 1779 /* 1780 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting 1781 * TIF_NEED_RESCHED remotely (for the first time) will also send 1782 * this IPI. 1783 */ 1784 preempt_fold_need_resched(); 1785 1786 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1787 return; 1788 1789 /* 1790 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1791 * traditionally all their work was done from the interrupt return 1792 * path. Now that we actually do some work, we need to make sure 1793 * we do call them. 1794 * 1795 * Some archs already do call them, luckily irq_enter/exit nest 1796 * properly. 1797 * 1798 * Arguably we should visit all archs and update all handlers, 1799 * however a fair share of IPIs are still resched only so this would 1800 * somewhat pessimize the simple resched case. 1801 */ 1802 irq_enter(); 1803 sched_ttwu_pending(); 1804 1805 /* 1806 * Check if someone kicked us for doing the nohz idle load balance. 1807 */ 1808 if (unlikely(got_nohz_idle_kick())) { 1809 this_rq()->idle_balance = 1; 1810 raise_softirq_irqoff(SCHED_SOFTIRQ); 1811 } 1812 irq_exit(); 1813 } 1814 1815 static void ttwu_queue_remote(struct task_struct *p, int cpu) 1816 { 1817 struct rq *rq = cpu_rq(cpu); 1818 1819 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { 1820 if (!set_nr_if_polling(rq->idle)) 1821 smp_send_reschedule(cpu); 1822 else 1823 trace_sched_wake_idle_without_ipi(cpu); 1824 } 1825 } 1826 1827 void wake_up_if_idle(int cpu) 1828 { 1829 struct rq *rq = cpu_rq(cpu); 1830 unsigned long flags; 1831 1832 rcu_read_lock(); 1833 1834 if (!is_idle_task(rcu_dereference(rq->curr))) 1835 goto out; 1836 1837 if (set_nr_if_polling(rq->idle)) { 1838 trace_sched_wake_idle_without_ipi(cpu); 1839 } else { 1840 raw_spin_lock_irqsave(&rq->lock, flags); 1841 if (is_idle_task(rq->curr)) 1842 smp_send_reschedule(cpu); 1843 /* Else cpu is not in idle, do nothing here */ 1844 raw_spin_unlock_irqrestore(&rq->lock, flags); 1845 } 1846 1847 out: 1848 rcu_read_unlock(); 1849 } 1850 1851 bool cpus_share_cache(int this_cpu, int that_cpu) 1852 { 1853 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1854 } 1855 #endif /* CONFIG_SMP */ 1856 1857 static void ttwu_queue(struct task_struct *p, int cpu) 1858 { 1859 struct rq *rq = cpu_rq(cpu); 1860 1861 #if defined(CONFIG_SMP) 1862 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1863 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1864 ttwu_queue_remote(p, cpu); 1865 return; 1866 } 1867 #endif 1868 1869 raw_spin_lock(&rq->lock); 1870 lockdep_pin_lock(&rq->lock); 1871 ttwu_do_activate(rq, p, 0); 1872 lockdep_unpin_lock(&rq->lock); 1873 raw_spin_unlock(&rq->lock); 1874 } 1875 1876 /** 1877 * try_to_wake_up - wake up a thread 1878 * @p: the thread to be awakened 1879 * @state: the mask of task states that can be woken 1880 * @wake_flags: wake modifier flags (WF_*) 1881 * 1882 * Put it on the run-queue if it's not already there. The "current" 1883 * thread is always on the run-queue (except when the actual 1884 * re-schedule is in progress), and as such you're allowed to do 1885 * the simpler "current->state = TASK_RUNNING" to mark yourself 1886 * runnable without the overhead of this. 1887 * 1888 * Return: %true if @p was woken up, %false if it was already running. 1889 * or @state didn't match @p's state. 1890 */ 1891 static int 1892 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1893 { 1894 unsigned long flags; 1895 int cpu, success = 0; 1896 1897 /* 1898 * If we are going to wake up a thread waiting for CONDITION we 1899 * need to ensure that CONDITION=1 done by the caller can not be 1900 * reordered with p->state check below. This pairs with mb() in 1901 * set_current_state() the waiting thread does. 1902 */ 1903 smp_mb__before_spinlock(); 1904 raw_spin_lock_irqsave(&p->pi_lock, flags); 1905 if (!(p->state & state)) 1906 goto out; 1907 1908 trace_sched_waking(p); 1909 1910 success = 1; /* we're going to change ->state */ 1911 cpu = task_cpu(p); 1912 1913 if (p->on_rq && ttwu_remote(p, wake_flags)) 1914 goto stat; 1915 1916 #ifdef CONFIG_SMP 1917 /* 1918 * If the owning (remote) cpu is still in the middle of schedule() with 1919 * this task as prev, wait until its done referencing the task. 1920 */ 1921 while (p->on_cpu) 1922 cpu_relax(); 1923 /* 1924 * Pairs with the smp_wmb() in finish_lock_switch(). 1925 */ 1926 smp_rmb(); 1927 1928 p->sched_contributes_to_load = !!task_contributes_to_load(p); 1929 p->state = TASK_WAKING; 1930 1931 if (p->sched_class->task_waking) 1932 p->sched_class->task_waking(p); 1933 1934 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 1935 if (task_cpu(p) != cpu) { 1936 wake_flags |= WF_MIGRATED; 1937 set_task_cpu(p, cpu); 1938 } 1939 #endif /* CONFIG_SMP */ 1940 1941 ttwu_queue(p, cpu); 1942 stat: 1943 ttwu_stat(p, cpu, wake_flags); 1944 out: 1945 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1946 1947 return success; 1948 } 1949 1950 /** 1951 * try_to_wake_up_local - try to wake up a local task with rq lock held 1952 * @p: the thread to be awakened 1953 * 1954 * Put @p on the run-queue if it's not already there. The caller must 1955 * ensure that this_rq() is locked, @p is bound to this_rq() and not 1956 * the current task. 1957 */ 1958 static void try_to_wake_up_local(struct task_struct *p) 1959 { 1960 struct rq *rq = task_rq(p); 1961 1962 if (WARN_ON_ONCE(rq != this_rq()) || 1963 WARN_ON_ONCE(p == current)) 1964 return; 1965 1966 lockdep_assert_held(&rq->lock); 1967 1968 if (!raw_spin_trylock(&p->pi_lock)) { 1969 /* 1970 * This is OK, because current is on_cpu, which avoids it being 1971 * picked for load-balance and preemption/IRQs are still 1972 * disabled avoiding further scheduler activity on it and we've 1973 * not yet picked a replacement task. 1974 */ 1975 lockdep_unpin_lock(&rq->lock); 1976 raw_spin_unlock(&rq->lock); 1977 raw_spin_lock(&p->pi_lock); 1978 raw_spin_lock(&rq->lock); 1979 lockdep_pin_lock(&rq->lock); 1980 } 1981 1982 if (!(p->state & TASK_NORMAL)) 1983 goto out; 1984 1985 trace_sched_waking(p); 1986 1987 if (!task_on_rq_queued(p)) 1988 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1989 1990 ttwu_do_wakeup(rq, p, 0); 1991 ttwu_stat(p, smp_processor_id(), 0); 1992 out: 1993 raw_spin_unlock(&p->pi_lock); 1994 } 1995 1996 /** 1997 * wake_up_process - Wake up a specific process 1998 * @p: The process to be woken up. 1999 * 2000 * Attempt to wake up the nominated process and move it to the set of runnable 2001 * processes. 2002 * 2003 * Return: 1 if the process was woken up, 0 if it was already running. 2004 * 2005 * It may be assumed that this function implies a write memory barrier before 2006 * changing the task state if and only if any tasks are woken up. 2007 */ 2008 int wake_up_process(struct task_struct *p) 2009 { 2010 WARN_ON(task_is_stopped_or_traced(p)); 2011 return try_to_wake_up(p, TASK_NORMAL, 0); 2012 } 2013 EXPORT_SYMBOL(wake_up_process); 2014 2015 int wake_up_state(struct task_struct *p, unsigned int state) 2016 { 2017 return try_to_wake_up(p, state, 0); 2018 } 2019 2020 /* 2021 * This function clears the sched_dl_entity static params. 2022 */ 2023 void __dl_clear_params(struct task_struct *p) 2024 { 2025 struct sched_dl_entity *dl_se = &p->dl; 2026 2027 dl_se->dl_runtime = 0; 2028 dl_se->dl_deadline = 0; 2029 dl_se->dl_period = 0; 2030 dl_se->flags = 0; 2031 dl_se->dl_bw = 0; 2032 2033 dl_se->dl_throttled = 0; 2034 dl_se->dl_new = 1; 2035 dl_se->dl_yielded = 0; 2036 } 2037 2038 /* 2039 * Perform scheduler related setup for a newly forked process p. 2040 * p is forked by current. 2041 * 2042 * __sched_fork() is basic setup used by init_idle() too: 2043 */ 2044 static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 2045 { 2046 p->on_rq = 0; 2047 2048 p->se.on_rq = 0; 2049 p->se.exec_start = 0; 2050 p->se.sum_exec_runtime = 0; 2051 p->se.prev_sum_exec_runtime = 0; 2052 p->se.nr_migrations = 0; 2053 p->se.vruntime = 0; 2054 INIT_LIST_HEAD(&p->se.group_node); 2055 2056 #ifdef CONFIG_SCHEDSTATS 2057 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2058 #endif 2059 2060 RB_CLEAR_NODE(&p->dl.rb_node); 2061 init_dl_task_timer(&p->dl); 2062 __dl_clear_params(p); 2063 2064 INIT_LIST_HEAD(&p->rt.run_list); 2065 2066 #ifdef CONFIG_PREEMPT_NOTIFIERS 2067 INIT_HLIST_HEAD(&p->preempt_notifiers); 2068 #endif 2069 2070 #ifdef CONFIG_NUMA_BALANCING 2071 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 2072 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 2073 p->mm->numa_scan_seq = 0; 2074 } 2075 2076 if (clone_flags & CLONE_VM) 2077 p->numa_preferred_nid = current->numa_preferred_nid; 2078 else 2079 p->numa_preferred_nid = -1; 2080 2081 p->node_stamp = 0ULL; 2082 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 2083 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 2084 p->numa_work.next = &p->numa_work; 2085 p->numa_faults = NULL; 2086 p->last_task_numa_placement = 0; 2087 p->last_sum_exec_runtime = 0; 2088 2089 p->numa_group = NULL; 2090 #endif /* CONFIG_NUMA_BALANCING */ 2091 } 2092 2093 #ifdef CONFIG_NUMA_BALANCING 2094 #ifdef CONFIG_SCHED_DEBUG 2095 void set_numabalancing_state(bool enabled) 2096 { 2097 if (enabled) 2098 sched_feat_set("NUMA"); 2099 else 2100 sched_feat_set("NO_NUMA"); 2101 } 2102 #else 2103 __read_mostly bool numabalancing_enabled; 2104 2105 void set_numabalancing_state(bool enabled) 2106 { 2107 numabalancing_enabled = enabled; 2108 } 2109 #endif /* CONFIG_SCHED_DEBUG */ 2110 2111 #ifdef CONFIG_PROC_SYSCTL 2112 int sysctl_numa_balancing(struct ctl_table *table, int write, 2113 void __user *buffer, size_t *lenp, loff_t *ppos) 2114 { 2115 struct ctl_table t; 2116 int err; 2117 int state = numabalancing_enabled; 2118 2119 if (write && !capable(CAP_SYS_ADMIN)) 2120 return -EPERM; 2121 2122 t = *table; 2123 t.data = &state; 2124 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 2125 if (err < 0) 2126 return err; 2127 if (write) 2128 set_numabalancing_state(state); 2129 return err; 2130 } 2131 #endif 2132 #endif 2133 2134 /* 2135 * fork()/clone()-time setup: 2136 */ 2137 int sched_fork(unsigned long clone_flags, struct task_struct *p) 2138 { 2139 unsigned long flags; 2140 int cpu = get_cpu(); 2141 2142 __sched_fork(clone_flags, p); 2143 /* 2144 * We mark the process as running here. This guarantees that 2145 * nobody will actually run it, and a signal or other external 2146 * event cannot wake it up and insert it on the runqueue either. 2147 */ 2148 p->state = TASK_RUNNING; 2149 2150 /* 2151 * Make sure we do not leak PI boosting priority to the child. 2152 */ 2153 p->prio = current->normal_prio; 2154 2155 /* 2156 * Revert to default priority/policy on fork if requested. 2157 */ 2158 if (unlikely(p->sched_reset_on_fork)) { 2159 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 2160 p->policy = SCHED_NORMAL; 2161 p->static_prio = NICE_TO_PRIO(0); 2162 p->rt_priority = 0; 2163 } else if (PRIO_TO_NICE(p->static_prio) < 0) 2164 p->static_prio = NICE_TO_PRIO(0); 2165 2166 p->prio = p->normal_prio = __normal_prio(p); 2167 set_load_weight(p); 2168 2169 /* 2170 * We don't need the reset flag anymore after the fork. It has 2171 * fulfilled its duty: 2172 */ 2173 p->sched_reset_on_fork = 0; 2174 } 2175 2176 if (dl_prio(p->prio)) { 2177 put_cpu(); 2178 return -EAGAIN; 2179 } else if (rt_prio(p->prio)) { 2180 p->sched_class = &rt_sched_class; 2181 } else { 2182 p->sched_class = &fair_sched_class; 2183 } 2184 2185 if (p->sched_class->task_fork) 2186 p->sched_class->task_fork(p); 2187 2188 /* 2189 * The child is not yet in the pid-hash so no cgroup attach races, 2190 * and the cgroup is pinned to this child due to cgroup_fork() 2191 * is ran before sched_fork(). 2192 * 2193 * Silence PROVE_RCU. 2194 */ 2195 raw_spin_lock_irqsave(&p->pi_lock, flags); 2196 set_task_cpu(p, cpu); 2197 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2198 2199 #ifdef CONFIG_SCHED_INFO 2200 if (likely(sched_info_on())) 2201 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2202 #endif 2203 #if defined(CONFIG_SMP) 2204 p->on_cpu = 0; 2205 #endif 2206 init_task_preempt_count(p); 2207 #ifdef CONFIG_SMP 2208 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2209 RB_CLEAR_NODE(&p->pushable_dl_tasks); 2210 #endif 2211 2212 put_cpu(); 2213 return 0; 2214 } 2215 2216 unsigned long to_ratio(u64 period, u64 runtime) 2217 { 2218 if (runtime == RUNTIME_INF) 2219 return 1ULL << 20; 2220 2221 /* 2222 * Doing this here saves a lot of checks in all 2223 * the calling paths, and returning zero seems 2224 * safe for them anyway. 2225 */ 2226 if (period == 0) 2227 return 0; 2228 2229 return div64_u64(runtime << 20, period); 2230 } 2231 2232 #ifdef CONFIG_SMP 2233 inline struct dl_bw *dl_bw_of(int i) 2234 { 2235 rcu_lockdep_assert(rcu_read_lock_sched_held(), 2236 "sched RCU must be held"); 2237 return &cpu_rq(i)->rd->dl_bw; 2238 } 2239 2240 static inline int dl_bw_cpus(int i) 2241 { 2242 struct root_domain *rd = cpu_rq(i)->rd; 2243 int cpus = 0; 2244 2245 rcu_lockdep_assert(rcu_read_lock_sched_held(), 2246 "sched RCU must be held"); 2247 for_each_cpu_and(i, rd->span, cpu_active_mask) 2248 cpus++; 2249 2250 return cpus; 2251 } 2252 #else 2253 inline struct dl_bw *dl_bw_of(int i) 2254 { 2255 return &cpu_rq(i)->dl.dl_bw; 2256 } 2257 2258 static inline int dl_bw_cpus(int i) 2259 { 2260 return 1; 2261 } 2262 #endif 2263 2264 /* 2265 * We must be sure that accepting a new task (or allowing changing the 2266 * parameters of an existing one) is consistent with the bandwidth 2267 * constraints. If yes, this function also accordingly updates the currently 2268 * allocated bandwidth to reflect the new situation. 2269 * 2270 * This function is called while holding p's rq->lock. 2271 * 2272 * XXX we should delay bw change until the task's 0-lag point, see 2273 * __setparam_dl(). 2274 */ 2275 static int dl_overflow(struct task_struct *p, int policy, 2276 const struct sched_attr *attr) 2277 { 2278 2279 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 2280 u64 period = attr->sched_period ?: attr->sched_deadline; 2281 u64 runtime = attr->sched_runtime; 2282 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; 2283 int cpus, err = -1; 2284 2285 if (new_bw == p->dl.dl_bw) 2286 return 0; 2287 2288 /* 2289 * Either if a task, enters, leave, or stays -deadline but changes 2290 * its parameters, we may need to update accordingly the total 2291 * allocated bandwidth of the container. 2292 */ 2293 raw_spin_lock(&dl_b->lock); 2294 cpus = dl_bw_cpus(task_cpu(p)); 2295 if (dl_policy(policy) && !task_has_dl_policy(p) && 2296 !__dl_overflow(dl_b, cpus, 0, new_bw)) { 2297 __dl_add(dl_b, new_bw); 2298 err = 0; 2299 } else if (dl_policy(policy) && task_has_dl_policy(p) && 2300 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { 2301 __dl_clear(dl_b, p->dl.dl_bw); 2302 __dl_add(dl_b, new_bw); 2303 err = 0; 2304 } else if (!dl_policy(policy) && task_has_dl_policy(p)) { 2305 __dl_clear(dl_b, p->dl.dl_bw); 2306 err = 0; 2307 } 2308 raw_spin_unlock(&dl_b->lock); 2309 2310 return err; 2311 } 2312 2313 extern void init_dl_bw(struct dl_bw *dl_b); 2314 2315 /* 2316 * wake_up_new_task - wake up a newly created task for the first time. 2317 * 2318 * This function will do some initial scheduler statistics housekeeping 2319 * that must be done for every newly created context, then puts the task 2320 * on the runqueue and wakes it. 2321 */ 2322 void wake_up_new_task(struct task_struct *p) 2323 { 2324 unsigned long flags; 2325 struct rq *rq; 2326 2327 raw_spin_lock_irqsave(&p->pi_lock, flags); 2328 #ifdef CONFIG_SMP 2329 /* 2330 * Fork balancing, do it here and not earlier because: 2331 * - cpus_allowed can change in the fork path 2332 * - any previously selected cpu might disappear through hotplug 2333 */ 2334 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2335 #endif 2336 2337 /* Initialize new task's runnable average */ 2338 init_entity_runnable_average(&p->se); 2339 rq = __task_rq_lock(p); 2340 activate_task(rq, p, 0); 2341 p->on_rq = TASK_ON_RQ_QUEUED; 2342 trace_sched_wakeup_new(p); 2343 check_preempt_curr(rq, p, WF_FORK); 2344 #ifdef CONFIG_SMP 2345 if (p->sched_class->task_woken) 2346 p->sched_class->task_woken(rq, p); 2347 #endif 2348 task_rq_unlock(rq, p, &flags); 2349 } 2350 2351 #ifdef CONFIG_PREEMPT_NOTIFIERS 2352 2353 static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; 2354 2355 void preempt_notifier_inc(void) 2356 { 2357 static_key_slow_inc(&preempt_notifier_key); 2358 } 2359 EXPORT_SYMBOL_GPL(preempt_notifier_inc); 2360 2361 void preempt_notifier_dec(void) 2362 { 2363 static_key_slow_dec(&preempt_notifier_key); 2364 } 2365 EXPORT_SYMBOL_GPL(preempt_notifier_dec); 2366 2367 /** 2368 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2369 * @notifier: notifier struct to register 2370 */ 2371 void preempt_notifier_register(struct preempt_notifier *notifier) 2372 { 2373 if (!static_key_false(&preempt_notifier_key)) 2374 WARN(1, "registering preempt_notifier while notifiers disabled\n"); 2375 2376 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 2377 } 2378 EXPORT_SYMBOL_GPL(preempt_notifier_register); 2379 2380 /** 2381 * preempt_notifier_unregister - no longer interested in preemption notifications 2382 * @notifier: notifier struct to unregister 2383 * 2384 * This is *not* safe to call from within a preemption notifier. 2385 */ 2386 void preempt_notifier_unregister(struct preempt_notifier *notifier) 2387 { 2388 hlist_del(¬ifier->link); 2389 } 2390 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2391 2392 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) 2393 { 2394 struct preempt_notifier *notifier; 2395 2396 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2397 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2398 } 2399 2400 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2401 { 2402 if (static_key_false(&preempt_notifier_key)) 2403 __fire_sched_in_preempt_notifiers(curr); 2404 } 2405 2406 static void 2407 __fire_sched_out_preempt_notifiers(struct task_struct *curr, 2408 struct task_struct *next) 2409 { 2410 struct preempt_notifier *notifier; 2411 2412 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2413 notifier->ops->sched_out(notifier, next); 2414 } 2415 2416 static __always_inline void 2417 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2418 struct task_struct *next) 2419 { 2420 if (static_key_false(&preempt_notifier_key)) 2421 __fire_sched_out_preempt_notifiers(curr, next); 2422 } 2423 2424 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 2425 2426 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2427 { 2428 } 2429 2430 static inline void 2431 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2432 struct task_struct *next) 2433 { 2434 } 2435 2436 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 2437 2438 /** 2439 * prepare_task_switch - prepare to switch tasks 2440 * @rq: the runqueue preparing to switch 2441 * @prev: the current task that is being switched out 2442 * @next: the task we are going to switch to. 2443 * 2444 * This is called with the rq lock held and interrupts off. It must 2445 * be paired with a subsequent finish_task_switch after the context 2446 * switch. 2447 * 2448 * prepare_task_switch sets up locking and calls architecture specific 2449 * hooks. 2450 */ 2451 static inline void 2452 prepare_task_switch(struct rq *rq, struct task_struct *prev, 2453 struct task_struct *next) 2454 { 2455 trace_sched_switch(prev, next); 2456 sched_info_switch(rq, prev, next); 2457 perf_event_task_sched_out(prev, next); 2458 fire_sched_out_preempt_notifiers(prev, next); 2459 prepare_lock_switch(rq, next); 2460 prepare_arch_switch(next); 2461 } 2462 2463 /** 2464 * finish_task_switch - clean up after a task-switch 2465 * @prev: the thread we just switched away from. 2466 * 2467 * finish_task_switch must be called after the context switch, paired 2468 * with a prepare_task_switch call before the context switch. 2469 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2470 * and do any other architecture-specific cleanup actions. 2471 * 2472 * Note that we may have delayed dropping an mm in context_switch(). If 2473 * so, we finish that here outside of the runqueue lock. (Doing it 2474 * with the lock held can cause deadlocks; see schedule() for 2475 * details.) 2476 * 2477 * The context switch have flipped the stack from under us and restored the 2478 * local variables which were saved when this task called schedule() in the 2479 * past. prev == current is still correct but we need to recalculate this_rq 2480 * because prev may have moved to another CPU. 2481 */ 2482 static struct rq *finish_task_switch(struct task_struct *prev) 2483 __releases(rq->lock) 2484 { 2485 struct rq *rq = this_rq(); 2486 struct mm_struct *mm = rq->prev_mm; 2487 long prev_state; 2488 2489 rq->prev_mm = NULL; 2490 2491 /* 2492 * A task struct has one reference for the use as "current". 2493 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2494 * schedule one last time. The schedule call will never return, and 2495 * the scheduled task must drop that reference. 2496 * The test for TASK_DEAD must occur while the runqueue locks are 2497 * still held, otherwise prev could be scheduled on another cpu, die 2498 * there before we look at prev->state, and then the reference would 2499 * be dropped twice. 2500 * Manfred Spraul <manfred@colorfullife.com> 2501 */ 2502 prev_state = prev->state; 2503 vtime_task_switch(prev); 2504 perf_event_task_sched_in(prev, current); 2505 finish_lock_switch(rq, prev); 2506 finish_arch_post_lock_switch(); 2507 2508 fire_sched_in_preempt_notifiers(current); 2509 if (mm) 2510 mmdrop(mm); 2511 if (unlikely(prev_state == TASK_DEAD)) { 2512 if (prev->sched_class->task_dead) 2513 prev->sched_class->task_dead(prev); 2514 2515 /* 2516 * Remove function-return probe instances associated with this 2517 * task and put them back on the free list. 2518 */ 2519 kprobe_flush_task(prev); 2520 put_task_struct(prev); 2521 } 2522 2523 tick_nohz_task_switch(current); 2524 return rq; 2525 } 2526 2527 #ifdef CONFIG_SMP 2528 2529 /* rq->lock is NOT held, but preemption is disabled */ 2530 static void __balance_callback(struct rq *rq) 2531 { 2532 struct callback_head *head, *next; 2533 void (*func)(struct rq *rq); 2534 unsigned long flags; 2535 2536 raw_spin_lock_irqsave(&rq->lock, flags); 2537 head = rq->balance_callback; 2538 rq->balance_callback = NULL; 2539 while (head) { 2540 func = (void (*)(struct rq *))head->func; 2541 next = head->next; 2542 head->next = NULL; 2543 head = next; 2544 2545 func(rq); 2546 } 2547 raw_spin_unlock_irqrestore(&rq->lock, flags); 2548 } 2549 2550 static inline void balance_callback(struct rq *rq) 2551 { 2552 if (unlikely(rq->balance_callback)) 2553 __balance_callback(rq); 2554 } 2555 2556 #else 2557 2558 static inline void balance_callback(struct rq *rq) 2559 { 2560 } 2561 2562 #endif 2563 2564 /** 2565 * schedule_tail - first thing a freshly forked thread must call. 2566 * @prev: the thread we just switched away from. 2567 */ 2568 asmlinkage __visible void schedule_tail(struct task_struct *prev) 2569 __releases(rq->lock) 2570 { 2571 struct rq *rq; 2572 2573 /* finish_task_switch() drops rq->lock and enables preemtion */ 2574 preempt_disable(); 2575 rq = finish_task_switch(prev); 2576 balance_callback(rq); 2577 preempt_enable(); 2578 2579 if (current->set_child_tid) 2580 put_user(task_pid_vnr(current), current->set_child_tid); 2581 } 2582 2583 /* 2584 * context_switch - switch to the new MM and the new thread's register state. 2585 */ 2586 static inline struct rq * 2587 context_switch(struct rq *rq, struct task_struct *prev, 2588 struct task_struct *next) 2589 { 2590 struct mm_struct *mm, *oldmm; 2591 2592 prepare_task_switch(rq, prev, next); 2593 2594 mm = next->mm; 2595 oldmm = prev->active_mm; 2596 /* 2597 * For paravirt, this is coupled with an exit in switch_to to 2598 * combine the page table reload and the switch backend into 2599 * one hypercall. 2600 */ 2601 arch_start_context_switch(prev); 2602 2603 if (!mm) { 2604 next->active_mm = oldmm; 2605 atomic_inc(&oldmm->mm_count); 2606 enter_lazy_tlb(oldmm, next); 2607 } else 2608 switch_mm(oldmm, mm, next); 2609 2610 if (!prev->mm) { 2611 prev->active_mm = NULL; 2612 rq->prev_mm = oldmm; 2613 } 2614 /* 2615 * Since the runqueue lock will be released by the next 2616 * task (which is an invalid locking op but in the case 2617 * of the scheduler it's an obvious special-case), so we 2618 * do an early lockdep release here: 2619 */ 2620 lockdep_unpin_lock(&rq->lock); 2621 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2622 2623 /* Here we just switch the register state and the stack. */ 2624 switch_to(prev, next, prev); 2625 barrier(); 2626 2627 return finish_task_switch(prev); 2628 } 2629 2630 /* 2631 * nr_running and nr_context_switches: 2632 * 2633 * externally visible scheduler statistics: current number of runnable 2634 * threads, total number of context switches performed since bootup. 2635 */ 2636 unsigned long nr_running(void) 2637 { 2638 unsigned long i, sum = 0; 2639 2640 for_each_online_cpu(i) 2641 sum += cpu_rq(i)->nr_running; 2642 2643 return sum; 2644 } 2645 2646 /* 2647 * Check if only the current task is running on the cpu. 2648 */ 2649 bool single_task_running(void) 2650 { 2651 if (cpu_rq(smp_processor_id())->nr_running == 1) 2652 return true; 2653 else 2654 return false; 2655 } 2656 EXPORT_SYMBOL(single_task_running); 2657 2658 unsigned long long nr_context_switches(void) 2659 { 2660 int i; 2661 unsigned long long sum = 0; 2662 2663 for_each_possible_cpu(i) 2664 sum += cpu_rq(i)->nr_switches; 2665 2666 return sum; 2667 } 2668 2669 unsigned long nr_iowait(void) 2670 { 2671 unsigned long i, sum = 0; 2672 2673 for_each_possible_cpu(i) 2674 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2675 2676 return sum; 2677 } 2678 2679 unsigned long nr_iowait_cpu(int cpu) 2680 { 2681 struct rq *this = cpu_rq(cpu); 2682 return atomic_read(&this->nr_iowait); 2683 } 2684 2685 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) 2686 { 2687 struct rq *rq = this_rq(); 2688 *nr_waiters = atomic_read(&rq->nr_iowait); 2689 *load = rq->load.weight; 2690 } 2691 2692 #ifdef CONFIG_SMP 2693 2694 /* 2695 * sched_exec - execve() is a valuable balancing opportunity, because at 2696 * this point the task has the smallest effective memory and cache footprint. 2697 */ 2698 void sched_exec(void) 2699 { 2700 struct task_struct *p = current; 2701 unsigned long flags; 2702 int dest_cpu; 2703 2704 raw_spin_lock_irqsave(&p->pi_lock, flags); 2705 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 2706 if (dest_cpu == smp_processor_id()) 2707 goto unlock; 2708 2709 if (likely(cpu_active(dest_cpu))) { 2710 struct migration_arg arg = { p, dest_cpu }; 2711 2712 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2713 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2714 return; 2715 } 2716 unlock: 2717 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2718 } 2719 2720 #endif 2721 2722 DEFINE_PER_CPU(struct kernel_stat, kstat); 2723 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2724 2725 EXPORT_PER_CPU_SYMBOL(kstat); 2726 EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2727 2728 /* 2729 * Return accounted runtime for the task. 2730 * In case the task is currently running, return the runtime plus current's 2731 * pending runtime that have not been accounted yet. 2732 */ 2733 unsigned long long task_sched_runtime(struct task_struct *p) 2734 { 2735 unsigned long flags; 2736 struct rq *rq; 2737 u64 ns; 2738 2739 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 2740 /* 2741 * 64-bit doesn't need locks to atomically read a 64bit value. 2742 * So we have a optimization chance when the task's delta_exec is 0. 2743 * Reading ->on_cpu is racy, but this is ok. 2744 * 2745 * If we race with it leaving cpu, we'll take a lock. So we're correct. 2746 * If we race with it entering cpu, unaccounted time is 0. This is 2747 * indistinguishable from the read occurring a few cycles earlier. 2748 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 2749 * been accounted, so we're correct here as well. 2750 */ 2751 if (!p->on_cpu || !task_on_rq_queued(p)) 2752 return p->se.sum_exec_runtime; 2753 #endif 2754 2755 rq = task_rq_lock(p, &flags); 2756 /* 2757 * Must be ->curr _and_ ->on_rq. If dequeued, we would 2758 * project cycles that may never be accounted to this 2759 * thread, breaking clock_gettime(). 2760 */ 2761 if (task_current(rq, p) && task_on_rq_queued(p)) { 2762 update_rq_clock(rq); 2763 p->sched_class->update_curr(rq); 2764 } 2765 ns = p->se.sum_exec_runtime; 2766 task_rq_unlock(rq, p, &flags); 2767 2768 return ns; 2769 } 2770 2771 /* 2772 * This function gets called by the timer code, with HZ frequency. 2773 * We call it with interrupts disabled. 2774 */ 2775 void scheduler_tick(void) 2776 { 2777 int cpu = smp_processor_id(); 2778 struct rq *rq = cpu_rq(cpu); 2779 struct task_struct *curr = rq->curr; 2780 2781 sched_clock_tick(); 2782 2783 raw_spin_lock(&rq->lock); 2784 update_rq_clock(rq); 2785 curr->sched_class->task_tick(rq, curr, 0); 2786 update_cpu_load_active(rq); 2787 calc_global_load_tick(rq); 2788 raw_spin_unlock(&rq->lock); 2789 2790 perf_event_task_tick(); 2791 2792 #ifdef CONFIG_SMP 2793 rq->idle_balance = idle_cpu(cpu); 2794 trigger_load_balance(rq); 2795 #endif 2796 rq_last_tick_reset(rq); 2797 } 2798 2799 #ifdef CONFIG_NO_HZ_FULL 2800 /** 2801 * scheduler_tick_max_deferment 2802 * 2803 * Keep at least one tick per second when a single 2804 * active task is running because the scheduler doesn't 2805 * yet completely support full dynticks environment. 2806 * 2807 * This makes sure that uptime, CFS vruntime, load 2808 * balancing, etc... continue to move forward, even 2809 * with a very low granularity. 2810 * 2811 * Return: Maximum deferment in nanoseconds. 2812 */ 2813 u64 scheduler_tick_max_deferment(void) 2814 { 2815 struct rq *rq = this_rq(); 2816 unsigned long next, now = READ_ONCE(jiffies); 2817 2818 next = rq->last_sched_tick + HZ; 2819 2820 if (time_before_eq(next, now)) 2821 return 0; 2822 2823 return jiffies_to_nsecs(next - now); 2824 } 2825 #endif 2826 2827 notrace unsigned long get_parent_ip(unsigned long addr) 2828 { 2829 if (in_lock_functions(addr)) { 2830 addr = CALLER_ADDR2; 2831 if (in_lock_functions(addr)) 2832 addr = CALLER_ADDR3; 2833 } 2834 return addr; 2835 } 2836 2837 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2838 defined(CONFIG_PREEMPT_TRACER)) 2839 2840 void preempt_count_add(int val) 2841 { 2842 #ifdef CONFIG_DEBUG_PREEMPT 2843 /* 2844 * Underflow? 2845 */ 2846 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2847 return; 2848 #endif 2849 __preempt_count_add(val); 2850 #ifdef CONFIG_DEBUG_PREEMPT 2851 /* 2852 * Spinlock count overflowing soon? 2853 */ 2854 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2855 PREEMPT_MASK - 10); 2856 #endif 2857 if (preempt_count() == val) { 2858 unsigned long ip = get_parent_ip(CALLER_ADDR1); 2859 #ifdef CONFIG_DEBUG_PREEMPT 2860 current->preempt_disable_ip = ip; 2861 #endif 2862 trace_preempt_off(CALLER_ADDR0, ip); 2863 } 2864 } 2865 EXPORT_SYMBOL(preempt_count_add); 2866 NOKPROBE_SYMBOL(preempt_count_add); 2867 2868 void preempt_count_sub(int val) 2869 { 2870 #ifdef CONFIG_DEBUG_PREEMPT 2871 /* 2872 * Underflow? 2873 */ 2874 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 2875 return; 2876 /* 2877 * Is the spinlock portion underflowing? 2878 */ 2879 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 2880 !(preempt_count() & PREEMPT_MASK))) 2881 return; 2882 #endif 2883 2884 if (preempt_count() == val) 2885 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2886 __preempt_count_sub(val); 2887 } 2888 EXPORT_SYMBOL(preempt_count_sub); 2889 NOKPROBE_SYMBOL(preempt_count_sub); 2890 2891 #endif 2892 2893 /* 2894 * Print scheduling while atomic bug: 2895 */ 2896 static noinline void __schedule_bug(struct task_struct *prev) 2897 { 2898 if (oops_in_progress) 2899 return; 2900 2901 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 2902 prev->comm, prev->pid, preempt_count()); 2903 2904 debug_show_held_locks(prev); 2905 print_modules(); 2906 if (irqs_disabled()) 2907 print_irqtrace_events(prev); 2908 #ifdef CONFIG_DEBUG_PREEMPT 2909 if (in_atomic_preempt_off()) { 2910 pr_err("Preemption disabled at:"); 2911 print_ip_sym(current->preempt_disable_ip); 2912 pr_cont("\n"); 2913 } 2914 #endif 2915 dump_stack(); 2916 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2917 } 2918 2919 /* 2920 * Various schedule()-time debugging checks and statistics: 2921 */ 2922 static inline void schedule_debug(struct task_struct *prev) 2923 { 2924 #ifdef CONFIG_SCHED_STACK_END_CHECK 2925 BUG_ON(unlikely(task_stack_end_corrupted(prev))); 2926 #endif 2927 /* 2928 * Test if we are atomic. Since do_exit() needs to call into 2929 * schedule() atomically, we ignore that path. Otherwise whine 2930 * if we are scheduling when we should not. 2931 */ 2932 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) 2933 __schedule_bug(prev); 2934 rcu_sleep_check(); 2935 2936 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2937 2938 schedstat_inc(this_rq(), sched_count); 2939 } 2940 2941 /* 2942 * Pick up the highest-prio task: 2943 */ 2944 static inline struct task_struct * 2945 pick_next_task(struct rq *rq, struct task_struct *prev) 2946 { 2947 const struct sched_class *class = &fair_sched_class; 2948 struct task_struct *p; 2949 2950 /* 2951 * Optimization: we know that if all tasks are in 2952 * the fair class we can call that function directly: 2953 */ 2954 if (likely(prev->sched_class == class && 2955 rq->nr_running == rq->cfs.h_nr_running)) { 2956 p = fair_sched_class.pick_next_task(rq, prev); 2957 if (unlikely(p == RETRY_TASK)) 2958 goto again; 2959 2960 /* assumes fair_sched_class->next == idle_sched_class */ 2961 if (unlikely(!p)) 2962 p = idle_sched_class.pick_next_task(rq, prev); 2963 2964 return p; 2965 } 2966 2967 again: 2968 for_each_class(class) { 2969 p = class->pick_next_task(rq, prev); 2970 if (p) { 2971 if (unlikely(p == RETRY_TASK)) 2972 goto again; 2973 return p; 2974 } 2975 } 2976 2977 BUG(); /* the idle class will always have a runnable task */ 2978 } 2979 2980 /* 2981 * __schedule() is the main scheduler function. 2982 * 2983 * The main means of driving the scheduler and thus entering this function are: 2984 * 2985 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 2986 * 2987 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 2988 * paths. For example, see arch/x86/entry_64.S. 2989 * 2990 * To drive preemption between tasks, the scheduler sets the flag in timer 2991 * interrupt handler scheduler_tick(). 2992 * 2993 * 3. Wakeups don't really cause entry into schedule(). They add a 2994 * task to the run-queue and that's it. 2995 * 2996 * Now, if the new task added to the run-queue preempts the current 2997 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 2998 * called on the nearest possible occasion: 2999 * 3000 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 3001 * 3002 * - in syscall or exception context, at the next outmost 3003 * preempt_enable(). (this might be as soon as the wake_up()'s 3004 * spin_unlock()!) 3005 * 3006 * - in IRQ context, return from interrupt-handler to 3007 * preemptible context 3008 * 3009 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 3010 * then at the next: 3011 * 3012 * - cond_resched() call 3013 * - explicit schedule() call 3014 * - return from syscall or exception to user-space 3015 * - return from interrupt-handler to user-space 3016 * 3017 * WARNING: must be called with preemption disabled! 3018 */ 3019 static void __sched __schedule(void) 3020 { 3021 struct task_struct *prev, *next; 3022 unsigned long *switch_count; 3023 struct rq *rq; 3024 int cpu; 3025 3026 cpu = smp_processor_id(); 3027 rq = cpu_rq(cpu); 3028 rcu_note_context_switch(); 3029 prev = rq->curr; 3030 3031 schedule_debug(prev); 3032 3033 if (sched_feat(HRTICK)) 3034 hrtick_clear(rq); 3035 3036 /* 3037 * Make sure that signal_pending_state()->signal_pending() below 3038 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 3039 * done by the caller to avoid the race with signal_wake_up(). 3040 */ 3041 smp_mb__before_spinlock(); 3042 raw_spin_lock_irq(&rq->lock); 3043 lockdep_pin_lock(&rq->lock); 3044 3045 rq->clock_skip_update <<= 1; /* promote REQ to ACT */ 3046 3047 switch_count = &prev->nivcsw; 3048 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3049 if (unlikely(signal_pending_state(prev->state, prev))) { 3050 prev->state = TASK_RUNNING; 3051 } else { 3052 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3053 prev->on_rq = 0; 3054 3055 /* 3056 * If a worker went to sleep, notify and ask workqueue 3057 * whether it wants to wake up a task to maintain 3058 * concurrency. 3059 */ 3060 if (prev->flags & PF_WQ_WORKER) { 3061 struct task_struct *to_wakeup; 3062 3063 to_wakeup = wq_worker_sleeping(prev, cpu); 3064 if (to_wakeup) 3065 try_to_wake_up_local(to_wakeup); 3066 } 3067 } 3068 switch_count = &prev->nvcsw; 3069 } 3070 3071 if (task_on_rq_queued(prev)) 3072 update_rq_clock(rq); 3073 3074 next = pick_next_task(rq, prev); 3075 clear_tsk_need_resched(prev); 3076 clear_preempt_need_resched(); 3077 rq->clock_skip_update = 0; 3078 3079 if (likely(prev != next)) { 3080 rq->nr_switches++; 3081 rq->curr = next; 3082 ++*switch_count; 3083 3084 rq = context_switch(rq, prev, next); /* unlocks the rq */ 3085 cpu = cpu_of(rq); 3086 } else { 3087 lockdep_unpin_lock(&rq->lock); 3088 raw_spin_unlock_irq(&rq->lock); 3089 } 3090 3091 balance_callback(rq); 3092 } 3093 3094 static inline void sched_submit_work(struct task_struct *tsk) 3095 { 3096 if (!tsk->state || tsk_is_pi_blocked(tsk)) 3097 return; 3098 /* 3099 * If we are going to sleep and we have plugged IO queued, 3100 * make sure to submit it to avoid deadlocks. 3101 */ 3102 if (blk_needs_flush_plug(tsk)) 3103 blk_schedule_flush_plug(tsk); 3104 } 3105 3106 asmlinkage __visible void __sched schedule(void) 3107 { 3108 struct task_struct *tsk = current; 3109 3110 sched_submit_work(tsk); 3111 do { 3112 preempt_disable(); 3113 __schedule(); 3114 sched_preempt_enable_no_resched(); 3115 } while (need_resched()); 3116 } 3117 EXPORT_SYMBOL(schedule); 3118 3119 #ifdef CONFIG_CONTEXT_TRACKING 3120 asmlinkage __visible void __sched schedule_user(void) 3121 { 3122 /* 3123 * If we come here after a random call to set_need_resched(), 3124 * or we have been woken up remotely but the IPI has not yet arrived, 3125 * we haven't yet exited the RCU idle mode. Do it here manually until 3126 * we find a better solution. 3127 * 3128 * NB: There are buggy callers of this function. Ideally we 3129 * should warn if prev_state != CONTEXT_USER, but that will trigger 3130 * too frequently to make sense yet. 3131 */ 3132 enum ctx_state prev_state = exception_enter(); 3133 schedule(); 3134 exception_exit(prev_state); 3135 } 3136 #endif 3137 3138 /** 3139 * schedule_preempt_disabled - called with preemption disabled 3140 * 3141 * Returns with preemption disabled. Note: preempt_count must be 1 3142 */ 3143 void __sched schedule_preempt_disabled(void) 3144 { 3145 sched_preempt_enable_no_resched(); 3146 schedule(); 3147 preempt_disable(); 3148 } 3149 3150 static void __sched notrace preempt_schedule_common(void) 3151 { 3152 do { 3153 preempt_active_enter(); 3154 __schedule(); 3155 preempt_active_exit(); 3156 3157 /* 3158 * Check again in case we missed a preemption opportunity 3159 * between schedule and now. 3160 */ 3161 } while (need_resched()); 3162 } 3163 3164 #ifdef CONFIG_PREEMPT 3165 /* 3166 * this is the entry point to schedule() from in-kernel preemption 3167 * off of preempt_enable. Kernel preemptions off return from interrupt 3168 * occur there and call schedule directly. 3169 */ 3170 asmlinkage __visible void __sched notrace preempt_schedule(void) 3171 { 3172 /* 3173 * If there is a non-zero preempt_count or interrupts are disabled, 3174 * we do not want to preempt the current task. Just return.. 3175 */ 3176 if (likely(!preemptible())) 3177 return; 3178 3179 preempt_schedule_common(); 3180 } 3181 NOKPROBE_SYMBOL(preempt_schedule); 3182 EXPORT_SYMBOL(preempt_schedule); 3183 3184 /** 3185 * preempt_schedule_notrace - preempt_schedule called by tracing 3186 * 3187 * The tracing infrastructure uses preempt_enable_notrace to prevent 3188 * recursion and tracing preempt enabling caused by the tracing 3189 * infrastructure itself. But as tracing can happen in areas coming 3190 * from userspace or just about to enter userspace, a preempt enable 3191 * can occur before user_exit() is called. This will cause the scheduler 3192 * to be called when the system is still in usermode. 3193 * 3194 * To prevent this, the preempt_enable_notrace will use this function 3195 * instead of preempt_schedule() to exit user context if needed before 3196 * calling the scheduler. 3197 */ 3198 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) 3199 { 3200 enum ctx_state prev_ctx; 3201 3202 if (likely(!preemptible())) 3203 return; 3204 3205 do { 3206 /* 3207 * Use raw __prempt_count() ops that don't call function. 3208 * We can't call functions before disabling preemption which 3209 * disarm preemption tracing recursions. 3210 */ 3211 __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); 3212 barrier(); 3213 /* 3214 * Needs preempt disabled in case user_exit() is traced 3215 * and the tracer calls preempt_enable_notrace() causing 3216 * an infinite recursion. 3217 */ 3218 prev_ctx = exception_enter(); 3219 __schedule(); 3220 exception_exit(prev_ctx); 3221 3222 barrier(); 3223 __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); 3224 } while (need_resched()); 3225 } 3226 EXPORT_SYMBOL_GPL(preempt_schedule_notrace); 3227 3228 #endif /* CONFIG_PREEMPT */ 3229 3230 /* 3231 * this is the entry point to schedule() from kernel preemption 3232 * off of irq context. 3233 * Note, that this is called and return with irqs disabled. This will 3234 * protect us against recursive calling from irq. 3235 */ 3236 asmlinkage __visible void __sched preempt_schedule_irq(void) 3237 { 3238 enum ctx_state prev_state; 3239 3240 /* Catch callers which need to be fixed */ 3241 BUG_ON(preempt_count() || !irqs_disabled()); 3242 3243 prev_state = exception_enter(); 3244 3245 do { 3246 preempt_active_enter(); 3247 local_irq_enable(); 3248 __schedule(); 3249 local_irq_disable(); 3250 preempt_active_exit(); 3251 } while (need_resched()); 3252 3253 exception_exit(prev_state); 3254 } 3255 3256 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 3257 void *key) 3258 { 3259 return try_to_wake_up(curr->private, mode, wake_flags); 3260 } 3261 EXPORT_SYMBOL(default_wake_function); 3262 3263 #ifdef CONFIG_RT_MUTEXES 3264 3265 /* 3266 * rt_mutex_setprio - set the current priority of a task 3267 * @p: task 3268 * @prio: prio value (kernel-internal form) 3269 * 3270 * This function changes the 'effective' priority of a task. It does 3271 * not touch ->normal_prio like __setscheduler(). 3272 * 3273 * Used by the rt_mutex code to implement priority inheritance 3274 * logic. Call site only calls if the priority of the task changed. 3275 */ 3276 void rt_mutex_setprio(struct task_struct *p, int prio) 3277 { 3278 int oldprio, queued, running, enqueue_flag = 0; 3279 struct rq *rq; 3280 const struct sched_class *prev_class; 3281 3282 BUG_ON(prio > MAX_PRIO); 3283 3284 rq = __task_rq_lock(p); 3285 3286 /* 3287 * Idle task boosting is a nono in general. There is one 3288 * exception, when PREEMPT_RT and NOHZ is active: 3289 * 3290 * The idle task calls get_next_timer_interrupt() and holds 3291 * the timer wheel base->lock on the CPU and another CPU wants 3292 * to access the timer (probably to cancel it). We can safely 3293 * ignore the boosting request, as the idle CPU runs this code 3294 * with interrupts disabled and will complete the lock 3295 * protected section without being interrupted. So there is no 3296 * real need to boost. 3297 */ 3298 if (unlikely(p == rq->idle)) { 3299 WARN_ON(p != rq->curr); 3300 WARN_ON(p->pi_blocked_on); 3301 goto out_unlock; 3302 } 3303 3304 trace_sched_pi_setprio(p, prio); 3305 oldprio = p->prio; 3306 prev_class = p->sched_class; 3307 queued = task_on_rq_queued(p); 3308 running = task_current(rq, p); 3309 if (queued) 3310 dequeue_task(rq, p, 0); 3311 if (running) 3312 put_prev_task(rq, p); 3313 3314 /* 3315 * Boosting condition are: 3316 * 1. -rt task is running and holds mutex A 3317 * --> -dl task blocks on mutex A 3318 * 3319 * 2. -dl task is running and holds mutex A 3320 * --> -dl task blocks on mutex A and could preempt the 3321 * running task 3322 */ 3323 if (dl_prio(prio)) { 3324 struct task_struct *pi_task = rt_mutex_get_top_task(p); 3325 if (!dl_prio(p->normal_prio) || 3326 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3327 p->dl.dl_boosted = 1; 3328 enqueue_flag = ENQUEUE_REPLENISH; 3329 } else 3330 p->dl.dl_boosted = 0; 3331 p->sched_class = &dl_sched_class; 3332 } else if (rt_prio(prio)) { 3333 if (dl_prio(oldprio)) 3334 p->dl.dl_boosted = 0; 3335 if (oldprio < prio) 3336 enqueue_flag = ENQUEUE_HEAD; 3337 p->sched_class = &rt_sched_class; 3338 } else { 3339 if (dl_prio(oldprio)) 3340 p->dl.dl_boosted = 0; 3341 if (rt_prio(oldprio)) 3342 p->rt.timeout = 0; 3343 p->sched_class = &fair_sched_class; 3344 } 3345 3346 p->prio = prio; 3347 3348 if (running) 3349 p->sched_class->set_curr_task(rq); 3350 if (queued) 3351 enqueue_task(rq, p, enqueue_flag); 3352 3353 check_class_changed(rq, p, prev_class, oldprio); 3354 out_unlock: 3355 preempt_disable(); /* avoid rq from going away on us */ 3356 __task_rq_unlock(rq); 3357 3358 balance_callback(rq); 3359 preempt_enable(); 3360 } 3361 #endif 3362 3363 void set_user_nice(struct task_struct *p, long nice) 3364 { 3365 int old_prio, delta, queued; 3366 unsigned long flags; 3367 struct rq *rq; 3368 3369 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) 3370 return; 3371 /* 3372 * We have to be careful, if called from sys_setpriority(), 3373 * the task might be in the middle of scheduling on another CPU. 3374 */ 3375 rq = task_rq_lock(p, &flags); 3376 /* 3377 * The RT priorities are set via sched_setscheduler(), but we still 3378 * allow the 'normal' nice value to be set - but as expected 3379 * it wont have any effect on scheduling until the task is 3380 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: 3381 */ 3382 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 3383 p->static_prio = NICE_TO_PRIO(nice); 3384 goto out_unlock; 3385 } 3386 queued = task_on_rq_queued(p); 3387 if (queued) 3388 dequeue_task(rq, p, 0); 3389 3390 p->static_prio = NICE_TO_PRIO(nice); 3391 set_load_weight(p); 3392 old_prio = p->prio; 3393 p->prio = effective_prio(p); 3394 delta = p->prio - old_prio; 3395 3396 if (queued) { 3397 enqueue_task(rq, p, 0); 3398 /* 3399 * If the task increased its priority or is running and 3400 * lowered its priority, then reschedule its CPU: 3401 */ 3402 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3403 resched_curr(rq); 3404 } 3405 out_unlock: 3406 task_rq_unlock(rq, p, &flags); 3407 } 3408 EXPORT_SYMBOL(set_user_nice); 3409 3410 /* 3411 * can_nice - check if a task can reduce its nice value 3412 * @p: task 3413 * @nice: nice value 3414 */ 3415 int can_nice(const struct task_struct *p, const int nice) 3416 { 3417 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3418 int nice_rlim = nice_to_rlimit(nice); 3419 3420 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3421 capable(CAP_SYS_NICE)); 3422 } 3423 3424 #ifdef __ARCH_WANT_SYS_NICE 3425 3426 /* 3427 * sys_nice - change the priority of the current process. 3428 * @increment: priority increment 3429 * 3430 * sys_setpriority is a more generic, but much slower function that 3431 * does similar things. 3432 */ 3433 SYSCALL_DEFINE1(nice, int, increment) 3434 { 3435 long nice, retval; 3436 3437 /* 3438 * Setpriority might change our priority at the same moment. 3439 * We don't have to worry. Conceptually one call occurs first 3440 * and we have a single winner. 3441 */ 3442 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); 3443 nice = task_nice(current) + increment; 3444 3445 nice = clamp_val(nice, MIN_NICE, MAX_NICE); 3446 if (increment < 0 && !can_nice(current, nice)) 3447 return -EPERM; 3448 3449 retval = security_task_setnice(current, nice); 3450 if (retval) 3451 return retval; 3452 3453 set_user_nice(current, nice); 3454 return 0; 3455 } 3456 3457 #endif 3458 3459 /** 3460 * task_prio - return the priority value of a given task. 3461 * @p: the task in question. 3462 * 3463 * Return: The priority value as seen by users in /proc. 3464 * RT tasks are offset by -200. Normal tasks are centered 3465 * around 0, value goes from -16 to +15. 3466 */ 3467 int task_prio(const struct task_struct *p) 3468 { 3469 return p->prio - MAX_RT_PRIO; 3470 } 3471 3472 /** 3473 * idle_cpu - is a given cpu idle currently? 3474 * @cpu: the processor in question. 3475 * 3476 * Return: 1 if the CPU is currently idle. 0 otherwise. 3477 */ 3478 int idle_cpu(int cpu) 3479 { 3480 struct rq *rq = cpu_rq(cpu); 3481 3482 if (rq->curr != rq->idle) 3483 return 0; 3484 3485 if (rq->nr_running) 3486 return 0; 3487 3488 #ifdef CONFIG_SMP 3489 if (!llist_empty(&rq->wake_list)) 3490 return 0; 3491 #endif 3492 3493 return 1; 3494 } 3495 3496 /** 3497 * idle_task - return the idle task for a given cpu. 3498 * @cpu: the processor in question. 3499 * 3500 * Return: The idle task for the cpu @cpu. 3501 */ 3502 struct task_struct *idle_task(int cpu) 3503 { 3504 return cpu_rq(cpu)->idle; 3505 } 3506 3507 /** 3508 * find_process_by_pid - find a process with a matching PID value. 3509 * @pid: the pid in question. 3510 * 3511 * The task of @pid, if found. %NULL otherwise. 3512 */ 3513 static struct task_struct *find_process_by_pid(pid_t pid) 3514 { 3515 return pid ? find_task_by_vpid(pid) : current; 3516 } 3517 3518 /* 3519 * This function initializes the sched_dl_entity of a newly becoming 3520 * SCHED_DEADLINE task. 3521 * 3522 * Only the static values are considered here, the actual runtime and the 3523 * absolute deadline will be properly calculated when the task is enqueued 3524 * for the first time with its new policy. 3525 */ 3526 static void 3527 __setparam_dl(struct task_struct *p, const struct sched_attr *attr) 3528 { 3529 struct sched_dl_entity *dl_se = &p->dl; 3530 3531 dl_se->dl_runtime = attr->sched_runtime; 3532 dl_se->dl_deadline = attr->sched_deadline; 3533 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3534 dl_se->flags = attr->sched_flags; 3535 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3536 3537 /* 3538 * Changing the parameters of a task is 'tricky' and we're not doing 3539 * the correct thing -- also see task_dead_dl() and switched_from_dl(). 3540 * 3541 * What we SHOULD do is delay the bandwidth release until the 0-lag 3542 * point. This would include retaining the task_struct until that time 3543 * and change dl_overflow() to not immediately decrement the current 3544 * amount. 3545 * 3546 * Instead we retain the current runtime/deadline and let the new 3547 * parameters take effect after the current reservation period lapses. 3548 * This is safe (albeit pessimistic) because the 0-lag point is always 3549 * before the current scheduling deadline. 3550 * 3551 * We can still have temporary overloads because we do not delay the 3552 * change in bandwidth until that time; so admission control is 3553 * not on the safe side. It does however guarantee tasks will never 3554 * consume more than promised. 3555 */ 3556 } 3557 3558 /* 3559 * sched_setparam() passes in -1 for its policy, to let the functions 3560 * it calls know not to change it. 3561 */ 3562 #define SETPARAM_POLICY -1 3563 3564 static void __setscheduler_params(struct task_struct *p, 3565 const struct sched_attr *attr) 3566 { 3567 int policy = attr->sched_policy; 3568 3569 if (policy == SETPARAM_POLICY) 3570 policy = p->policy; 3571 3572 p->policy = policy; 3573 3574 if (dl_policy(policy)) 3575 __setparam_dl(p, attr); 3576 else if (fair_policy(policy)) 3577 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 3578 3579 /* 3580 * __sched_setscheduler() ensures attr->sched_priority == 0 when 3581 * !rt_policy. Always setting this ensures that things like 3582 * getparam()/getattr() don't report silly values for !rt tasks. 3583 */ 3584 p->rt_priority = attr->sched_priority; 3585 p->normal_prio = normal_prio(p); 3586 set_load_weight(p); 3587 } 3588 3589 /* Actually do priority change: must hold pi & rq lock. */ 3590 static void __setscheduler(struct rq *rq, struct task_struct *p, 3591 const struct sched_attr *attr, bool keep_boost) 3592 { 3593 __setscheduler_params(p, attr); 3594 3595 /* 3596 * Keep a potential priority boosting if called from 3597 * sched_setscheduler(). 3598 */ 3599 if (keep_boost) 3600 p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); 3601 else 3602 p->prio = normal_prio(p); 3603 3604 if (dl_prio(p->prio)) 3605 p->sched_class = &dl_sched_class; 3606 else if (rt_prio(p->prio)) 3607 p->sched_class = &rt_sched_class; 3608 else 3609 p->sched_class = &fair_sched_class; 3610 } 3611 3612 static void 3613 __getparam_dl(struct task_struct *p, struct sched_attr *attr) 3614 { 3615 struct sched_dl_entity *dl_se = &p->dl; 3616 3617 attr->sched_priority = p->rt_priority; 3618 attr->sched_runtime = dl_se->dl_runtime; 3619 attr->sched_deadline = dl_se->dl_deadline; 3620 attr->sched_period = dl_se->dl_period; 3621 attr->sched_flags = dl_se->flags; 3622 } 3623 3624 /* 3625 * This function validates the new parameters of a -deadline task. 3626 * We ask for the deadline not being zero, and greater or equal 3627 * than the runtime, as well as the period of being zero or 3628 * greater than deadline. Furthermore, we have to be sure that 3629 * user parameters are above the internal resolution of 1us (we 3630 * check sched_runtime only since it is always the smaller one) and 3631 * below 2^63 ns (we have to check both sched_deadline and 3632 * sched_period, as the latter can be zero). 3633 */ 3634 static bool 3635 __checkparam_dl(const struct sched_attr *attr) 3636 { 3637 /* deadline != 0 */ 3638 if (attr->sched_deadline == 0) 3639 return false; 3640 3641 /* 3642 * Since we truncate DL_SCALE bits, make sure we're at least 3643 * that big. 3644 */ 3645 if (attr->sched_runtime < (1ULL << DL_SCALE)) 3646 return false; 3647 3648 /* 3649 * Since we use the MSB for wrap-around and sign issues, make 3650 * sure it's not set (mind that period can be equal to zero). 3651 */ 3652 if (attr->sched_deadline & (1ULL << 63) || 3653 attr->sched_period & (1ULL << 63)) 3654 return false; 3655 3656 /* runtime <= deadline <= period (if period != 0) */ 3657 if ((attr->sched_period != 0 && 3658 attr->sched_period < attr->sched_deadline) || 3659 attr->sched_deadline < attr->sched_runtime) 3660 return false; 3661 3662 return true; 3663 } 3664 3665 /* 3666 * check the target process has a UID that matches the current process's 3667 */ 3668 static bool check_same_owner(struct task_struct *p) 3669 { 3670 const struct cred *cred = current_cred(), *pcred; 3671 bool match; 3672 3673 rcu_read_lock(); 3674 pcred = __task_cred(p); 3675 match = (uid_eq(cred->euid, pcred->euid) || 3676 uid_eq(cred->euid, pcred->uid)); 3677 rcu_read_unlock(); 3678 return match; 3679 } 3680 3681 static bool dl_param_changed(struct task_struct *p, 3682 const struct sched_attr *attr) 3683 { 3684 struct sched_dl_entity *dl_se = &p->dl; 3685 3686 if (dl_se->dl_runtime != attr->sched_runtime || 3687 dl_se->dl_deadline != attr->sched_deadline || 3688 dl_se->dl_period != attr->sched_period || 3689 dl_se->flags != attr->sched_flags) 3690 return true; 3691 3692 return false; 3693 } 3694 3695 static int __sched_setscheduler(struct task_struct *p, 3696 const struct sched_attr *attr, 3697 bool user, bool pi) 3698 { 3699 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 3700 MAX_RT_PRIO - 1 - attr->sched_priority; 3701 int retval, oldprio, oldpolicy = -1, queued, running; 3702 int new_effective_prio, policy = attr->sched_policy; 3703 unsigned long flags; 3704 const struct sched_class *prev_class; 3705 struct rq *rq; 3706 int reset_on_fork; 3707 3708 /* may grab non-irq protected spin_locks */ 3709 BUG_ON(in_interrupt()); 3710 recheck: 3711 /* double check policy once rq lock held */ 3712 if (policy < 0) { 3713 reset_on_fork = p->sched_reset_on_fork; 3714 policy = oldpolicy = p->policy; 3715 } else { 3716 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 3717 3718 if (policy != SCHED_DEADLINE && 3719 policy != SCHED_FIFO && policy != SCHED_RR && 3720 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3721 policy != SCHED_IDLE) 3722 return -EINVAL; 3723 } 3724 3725 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) 3726 return -EINVAL; 3727 3728 /* 3729 * Valid priorities for SCHED_FIFO and SCHED_RR are 3730 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3731 * SCHED_BATCH and SCHED_IDLE is 0. 3732 */ 3733 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 3734 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 3735 return -EINVAL; 3736 if ((dl_policy(policy) && !__checkparam_dl(attr)) || 3737 (rt_policy(policy) != (attr->sched_priority != 0))) 3738 return -EINVAL; 3739 3740 /* 3741 * Allow unprivileged RT tasks to decrease priority: 3742 */ 3743 if (user && !capable(CAP_SYS_NICE)) { 3744 if (fair_policy(policy)) { 3745 if (attr->sched_nice < task_nice(p) && 3746 !can_nice(p, attr->sched_nice)) 3747 return -EPERM; 3748 } 3749 3750 if (rt_policy(policy)) { 3751 unsigned long rlim_rtprio = 3752 task_rlimit(p, RLIMIT_RTPRIO); 3753 3754 /* can't set/change the rt policy */ 3755 if (policy != p->policy && !rlim_rtprio) 3756 return -EPERM; 3757 3758 /* can't increase priority */ 3759 if (attr->sched_priority > p->rt_priority && 3760 attr->sched_priority > rlim_rtprio) 3761 return -EPERM; 3762 } 3763 3764 /* 3765 * Can't set/change SCHED_DEADLINE policy at all for now 3766 * (safest behavior); in the future we would like to allow 3767 * unprivileged DL tasks to increase their relative deadline 3768 * or reduce their runtime (both ways reducing utilization) 3769 */ 3770 if (dl_policy(policy)) 3771 return -EPERM; 3772 3773 /* 3774 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3775 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3776 */ 3777 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3778 if (!can_nice(p, task_nice(p))) 3779 return -EPERM; 3780 } 3781 3782 /* can't change other user's priorities */ 3783 if (!check_same_owner(p)) 3784 return -EPERM; 3785 3786 /* Normal users shall not reset the sched_reset_on_fork flag */ 3787 if (p->sched_reset_on_fork && !reset_on_fork) 3788 return -EPERM; 3789 } 3790 3791 if (user) { 3792 retval = security_task_setscheduler(p); 3793 if (retval) 3794 return retval; 3795 } 3796 3797 /* 3798 * make sure no PI-waiters arrive (or leave) while we are 3799 * changing the priority of the task: 3800 * 3801 * To be able to change p->policy safely, the appropriate 3802 * runqueue lock must be held. 3803 */ 3804 rq = task_rq_lock(p, &flags); 3805 3806 /* 3807 * Changing the policy of the stop threads its a very bad idea 3808 */ 3809 if (p == rq->stop) { 3810 task_rq_unlock(rq, p, &flags); 3811 return -EINVAL; 3812 } 3813 3814 /* 3815 * If not changing anything there's no need to proceed further, 3816 * but store a possible modification of reset_on_fork. 3817 */ 3818 if (unlikely(policy == p->policy)) { 3819 if (fair_policy(policy) && attr->sched_nice != task_nice(p)) 3820 goto change; 3821 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3822 goto change; 3823 if (dl_policy(policy) && dl_param_changed(p, attr)) 3824 goto change; 3825 3826 p->sched_reset_on_fork = reset_on_fork; 3827 task_rq_unlock(rq, p, &flags); 3828 return 0; 3829 } 3830 change: 3831 3832 if (user) { 3833 #ifdef CONFIG_RT_GROUP_SCHED 3834 /* 3835 * Do not allow realtime tasks into groups that have no runtime 3836 * assigned. 3837 */ 3838 if (rt_bandwidth_enabled() && rt_policy(policy) && 3839 task_group(p)->rt_bandwidth.rt_runtime == 0 && 3840 !task_group_is_autogroup(task_group(p))) { 3841 task_rq_unlock(rq, p, &flags); 3842 return -EPERM; 3843 } 3844 #endif 3845 #ifdef CONFIG_SMP 3846 if (dl_bandwidth_enabled() && dl_policy(policy)) { 3847 cpumask_t *span = rq->rd->span; 3848 3849 /* 3850 * Don't allow tasks with an affinity mask smaller than 3851 * the entire root_domain to become SCHED_DEADLINE. We 3852 * will also fail if there's no bandwidth available. 3853 */ 3854 if (!cpumask_subset(span, &p->cpus_allowed) || 3855 rq->rd->dl_bw.bw == 0) { 3856 task_rq_unlock(rq, p, &flags); 3857 return -EPERM; 3858 } 3859 } 3860 #endif 3861 } 3862 3863 /* recheck policy now with rq lock held */ 3864 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3865 policy = oldpolicy = -1; 3866 task_rq_unlock(rq, p, &flags); 3867 goto recheck; 3868 } 3869 3870 /* 3871 * If setscheduling to SCHED_DEADLINE (or changing the parameters 3872 * of a SCHED_DEADLINE task) we need to check if enough bandwidth 3873 * is available. 3874 */ 3875 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { 3876 task_rq_unlock(rq, p, &flags); 3877 return -EBUSY; 3878 } 3879 3880 p->sched_reset_on_fork = reset_on_fork; 3881 oldprio = p->prio; 3882 3883 if (pi) { 3884 /* 3885 * Take priority boosted tasks into account. If the new 3886 * effective priority is unchanged, we just store the new 3887 * normal parameters and do not touch the scheduler class and 3888 * the runqueue. This will be done when the task deboost 3889 * itself. 3890 */ 3891 new_effective_prio = rt_mutex_get_effective_prio(p, newprio); 3892 if (new_effective_prio == oldprio) { 3893 __setscheduler_params(p, attr); 3894 task_rq_unlock(rq, p, &flags); 3895 return 0; 3896 } 3897 } 3898 3899 queued = task_on_rq_queued(p); 3900 running = task_current(rq, p); 3901 if (queued) 3902 dequeue_task(rq, p, 0); 3903 if (running) 3904 put_prev_task(rq, p); 3905 3906 prev_class = p->sched_class; 3907 __setscheduler(rq, p, attr, pi); 3908 3909 if (running) 3910 p->sched_class->set_curr_task(rq); 3911 if (queued) { 3912 /* 3913 * We enqueue to tail when the priority of a task is 3914 * increased (user space view). 3915 */ 3916 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); 3917 } 3918 3919 check_class_changed(rq, p, prev_class, oldprio); 3920 preempt_disable(); /* avoid rq from going away on us */ 3921 task_rq_unlock(rq, p, &flags); 3922 3923 if (pi) 3924 rt_mutex_adjust_pi(p); 3925 3926 /* 3927 * Run balance callbacks after we've adjusted the PI chain. 3928 */ 3929 balance_callback(rq); 3930 preempt_enable(); 3931 3932 return 0; 3933 } 3934 3935 static int _sched_setscheduler(struct task_struct *p, int policy, 3936 const struct sched_param *param, bool check) 3937 { 3938 struct sched_attr attr = { 3939 .sched_policy = policy, 3940 .sched_priority = param->sched_priority, 3941 .sched_nice = PRIO_TO_NICE(p->static_prio), 3942 }; 3943 3944 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ 3945 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { 3946 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3947 policy &= ~SCHED_RESET_ON_FORK; 3948 attr.sched_policy = policy; 3949 } 3950 3951 return __sched_setscheduler(p, &attr, check, true); 3952 } 3953 /** 3954 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3955 * @p: the task in question. 3956 * @policy: new policy. 3957 * @param: structure containing the new RT priority. 3958 * 3959 * Return: 0 on success. An error code otherwise. 3960 * 3961 * NOTE that the task may be already dead. 3962 */ 3963 int sched_setscheduler(struct task_struct *p, int policy, 3964 const struct sched_param *param) 3965 { 3966 return _sched_setscheduler(p, policy, param, true); 3967 } 3968 EXPORT_SYMBOL_GPL(sched_setscheduler); 3969 3970 int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 3971 { 3972 return __sched_setscheduler(p, attr, true, true); 3973 } 3974 EXPORT_SYMBOL_GPL(sched_setattr); 3975 3976 /** 3977 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3978 * @p: the task in question. 3979 * @policy: new policy. 3980 * @param: structure containing the new RT priority. 3981 * 3982 * Just like sched_setscheduler, only don't bother checking if the 3983 * current context has permission. For example, this is needed in 3984 * stop_machine(): we create temporary high priority worker threads, 3985 * but our caller might not have that capability. 3986 * 3987 * Return: 0 on success. An error code otherwise. 3988 */ 3989 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3990 const struct sched_param *param) 3991 { 3992 return _sched_setscheduler(p, policy, param, false); 3993 } 3994 3995 static int 3996 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3997 { 3998 struct sched_param lparam; 3999 struct task_struct *p; 4000 int retval; 4001 4002 if (!param || pid < 0) 4003 return -EINVAL; 4004 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 4005 return -EFAULT; 4006 4007 rcu_read_lock(); 4008 retval = -ESRCH; 4009 p = find_process_by_pid(pid); 4010 if (p != NULL) 4011 retval = sched_setscheduler(p, policy, &lparam); 4012 rcu_read_unlock(); 4013 4014 return retval; 4015 } 4016 4017 /* 4018 * Mimics kernel/events/core.c perf_copy_attr(). 4019 */ 4020 static int sched_copy_attr(struct sched_attr __user *uattr, 4021 struct sched_attr *attr) 4022 { 4023 u32 size; 4024 int ret; 4025 4026 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) 4027 return -EFAULT; 4028 4029 /* 4030 * zero the full structure, so that a short copy will be nice. 4031 */ 4032 memset(attr, 0, sizeof(*attr)); 4033 4034 ret = get_user(size, &uattr->size); 4035 if (ret) 4036 return ret; 4037 4038 if (size > PAGE_SIZE) /* silly large */ 4039 goto err_size; 4040 4041 if (!size) /* abi compat */ 4042 size = SCHED_ATTR_SIZE_VER0; 4043 4044 if (size < SCHED_ATTR_SIZE_VER0) 4045 goto err_size; 4046 4047 /* 4048 * If we're handed a bigger struct than we know of, 4049 * ensure all the unknown bits are 0 - i.e. new 4050 * user-space does not rely on any kernel feature 4051 * extensions we dont know about yet. 4052 */ 4053 if (size > sizeof(*attr)) { 4054 unsigned char __user *addr; 4055 unsigned char __user *end; 4056 unsigned char val; 4057 4058 addr = (void __user *)uattr + sizeof(*attr); 4059 end = (void __user *)uattr + size; 4060 4061 for (; addr < end; addr++) { 4062 ret = get_user(val, addr); 4063 if (ret) 4064 return ret; 4065 if (val) 4066 goto err_size; 4067 } 4068 size = sizeof(*attr); 4069 } 4070 4071 ret = copy_from_user(attr, uattr, size); 4072 if (ret) 4073 return -EFAULT; 4074 4075 /* 4076 * XXX: do we want to be lenient like existing syscalls; or do we want 4077 * to be strict and return an error on out-of-bounds values? 4078 */ 4079 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 4080 4081 return 0; 4082 4083 err_size: 4084 put_user(sizeof(*attr), &uattr->size); 4085 return -E2BIG; 4086 } 4087 4088 /** 4089 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 4090 * @pid: the pid in question. 4091 * @policy: new policy. 4092 * @param: structure containing the new RT priority. 4093 * 4094 * Return: 0 on success. An error code otherwise. 4095 */ 4096 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 4097 struct sched_param __user *, param) 4098 { 4099 /* negative values for policy are not valid */ 4100 if (policy < 0) 4101 return -EINVAL; 4102 4103 return do_sched_setscheduler(pid, policy, param); 4104 } 4105 4106 /** 4107 * sys_sched_setparam - set/change the RT priority of a thread 4108 * @pid: the pid in question. 4109 * @param: structure containing the new RT priority. 4110 * 4111 * Return: 0 on success. An error code otherwise. 4112 */ 4113 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 4114 { 4115 return do_sched_setscheduler(pid, SETPARAM_POLICY, param); 4116 } 4117 4118 /** 4119 * sys_sched_setattr - same as above, but with extended sched_attr 4120 * @pid: the pid in question. 4121 * @uattr: structure containing the extended parameters. 4122 * @flags: for future extension. 4123 */ 4124 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 4125 unsigned int, flags) 4126 { 4127 struct sched_attr attr; 4128 struct task_struct *p; 4129 int retval; 4130 4131 if (!uattr || pid < 0 || flags) 4132 return -EINVAL; 4133 4134 retval = sched_copy_attr(uattr, &attr); 4135 if (retval) 4136 return retval; 4137 4138 if ((int)attr.sched_policy < 0) 4139 return -EINVAL; 4140 4141 rcu_read_lock(); 4142 retval = -ESRCH; 4143 p = find_process_by_pid(pid); 4144 if (p != NULL) 4145 retval = sched_setattr(p, &attr); 4146 rcu_read_unlock(); 4147 4148 return retval; 4149 } 4150 4151 /** 4152 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 4153 * @pid: the pid in question. 4154 * 4155 * Return: On success, the policy of the thread. Otherwise, a negative error 4156 * code. 4157 */ 4158 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 4159 { 4160 struct task_struct *p; 4161 int retval; 4162 4163 if (pid < 0) 4164 return -EINVAL; 4165 4166 retval = -ESRCH; 4167 rcu_read_lock(); 4168 p = find_process_by_pid(pid); 4169 if (p) { 4170 retval = security_task_getscheduler(p); 4171 if (!retval) 4172 retval = p->policy 4173 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4174 } 4175 rcu_read_unlock(); 4176 return retval; 4177 } 4178 4179 /** 4180 * sys_sched_getparam - get the RT priority of a thread 4181 * @pid: the pid in question. 4182 * @param: structure containing the RT priority. 4183 * 4184 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error 4185 * code. 4186 */ 4187 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 4188 { 4189 struct sched_param lp = { .sched_priority = 0 }; 4190 struct task_struct *p; 4191 int retval; 4192 4193 if (!param || pid < 0) 4194 return -EINVAL; 4195 4196 rcu_read_lock(); 4197 p = find_process_by_pid(pid); 4198 retval = -ESRCH; 4199 if (!p) 4200 goto out_unlock; 4201 4202 retval = security_task_getscheduler(p); 4203 if (retval) 4204 goto out_unlock; 4205 4206 if (task_has_rt_policy(p)) 4207 lp.sched_priority = p->rt_priority; 4208 rcu_read_unlock(); 4209 4210 /* 4211 * This one might sleep, we cannot do it with a spinlock held ... 4212 */ 4213 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4214 4215 return retval; 4216 4217 out_unlock: 4218 rcu_read_unlock(); 4219 return retval; 4220 } 4221 4222 static int sched_read_attr(struct sched_attr __user *uattr, 4223 struct sched_attr *attr, 4224 unsigned int usize) 4225 { 4226 int ret; 4227 4228 if (!access_ok(VERIFY_WRITE, uattr, usize)) 4229 return -EFAULT; 4230 4231 /* 4232 * If we're handed a smaller struct than we know of, 4233 * ensure all the unknown bits are 0 - i.e. old 4234 * user-space does not get uncomplete information. 4235 */ 4236 if (usize < sizeof(*attr)) { 4237 unsigned char *addr; 4238 unsigned char *end; 4239 4240 addr = (void *)attr + usize; 4241 end = (void *)attr + sizeof(*attr); 4242 4243 for (; addr < end; addr++) { 4244 if (*addr) 4245 return -EFBIG; 4246 } 4247 4248 attr->size = usize; 4249 } 4250 4251 ret = copy_to_user(uattr, attr, attr->size); 4252 if (ret) 4253 return -EFAULT; 4254 4255 return 0; 4256 } 4257 4258 /** 4259 * sys_sched_getattr - similar to sched_getparam, but with sched_attr 4260 * @pid: the pid in question. 4261 * @uattr: structure containing the extended parameters. 4262 * @size: sizeof(attr) for fwd/bwd comp. 4263 * @flags: for future extension. 4264 */ 4265 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 4266 unsigned int, size, unsigned int, flags) 4267 { 4268 struct sched_attr attr = { 4269 .size = sizeof(struct sched_attr), 4270 }; 4271 struct task_struct *p; 4272 int retval; 4273 4274 if (!uattr || pid < 0 || size > PAGE_SIZE || 4275 size < SCHED_ATTR_SIZE_VER0 || flags) 4276 return -EINVAL; 4277 4278 rcu_read_lock(); 4279 p = find_process_by_pid(pid); 4280 retval = -ESRCH; 4281 if (!p) 4282 goto out_unlock; 4283 4284 retval = security_task_getscheduler(p); 4285 if (retval) 4286 goto out_unlock; 4287 4288 attr.sched_policy = p->policy; 4289 if (p->sched_reset_on_fork) 4290 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 4291 if (task_has_dl_policy(p)) 4292 __getparam_dl(p, &attr); 4293 else if (task_has_rt_policy(p)) 4294 attr.sched_priority = p->rt_priority; 4295 else 4296 attr.sched_nice = task_nice(p); 4297 4298 rcu_read_unlock(); 4299 4300 retval = sched_read_attr(uattr, &attr, size); 4301 return retval; 4302 4303 out_unlock: 4304 rcu_read_unlock(); 4305 return retval; 4306 } 4307 4308 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 4309 { 4310 cpumask_var_t cpus_allowed, new_mask; 4311 struct task_struct *p; 4312 int retval; 4313 4314 rcu_read_lock(); 4315 4316 p = find_process_by_pid(pid); 4317 if (!p) { 4318 rcu_read_unlock(); 4319 return -ESRCH; 4320 } 4321 4322 /* Prevent p going away */ 4323 get_task_struct(p); 4324 rcu_read_unlock(); 4325 4326 if (p->flags & PF_NO_SETAFFINITY) { 4327 retval = -EINVAL; 4328 goto out_put_task; 4329 } 4330 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4331 retval = -ENOMEM; 4332 goto out_put_task; 4333 } 4334 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 4335 retval = -ENOMEM; 4336 goto out_free_cpus_allowed; 4337 } 4338 retval = -EPERM; 4339 if (!check_same_owner(p)) { 4340 rcu_read_lock(); 4341 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4342 rcu_read_unlock(); 4343 goto out_free_new_mask; 4344 } 4345 rcu_read_unlock(); 4346 } 4347 4348 retval = security_task_setscheduler(p); 4349 if (retval) 4350 goto out_free_new_mask; 4351 4352 4353 cpuset_cpus_allowed(p, cpus_allowed); 4354 cpumask_and(new_mask, in_mask, cpus_allowed); 4355 4356 /* 4357 * Since bandwidth control happens on root_domain basis, 4358 * if admission test is enabled, we only admit -deadline 4359 * tasks allowed to run on all the CPUs in the task's 4360 * root_domain. 4361 */ 4362 #ifdef CONFIG_SMP 4363 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { 4364 rcu_read_lock(); 4365 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { 4366 retval = -EBUSY; 4367 rcu_read_unlock(); 4368 goto out_free_new_mask; 4369 } 4370 rcu_read_unlock(); 4371 } 4372 #endif 4373 again: 4374 retval = __set_cpus_allowed_ptr(p, new_mask, true); 4375 4376 if (!retval) { 4377 cpuset_cpus_allowed(p, cpus_allowed); 4378 if (!cpumask_subset(new_mask, cpus_allowed)) { 4379 /* 4380 * We must have raced with a concurrent cpuset 4381 * update. Just reset the cpus_allowed to the 4382 * cpuset's cpus_allowed 4383 */ 4384 cpumask_copy(new_mask, cpus_allowed); 4385 goto again; 4386 } 4387 } 4388 out_free_new_mask: 4389 free_cpumask_var(new_mask); 4390 out_free_cpus_allowed: 4391 free_cpumask_var(cpus_allowed); 4392 out_put_task: 4393 put_task_struct(p); 4394 return retval; 4395 } 4396 4397 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4398 struct cpumask *new_mask) 4399 { 4400 if (len < cpumask_size()) 4401 cpumask_clear(new_mask); 4402 else if (len > cpumask_size()) 4403 len = cpumask_size(); 4404 4405 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4406 } 4407 4408 /** 4409 * sys_sched_setaffinity - set the cpu affinity of a process 4410 * @pid: pid of the process 4411 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4412 * @user_mask_ptr: user-space pointer to the new cpu mask 4413 * 4414 * Return: 0 on success. An error code otherwise. 4415 */ 4416 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 4417 unsigned long __user *, user_mask_ptr) 4418 { 4419 cpumask_var_t new_mask; 4420 int retval; 4421 4422 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4423 return -ENOMEM; 4424 4425 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 4426 if (retval == 0) 4427 retval = sched_setaffinity(pid, new_mask); 4428 free_cpumask_var(new_mask); 4429 return retval; 4430 } 4431 4432 long sched_getaffinity(pid_t pid, struct cpumask *mask) 4433 { 4434 struct task_struct *p; 4435 unsigned long flags; 4436 int retval; 4437 4438 rcu_read_lock(); 4439 4440 retval = -ESRCH; 4441 p = find_process_by_pid(pid); 4442 if (!p) 4443 goto out_unlock; 4444 4445 retval = security_task_getscheduler(p); 4446 if (retval) 4447 goto out_unlock; 4448 4449 raw_spin_lock_irqsave(&p->pi_lock, flags); 4450 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); 4451 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4452 4453 out_unlock: 4454 rcu_read_unlock(); 4455 4456 return retval; 4457 } 4458 4459 /** 4460 * sys_sched_getaffinity - get the cpu affinity of a process 4461 * @pid: pid of the process 4462 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4463 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4464 * 4465 * Return: 0 on success. An error code otherwise. 4466 */ 4467 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4468 unsigned long __user *, user_mask_ptr) 4469 { 4470 int ret; 4471 cpumask_var_t mask; 4472 4473 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4474 return -EINVAL; 4475 if (len & (sizeof(unsigned long)-1)) 4476 return -EINVAL; 4477 4478 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4479 return -ENOMEM; 4480 4481 ret = sched_getaffinity(pid, mask); 4482 if (ret == 0) { 4483 size_t retlen = min_t(size_t, len, cpumask_size()); 4484 4485 if (copy_to_user(user_mask_ptr, mask, retlen)) 4486 ret = -EFAULT; 4487 else 4488 ret = retlen; 4489 } 4490 free_cpumask_var(mask); 4491 4492 return ret; 4493 } 4494 4495 /** 4496 * sys_sched_yield - yield the current processor to other threads. 4497 * 4498 * This function yields the current CPU to other tasks. If there are no 4499 * other threads running on this CPU then this function will return. 4500 * 4501 * Return: 0. 4502 */ 4503 SYSCALL_DEFINE0(sched_yield) 4504 { 4505 struct rq *rq = this_rq_lock(); 4506 4507 schedstat_inc(rq, yld_count); 4508 current->sched_class->yield_task(rq); 4509 4510 /* 4511 * Since we are going to call schedule() anyway, there's 4512 * no need to preempt or enable interrupts: 4513 */ 4514 __release(rq->lock); 4515 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4516 do_raw_spin_unlock(&rq->lock); 4517 sched_preempt_enable_no_resched(); 4518 4519 schedule(); 4520 4521 return 0; 4522 } 4523 4524 int __sched _cond_resched(void) 4525 { 4526 if (should_resched(0)) { 4527 preempt_schedule_common(); 4528 return 1; 4529 } 4530 return 0; 4531 } 4532 EXPORT_SYMBOL(_cond_resched); 4533 4534 /* 4535 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4536 * call schedule, and on return reacquire the lock. 4537 * 4538 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4539 * operations here to prevent schedule() from being called twice (once via 4540 * spin_unlock(), once by hand). 4541 */ 4542 int __cond_resched_lock(spinlock_t *lock) 4543 { 4544 int resched = should_resched(PREEMPT_LOCK_OFFSET); 4545 int ret = 0; 4546 4547 lockdep_assert_held(lock); 4548 4549 if (spin_needbreak(lock) || resched) { 4550 spin_unlock(lock); 4551 if (resched) 4552 preempt_schedule_common(); 4553 else 4554 cpu_relax(); 4555 ret = 1; 4556 spin_lock(lock); 4557 } 4558 return ret; 4559 } 4560 EXPORT_SYMBOL(__cond_resched_lock); 4561 4562 int __sched __cond_resched_softirq(void) 4563 { 4564 BUG_ON(!in_softirq()); 4565 4566 if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { 4567 local_bh_enable(); 4568 preempt_schedule_common(); 4569 local_bh_disable(); 4570 return 1; 4571 } 4572 return 0; 4573 } 4574 EXPORT_SYMBOL(__cond_resched_softirq); 4575 4576 /** 4577 * yield - yield the current processor to other threads. 4578 * 4579 * Do not ever use this function, there's a 99% chance you're doing it wrong. 4580 * 4581 * The scheduler is at all times free to pick the calling task as the most 4582 * eligible task to run, if removing the yield() call from your code breaks 4583 * it, its already broken. 4584 * 4585 * Typical broken usage is: 4586 * 4587 * while (!event) 4588 * yield(); 4589 * 4590 * where one assumes that yield() will let 'the other' process run that will 4591 * make event true. If the current task is a SCHED_FIFO task that will never 4592 * happen. Never use yield() as a progress guarantee!! 4593 * 4594 * If you want to use yield() to wait for something, use wait_event(). 4595 * If you want to use yield() to be 'nice' for others, use cond_resched(). 4596 * If you still want to use yield(), do not! 4597 */ 4598 void __sched yield(void) 4599 { 4600 set_current_state(TASK_RUNNING); 4601 sys_sched_yield(); 4602 } 4603 EXPORT_SYMBOL(yield); 4604 4605 /** 4606 * yield_to - yield the current processor to another thread in 4607 * your thread group, or accelerate that thread toward the 4608 * processor it's on. 4609 * @p: target task 4610 * @preempt: whether task preemption is allowed or not 4611 * 4612 * It's the caller's job to ensure that the target task struct 4613 * can't go away on us before we can do any checks. 4614 * 4615 * Return: 4616 * true (>0) if we indeed boosted the target task. 4617 * false (0) if we failed to boost the target. 4618 * -ESRCH if there's no task to yield to. 4619 */ 4620 int __sched yield_to(struct task_struct *p, bool preempt) 4621 { 4622 struct task_struct *curr = current; 4623 struct rq *rq, *p_rq; 4624 unsigned long flags; 4625 int yielded = 0; 4626 4627 local_irq_save(flags); 4628 rq = this_rq(); 4629 4630 again: 4631 p_rq = task_rq(p); 4632 /* 4633 * If we're the only runnable task on the rq and target rq also 4634 * has only one task, there's absolutely no point in yielding. 4635 */ 4636 if (rq->nr_running == 1 && p_rq->nr_running == 1) { 4637 yielded = -ESRCH; 4638 goto out_irq; 4639 } 4640 4641 double_rq_lock(rq, p_rq); 4642 if (task_rq(p) != p_rq) { 4643 double_rq_unlock(rq, p_rq); 4644 goto again; 4645 } 4646 4647 if (!curr->sched_class->yield_to_task) 4648 goto out_unlock; 4649 4650 if (curr->sched_class != p->sched_class) 4651 goto out_unlock; 4652 4653 if (task_running(p_rq, p) || p->state) 4654 goto out_unlock; 4655 4656 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4657 if (yielded) { 4658 schedstat_inc(rq, yld_count); 4659 /* 4660 * Make p's CPU reschedule; pick_next_entity takes care of 4661 * fairness. 4662 */ 4663 if (preempt && rq != p_rq) 4664 resched_curr(p_rq); 4665 } 4666 4667 out_unlock: 4668 double_rq_unlock(rq, p_rq); 4669 out_irq: 4670 local_irq_restore(flags); 4671 4672 if (yielded > 0) 4673 schedule(); 4674 4675 return yielded; 4676 } 4677 EXPORT_SYMBOL_GPL(yield_to); 4678 4679 /* 4680 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4681 * that process accounting knows that this is a task in IO wait state. 4682 */ 4683 long __sched io_schedule_timeout(long timeout) 4684 { 4685 int old_iowait = current->in_iowait; 4686 struct rq *rq; 4687 long ret; 4688 4689 current->in_iowait = 1; 4690 blk_schedule_flush_plug(current); 4691 4692 delayacct_blkio_start(); 4693 rq = raw_rq(); 4694 atomic_inc(&rq->nr_iowait); 4695 ret = schedule_timeout(timeout); 4696 current->in_iowait = old_iowait; 4697 atomic_dec(&rq->nr_iowait); 4698 delayacct_blkio_end(); 4699 4700 return ret; 4701 } 4702 EXPORT_SYMBOL(io_schedule_timeout); 4703 4704 /** 4705 * sys_sched_get_priority_max - return maximum RT priority. 4706 * @policy: scheduling class. 4707 * 4708 * Return: On success, this syscall returns the maximum 4709 * rt_priority that can be used by a given scheduling class. 4710 * On failure, a negative error code is returned. 4711 */ 4712 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4713 { 4714 int ret = -EINVAL; 4715 4716 switch (policy) { 4717 case SCHED_FIFO: 4718 case SCHED_RR: 4719 ret = MAX_USER_RT_PRIO-1; 4720 break; 4721 case SCHED_DEADLINE: 4722 case SCHED_NORMAL: 4723 case SCHED_BATCH: 4724 case SCHED_IDLE: 4725 ret = 0; 4726 break; 4727 } 4728 return ret; 4729 } 4730 4731 /** 4732 * sys_sched_get_priority_min - return minimum RT priority. 4733 * @policy: scheduling class. 4734 * 4735 * Return: On success, this syscall returns the minimum 4736 * rt_priority that can be used by a given scheduling class. 4737 * On failure, a negative error code is returned. 4738 */ 4739 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4740 { 4741 int ret = -EINVAL; 4742 4743 switch (policy) { 4744 case SCHED_FIFO: 4745 case SCHED_RR: 4746 ret = 1; 4747 break; 4748 case SCHED_DEADLINE: 4749 case SCHED_NORMAL: 4750 case SCHED_BATCH: 4751 case SCHED_IDLE: 4752 ret = 0; 4753 } 4754 return ret; 4755 } 4756 4757 /** 4758 * sys_sched_rr_get_interval - return the default timeslice of a process. 4759 * @pid: pid of the process. 4760 * @interval: userspace pointer to the timeslice value. 4761 * 4762 * this syscall writes the default timeslice value of a given process 4763 * into the user-space timespec buffer. A value of '0' means infinity. 4764 * 4765 * Return: On success, 0 and the timeslice is in @interval. Otherwise, 4766 * an error code. 4767 */ 4768 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4769 struct timespec __user *, interval) 4770 { 4771 struct task_struct *p; 4772 unsigned int time_slice; 4773 unsigned long flags; 4774 struct rq *rq; 4775 int retval; 4776 struct timespec t; 4777 4778 if (pid < 0) 4779 return -EINVAL; 4780 4781 retval = -ESRCH; 4782 rcu_read_lock(); 4783 p = find_process_by_pid(pid); 4784 if (!p) 4785 goto out_unlock; 4786 4787 retval = security_task_getscheduler(p); 4788 if (retval) 4789 goto out_unlock; 4790 4791 rq = task_rq_lock(p, &flags); 4792 time_slice = 0; 4793 if (p->sched_class->get_rr_interval) 4794 time_slice = p->sched_class->get_rr_interval(rq, p); 4795 task_rq_unlock(rq, p, &flags); 4796 4797 rcu_read_unlock(); 4798 jiffies_to_timespec(time_slice, &t); 4799 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4800 return retval; 4801 4802 out_unlock: 4803 rcu_read_unlock(); 4804 return retval; 4805 } 4806 4807 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 4808 4809 void sched_show_task(struct task_struct *p) 4810 { 4811 unsigned long free = 0; 4812 int ppid; 4813 unsigned long state = p->state; 4814 4815 if (state) 4816 state = __ffs(state) + 1; 4817 printk(KERN_INFO "%-15.15s %c", p->comm, 4818 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4819 #if BITS_PER_LONG == 32 4820 if (state == TASK_RUNNING) 4821 printk(KERN_CONT " running "); 4822 else 4823 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 4824 #else 4825 if (state == TASK_RUNNING) 4826 printk(KERN_CONT " running task "); 4827 else 4828 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 4829 #endif 4830 #ifdef CONFIG_DEBUG_STACK_USAGE 4831 free = stack_not_used(p); 4832 #endif 4833 ppid = 0; 4834 rcu_read_lock(); 4835 if (pid_alive(p)) 4836 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4837 rcu_read_unlock(); 4838 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4839 task_pid_nr(p), ppid, 4840 (unsigned long)task_thread_info(p)->flags); 4841 4842 print_worker_info(KERN_INFO, p); 4843 show_stack(p, NULL); 4844 } 4845 4846 void show_state_filter(unsigned long state_filter) 4847 { 4848 struct task_struct *g, *p; 4849 4850 #if BITS_PER_LONG == 32 4851 printk(KERN_INFO 4852 " task PC stack pid father\n"); 4853 #else 4854 printk(KERN_INFO 4855 " task PC stack pid father\n"); 4856 #endif 4857 rcu_read_lock(); 4858 for_each_process_thread(g, p) { 4859 /* 4860 * reset the NMI-timeout, listing all files on a slow 4861 * console might take a lot of time: 4862 */ 4863 touch_nmi_watchdog(); 4864 if (!state_filter || (p->state & state_filter)) 4865 sched_show_task(p); 4866 } 4867 4868 touch_all_softlockup_watchdogs(); 4869 4870 #ifdef CONFIG_SCHED_DEBUG 4871 sysrq_sched_debug_show(); 4872 #endif 4873 rcu_read_unlock(); 4874 /* 4875 * Only show locks if all tasks are dumped: 4876 */ 4877 if (!state_filter) 4878 debug_show_all_locks(); 4879 } 4880 4881 void init_idle_bootup_task(struct task_struct *idle) 4882 { 4883 idle->sched_class = &idle_sched_class; 4884 } 4885 4886 /** 4887 * init_idle - set up an idle thread for a given CPU 4888 * @idle: task in question 4889 * @cpu: cpu the idle task belongs to 4890 * 4891 * NOTE: this function does not set the idle thread's NEED_RESCHED 4892 * flag, to make booting more robust. 4893 */ 4894 void init_idle(struct task_struct *idle, int cpu) 4895 { 4896 struct rq *rq = cpu_rq(cpu); 4897 unsigned long flags; 4898 4899 raw_spin_lock_irqsave(&idle->pi_lock, flags); 4900 raw_spin_lock(&rq->lock); 4901 4902 __sched_fork(0, idle); 4903 idle->state = TASK_RUNNING; 4904 idle->se.exec_start = sched_clock(); 4905 4906 do_set_cpus_allowed(idle, cpumask_of(cpu)); 4907 /* 4908 * We're having a chicken and egg problem, even though we are 4909 * holding rq->lock, the cpu isn't yet set to this cpu so the 4910 * lockdep check in task_group() will fail. 4911 * 4912 * Similar case to sched_fork(). / Alternatively we could 4913 * use task_rq_lock() here and obtain the other rq->lock. 4914 * 4915 * Silence PROVE_RCU 4916 */ 4917 rcu_read_lock(); 4918 __set_task_cpu(idle, cpu); 4919 rcu_read_unlock(); 4920 4921 rq->curr = rq->idle = idle; 4922 idle->on_rq = TASK_ON_RQ_QUEUED; 4923 #if defined(CONFIG_SMP) 4924 idle->on_cpu = 1; 4925 #endif 4926 raw_spin_unlock(&rq->lock); 4927 raw_spin_unlock_irqrestore(&idle->pi_lock, flags); 4928 4929 /* Set the preempt count _outside_ the spinlocks! */ 4930 init_idle_preempt_count(idle, cpu); 4931 4932 /* 4933 * The idle tasks have their own, simple scheduling class: 4934 */ 4935 idle->sched_class = &idle_sched_class; 4936 ftrace_graph_init_idle_task(idle, cpu); 4937 vtime_init_idle(idle, cpu); 4938 #if defined(CONFIG_SMP) 4939 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4940 #endif 4941 } 4942 4943 int cpuset_cpumask_can_shrink(const struct cpumask *cur, 4944 const struct cpumask *trial) 4945 { 4946 int ret = 1, trial_cpus; 4947 struct dl_bw *cur_dl_b; 4948 unsigned long flags; 4949 4950 if (!cpumask_weight(cur)) 4951 return ret; 4952 4953 rcu_read_lock_sched(); 4954 cur_dl_b = dl_bw_of(cpumask_any(cur)); 4955 trial_cpus = cpumask_weight(trial); 4956 4957 raw_spin_lock_irqsave(&cur_dl_b->lock, flags); 4958 if (cur_dl_b->bw != -1 && 4959 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) 4960 ret = 0; 4961 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); 4962 rcu_read_unlock_sched(); 4963 4964 return ret; 4965 } 4966 4967 int task_can_attach(struct task_struct *p, 4968 const struct cpumask *cs_cpus_allowed) 4969 { 4970 int ret = 0; 4971 4972 /* 4973 * Kthreads which disallow setaffinity shouldn't be moved 4974 * to a new cpuset; we don't want to change their cpu 4975 * affinity and isolating such threads by their set of 4976 * allowed nodes is unnecessary. Thus, cpusets are not 4977 * applicable for such threads. This prevents checking for 4978 * success of set_cpus_allowed_ptr() on all attached tasks 4979 * before cpus_allowed may be changed. 4980 */ 4981 if (p->flags & PF_NO_SETAFFINITY) { 4982 ret = -EINVAL; 4983 goto out; 4984 } 4985 4986 #ifdef CONFIG_SMP 4987 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, 4988 cs_cpus_allowed)) { 4989 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, 4990 cs_cpus_allowed); 4991 struct dl_bw *dl_b; 4992 bool overflow; 4993 int cpus; 4994 unsigned long flags; 4995 4996 rcu_read_lock_sched(); 4997 dl_b = dl_bw_of(dest_cpu); 4998 raw_spin_lock_irqsave(&dl_b->lock, flags); 4999 cpus = dl_bw_cpus(dest_cpu); 5000 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); 5001 if (overflow) 5002 ret = -EBUSY; 5003 else { 5004 /* 5005 * We reserve space for this task in the destination 5006 * root_domain, as we can't fail after this point. 5007 * We will free resources in the source root_domain 5008 * later on (see set_cpus_allowed_dl()). 5009 */ 5010 __dl_add(dl_b, p->dl.dl_bw); 5011 } 5012 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 5013 rcu_read_unlock_sched(); 5014 5015 } 5016 #endif 5017 out: 5018 return ret; 5019 } 5020 5021 #ifdef CONFIG_SMP 5022 5023 #ifdef CONFIG_NUMA_BALANCING 5024 /* Migrate current task p to target_cpu */ 5025 int migrate_task_to(struct task_struct *p, int target_cpu) 5026 { 5027 struct migration_arg arg = { p, target_cpu }; 5028 int curr_cpu = task_cpu(p); 5029 5030 if (curr_cpu == target_cpu) 5031 return 0; 5032 5033 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) 5034 return -EINVAL; 5035 5036 /* TODO: This is not properly updating schedstats */ 5037 5038 trace_sched_move_numa(p, curr_cpu, target_cpu); 5039 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 5040 } 5041 5042 /* 5043 * Requeue a task on a given node and accurately track the number of NUMA 5044 * tasks on the runqueues 5045 */ 5046 void sched_setnuma(struct task_struct *p, int nid) 5047 { 5048 struct rq *rq; 5049 unsigned long flags; 5050 bool queued, running; 5051 5052 rq = task_rq_lock(p, &flags); 5053 queued = task_on_rq_queued(p); 5054 running = task_current(rq, p); 5055 5056 if (queued) 5057 dequeue_task(rq, p, 0); 5058 if (running) 5059 put_prev_task(rq, p); 5060 5061 p->numa_preferred_nid = nid; 5062 5063 if (running) 5064 p->sched_class->set_curr_task(rq); 5065 if (queued) 5066 enqueue_task(rq, p, 0); 5067 task_rq_unlock(rq, p, &flags); 5068 } 5069 #endif /* CONFIG_NUMA_BALANCING */ 5070 5071 #ifdef CONFIG_HOTPLUG_CPU 5072 /* 5073 * Ensures that the idle task is using init_mm right before its cpu goes 5074 * offline. 5075 */ 5076 void idle_task_exit(void) 5077 { 5078 struct mm_struct *mm = current->active_mm; 5079 5080 BUG_ON(cpu_online(smp_processor_id())); 5081 5082 if (mm != &init_mm) { 5083 switch_mm(mm, &init_mm, current); 5084 finish_arch_post_lock_switch(); 5085 } 5086 mmdrop(mm); 5087 } 5088 5089 /* 5090 * Since this CPU is going 'away' for a while, fold any nr_active delta 5091 * we might have. Assumes we're called after migrate_tasks() so that the 5092 * nr_active count is stable. 5093 * 5094 * Also see the comment "Global load-average calculations". 5095 */ 5096 static void calc_load_migrate(struct rq *rq) 5097 { 5098 long delta = calc_load_fold_active(rq); 5099 if (delta) 5100 atomic_long_add(delta, &calc_load_tasks); 5101 } 5102 5103 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) 5104 { 5105 } 5106 5107 static const struct sched_class fake_sched_class = { 5108 .put_prev_task = put_prev_task_fake, 5109 }; 5110 5111 static struct task_struct fake_task = { 5112 /* 5113 * Avoid pull_{rt,dl}_task() 5114 */ 5115 .prio = MAX_PRIO + 1, 5116 .sched_class = &fake_sched_class, 5117 }; 5118 5119 /* 5120 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5121 * try_to_wake_up()->select_task_rq(). 5122 * 5123 * Called with rq->lock held even though we'er in stop_machine() and 5124 * there's no concurrency possible, we hold the required locks anyway 5125 * because of lock validation efforts. 5126 */ 5127 static void migrate_tasks(struct rq *dead_rq) 5128 { 5129 struct rq *rq = dead_rq; 5130 struct task_struct *next, *stop = rq->stop; 5131 int dest_cpu; 5132 5133 /* 5134 * Fudge the rq selection such that the below task selection loop 5135 * doesn't get stuck on the currently eligible stop task. 5136 * 5137 * We're currently inside stop_machine() and the rq is either stuck 5138 * in the stop_machine_cpu_stop() loop, or we're executing this code, 5139 * either way we should never end up calling schedule() until we're 5140 * done here. 5141 */ 5142 rq->stop = NULL; 5143 5144 /* 5145 * put_prev_task() and pick_next_task() sched 5146 * class method both need to have an up-to-date 5147 * value of rq->clock[_task] 5148 */ 5149 update_rq_clock(rq); 5150 5151 for (;;) { 5152 /* 5153 * There's this thread running, bail when that's the only 5154 * remaining thread. 5155 */ 5156 if (rq->nr_running == 1) 5157 break; 5158 5159 /* 5160 * Ensure rq->lock covers the entire task selection 5161 * until the migration. 5162 */ 5163 lockdep_pin_lock(&rq->lock); 5164 next = pick_next_task(rq, &fake_task); 5165 BUG_ON(!next); 5166 next->sched_class->put_prev_task(rq, next); 5167 5168 /* Find suitable destination for @next, with force if needed. */ 5169 dest_cpu = select_fallback_rq(dead_rq->cpu, next); 5170 5171 lockdep_unpin_lock(&rq->lock); 5172 rq = __migrate_task(rq, next, dest_cpu); 5173 if (rq != dead_rq) { 5174 raw_spin_unlock(&rq->lock); 5175 rq = dead_rq; 5176 raw_spin_lock(&rq->lock); 5177 } 5178 } 5179 5180 rq->stop = stop; 5181 } 5182 #endif /* CONFIG_HOTPLUG_CPU */ 5183 5184 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5185 5186 static struct ctl_table sd_ctl_dir[] = { 5187 { 5188 .procname = "sched_domain", 5189 .mode = 0555, 5190 }, 5191 {} 5192 }; 5193 5194 static struct ctl_table sd_ctl_root[] = { 5195 { 5196 .procname = "kernel", 5197 .mode = 0555, 5198 .child = sd_ctl_dir, 5199 }, 5200 {} 5201 }; 5202 5203 static struct ctl_table *sd_alloc_ctl_entry(int n) 5204 { 5205 struct ctl_table *entry = 5206 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 5207 5208 return entry; 5209 } 5210 5211 static void sd_free_ctl_entry(struct ctl_table **tablep) 5212 { 5213 struct ctl_table *entry; 5214 5215 /* 5216 * In the intermediate directories, both the child directory and 5217 * procname are dynamically allocated and could fail but the mode 5218 * will always be set. In the lowest directory the names are 5219 * static strings and all have proc handlers. 5220 */ 5221 for (entry = *tablep; entry->mode; entry++) { 5222 if (entry->child) 5223 sd_free_ctl_entry(&entry->child); 5224 if (entry->proc_handler == NULL) 5225 kfree(entry->procname); 5226 } 5227 5228 kfree(*tablep); 5229 *tablep = NULL; 5230 } 5231 5232 static int min_load_idx = 0; 5233 static int max_load_idx = CPU_LOAD_IDX_MAX-1; 5234 5235 static void 5236 set_table_entry(struct ctl_table *entry, 5237 const char *procname, void *data, int maxlen, 5238 umode_t mode, proc_handler *proc_handler, 5239 bool load_idx) 5240 { 5241 entry->procname = procname; 5242 entry->data = data; 5243 entry->maxlen = maxlen; 5244 entry->mode = mode; 5245 entry->proc_handler = proc_handler; 5246 5247 if (load_idx) { 5248 entry->extra1 = &min_load_idx; 5249 entry->extra2 = &max_load_idx; 5250 } 5251 } 5252 5253 static struct ctl_table * 5254 sd_alloc_ctl_domain_table(struct sched_domain *sd) 5255 { 5256 struct ctl_table *table = sd_alloc_ctl_entry(14); 5257 5258 if (table == NULL) 5259 return NULL; 5260 5261 set_table_entry(&table[0], "min_interval", &sd->min_interval, 5262 sizeof(long), 0644, proc_doulongvec_minmax, false); 5263 set_table_entry(&table[1], "max_interval", &sd->max_interval, 5264 sizeof(long), 0644, proc_doulongvec_minmax, false); 5265 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 5266 sizeof(int), 0644, proc_dointvec_minmax, true); 5267 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 5268 sizeof(int), 0644, proc_dointvec_minmax, true); 5269 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 5270 sizeof(int), 0644, proc_dointvec_minmax, true); 5271 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 5272 sizeof(int), 0644, proc_dointvec_minmax, true); 5273 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 5274 sizeof(int), 0644, proc_dointvec_minmax, true); 5275 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 5276 sizeof(int), 0644, proc_dointvec_minmax, false); 5277 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 5278 sizeof(int), 0644, proc_dointvec_minmax, false); 5279 set_table_entry(&table[9], "cache_nice_tries", 5280 &sd->cache_nice_tries, 5281 sizeof(int), 0644, proc_dointvec_minmax, false); 5282 set_table_entry(&table[10], "flags", &sd->flags, 5283 sizeof(int), 0644, proc_dointvec_minmax, false); 5284 set_table_entry(&table[11], "max_newidle_lb_cost", 5285 &sd->max_newidle_lb_cost, 5286 sizeof(long), 0644, proc_doulongvec_minmax, false); 5287 set_table_entry(&table[12], "name", sd->name, 5288 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 5289 /* &table[13] is terminator */ 5290 5291 return table; 5292 } 5293 5294 static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) 5295 { 5296 struct ctl_table *entry, *table; 5297 struct sched_domain *sd; 5298 int domain_num = 0, i; 5299 char buf[32]; 5300 5301 for_each_domain(cpu, sd) 5302 domain_num++; 5303 entry = table = sd_alloc_ctl_entry(domain_num + 1); 5304 if (table == NULL) 5305 return NULL; 5306 5307 i = 0; 5308 for_each_domain(cpu, sd) { 5309 snprintf(buf, 32, "domain%d", i); 5310 entry->procname = kstrdup(buf, GFP_KERNEL); 5311 entry->mode = 0555; 5312 entry->child = sd_alloc_ctl_domain_table(sd); 5313 entry++; 5314 i++; 5315 } 5316 return table; 5317 } 5318 5319 static struct ctl_table_header *sd_sysctl_header; 5320 static void register_sched_domain_sysctl(void) 5321 { 5322 int i, cpu_num = num_possible_cpus(); 5323 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5324 char buf[32]; 5325 5326 WARN_ON(sd_ctl_dir[0].child); 5327 sd_ctl_dir[0].child = entry; 5328 5329 if (entry == NULL) 5330 return; 5331 5332 for_each_possible_cpu(i) { 5333 snprintf(buf, 32, "cpu%d", i); 5334 entry->procname = kstrdup(buf, GFP_KERNEL); 5335 entry->mode = 0555; 5336 entry->child = sd_alloc_ctl_cpu_table(i); 5337 entry++; 5338 } 5339 5340 WARN_ON(sd_sysctl_header); 5341 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5342 } 5343 5344 /* may be called multiple times per register */ 5345 static void unregister_sched_domain_sysctl(void) 5346 { 5347 unregister_sysctl_table(sd_sysctl_header); 5348 sd_sysctl_header = NULL; 5349 if (sd_ctl_dir[0].child) 5350 sd_free_ctl_entry(&sd_ctl_dir[0].child); 5351 } 5352 #else 5353 static void register_sched_domain_sysctl(void) 5354 { 5355 } 5356 static void unregister_sched_domain_sysctl(void) 5357 { 5358 } 5359 #endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ 5360 5361 static void set_rq_online(struct rq *rq) 5362 { 5363 if (!rq->online) { 5364 const struct sched_class *class; 5365 5366 cpumask_set_cpu(rq->cpu, rq->rd->online); 5367 rq->online = 1; 5368 5369 for_each_class(class) { 5370 if (class->rq_online) 5371 class->rq_online(rq); 5372 } 5373 } 5374 } 5375 5376 static void set_rq_offline(struct rq *rq) 5377 { 5378 if (rq->online) { 5379 const struct sched_class *class; 5380 5381 for_each_class(class) { 5382 if (class->rq_offline) 5383 class->rq_offline(rq); 5384 } 5385 5386 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5387 rq->online = 0; 5388 } 5389 } 5390 5391 /* 5392 * migration_call - callback that gets triggered when a CPU is added. 5393 * Here we can start up the necessary migration thread for the new CPU. 5394 */ 5395 static int 5396 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5397 { 5398 int cpu = (long)hcpu; 5399 unsigned long flags; 5400 struct rq *rq = cpu_rq(cpu); 5401 5402 switch (action & ~CPU_TASKS_FROZEN) { 5403 5404 case CPU_UP_PREPARE: 5405 rq->calc_load_update = calc_load_update; 5406 break; 5407 5408 case CPU_ONLINE: 5409 /* Update our root-domain */ 5410 raw_spin_lock_irqsave(&rq->lock, flags); 5411 if (rq->rd) { 5412 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5413 5414 set_rq_online(rq); 5415 } 5416 raw_spin_unlock_irqrestore(&rq->lock, flags); 5417 break; 5418 5419 #ifdef CONFIG_HOTPLUG_CPU 5420 case CPU_DYING: 5421 sched_ttwu_pending(); 5422 /* Update our root-domain */ 5423 raw_spin_lock_irqsave(&rq->lock, flags); 5424 if (rq->rd) { 5425 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5426 set_rq_offline(rq); 5427 } 5428 migrate_tasks(rq); 5429 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5430 raw_spin_unlock_irqrestore(&rq->lock, flags); 5431 break; 5432 5433 case CPU_DEAD: 5434 calc_load_migrate(rq); 5435 break; 5436 #endif 5437 } 5438 5439 update_max_interval(); 5440 5441 return NOTIFY_OK; 5442 } 5443 5444 /* 5445 * Register at high priority so that task migration (migrate_all_tasks) 5446 * happens before everything else. This has to be lower priority than 5447 * the notifier in the perf_event subsystem, though. 5448 */ 5449 static struct notifier_block migration_notifier = { 5450 .notifier_call = migration_call, 5451 .priority = CPU_PRI_MIGRATION, 5452 }; 5453 5454 static void set_cpu_rq_start_time(void) 5455 { 5456 int cpu = smp_processor_id(); 5457 struct rq *rq = cpu_rq(cpu); 5458 rq->age_stamp = sched_clock_cpu(cpu); 5459 } 5460 5461 static int sched_cpu_active(struct notifier_block *nfb, 5462 unsigned long action, void *hcpu) 5463 { 5464 switch (action & ~CPU_TASKS_FROZEN) { 5465 case CPU_STARTING: 5466 set_cpu_rq_start_time(); 5467 return NOTIFY_OK; 5468 case CPU_DOWN_FAILED: 5469 set_cpu_active((long)hcpu, true); 5470 return NOTIFY_OK; 5471 default: 5472 return NOTIFY_DONE; 5473 } 5474 } 5475 5476 static int sched_cpu_inactive(struct notifier_block *nfb, 5477 unsigned long action, void *hcpu) 5478 { 5479 switch (action & ~CPU_TASKS_FROZEN) { 5480 case CPU_DOWN_PREPARE: 5481 set_cpu_active((long)hcpu, false); 5482 return NOTIFY_OK; 5483 default: 5484 return NOTIFY_DONE; 5485 } 5486 } 5487 5488 static int __init migration_init(void) 5489 { 5490 void *cpu = (void *)(long)smp_processor_id(); 5491 int err; 5492 5493 /* Initialize migration for the boot CPU */ 5494 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5495 BUG_ON(err == NOTIFY_BAD); 5496 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5497 register_cpu_notifier(&migration_notifier); 5498 5499 /* Register cpu active notifiers */ 5500 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 5501 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 5502 5503 return 0; 5504 } 5505 early_initcall(migration_init); 5506 5507 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 5508 5509 #ifdef CONFIG_SCHED_DEBUG 5510 5511 static __read_mostly int sched_debug_enabled; 5512 5513 static int __init sched_debug_setup(char *str) 5514 { 5515 sched_debug_enabled = 1; 5516 5517 return 0; 5518 } 5519 early_param("sched_debug", sched_debug_setup); 5520 5521 static inline bool sched_debug(void) 5522 { 5523 return sched_debug_enabled; 5524 } 5525 5526 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5527 struct cpumask *groupmask) 5528 { 5529 struct sched_group *group = sd->groups; 5530 5531 cpumask_clear(groupmask); 5532 5533 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5534 5535 if (!(sd->flags & SD_LOAD_BALANCE)) { 5536 printk("does not load-balance\n"); 5537 if (sd->parent) 5538 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 5539 " has parent"); 5540 return -1; 5541 } 5542 5543 printk(KERN_CONT "span %*pbl level %s\n", 5544 cpumask_pr_args(sched_domain_span(sd)), sd->name); 5545 5546 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5547 printk(KERN_ERR "ERROR: domain->span does not contain " 5548 "CPU%d\n", cpu); 5549 } 5550 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 5551 printk(KERN_ERR "ERROR: domain->groups does not contain" 5552 " CPU%d\n", cpu); 5553 } 5554 5555 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 5556 do { 5557 if (!group) { 5558 printk("\n"); 5559 printk(KERN_ERR "ERROR: group is NULL\n"); 5560 break; 5561 } 5562 5563 if (!cpumask_weight(sched_group_cpus(group))) { 5564 printk(KERN_CONT "\n"); 5565 printk(KERN_ERR "ERROR: empty group\n"); 5566 break; 5567 } 5568 5569 if (!(sd->flags & SD_OVERLAP) && 5570 cpumask_intersects(groupmask, sched_group_cpus(group))) { 5571 printk(KERN_CONT "\n"); 5572 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5573 break; 5574 } 5575 5576 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5577 5578 printk(KERN_CONT " %*pbl", 5579 cpumask_pr_args(sched_group_cpus(group))); 5580 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { 5581 printk(KERN_CONT " (cpu_capacity = %d)", 5582 group->sgc->capacity); 5583 } 5584 5585 group = group->next; 5586 } while (group != sd->groups); 5587 printk(KERN_CONT "\n"); 5588 5589 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 5590 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5591 5592 if (sd->parent && 5593 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 5594 printk(KERN_ERR "ERROR: parent span is not a superset " 5595 "of domain->span\n"); 5596 return 0; 5597 } 5598 5599 static void sched_domain_debug(struct sched_domain *sd, int cpu) 5600 { 5601 int level = 0; 5602 5603 if (!sched_debug_enabled) 5604 return; 5605 5606 if (!sd) { 5607 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5608 return; 5609 } 5610 5611 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5612 5613 for (;;) { 5614 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 5615 break; 5616 level++; 5617 sd = sd->parent; 5618 if (!sd) 5619 break; 5620 } 5621 } 5622 #else /* !CONFIG_SCHED_DEBUG */ 5623 # define sched_domain_debug(sd, cpu) do { } while (0) 5624 static inline bool sched_debug(void) 5625 { 5626 return false; 5627 } 5628 #endif /* CONFIG_SCHED_DEBUG */ 5629 5630 static int sd_degenerate(struct sched_domain *sd) 5631 { 5632 if (cpumask_weight(sched_domain_span(sd)) == 1) 5633 return 1; 5634 5635 /* Following flags need at least 2 groups */ 5636 if (sd->flags & (SD_LOAD_BALANCE | 5637 SD_BALANCE_NEWIDLE | 5638 SD_BALANCE_FORK | 5639 SD_BALANCE_EXEC | 5640 SD_SHARE_CPUCAPACITY | 5641 SD_SHARE_PKG_RESOURCES | 5642 SD_SHARE_POWERDOMAIN)) { 5643 if (sd->groups != sd->groups->next) 5644 return 0; 5645 } 5646 5647 /* Following flags don't use groups */ 5648 if (sd->flags & (SD_WAKE_AFFINE)) 5649 return 0; 5650 5651 return 1; 5652 } 5653 5654 static int 5655 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5656 { 5657 unsigned long cflags = sd->flags, pflags = parent->flags; 5658 5659 if (sd_degenerate(parent)) 5660 return 1; 5661 5662 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 5663 return 0; 5664 5665 /* Flags needing groups don't count if only 1 group in parent */ 5666 if (parent->groups == parent->groups->next) { 5667 pflags &= ~(SD_LOAD_BALANCE | 5668 SD_BALANCE_NEWIDLE | 5669 SD_BALANCE_FORK | 5670 SD_BALANCE_EXEC | 5671 SD_SHARE_CPUCAPACITY | 5672 SD_SHARE_PKG_RESOURCES | 5673 SD_PREFER_SIBLING | 5674 SD_SHARE_POWERDOMAIN); 5675 if (nr_node_ids == 1) 5676 pflags &= ~SD_SERIALIZE; 5677 } 5678 if (~cflags & pflags) 5679 return 0; 5680 5681 return 1; 5682 } 5683 5684 static void free_rootdomain(struct rcu_head *rcu) 5685 { 5686 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5687 5688 cpupri_cleanup(&rd->cpupri); 5689 cpudl_cleanup(&rd->cpudl); 5690 free_cpumask_var(rd->dlo_mask); 5691 free_cpumask_var(rd->rto_mask); 5692 free_cpumask_var(rd->online); 5693 free_cpumask_var(rd->span); 5694 kfree(rd); 5695 } 5696 5697 static void rq_attach_root(struct rq *rq, struct root_domain *rd) 5698 { 5699 struct root_domain *old_rd = NULL; 5700 unsigned long flags; 5701 5702 raw_spin_lock_irqsave(&rq->lock, flags); 5703 5704 if (rq->rd) { 5705 old_rd = rq->rd; 5706 5707 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 5708 set_rq_offline(rq); 5709 5710 cpumask_clear_cpu(rq->cpu, old_rd->span); 5711 5712 /* 5713 * If we dont want to free the old_rd yet then 5714 * set old_rd to NULL to skip the freeing later 5715 * in this function: 5716 */ 5717 if (!atomic_dec_and_test(&old_rd->refcount)) 5718 old_rd = NULL; 5719 } 5720 5721 atomic_inc(&rd->refcount); 5722 rq->rd = rd; 5723 5724 cpumask_set_cpu(rq->cpu, rd->span); 5725 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 5726 set_rq_online(rq); 5727 5728 raw_spin_unlock_irqrestore(&rq->lock, flags); 5729 5730 if (old_rd) 5731 call_rcu_sched(&old_rd->rcu, free_rootdomain); 5732 } 5733 5734 static int init_rootdomain(struct root_domain *rd) 5735 { 5736 memset(rd, 0, sizeof(*rd)); 5737 5738 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 5739 goto out; 5740 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5741 goto free_span; 5742 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) 5743 goto free_online; 5744 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5745 goto free_dlo_mask; 5746 5747 init_dl_bw(&rd->dl_bw); 5748 if (cpudl_init(&rd->cpudl) != 0) 5749 goto free_dlo_mask; 5750 5751 if (cpupri_init(&rd->cpupri) != 0) 5752 goto free_rto_mask; 5753 return 0; 5754 5755 free_rto_mask: 5756 free_cpumask_var(rd->rto_mask); 5757 free_dlo_mask: 5758 free_cpumask_var(rd->dlo_mask); 5759 free_online: 5760 free_cpumask_var(rd->online); 5761 free_span: 5762 free_cpumask_var(rd->span); 5763 out: 5764 return -ENOMEM; 5765 } 5766 5767 /* 5768 * By default the system creates a single root-domain with all cpus as 5769 * members (mimicking the global state we have today). 5770 */ 5771 struct root_domain def_root_domain; 5772 5773 static void init_defrootdomain(void) 5774 { 5775 init_rootdomain(&def_root_domain); 5776 5777 atomic_set(&def_root_domain.refcount, 1); 5778 } 5779 5780 static struct root_domain *alloc_rootdomain(void) 5781 { 5782 struct root_domain *rd; 5783 5784 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 5785 if (!rd) 5786 return NULL; 5787 5788 if (init_rootdomain(rd) != 0) { 5789 kfree(rd); 5790 return NULL; 5791 } 5792 5793 return rd; 5794 } 5795 5796 static void free_sched_groups(struct sched_group *sg, int free_sgc) 5797 { 5798 struct sched_group *tmp, *first; 5799 5800 if (!sg) 5801 return; 5802 5803 first = sg; 5804 do { 5805 tmp = sg->next; 5806 5807 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) 5808 kfree(sg->sgc); 5809 5810 kfree(sg); 5811 sg = tmp; 5812 } while (sg != first); 5813 } 5814 5815 static void free_sched_domain(struct rcu_head *rcu) 5816 { 5817 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 5818 5819 /* 5820 * If its an overlapping domain it has private groups, iterate and 5821 * nuke them all. 5822 */ 5823 if (sd->flags & SD_OVERLAP) { 5824 free_sched_groups(sd->groups, 1); 5825 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5826 kfree(sd->groups->sgc); 5827 kfree(sd->groups); 5828 } 5829 kfree(sd); 5830 } 5831 5832 static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5833 { 5834 call_rcu(&sd->rcu, free_sched_domain); 5835 } 5836 5837 static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5838 { 5839 for (; sd; sd = sd->parent) 5840 destroy_sched_domain(sd, cpu); 5841 } 5842 5843 /* 5844 * Keep a special pointer to the highest sched_domain that has 5845 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 5846 * allows us to avoid some pointer chasing select_idle_sibling(). 5847 * 5848 * Also keep a unique ID per domain (we use the first cpu number in 5849 * the cpumask of the domain), this allows us to quickly tell if 5850 * two cpus are in the same cache domain, see cpus_share_cache(). 5851 */ 5852 DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5853 DEFINE_PER_CPU(int, sd_llc_size); 5854 DEFINE_PER_CPU(int, sd_llc_id); 5855 DEFINE_PER_CPU(struct sched_domain *, sd_numa); 5856 DEFINE_PER_CPU(struct sched_domain *, sd_busy); 5857 DEFINE_PER_CPU(struct sched_domain *, sd_asym); 5858 5859 static void update_top_cache_domain(int cpu) 5860 { 5861 struct sched_domain *sd; 5862 struct sched_domain *busy_sd = NULL; 5863 int id = cpu; 5864 int size = 1; 5865 5866 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5867 if (sd) { 5868 id = cpumask_first(sched_domain_span(sd)); 5869 size = cpumask_weight(sched_domain_span(sd)); 5870 busy_sd = sd->parent; /* sd_busy */ 5871 } 5872 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); 5873 5874 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5875 per_cpu(sd_llc_size, cpu) = size; 5876 per_cpu(sd_llc_id, cpu) = id; 5877 5878 sd = lowest_flag_domain(cpu, SD_NUMA); 5879 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); 5880 5881 sd = highest_flag_domain(cpu, SD_ASYM_PACKING); 5882 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); 5883 } 5884 5885 /* 5886 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5887 * hold the hotplug lock. 5888 */ 5889 static void 5890 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 5891 { 5892 struct rq *rq = cpu_rq(cpu); 5893 struct sched_domain *tmp; 5894 5895 /* Remove the sched domains which do not contribute to scheduling. */ 5896 for (tmp = sd; tmp; ) { 5897 struct sched_domain *parent = tmp->parent; 5898 if (!parent) 5899 break; 5900 5901 if (sd_parent_degenerate(tmp, parent)) { 5902 tmp->parent = parent->parent; 5903 if (parent->parent) 5904 parent->parent->child = tmp; 5905 /* 5906 * Transfer SD_PREFER_SIBLING down in case of a 5907 * degenerate parent; the spans match for this 5908 * so the property transfers. 5909 */ 5910 if (parent->flags & SD_PREFER_SIBLING) 5911 tmp->flags |= SD_PREFER_SIBLING; 5912 destroy_sched_domain(parent, cpu); 5913 } else 5914 tmp = tmp->parent; 5915 } 5916 5917 if (sd && sd_degenerate(sd)) { 5918 tmp = sd; 5919 sd = sd->parent; 5920 destroy_sched_domain(tmp, cpu); 5921 if (sd) 5922 sd->child = NULL; 5923 } 5924 5925 sched_domain_debug(sd, cpu); 5926 5927 rq_attach_root(rq, rd); 5928 tmp = rq->sd; 5929 rcu_assign_pointer(rq->sd, sd); 5930 destroy_sched_domains(tmp, cpu); 5931 5932 update_top_cache_domain(cpu); 5933 } 5934 5935 /* Setup the mask of cpus configured for isolated domains */ 5936 static int __init isolated_cpu_setup(char *str) 5937 { 5938 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5939 cpulist_parse(str, cpu_isolated_map); 5940 return 1; 5941 } 5942 5943 __setup("isolcpus=", isolated_cpu_setup); 5944 5945 struct s_data { 5946 struct sched_domain ** __percpu sd; 5947 struct root_domain *rd; 5948 }; 5949 5950 enum s_alloc { 5951 sa_rootdomain, 5952 sa_sd, 5953 sa_sd_storage, 5954 sa_none, 5955 }; 5956 5957 /* 5958 * Build an iteration mask that can exclude certain CPUs from the upwards 5959 * domain traversal. 5960 * 5961 * Asymmetric node setups can result in situations where the domain tree is of 5962 * unequal depth, make sure to skip domains that already cover the entire 5963 * range. 5964 * 5965 * In that case build_sched_domains() will have terminated the iteration early 5966 * and our sibling sd spans will be empty. Domains should always include the 5967 * cpu they're built on, so check that. 5968 * 5969 */ 5970 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 5971 { 5972 const struct cpumask *span = sched_domain_span(sd); 5973 struct sd_data *sdd = sd->private; 5974 struct sched_domain *sibling; 5975 int i; 5976 5977 for_each_cpu(i, span) { 5978 sibling = *per_cpu_ptr(sdd->sd, i); 5979 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5980 continue; 5981 5982 cpumask_set_cpu(i, sched_group_mask(sg)); 5983 } 5984 } 5985 5986 /* 5987 * Return the canonical balance cpu for this group, this is the first cpu 5988 * of this group that's also in the iteration mask. 5989 */ 5990 int group_balance_cpu(struct sched_group *sg) 5991 { 5992 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 5993 } 5994 5995 static int 5996 build_overlap_sched_groups(struct sched_domain *sd, int cpu) 5997 { 5998 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 5999 const struct cpumask *span = sched_domain_span(sd); 6000 struct cpumask *covered = sched_domains_tmpmask; 6001 struct sd_data *sdd = sd->private; 6002 struct sched_domain *sibling; 6003 int i; 6004 6005 cpumask_clear(covered); 6006 6007 for_each_cpu(i, span) { 6008 struct cpumask *sg_span; 6009 6010 if (cpumask_test_cpu(i, covered)) 6011 continue; 6012 6013 sibling = *per_cpu_ptr(sdd->sd, i); 6014 6015 /* See the comment near build_group_mask(). */ 6016 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 6017 continue; 6018 6019 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6020 GFP_KERNEL, cpu_to_node(cpu)); 6021 6022 if (!sg) 6023 goto fail; 6024 6025 sg_span = sched_group_cpus(sg); 6026 if (sibling->child) 6027 cpumask_copy(sg_span, sched_domain_span(sibling->child)); 6028 else 6029 cpumask_set_cpu(i, sg_span); 6030 6031 cpumask_or(covered, covered, sg_span); 6032 6033 sg->sgc = *per_cpu_ptr(sdd->sgc, i); 6034 if (atomic_inc_return(&sg->sgc->ref) == 1) 6035 build_group_mask(sd, sg); 6036 6037 /* 6038 * Initialize sgc->capacity such that even if we mess up the 6039 * domains and no possible iteration will get us here, we won't 6040 * die on a /0 trap. 6041 */ 6042 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); 6043 6044 /* 6045 * Make sure the first group of this domain contains the 6046 * canonical balance cpu. Otherwise the sched_domain iteration 6047 * breaks. See update_sg_lb_stats(). 6048 */ 6049 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 6050 group_balance_cpu(sg) == cpu) 6051 groups = sg; 6052 6053 if (!first) 6054 first = sg; 6055 if (last) 6056 last->next = sg; 6057 last = sg; 6058 last->next = first; 6059 } 6060 sd->groups = groups; 6061 6062 return 0; 6063 6064 fail: 6065 free_sched_groups(first, 0); 6066 6067 return -ENOMEM; 6068 } 6069 6070 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 6071 { 6072 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 6073 struct sched_domain *child = sd->child; 6074 6075 if (child) 6076 cpu = cpumask_first(sched_domain_span(child)); 6077 6078 if (sg) { 6079 *sg = *per_cpu_ptr(sdd->sg, cpu); 6080 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); 6081 atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ 6082 } 6083 6084 return cpu; 6085 } 6086 6087 /* 6088 * build_sched_groups will build a circular linked list of the groups 6089 * covered by the given span, and will set each group's ->cpumask correctly, 6090 * and ->cpu_capacity to 0. 6091 * 6092 * Assumes the sched_domain tree is fully constructed 6093 */ 6094 static int 6095 build_sched_groups(struct sched_domain *sd, int cpu) 6096 { 6097 struct sched_group *first = NULL, *last = NULL; 6098 struct sd_data *sdd = sd->private; 6099 const struct cpumask *span = sched_domain_span(sd); 6100 struct cpumask *covered; 6101 int i; 6102 6103 get_group(cpu, sdd, &sd->groups); 6104 atomic_inc(&sd->groups->ref); 6105 6106 if (cpu != cpumask_first(span)) 6107 return 0; 6108 6109 lockdep_assert_held(&sched_domains_mutex); 6110 covered = sched_domains_tmpmask; 6111 6112 cpumask_clear(covered); 6113 6114 for_each_cpu(i, span) { 6115 struct sched_group *sg; 6116 int group, j; 6117 6118 if (cpumask_test_cpu(i, covered)) 6119 continue; 6120 6121 group = get_group(i, sdd, &sg); 6122 cpumask_setall(sched_group_mask(sg)); 6123 6124 for_each_cpu(j, span) { 6125 if (get_group(j, sdd, NULL) != group) 6126 continue; 6127 6128 cpumask_set_cpu(j, covered); 6129 cpumask_set_cpu(j, sched_group_cpus(sg)); 6130 } 6131 6132 if (!first) 6133 first = sg; 6134 if (last) 6135 last->next = sg; 6136 last = sg; 6137 } 6138 last->next = first; 6139 6140 return 0; 6141 } 6142 6143 /* 6144 * Initialize sched groups cpu_capacity. 6145 * 6146 * cpu_capacity indicates the capacity of sched group, which is used while 6147 * distributing the load between different sched groups in a sched domain. 6148 * Typically cpu_capacity for all the groups in a sched domain will be same 6149 * unless there are asymmetries in the topology. If there are asymmetries, 6150 * group having more cpu_capacity will pickup more load compared to the 6151 * group having less cpu_capacity. 6152 */ 6153 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) 6154 { 6155 struct sched_group *sg = sd->groups; 6156 6157 WARN_ON(!sg); 6158 6159 do { 6160 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 6161 sg = sg->next; 6162 } while (sg != sd->groups); 6163 6164 if (cpu != group_balance_cpu(sg)) 6165 return; 6166 6167 update_group_capacity(sd, cpu); 6168 atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); 6169 } 6170 6171 /* 6172 * Initializers for schedule domains 6173 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 6174 */ 6175 6176 static int default_relax_domain_level = -1; 6177 int sched_domain_level_max; 6178 6179 static int __init setup_relax_domain_level(char *str) 6180 { 6181 if (kstrtoint(str, 0, &default_relax_domain_level)) 6182 pr_warn("Unable to set relax_domain_level\n"); 6183 6184 return 1; 6185 } 6186 __setup("relax_domain_level=", setup_relax_domain_level); 6187 6188 static void set_domain_attribute(struct sched_domain *sd, 6189 struct sched_domain_attr *attr) 6190 { 6191 int request; 6192 6193 if (!attr || attr->relax_domain_level < 0) { 6194 if (default_relax_domain_level < 0) 6195 return; 6196 else 6197 request = default_relax_domain_level; 6198 } else 6199 request = attr->relax_domain_level; 6200 if (request < sd->level) { 6201 /* turn off idle balance on this domain */ 6202 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6203 } else { 6204 /* turn on idle balance on this domain */ 6205 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6206 } 6207 } 6208 6209 static void __sdt_free(const struct cpumask *cpu_map); 6210 static int __sdt_alloc(const struct cpumask *cpu_map); 6211 6212 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 6213 const struct cpumask *cpu_map) 6214 { 6215 switch (what) { 6216 case sa_rootdomain: 6217 if (!atomic_read(&d->rd->refcount)) 6218 free_rootdomain(&d->rd->rcu); /* fall through */ 6219 case sa_sd: 6220 free_percpu(d->sd); /* fall through */ 6221 case sa_sd_storage: 6222 __sdt_free(cpu_map); /* fall through */ 6223 case sa_none: 6224 break; 6225 } 6226 } 6227 6228 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 6229 const struct cpumask *cpu_map) 6230 { 6231 memset(d, 0, sizeof(*d)); 6232 6233 if (__sdt_alloc(cpu_map)) 6234 return sa_sd_storage; 6235 d->sd = alloc_percpu(struct sched_domain *); 6236 if (!d->sd) 6237 return sa_sd_storage; 6238 d->rd = alloc_rootdomain(); 6239 if (!d->rd) 6240 return sa_sd; 6241 return sa_rootdomain; 6242 } 6243 6244 /* 6245 * NULL the sd_data elements we've used to build the sched_domain and 6246 * sched_group structure so that the subsequent __free_domain_allocs() 6247 * will not free the data we're using. 6248 */ 6249 static void claim_allocations(int cpu, struct sched_domain *sd) 6250 { 6251 struct sd_data *sdd = sd->private; 6252 6253 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 6254 *per_cpu_ptr(sdd->sd, cpu) = NULL; 6255 6256 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 6257 *per_cpu_ptr(sdd->sg, cpu) = NULL; 6258 6259 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) 6260 *per_cpu_ptr(sdd->sgc, cpu) = NULL; 6261 } 6262 6263 #ifdef CONFIG_NUMA 6264 static int sched_domains_numa_levels; 6265 enum numa_topology_type sched_numa_topology_type; 6266 static int *sched_domains_numa_distance; 6267 int sched_max_numa_distance; 6268 static struct cpumask ***sched_domains_numa_masks; 6269 static int sched_domains_curr_level; 6270 #endif 6271 6272 /* 6273 * SD_flags allowed in topology descriptions. 6274 * 6275 * SD_SHARE_CPUCAPACITY - describes SMT topologies 6276 * SD_SHARE_PKG_RESOURCES - describes shared caches 6277 * SD_NUMA - describes NUMA topologies 6278 * SD_SHARE_POWERDOMAIN - describes shared power domain 6279 * 6280 * Odd one out: 6281 * SD_ASYM_PACKING - describes SMT quirks 6282 */ 6283 #define TOPOLOGY_SD_FLAGS \ 6284 (SD_SHARE_CPUCAPACITY | \ 6285 SD_SHARE_PKG_RESOURCES | \ 6286 SD_NUMA | \ 6287 SD_ASYM_PACKING | \ 6288 SD_SHARE_POWERDOMAIN) 6289 6290 static struct sched_domain * 6291 sd_init(struct sched_domain_topology_level *tl, int cpu) 6292 { 6293 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6294 int sd_weight, sd_flags = 0; 6295 6296 #ifdef CONFIG_NUMA 6297 /* 6298 * Ugly hack to pass state to sd_numa_mask()... 6299 */ 6300 sched_domains_curr_level = tl->numa_level; 6301 #endif 6302 6303 sd_weight = cpumask_weight(tl->mask(cpu)); 6304 6305 if (tl->sd_flags) 6306 sd_flags = (*tl->sd_flags)(); 6307 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, 6308 "wrong sd_flags in topology description\n")) 6309 sd_flags &= ~TOPOLOGY_SD_FLAGS; 6310 6311 *sd = (struct sched_domain){ 6312 .min_interval = sd_weight, 6313 .max_interval = 2*sd_weight, 6314 .busy_factor = 32, 6315 .imbalance_pct = 125, 6316 6317 .cache_nice_tries = 0, 6318 .busy_idx = 0, 6319 .idle_idx = 0, 6320 .newidle_idx = 0, 6321 .wake_idx = 0, 6322 .forkexec_idx = 0, 6323 6324 .flags = 1*SD_LOAD_BALANCE 6325 | 1*SD_BALANCE_NEWIDLE 6326 | 1*SD_BALANCE_EXEC 6327 | 1*SD_BALANCE_FORK 6328 | 0*SD_BALANCE_WAKE 6329 | 1*SD_WAKE_AFFINE 6330 | 0*SD_SHARE_CPUCAPACITY 6331 | 0*SD_SHARE_PKG_RESOURCES 6332 | 0*SD_SERIALIZE 6333 | 0*SD_PREFER_SIBLING 6334 | 0*SD_NUMA 6335 | sd_flags 6336 , 6337 6338 .last_balance = jiffies, 6339 .balance_interval = sd_weight, 6340 .smt_gain = 0, 6341 .max_newidle_lb_cost = 0, 6342 .next_decay_max_lb_cost = jiffies, 6343 #ifdef CONFIG_SCHED_DEBUG 6344 .name = tl->name, 6345 #endif 6346 }; 6347 6348 /* 6349 * Convert topological properties into behaviour. 6350 */ 6351 6352 if (sd->flags & SD_SHARE_CPUCAPACITY) { 6353 sd->flags |= SD_PREFER_SIBLING; 6354 sd->imbalance_pct = 110; 6355 sd->smt_gain = 1178; /* ~15% */ 6356 6357 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 6358 sd->imbalance_pct = 117; 6359 sd->cache_nice_tries = 1; 6360 sd->busy_idx = 2; 6361 6362 #ifdef CONFIG_NUMA 6363 } else if (sd->flags & SD_NUMA) { 6364 sd->cache_nice_tries = 2; 6365 sd->busy_idx = 3; 6366 sd->idle_idx = 2; 6367 6368 sd->flags |= SD_SERIALIZE; 6369 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { 6370 sd->flags &= ~(SD_BALANCE_EXEC | 6371 SD_BALANCE_FORK | 6372 SD_WAKE_AFFINE); 6373 } 6374 6375 #endif 6376 } else { 6377 sd->flags |= SD_PREFER_SIBLING; 6378 sd->cache_nice_tries = 1; 6379 sd->busy_idx = 2; 6380 sd->idle_idx = 1; 6381 } 6382 6383 sd->private = &tl->data; 6384 6385 return sd; 6386 } 6387 6388 /* 6389 * Topology list, bottom-up. 6390 */ 6391 static struct sched_domain_topology_level default_topology[] = { 6392 #ifdef CONFIG_SCHED_SMT 6393 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 6394 #endif 6395 #ifdef CONFIG_SCHED_MC 6396 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 6397 #endif 6398 { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 6399 { NULL, }, 6400 }; 6401 6402 struct sched_domain_topology_level *sched_domain_topology = default_topology; 6403 6404 #define for_each_sd_topology(tl) \ 6405 for (tl = sched_domain_topology; tl->mask; tl++) 6406 6407 void set_sched_topology(struct sched_domain_topology_level *tl) 6408 { 6409 sched_domain_topology = tl; 6410 } 6411 6412 #ifdef CONFIG_NUMA 6413 6414 static const struct cpumask *sd_numa_mask(int cpu) 6415 { 6416 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6417 } 6418 6419 static void sched_numa_warn(const char *str) 6420 { 6421 static int done = false; 6422 int i,j; 6423 6424 if (done) 6425 return; 6426 6427 done = true; 6428 6429 printk(KERN_WARNING "ERROR: %s\n\n", str); 6430 6431 for (i = 0; i < nr_node_ids; i++) { 6432 printk(KERN_WARNING " "); 6433 for (j = 0; j < nr_node_ids; j++) 6434 printk(KERN_CONT "%02d ", node_distance(i,j)); 6435 printk(KERN_CONT "\n"); 6436 } 6437 printk(KERN_WARNING "\n"); 6438 } 6439 6440 bool find_numa_distance(int distance) 6441 { 6442 int i; 6443 6444 if (distance == node_distance(0, 0)) 6445 return true; 6446 6447 for (i = 0; i < sched_domains_numa_levels; i++) { 6448 if (sched_domains_numa_distance[i] == distance) 6449 return true; 6450 } 6451 6452 return false; 6453 } 6454 6455 /* 6456 * A system can have three types of NUMA topology: 6457 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system 6458 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes 6459 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane 6460 * 6461 * The difference between a glueless mesh topology and a backplane 6462 * topology lies in whether communication between not directly 6463 * connected nodes goes through intermediary nodes (where programs 6464 * could run), or through backplane controllers. This affects 6465 * placement of programs. 6466 * 6467 * The type of topology can be discerned with the following tests: 6468 * - If the maximum distance between any nodes is 1 hop, the system 6469 * is directly connected. 6470 * - If for two nodes A and B, located N > 1 hops away from each other, 6471 * there is an intermediary node C, which is < N hops away from both 6472 * nodes A and B, the system is a glueless mesh. 6473 */ 6474 static void init_numa_topology_type(void) 6475 { 6476 int a, b, c, n; 6477 6478 n = sched_max_numa_distance; 6479 6480 if (sched_domains_numa_levels <= 1) { 6481 sched_numa_topology_type = NUMA_DIRECT; 6482 return; 6483 } 6484 6485 for_each_online_node(a) { 6486 for_each_online_node(b) { 6487 /* Find two nodes furthest removed from each other. */ 6488 if (node_distance(a, b) < n) 6489 continue; 6490 6491 /* Is there an intermediary node between a and b? */ 6492 for_each_online_node(c) { 6493 if (node_distance(a, c) < n && 6494 node_distance(b, c) < n) { 6495 sched_numa_topology_type = 6496 NUMA_GLUELESS_MESH; 6497 return; 6498 } 6499 } 6500 6501 sched_numa_topology_type = NUMA_BACKPLANE; 6502 return; 6503 } 6504 } 6505 } 6506 6507 static void sched_init_numa(void) 6508 { 6509 int next_distance, curr_distance = node_distance(0, 0); 6510 struct sched_domain_topology_level *tl; 6511 int level = 0; 6512 int i, j, k; 6513 6514 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6515 if (!sched_domains_numa_distance) 6516 return; 6517 6518 /* 6519 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 6520 * unique distances in the node_distance() table. 6521 * 6522 * Assumes node_distance(0,j) includes all distances in 6523 * node_distance(i,j) in order to avoid cubic time. 6524 */ 6525 next_distance = curr_distance; 6526 for (i = 0; i < nr_node_ids; i++) { 6527 for (j = 0; j < nr_node_ids; j++) { 6528 for (k = 0; k < nr_node_ids; k++) { 6529 int distance = node_distance(i, k); 6530 6531 if (distance > curr_distance && 6532 (distance < next_distance || 6533 next_distance == curr_distance)) 6534 next_distance = distance; 6535 6536 /* 6537 * While not a strong assumption it would be nice to know 6538 * about cases where if node A is connected to B, B is not 6539 * equally connected to A. 6540 */ 6541 if (sched_debug() && node_distance(k, i) != distance) 6542 sched_numa_warn("Node-distance not symmetric"); 6543 6544 if (sched_debug() && i && !find_numa_distance(distance)) 6545 sched_numa_warn("Node-0 not representative"); 6546 } 6547 if (next_distance != curr_distance) { 6548 sched_domains_numa_distance[level++] = next_distance; 6549 sched_domains_numa_levels = level; 6550 curr_distance = next_distance; 6551 } else break; 6552 } 6553 6554 /* 6555 * In case of sched_debug() we verify the above assumption. 6556 */ 6557 if (!sched_debug()) 6558 break; 6559 } 6560 6561 if (!level) 6562 return; 6563 6564 /* 6565 * 'level' contains the number of unique distances, excluding the 6566 * identity distance node_distance(i,i). 6567 * 6568 * The sched_domains_numa_distance[] array includes the actual distance 6569 * numbers. 6570 */ 6571 6572 /* 6573 * Here, we should temporarily reset sched_domains_numa_levels to 0. 6574 * If it fails to allocate memory for array sched_domains_numa_masks[][], 6575 * the array will contain less then 'level' members. This could be 6576 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 6577 * in other functions. 6578 * 6579 * We reset it to 'level' at the end of this function. 6580 */ 6581 sched_domains_numa_levels = 0; 6582 6583 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 6584 if (!sched_domains_numa_masks) 6585 return; 6586 6587 /* 6588 * Now for each level, construct a mask per node which contains all 6589 * cpus of nodes that are that many hops away from us. 6590 */ 6591 for (i = 0; i < level; i++) { 6592 sched_domains_numa_masks[i] = 6593 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 6594 if (!sched_domains_numa_masks[i]) 6595 return; 6596 6597 for (j = 0; j < nr_node_ids; j++) { 6598 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 6599 if (!mask) 6600 return; 6601 6602 sched_domains_numa_masks[i][j] = mask; 6603 6604 for (k = 0; k < nr_node_ids; k++) { 6605 if (node_distance(j, k) > sched_domains_numa_distance[i]) 6606 continue; 6607 6608 cpumask_or(mask, mask, cpumask_of_node(k)); 6609 } 6610 } 6611 } 6612 6613 /* Compute default topology size */ 6614 for (i = 0; sched_domain_topology[i].mask; i++); 6615 6616 tl = kzalloc((i + level + 1) * 6617 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6618 if (!tl) 6619 return; 6620 6621 /* 6622 * Copy the default topology bits.. 6623 */ 6624 for (i = 0; sched_domain_topology[i].mask; i++) 6625 tl[i] = sched_domain_topology[i]; 6626 6627 /* 6628 * .. and append 'j' levels of NUMA goodness. 6629 */ 6630 for (j = 0; j < level; i++, j++) { 6631 tl[i] = (struct sched_domain_topology_level){ 6632 .mask = sd_numa_mask, 6633 .sd_flags = cpu_numa_flags, 6634 .flags = SDTL_OVERLAP, 6635 .numa_level = j, 6636 SD_INIT_NAME(NUMA) 6637 }; 6638 } 6639 6640 sched_domain_topology = tl; 6641 6642 sched_domains_numa_levels = level; 6643 sched_max_numa_distance = sched_domains_numa_distance[level - 1]; 6644 6645 init_numa_topology_type(); 6646 } 6647 6648 static void sched_domains_numa_masks_set(int cpu) 6649 { 6650 int i, j; 6651 int node = cpu_to_node(cpu); 6652 6653 for (i = 0; i < sched_domains_numa_levels; i++) { 6654 for (j = 0; j < nr_node_ids; j++) { 6655 if (node_distance(j, node) <= sched_domains_numa_distance[i]) 6656 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 6657 } 6658 } 6659 } 6660 6661 static void sched_domains_numa_masks_clear(int cpu) 6662 { 6663 int i, j; 6664 for (i = 0; i < sched_domains_numa_levels; i++) { 6665 for (j = 0; j < nr_node_ids; j++) 6666 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 6667 } 6668 } 6669 6670 /* 6671 * Update sched_domains_numa_masks[level][node] array when new cpus 6672 * are onlined. 6673 */ 6674 static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6675 unsigned long action, 6676 void *hcpu) 6677 { 6678 int cpu = (long)hcpu; 6679 6680 switch (action & ~CPU_TASKS_FROZEN) { 6681 case CPU_ONLINE: 6682 sched_domains_numa_masks_set(cpu); 6683 break; 6684 6685 case CPU_DEAD: 6686 sched_domains_numa_masks_clear(cpu); 6687 break; 6688 6689 default: 6690 return NOTIFY_DONE; 6691 } 6692 6693 return NOTIFY_OK; 6694 } 6695 #else 6696 static inline void sched_init_numa(void) 6697 { 6698 } 6699 6700 static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6701 unsigned long action, 6702 void *hcpu) 6703 { 6704 return 0; 6705 } 6706 #endif /* CONFIG_NUMA */ 6707 6708 static int __sdt_alloc(const struct cpumask *cpu_map) 6709 { 6710 struct sched_domain_topology_level *tl; 6711 int j; 6712 6713 for_each_sd_topology(tl) { 6714 struct sd_data *sdd = &tl->data; 6715 6716 sdd->sd = alloc_percpu(struct sched_domain *); 6717 if (!sdd->sd) 6718 return -ENOMEM; 6719 6720 sdd->sg = alloc_percpu(struct sched_group *); 6721 if (!sdd->sg) 6722 return -ENOMEM; 6723 6724 sdd->sgc = alloc_percpu(struct sched_group_capacity *); 6725 if (!sdd->sgc) 6726 return -ENOMEM; 6727 6728 for_each_cpu(j, cpu_map) { 6729 struct sched_domain *sd; 6730 struct sched_group *sg; 6731 struct sched_group_capacity *sgc; 6732 6733 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6734 GFP_KERNEL, cpu_to_node(j)); 6735 if (!sd) 6736 return -ENOMEM; 6737 6738 *per_cpu_ptr(sdd->sd, j) = sd; 6739 6740 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6741 GFP_KERNEL, cpu_to_node(j)); 6742 if (!sg) 6743 return -ENOMEM; 6744 6745 sg->next = sg; 6746 6747 *per_cpu_ptr(sdd->sg, j) = sg; 6748 6749 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), 6750 GFP_KERNEL, cpu_to_node(j)); 6751 if (!sgc) 6752 return -ENOMEM; 6753 6754 *per_cpu_ptr(sdd->sgc, j) = sgc; 6755 } 6756 } 6757 6758 return 0; 6759 } 6760 6761 static void __sdt_free(const struct cpumask *cpu_map) 6762 { 6763 struct sched_domain_topology_level *tl; 6764 int j; 6765 6766 for_each_sd_topology(tl) { 6767 struct sd_data *sdd = &tl->data; 6768 6769 for_each_cpu(j, cpu_map) { 6770 struct sched_domain *sd; 6771 6772 if (sdd->sd) { 6773 sd = *per_cpu_ptr(sdd->sd, j); 6774 if (sd && (sd->flags & SD_OVERLAP)) 6775 free_sched_groups(sd->groups, 0); 6776 kfree(*per_cpu_ptr(sdd->sd, j)); 6777 } 6778 6779 if (sdd->sg) 6780 kfree(*per_cpu_ptr(sdd->sg, j)); 6781 if (sdd->sgc) 6782 kfree(*per_cpu_ptr(sdd->sgc, j)); 6783 } 6784 free_percpu(sdd->sd); 6785 sdd->sd = NULL; 6786 free_percpu(sdd->sg); 6787 sdd->sg = NULL; 6788 free_percpu(sdd->sgc); 6789 sdd->sgc = NULL; 6790 } 6791 } 6792 6793 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 6794 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6795 struct sched_domain *child, int cpu) 6796 { 6797 struct sched_domain *sd = sd_init(tl, cpu); 6798 if (!sd) 6799 return child; 6800 6801 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6802 if (child) { 6803 sd->level = child->level + 1; 6804 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6805 child->parent = sd; 6806 sd->child = child; 6807 6808 if (!cpumask_subset(sched_domain_span(child), 6809 sched_domain_span(sd))) { 6810 pr_err("BUG: arch topology borken\n"); 6811 #ifdef CONFIG_SCHED_DEBUG 6812 pr_err(" the %s domain not a subset of the %s domain\n", 6813 child->name, sd->name); 6814 #endif 6815 /* Fixup, ensure @sd has at least @child cpus. */ 6816 cpumask_or(sched_domain_span(sd), 6817 sched_domain_span(sd), 6818 sched_domain_span(child)); 6819 } 6820 6821 } 6822 set_domain_attribute(sd, attr); 6823 6824 return sd; 6825 } 6826 6827 /* 6828 * Build sched domains for a given set of cpus and attach the sched domains 6829 * to the individual cpus 6830 */ 6831 static int build_sched_domains(const struct cpumask *cpu_map, 6832 struct sched_domain_attr *attr) 6833 { 6834 enum s_alloc alloc_state; 6835 struct sched_domain *sd; 6836 struct s_data d; 6837 int i, ret = -ENOMEM; 6838 6839 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 6840 if (alloc_state != sa_rootdomain) 6841 goto error; 6842 6843 /* Set up domains for cpus specified by the cpu_map. */ 6844 for_each_cpu(i, cpu_map) { 6845 struct sched_domain_topology_level *tl; 6846 6847 sd = NULL; 6848 for_each_sd_topology(tl) { 6849 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 6850 if (tl == sched_domain_topology) 6851 *per_cpu_ptr(d.sd, i) = sd; 6852 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6853 sd->flags |= SD_OVERLAP; 6854 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6855 break; 6856 } 6857 } 6858 6859 /* Build the groups for the domains */ 6860 for_each_cpu(i, cpu_map) { 6861 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6862 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 6863 if (sd->flags & SD_OVERLAP) { 6864 if (build_overlap_sched_groups(sd, i)) 6865 goto error; 6866 } else { 6867 if (build_sched_groups(sd, i)) 6868 goto error; 6869 } 6870 } 6871 } 6872 6873 /* Calculate CPU capacity for physical packages and nodes */ 6874 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6875 if (!cpumask_test_cpu(i, cpu_map)) 6876 continue; 6877 6878 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6879 claim_allocations(i, sd); 6880 init_sched_groups_capacity(i, sd); 6881 } 6882 } 6883 6884 /* Attach the domains */ 6885 rcu_read_lock(); 6886 for_each_cpu(i, cpu_map) { 6887 sd = *per_cpu_ptr(d.sd, i); 6888 cpu_attach_domain(sd, d.rd, i); 6889 } 6890 rcu_read_unlock(); 6891 6892 ret = 0; 6893 error: 6894 __free_domain_allocs(&d, alloc_state, cpu_map); 6895 return ret; 6896 } 6897 6898 static cpumask_var_t *doms_cur; /* current sched domains */ 6899 static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 6900 static struct sched_domain_attr *dattr_cur; 6901 /* attribues of custom domains in 'doms_cur' */ 6902 6903 /* 6904 * Special case: If a kmalloc of a doms_cur partition (array of 6905 * cpumask) fails, then fallback to a single sched domain, 6906 * as determined by the single cpumask fallback_doms. 6907 */ 6908 static cpumask_var_t fallback_doms; 6909 6910 /* 6911 * arch_update_cpu_topology lets virtualized architectures update the 6912 * cpu core maps. It is supposed to return 1 if the topology changed 6913 * or 0 if it stayed the same. 6914 */ 6915 int __weak arch_update_cpu_topology(void) 6916 { 6917 return 0; 6918 } 6919 6920 cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 6921 { 6922 int i; 6923 cpumask_var_t *doms; 6924 6925 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 6926 if (!doms) 6927 return NULL; 6928 for (i = 0; i < ndoms; i++) { 6929 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 6930 free_sched_domains(doms, i); 6931 return NULL; 6932 } 6933 } 6934 return doms; 6935 } 6936 6937 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 6938 { 6939 unsigned int i; 6940 for (i = 0; i < ndoms; i++) 6941 free_cpumask_var(doms[i]); 6942 kfree(doms); 6943 } 6944 6945 /* 6946 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6947 * For now this just excludes isolated cpus, but could be used to 6948 * exclude other special cases in the future. 6949 */ 6950 static int init_sched_domains(const struct cpumask *cpu_map) 6951 { 6952 int err; 6953 6954 arch_update_cpu_topology(); 6955 ndoms_cur = 1; 6956 doms_cur = alloc_sched_domains(ndoms_cur); 6957 if (!doms_cur) 6958 doms_cur = &fallback_doms; 6959 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6960 err = build_sched_domains(doms_cur[0], NULL); 6961 register_sched_domain_sysctl(); 6962 6963 return err; 6964 } 6965 6966 /* 6967 * Detach sched domains from a group of cpus specified in cpu_map 6968 * These cpus will now be attached to the NULL domain 6969 */ 6970 static void detach_destroy_domains(const struct cpumask *cpu_map) 6971 { 6972 int i; 6973 6974 rcu_read_lock(); 6975 for_each_cpu(i, cpu_map) 6976 cpu_attach_domain(NULL, &def_root_domain, i); 6977 rcu_read_unlock(); 6978 } 6979 6980 /* handle null as "default" */ 6981 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 6982 struct sched_domain_attr *new, int idx_new) 6983 { 6984 struct sched_domain_attr tmp; 6985 6986 /* fast path */ 6987 if (!new && !cur) 6988 return 1; 6989 6990 tmp = SD_ATTR_INIT; 6991 return !memcmp(cur ? (cur + idx_cur) : &tmp, 6992 new ? (new + idx_new) : &tmp, 6993 sizeof(struct sched_domain_attr)); 6994 } 6995 6996 /* 6997 * Partition sched domains as specified by the 'ndoms_new' 6998 * cpumasks in the array doms_new[] of cpumasks. This compares 6999 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7000 * It destroys each deleted domain and builds each new domain. 7001 * 7002 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 7003 * The masks don't intersect (don't overlap.) We should setup one 7004 * sched domain for each mask. CPUs not in any of the cpumasks will 7005 * not be load balanced. If the same cpumask appears both in the 7006 * current 'doms_cur' domains and in the new 'doms_new', we can leave 7007 * it as it is. 7008 * 7009 * The passed in 'doms_new' should be allocated using 7010 * alloc_sched_domains. This routine takes ownership of it and will 7011 * free_sched_domains it when done with it. If the caller failed the 7012 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 7013 * and partition_sched_domains() will fallback to the single partition 7014 * 'fallback_doms', it also forces the domains to be rebuilt. 7015 * 7016 * If doms_new == NULL it will be replaced with cpu_online_mask. 7017 * ndoms_new == 0 is a special case for destroying existing domains, 7018 * and it will not create the default domain. 7019 * 7020 * Call with hotplug lock held 7021 */ 7022 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 7023 struct sched_domain_attr *dattr_new) 7024 { 7025 int i, j, n; 7026 int new_topology; 7027 7028 mutex_lock(&sched_domains_mutex); 7029 7030 /* always unregister in case we don't destroy any domains */ 7031 unregister_sched_domain_sysctl(); 7032 7033 /* Let architecture update cpu core mappings. */ 7034 new_topology = arch_update_cpu_topology(); 7035 7036 n = doms_new ? ndoms_new : 0; 7037 7038 /* Destroy deleted domains */ 7039 for (i = 0; i < ndoms_cur; i++) { 7040 for (j = 0; j < n && !new_topology; j++) { 7041 if (cpumask_equal(doms_cur[i], doms_new[j]) 7042 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7043 goto match1; 7044 } 7045 /* no match - a current sched domain not in new doms_new[] */ 7046 detach_destroy_domains(doms_cur[i]); 7047 match1: 7048 ; 7049 } 7050 7051 n = ndoms_cur; 7052 if (doms_new == NULL) { 7053 n = 0; 7054 doms_new = &fallback_doms; 7055 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 7056 WARN_ON_ONCE(dattr_new); 7057 } 7058 7059 /* Build new domains */ 7060 for (i = 0; i < ndoms_new; i++) { 7061 for (j = 0; j < n && !new_topology; j++) { 7062 if (cpumask_equal(doms_new[i], doms_cur[j]) 7063 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7064 goto match2; 7065 } 7066 /* no match - add a new doms_new */ 7067 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 7068 match2: 7069 ; 7070 } 7071 7072 /* Remember the new sched domains */ 7073 if (doms_cur != &fallback_doms) 7074 free_sched_domains(doms_cur, ndoms_cur); 7075 kfree(dattr_cur); /* kfree(NULL) is safe */ 7076 doms_cur = doms_new; 7077 dattr_cur = dattr_new; 7078 ndoms_cur = ndoms_new; 7079 7080 register_sched_domain_sysctl(); 7081 7082 mutex_unlock(&sched_domains_mutex); 7083 } 7084 7085 static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ 7086 7087 /* 7088 * Update cpusets according to cpu_active mask. If cpusets are 7089 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7090 * around partition_sched_domains(). 7091 * 7092 * If we come here as part of a suspend/resume, don't touch cpusets because we 7093 * want to restore it back to its original state upon resume anyway. 7094 */ 7095 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 7096 void *hcpu) 7097 { 7098 switch (action) { 7099 case CPU_ONLINE_FROZEN: 7100 case CPU_DOWN_FAILED_FROZEN: 7101 7102 /* 7103 * num_cpus_frozen tracks how many CPUs are involved in suspend 7104 * resume sequence. As long as this is not the last online 7105 * operation in the resume sequence, just build a single sched 7106 * domain, ignoring cpusets. 7107 */ 7108 num_cpus_frozen--; 7109 if (likely(num_cpus_frozen)) { 7110 partition_sched_domains(1, NULL, NULL); 7111 break; 7112 } 7113 7114 /* 7115 * This is the last CPU online operation. So fall through and 7116 * restore the original sched domains by considering the 7117 * cpuset configurations. 7118 */ 7119 7120 case CPU_ONLINE: 7121 cpuset_update_active_cpus(true); 7122 break; 7123 default: 7124 return NOTIFY_DONE; 7125 } 7126 return NOTIFY_OK; 7127 } 7128 7129 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7130 void *hcpu) 7131 { 7132 unsigned long flags; 7133 long cpu = (long)hcpu; 7134 struct dl_bw *dl_b; 7135 bool overflow; 7136 int cpus; 7137 7138 switch (action) { 7139 case CPU_DOWN_PREPARE: 7140 rcu_read_lock_sched(); 7141 dl_b = dl_bw_of(cpu); 7142 7143 raw_spin_lock_irqsave(&dl_b->lock, flags); 7144 cpus = dl_bw_cpus(cpu); 7145 overflow = __dl_overflow(dl_b, cpus, 0, 0); 7146 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7147 7148 rcu_read_unlock_sched(); 7149 7150 if (overflow) 7151 return notifier_from_errno(-EBUSY); 7152 cpuset_update_active_cpus(false); 7153 break; 7154 case CPU_DOWN_PREPARE_FROZEN: 7155 num_cpus_frozen++; 7156 partition_sched_domains(1, NULL, NULL); 7157 break; 7158 default: 7159 return NOTIFY_DONE; 7160 } 7161 return NOTIFY_OK; 7162 } 7163 7164 void __init sched_init_smp(void) 7165 { 7166 cpumask_var_t non_isolated_cpus; 7167 7168 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7169 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7170 7171 /* nohz_full won't take effect without isolating the cpus. */ 7172 tick_nohz_full_add_cpus_to(cpu_isolated_map); 7173 7174 sched_init_numa(); 7175 7176 /* 7177 * There's no userspace yet to cause hotplug operations; hence all the 7178 * cpu masks are stable and all blatant races in the below code cannot 7179 * happen. 7180 */ 7181 mutex_lock(&sched_domains_mutex); 7182 init_sched_domains(cpu_active_mask); 7183 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7184 if (cpumask_empty(non_isolated_cpus)) 7185 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7186 mutex_unlock(&sched_domains_mutex); 7187 7188 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 7189 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 7190 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 7191 7192 init_hrtick(); 7193 7194 /* Move init over to a non-isolated CPU */ 7195 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 7196 BUG(); 7197 sched_init_granularity(); 7198 free_cpumask_var(non_isolated_cpus); 7199 7200 init_sched_rt_class(); 7201 init_sched_dl_class(); 7202 } 7203 #else 7204 void __init sched_init_smp(void) 7205 { 7206 sched_init_granularity(); 7207 } 7208 #endif /* CONFIG_SMP */ 7209 7210 int in_sched_functions(unsigned long addr) 7211 { 7212 return in_lock_functions(addr) || 7213 (addr >= (unsigned long)__sched_text_start 7214 && addr < (unsigned long)__sched_text_end); 7215 } 7216 7217 #ifdef CONFIG_CGROUP_SCHED 7218 /* 7219 * Default task group. 7220 * Every task in system belongs to this group at bootup. 7221 */ 7222 struct task_group root_task_group; 7223 LIST_HEAD(task_groups); 7224 #endif 7225 7226 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 7227 7228 void __init sched_init(void) 7229 { 7230 int i, j; 7231 unsigned long alloc_size = 0, ptr; 7232 7233 #ifdef CONFIG_FAIR_GROUP_SCHED 7234 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7235 #endif 7236 #ifdef CONFIG_RT_GROUP_SCHED 7237 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7238 #endif 7239 if (alloc_size) { 7240 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7241 7242 #ifdef CONFIG_FAIR_GROUP_SCHED 7243 root_task_group.se = (struct sched_entity **)ptr; 7244 ptr += nr_cpu_ids * sizeof(void **); 7245 7246 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 7247 ptr += nr_cpu_ids * sizeof(void **); 7248 7249 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7250 #ifdef CONFIG_RT_GROUP_SCHED 7251 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 7252 ptr += nr_cpu_ids * sizeof(void **); 7253 7254 root_task_group.rt_rq = (struct rt_rq **)ptr; 7255 ptr += nr_cpu_ids * sizeof(void **); 7256 7257 #endif /* CONFIG_RT_GROUP_SCHED */ 7258 } 7259 #ifdef CONFIG_CPUMASK_OFFSTACK 7260 for_each_possible_cpu(i) { 7261 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( 7262 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 7263 } 7264 #endif /* CONFIG_CPUMASK_OFFSTACK */ 7265 7266 init_rt_bandwidth(&def_rt_bandwidth, 7267 global_rt_period(), global_rt_runtime()); 7268 init_dl_bandwidth(&def_dl_bandwidth, 7269 global_rt_period(), global_rt_runtime()); 7270 7271 #ifdef CONFIG_SMP 7272 init_defrootdomain(); 7273 #endif 7274 7275 #ifdef CONFIG_RT_GROUP_SCHED 7276 init_rt_bandwidth(&root_task_group.rt_bandwidth, 7277 global_rt_period(), global_rt_runtime()); 7278 #endif /* CONFIG_RT_GROUP_SCHED */ 7279 7280 #ifdef CONFIG_CGROUP_SCHED 7281 list_add(&root_task_group.list, &task_groups); 7282 INIT_LIST_HEAD(&root_task_group.children); 7283 INIT_LIST_HEAD(&root_task_group.siblings); 7284 autogroup_init(&init_task); 7285 7286 #endif /* CONFIG_CGROUP_SCHED */ 7287 7288 for_each_possible_cpu(i) { 7289 struct rq *rq; 7290 7291 rq = cpu_rq(i); 7292 raw_spin_lock_init(&rq->lock); 7293 rq->nr_running = 0; 7294 rq->calc_load_active = 0; 7295 rq->calc_load_update = jiffies + LOAD_FREQ; 7296 init_cfs_rq(&rq->cfs); 7297 init_rt_rq(&rq->rt); 7298 init_dl_rq(&rq->dl); 7299 #ifdef CONFIG_FAIR_GROUP_SCHED 7300 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 7301 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7302 /* 7303 * How much cpu bandwidth does root_task_group get? 7304 * 7305 * In case of task-groups formed thr' the cgroup filesystem, it 7306 * gets 100% of the cpu resources in the system. This overall 7307 * system cpu resource is divided among the tasks of 7308 * root_task_group and its child task-groups in a fair manner, 7309 * based on each entity's (task or task-group's) weight 7310 * (se->load.weight). 7311 * 7312 * In other words, if root_task_group has 10 tasks of weight 7313 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7314 * then A0's share of the cpu resource is: 7315 * 7316 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7317 * 7318 * We achieve this by letting root_task_group's tasks sit 7319 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 7320 */ 7321 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 7322 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 7323 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7324 7325 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7326 #ifdef CONFIG_RT_GROUP_SCHED 7327 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 7328 #endif 7329 7330 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7331 rq->cpu_load[j] = 0; 7332 7333 rq->last_load_update_tick = jiffies; 7334 7335 #ifdef CONFIG_SMP 7336 rq->sd = NULL; 7337 rq->rd = NULL; 7338 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; 7339 rq->balance_callback = NULL; 7340 rq->active_balance = 0; 7341 rq->next_balance = jiffies; 7342 rq->push_cpu = 0; 7343 rq->cpu = i; 7344 rq->online = 0; 7345 rq->idle_stamp = 0; 7346 rq->avg_idle = 2*sysctl_sched_migration_cost; 7347 rq->max_idle_balance_cost = sysctl_sched_migration_cost; 7348 7349 INIT_LIST_HEAD(&rq->cfs_tasks); 7350 7351 rq_attach_root(rq, &def_root_domain); 7352 #ifdef CONFIG_NO_HZ_COMMON 7353 rq->nohz_flags = 0; 7354 #endif 7355 #ifdef CONFIG_NO_HZ_FULL 7356 rq->last_sched_tick = 0; 7357 #endif 7358 #endif 7359 init_rq_hrtick(rq); 7360 atomic_set(&rq->nr_iowait, 0); 7361 } 7362 7363 set_load_weight(&init_task); 7364 7365 #ifdef CONFIG_PREEMPT_NOTIFIERS 7366 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 7367 #endif 7368 7369 /* 7370 * The boot idle thread does lazy MMU switching as well: 7371 */ 7372 atomic_inc(&init_mm.mm_count); 7373 enter_lazy_tlb(&init_mm, current); 7374 7375 /* 7376 * During early bootup we pretend to be a normal task: 7377 */ 7378 current->sched_class = &fair_sched_class; 7379 7380 /* 7381 * Make us the idle thread. Technically, schedule() should not be 7382 * called from this thread, however somewhere below it might be, 7383 * but because we are the idle thread, we just pick up running again 7384 * when this runqueue becomes "idle". 7385 */ 7386 init_idle(current, smp_processor_id()); 7387 7388 calc_load_update = jiffies + LOAD_FREQ; 7389 7390 #ifdef CONFIG_SMP 7391 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7392 /* May be allocated at isolcpus cmdline parse time */ 7393 if (cpu_isolated_map == NULL) 7394 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7395 idle_thread_set_boot_cpu(); 7396 set_cpu_rq_start_time(); 7397 #endif 7398 init_sched_fair_class(); 7399 7400 scheduler_running = 1; 7401 } 7402 7403 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 7404 static inline int preempt_count_equals(int preempt_offset) 7405 { 7406 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 7407 7408 return (nested == preempt_offset); 7409 } 7410 7411 void __might_sleep(const char *file, int line, int preempt_offset) 7412 { 7413 /* 7414 * Blocking primitives will set (and therefore destroy) current->state, 7415 * since we will exit with TASK_RUNNING make sure we enter with it, 7416 * otherwise we will destroy state. 7417 */ 7418 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, 7419 "do not call blocking ops when !TASK_RUNNING; " 7420 "state=%lx set at [<%p>] %pS\n", 7421 current->state, 7422 (void *)current->task_state_change, 7423 (void *)current->task_state_change); 7424 7425 ___might_sleep(file, line, preempt_offset); 7426 } 7427 EXPORT_SYMBOL(__might_sleep); 7428 7429 void ___might_sleep(const char *file, int line, int preempt_offset) 7430 { 7431 static unsigned long prev_jiffy; /* ratelimiting */ 7432 7433 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7434 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 7435 !is_idle_task(current)) || 7436 system_state != SYSTEM_RUNNING || oops_in_progress) 7437 return; 7438 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 7439 return; 7440 prev_jiffy = jiffies; 7441 7442 printk(KERN_ERR 7443 "BUG: sleeping function called from invalid context at %s:%d\n", 7444 file, line); 7445 printk(KERN_ERR 7446 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 7447 in_atomic(), irqs_disabled(), 7448 current->pid, current->comm); 7449 7450 if (task_stack_end_corrupted(current)) 7451 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); 7452 7453 debug_show_held_locks(current); 7454 if (irqs_disabled()) 7455 print_irqtrace_events(current); 7456 #ifdef CONFIG_DEBUG_PREEMPT 7457 if (!preempt_count_equals(preempt_offset)) { 7458 pr_err("Preemption disabled at:"); 7459 print_ip_sym(current->preempt_disable_ip); 7460 pr_cont("\n"); 7461 } 7462 #endif 7463 dump_stack(); 7464 } 7465 EXPORT_SYMBOL(___might_sleep); 7466 #endif 7467 7468 #ifdef CONFIG_MAGIC_SYSRQ 7469 void normalize_rt_tasks(void) 7470 { 7471 struct task_struct *g, *p; 7472 struct sched_attr attr = { 7473 .sched_policy = SCHED_NORMAL, 7474 }; 7475 7476 read_lock(&tasklist_lock); 7477 for_each_process_thread(g, p) { 7478 /* 7479 * Only normalize user tasks: 7480 */ 7481 if (p->flags & PF_KTHREAD) 7482 continue; 7483 7484 p->se.exec_start = 0; 7485 #ifdef CONFIG_SCHEDSTATS 7486 p->se.statistics.wait_start = 0; 7487 p->se.statistics.sleep_start = 0; 7488 p->se.statistics.block_start = 0; 7489 #endif 7490 7491 if (!dl_task(p) && !rt_task(p)) { 7492 /* 7493 * Renice negative nice level userspace 7494 * tasks back to 0: 7495 */ 7496 if (task_nice(p) < 0) 7497 set_user_nice(p, 0); 7498 continue; 7499 } 7500 7501 __sched_setscheduler(p, &attr, false, false); 7502 } 7503 read_unlock(&tasklist_lock); 7504 } 7505 7506 #endif /* CONFIG_MAGIC_SYSRQ */ 7507 7508 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 7509 /* 7510 * These functions are only useful for the IA64 MCA handling, or kdb. 7511 * 7512 * They can only be called when the whole system has been 7513 * stopped - every CPU needs to be quiescent, and no scheduling 7514 * activity can take place. Using them for anything else would 7515 * be a serious bug, and as a result, they aren't even visible 7516 * under any other configuration. 7517 */ 7518 7519 /** 7520 * curr_task - return the current task for a given cpu. 7521 * @cpu: the processor in question. 7522 * 7523 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7524 * 7525 * Return: The current task for @cpu. 7526 */ 7527 struct task_struct *curr_task(int cpu) 7528 { 7529 return cpu_curr(cpu); 7530 } 7531 7532 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 7533 7534 #ifdef CONFIG_IA64 7535 /** 7536 * set_curr_task - set the current task for a given cpu. 7537 * @cpu: the processor in question. 7538 * @p: the task pointer to set. 7539 * 7540 * Description: This function must only be used when non-maskable interrupts 7541 * are serviced on a separate stack. It allows the architecture to switch the 7542 * notion of the current task on a cpu in a non-blocking manner. This function 7543 * must be called with all CPU's synchronized, and interrupts disabled, the 7544 * and caller must save the original value of the current task (see 7545 * curr_task() above) and restore that value before reenabling interrupts and 7546 * re-starting the system. 7547 * 7548 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7549 */ 7550 void set_curr_task(int cpu, struct task_struct *p) 7551 { 7552 cpu_curr(cpu) = p; 7553 } 7554 7555 #endif 7556 7557 #ifdef CONFIG_CGROUP_SCHED 7558 /* task_group_lock serializes the addition/removal of task groups */ 7559 static DEFINE_SPINLOCK(task_group_lock); 7560 7561 static void free_sched_group(struct task_group *tg) 7562 { 7563 free_fair_sched_group(tg); 7564 free_rt_sched_group(tg); 7565 autogroup_free(tg); 7566 kfree(tg); 7567 } 7568 7569 /* allocate runqueue etc for a new task group */ 7570 struct task_group *sched_create_group(struct task_group *parent) 7571 { 7572 struct task_group *tg; 7573 7574 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7575 if (!tg) 7576 return ERR_PTR(-ENOMEM); 7577 7578 if (!alloc_fair_sched_group(tg, parent)) 7579 goto err; 7580 7581 if (!alloc_rt_sched_group(tg, parent)) 7582 goto err; 7583 7584 return tg; 7585 7586 err: 7587 free_sched_group(tg); 7588 return ERR_PTR(-ENOMEM); 7589 } 7590 7591 void sched_online_group(struct task_group *tg, struct task_group *parent) 7592 { 7593 unsigned long flags; 7594 7595 spin_lock_irqsave(&task_group_lock, flags); 7596 list_add_rcu(&tg->list, &task_groups); 7597 7598 WARN_ON(!parent); /* root should already exist */ 7599 7600 tg->parent = parent; 7601 INIT_LIST_HEAD(&tg->children); 7602 list_add_rcu(&tg->siblings, &parent->children); 7603 spin_unlock_irqrestore(&task_group_lock, flags); 7604 } 7605 7606 /* rcu callback to free various structures associated with a task group */ 7607 static void free_sched_group_rcu(struct rcu_head *rhp) 7608 { 7609 /* now it should be safe to free those cfs_rqs */ 7610 free_sched_group(container_of(rhp, struct task_group, rcu)); 7611 } 7612 7613 /* Destroy runqueue etc associated with a task group */ 7614 void sched_destroy_group(struct task_group *tg) 7615 { 7616 /* wait for possible concurrent references to cfs_rqs complete */ 7617 call_rcu(&tg->rcu, free_sched_group_rcu); 7618 } 7619 7620 void sched_offline_group(struct task_group *tg) 7621 { 7622 unsigned long flags; 7623 int i; 7624 7625 /* end participation in shares distribution */ 7626 for_each_possible_cpu(i) 7627 unregister_fair_sched_group(tg, i); 7628 7629 spin_lock_irqsave(&task_group_lock, flags); 7630 list_del_rcu(&tg->list); 7631 list_del_rcu(&tg->siblings); 7632 spin_unlock_irqrestore(&task_group_lock, flags); 7633 } 7634 7635 /* change task's runqueue when it moves between groups. 7636 * The caller of this function should have put the task in its new group 7637 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 7638 * reflect its new group. 7639 */ 7640 void sched_move_task(struct task_struct *tsk) 7641 { 7642 struct task_group *tg; 7643 int queued, running; 7644 unsigned long flags; 7645 struct rq *rq; 7646 7647 rq = task_rq_lock(tsk, &flags); 7648 7649 running = task_current(rq, tsk); 7650 queued = task_on_rq_queued(tsk); 7651 7652 if (queued) 7653 dequeue_task(rq, tsk, 0); 7654 if (unlikely(running)) 7655 put_prev_task(rq, tsk); 7656 7657 /* 7658 * All callers are synchronized by task_rq_lock(); we do not use RCU 7659 * which is pointless here. Thus, we pass "true" to task_css_check() 7660 * to prevent lockdep warnings. 7661 */ 7662 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), 7663 struct task_group, css); 7664 tg = autogroup_task_group(tsk, tg); 7665 tsk->sched_task_group = tg; 7666 7667 #ifdef CONFIG_FAIR_GROUP_SCHED 7668 if (tsk->sched_class->task_move_group) 7669 tsk->sched_class->task_move_group(tsk, queued); 7670 else 7671 #endif 7672 set_task_rq(tsk, task_cpu(tsk)); 7673 7674 if (unlikely(running)) 7675 tsk->sched_class->set_curr_task(rq); 7676 if (queued) 7677 enqueue_task(rq, tsk, 0); 7678 7679 task_rq_unlock(rq, tsk, &flags); 7680 } 7681 #endif /* CONFIG_CGROUP_SCHED */ 7682 7683 #ifdef CONFIG_RT_GROUP_SCHED 7684 /* 7685 * Ensure that the real time constraints are schedulable. 7686 */ 7687 static DEFINE_MUTEX(rt_constraints_mutex); 7688 7689 /* Must be called with tasklist_lock held */ 7690 static inline int tg_has_rt_tasks(struct task_group *tg) 7691 { 7692 struct task_struct *g, *p; 7693 7694 /* 7695 * Autogroups do not have RT tasks; see autogroup_create(). 7696 */ 7697 if (task_group_is_autogroup(tg)) 7698 return 0; 7699 7700 for_each_process_thread(g, p) { 7701 if (rt_task(p) && task_group(p) == tg) 7702 return 1; 7703 } 7704 7705 return 0; 7706 } 7707 7708 struct rt_schedulable_data { 7709 struct task_group *tg; 7710 u64 rt_period; 7711 u64 rt_runtime; 7712 }; 7713 7714 static int tg_rt_schedulable(struct task_group *tg, void *data) 7715 { 7716 struct rt_schedulable_data *d = data; 7717 struct task_group *child; 7718 unsigned long total, sum = 0; 7719 u64 period, runtime; 7720 7721 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7722 runtime = tg->rt_bandwidth.rt_runtime; 7723 7724 if (tg == d->tg) { 7725 period = d->rt_period; 7726 runtime = d->rt_runtime; 7727 } 7728 7729 /* 7730 * Cannot have more runtime than the period. 7731 */ 7732 if (runtime > period && runtime != RUNTIME_INF) 7733 return -EINVAL; 7734 7735 /* 7736 * Ensure we don't starve existing RT tasks. 7737 */ 7738 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 7739 return -EBUSY; 7740 7741 total = to_ratio(period, runtime); 7742 7743 /* 7744 * Nobody can have more than the global setting allows. 7745 */ 7746 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 7747 return -EINVAL; 7748 7749 /* 7750 * The sum of our children's runtime should not exceed our own. 7751 */ 7752 list_for_each_entry_rcu(child, &tg->children, siblings) { 7753 period = ktime_to_ns(child->rt_bandwidth.rt_period); 7754 runtime = child->rt_bandwidth.rt_runtime; 7755 7756 if (child == d->tg) { 7757 period = d->rt_period; 7758 runtime = d->rt_runtime; 7759 } 7760 7761 sum += to_ratio(period, runtime); 7762 } 7763 7764 if (sum > total) 7765 return -EINVAL; 7766 7767 return 0; 7768 } 7769 7770 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7771 { 7772 int ret; 7773 7774 struct rt_schedulable_data data = { 7775 .tg = tg, 7776 .rt_period = period, 7777 .rt_runtime = runtime, 7778 }; 7779 7780 rcu_read_lock(); 7781 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 7782 rcu_read_unlock(); 7783 7784 return ret; 7785 } 7786 7787 static int tg_set_rt_bandwidth(struct task_group *tg, 7788 u64 rt_period, u64 rt_runtime) 7789 { 7790 int i, err = 0; 7791 7792 /* 7793 * Disallowing the root group RT runtime is BAD, it would disallow the 7794 * kernel creating (and or operating) RT threads. 7795 */ 7796 if (tg == &root_task_group && rt_runtime == 0) 7797 return -EINVAL; 7798 7799 /* No period doesn't make any sense. */ 7800 if (rt_period == 0) 7801 return -EINVAL; 7802 7803 mutex_lock(&rt_constraints_mutex); 7804 read_lock(&tasklist_lock); 7805 err = __rt_schedulable(tg, rt_period, rt_runtime); 7806 if (err) 7807 goto unlock; 7808 7809 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7810 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 7811 tg->rt_bandwidth.rt_runtime = rt_runtime; 7812 7813 for_each_possible_cpu(i) { 7814 struct rt_rq *rt_rq = tg->rt_rq[i]; 7815 7816 raw_spin_lock(&rt_rq->rt_runtime_lock); 7817 rt_rq->rt_runtime = rt_runtime; 7818 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7819 } 7820 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7821 unlock: 7822 read_unlock(&tasklist_lock); 7823 mutex_unlock(&rt_constraints_mutex); 7824 7825 return err; 7826 } 7827 7828 static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7829 { 7830 u64 rt_runtime, rt_period; 7831 7832 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7833 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7834 if (rt_runtime_us < 0) 7835 rt_runtime = RUNTIME_INF; 7836 7837 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7838 } 7839 7840 static long sched_group_rt_runtime(struct task_group *tg) 7841 { 7842 u64 rt_runtime_us; 7843 7844 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 7845 return -1; 7846 7847 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 7848 do_div(rt_runtime_us, NSEC_PER_USEC); 7849 return rt_runtime_us; 7850 } 7851 7852 static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) 7853 { 7854 u64 rt_runtime, rt_period; 7855 7856 rt_period = rt_period_us * NSEC_PER_USEC; 7857 rt_runtime = tg->rt_bandwidth.rt_runtime; 7858 7859 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7860 } 7861 7862 static long sched_group_rt_period(struct task_group *tg) 7863 { 7864 u64 rt_period_us; 7865 7866 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 7867 do_div(rt_period_us, NSEC_PER_USEC); 7868 return rt_period_us; 7869 } 7870 #endif /* CONFIG_RT_GROUP_SCHED */ 7871 7872 #ifdef CONFIG_RT_GROUP_SCHED 7873 static int sched_rt_global_constraints(void) 7874 { 7875 int ret = 0; 7876 7877 mutex_lock(&rt_constraints_mutex); 7878 read_lock(&tasklist_lock); 7879 ret = __rt_schedulable(NULL, 0, 0); 7880 read_unlock(&tasklist_lock); 7881 mutex_unlock(&rt_constraints_mutex); 7882 7883 return ret; 7884 } 7885 7886 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7887 { 7888 /* Don't accept realtime tasks when there is no way for them to run */ 7889 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7890 return 0; 7891 7892 return 1; 7893 } 7894 7895 #else /* !CONFIG_RT_GROUP_SCHED */ 7896 static int sched_rt_global_constraints(void) 7897 { 7898 unsigned long flags; 7899 int i, ret = 0; 7900 7901 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7902 for_each_possible_cpu(i) { 7903 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7904 7905 raw_spin_lock(&rt_rq->rt_runtime_lock); 7906 rt_rq->rt_runtime = global_rt_runtime(); 7907 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7908 } 7909 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7910 7911 return ret; 7912 } 7913 #endif /* CONFIG_RT_GROUP_SCHED */ 7914 7915 static int sched_dl_global_validate(void) 7916 { 7917 u64 runtime = global_rt_runtime(); 7918 u64 period = global_rt_period(); 7919 u64 new_bw = to_ratio(period, runtime); 7920 struct dl_bw *dl_b; 7921 int cpu, ret = 0; 7922 unsigned long flags; 7923 7924 /* 7925 * Here we want to check the bandwidth not being set to some 7926 * value smaller than the currently allocated bandwidth in 7927 * any of the root_domains. 7928 * 7929 * FIXME: Cycling on all the CPUs is overdoing, but simpler than 7930 * cycling on root_domains... Discussion on different/better 7931 * solutions is welcome! 7932 */ 7933 for_each_possible_cpu(cpu) { 7934 rcu_read_lock_sched(); 7935 dl_b = dl_bw_of(cpu); 7936 7937 raw_spin_lock_irqsave(&dl_b->lock, flags); 7938 if (new_bw < dl_b->total_bw) 7939 ret = -EBUSY; 7940 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7941 7942 rcu_read_unlock_sched(); 7943 7944 if (ret) 7945 break; 7946 } 7947 7948 return ret; 7949 } 7950 7951 static void sched_dl_do_global(void) 7952 { 7953 u64 new_bw = -1; 7954 struct dl_bw *dl_b; 7955 int cpu; 7956 unsigned long flags; 7957 7958 def_dl_bandwidth.dl_period = global_rt_period(); 7959 def_dl_bandwidth.dl_runtime = global_rt_runtime(); 7960 7961 if (global_rt_runtime() != RUNTIME_INF) 7962 new_bw = to_ratio(global_rt_period(), global_rt_runtime()); 7963 7964 /* 7965 * FIXME: As above... 7966 */ 7967 for_each_possible_cpu(cpu) { 7968 rcu_read_lock_sched(); 7969 dl_b = dl_bw_of(cpu); 7970 7971 raw_spin_lock_irqsave(&dl_b->lock, flags); 7972 dl_b->bw = new_bw; 7973 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7974 7975 rcu_read_unlock_sched(); 7976 } 7977 } 7978 7979 static int sched_rt_global_validate(void) 7980 { 7981 if (sysctl_sched_rt_period <= 0) 7982 return -EINVAL; 7983 7984 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && 7985 (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) 7986 return -EINVAL; 7987 7988 return 0; 7989 } 7990 7991 static void sched_rt_do_global(void) 7992 { 7993 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7994 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 7995 } 7996 7997 int sched_rt_handler(struct ctl_table *table, int write, 7998 void __user *buffer, size_t *lenp, 7999 loff_t *ppos) 8000 { 8001 int old_period, old_runtime; 8002 static DEFINE_MUTEX(mutex); 8003 int ret; 8004 8005 mutex_lock(&mutex); 8006 old_period = sysctl_sched_rt_period; 8007 old_runtime = sysctl_sched_rt_runtime; 8008 8009 ret = proc_dointvec(table, write, buffer, lenp, ppos); 8010 8011 if (!ret && write) { 8012 ret = sched_rt_global_validate(); 8013 if (ret) 8014 goto undo; 8015 8016 ret = sched_dl_global_validate(); 8017 if (ret) 8018 goto undo; 8019 8020 ret = sched_rt_global_constraints(); 8021 if (ret) 8022 goto undo; 8023 8024 sched_rt_do_global(); 8025 sched_dl_do_global(); 8026 } 8027 if (0) { 8028 undo: 8029 sysctl_sched_rt_period = old_period; 8030 sysctl_sched_rt_runtime = old_runtime; 8031 } 8032 mutex_unlock(&mutex); 8033 8034 return ret; 8035 } 8036 8037 int sched_rr_handler(struct ctl_table *table, int write, 8038 void __user *buffer, size_t *lenp, 8039 loff_t *ppos) 8040 { 8041 int ret; 8042 static DEFINE_MUTEX(mutex); 8043 8044 mutex_lock(&mutex); 8045 ret = proc_dointvec(table, write, buffer, lenp, ppos); 8046 /* make sure that internally we keep jiffies */ 8047 /* also, writing zero resets timeslice to default */ 8048 if (!ret && write) { 8049 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 8050 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 8051 } 8052 mutex_unlock(&mutex); 8053 return ret; 8054 } 8055 8056 #ifdef CONFIG_CGROUP_SCHED 8057 8058 static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 8059 { 8060 return css ? container_of(css, struct task_group, css) : NULL; 8061 } 8062 8063 static struct cgroup_subsys_state * 8064 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 8065 { 8066 struct task_group *parent = css_tg(parent_css); 8067 struct task_group *tg; 8068 8069 if (!parent) { 8070 /* This is early initialization for the top cgroup */ 8071 return &root_task_group.css; 8072 } 8073 8074 tg = sched_create_group(parent); 8075 if (IS_ERR(tg)) 8076 return ERR_PTR(-ENOMEM); 8077 8078 return &tg->css; 8079 } 8080 8081 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 8082 { 8083 struct task_group *tg = css_tg(css); 8084 struct task_group *parent = css_tg(css->parent); 8085 8086 if (parent) 8087 sched_online_group(tg, parent); 8088 return 0; 8089 } 8090 8091 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 8092 { 8093 struct task_group *tg = css_tg(css); 8094 8095 sched_destroy_group(tg); 8096 } 8097 8098 static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) 8099 { 8100 struct task_group *tg = css_tg(css); 8101 8102 sched_offline_group(tg); 8103 } 8104 8105 static void cpu_cgroup_fork(struct task_struct *task) 8106 { 8107 sched_move_task(task); 8108 } 8109 8110 static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, 8111 struct cgroup_taskset *tset) 8112 { 8113 struct task_struct *task; 8114 8115 cgroup_taskset_for_each(task, tset) { 8116 #ifdef CONFIG_RT_GROUP_SCHED 8117 if (!sched_rt_can_attach(css_tg(css), task)) 8118 return -EINVAL; 8119 #else 8120 /* We don't support RT-tasks being in separate groups */ 8121 if (task->sched_class != &fair_sched_class) 8122 return -EINVAL; 8123 #endif 8124 } 8125 return 0; 8126 } 8127 8128 static void cpu_cgroup_attach(struct cgroup_subsys_state *css, 8129 struct cgroup_taskset *tset) 8130 { 8131 struct task_struct *task; 8132 8133 cgroup_taskset_for_each(task, tset) 8134 sched_move_task(task); 8135 } 8136 8137 static void cpu_cgroup_exit(struct cgroup_subsys_state *css, 8138 struct cgroup_subsys_state *old_css, 8139 struct task_struct *task) 8140 { 8141 /* 8142 * cgroup_exit() is called in the copy_process() failure path. 8143 * Ignore this case since the task hasn't ran yet, this avoids 8144 * trying to poke a half freed task state from generic code. 8145 */ 8146 if (!(task->flags & PF_EXITING)) 8147 return; 8148 8149 sched_move_task(task); 8150 } 8151 8152 #ifdef CONFIG_FAIR_GROUP_SCHED 8153 static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 8154 struct cftype *cftype, u64 shareval) 8155 { 8156 return sched_group_set_shares(css_tg(css), scale_load(shareval)); 8157 } 8158 8159 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, 8160 struct cftype *cft) 8161 { 8162 struct task_group *tg = css_tg(css); 8163 8164 return (u64) scale_load_down(tg->shares); 8165 } 8166 8167 #ifdef CONFIG_CFS_BANDWIDTH 8168 static DEFINE_MUTEX(cfs_constraints_mutex); 8169 8170 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 8171 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 8172 8173 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 8174 8175 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 8176 { 8177 int i, ret = 0, runtime_enabled, runtime_was_enabled; 8178 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8179 8180 if (tg == &root_task_group) 8181 return -EINVAL; 8182 8183 /* 8184 * Ensure we have at some amount of bandwidth every period. This is 8185 * to prevent reaching a state of large arrears when throttled via 8186 * entity_tick() resulting in prolonged exit starvation. 8187 */ 8188 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 8189 return -EINVAL; 8190 8191 /* 8192 * Likewise, bound things on the otherside by preventing insane quota 8193 * periods. This also allows us to normalize in computing quota 8194 * feasibility. 8195 */ 8196 if (period > max_cfs_quota_period) 8197 return -EINVAL; 8198 8199 /* 8200 * Prevent race between setting of cfs_rq->runtime_enabled and 8201 * unthrottle_offline_cfs_rqs(). 8202 */ 8203 get_online_cpus(); 8204 mutex_lock(&cfs_constraints_mutex); 8205 ret = __cfs_schedulable(tg, period, quota); 8206 if (ret) 8207 goto out_unlock; 8208 8209 runtime_enabled = quota != RUNTIME_INF; 8210 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 8211 /* 8212 * If we need to toggle cfs_bandwidth_used, off->on must occur 8213 * before making related changes, and on->off must occur afterwards 8214 */ 8215 if (runtime_enabled && !runtime_was_enabled) 8216 cfs_bandwidth_usage_inc(); 8217 raw_spin_lock_irq(&cfs_b->lock); 8218 cfs_b->period = ns_to_ktime(period); 8219 cfs_b->quota = quota; 8220 8221 __refill_cfs_bandwidth_runtime(cfs_b); 8222 /* restart the period timer (if active) to handle new period expiry */ 8223 if (runtime_enabled) 8224 start_cfs_bandwidth(cfs_b); 8225 raw_spin_unlock_irq(&cfs_b->lock); 8226 8227 for_each_online_cpu(i) { 8228 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 8229 struct rq *rq = cfs_rq->rq; 8230 8231 raw_spin_lock_irq(&rq->lock); 8232 cfs_rq->runtime_enabled = runtime_enabled; 8233 cfs_rq->runtime_remaining = 0; 8234 8235 if (cfs_rq->throttled) 8236 unthrottle_cfs_rq(cfs_rq); 8237 raw_spin_unlock_irq(&rq->lock); 8238 } 8239 if (runtime_was_enabled && !runtime_enabled) 8240 cfs_bandwidth_usage_dec(); 8241 out_unlock: 8242 mutex_unlock(&cfs_constraints_mutex); 8243 put_online_cpus(); 8244 8245 return ret; 8246 } 8247 8248 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 8249 { 8250 u64 quota, period; 8251 8252 period = ktime_to_ns(tg->cfs_bandwidth.period); 8253 if (cfs_quota_us < 0) 8254 quota = RUNTIME_INF; 8255 else 8256 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 8257 8258 return tg_set_cfs_bandwidth(tg, period, quota); 8259 } 8260 8261 long tg_get_cfs_quota(struct task_group *tg) 8262 { 8263 u64 quota_us; 8264 8265 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 8266 return -1; 8267 8268 quota_us = tg->cfs_bandwidth.quota; 8269 do_div(quota_us, NSEC_PER_USEC); 8270 8271 return quota_us; 8272 } 8273 8274 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 8275 { 8276 u64 quota, period; 8277 8278 period = (u64)cfs_period_us * NSEC_PER_USEC; 8279 quota = tg->cfs_bandwidth.quota; 8280 8281 return tg_set_cfs_bandwidth(tg, period, quota); 8282 } 8283 8284 long tg_get_cfs_period(struct task_group *tg) 8285 { 8286 u64 cfs_period_us; 8287 8288 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 8289 do_div(cfs_period_us, NSEC_PER_USEC); 8290 8291 return cfs_period_us; 8292 } 8293 8294 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, 8295 struct cftype *cft) 8296 { 8297 return tg_get_cfs_quota(css_tg(css)); 8298 } 8299 8300 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, 8301 struct cftype *cftype, s64 cfs_quota_us) 8302 { 8303 return tg_set_cfs_quota(css_tg(css), cfs_quota_us); 8304 } 8305 8306 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, 8307 struct cftype *cft) 8308 { 8309 return tg_get_cfs_period(css_tg(css)); 8310 } 8311 8312 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, 8313 struct cftype *cftype, u64 cfs_period_us) 8314 { 8315 return tg_set_cfs_period(css_tg(css), cfs_period_us); 8316 } 8317 8318 struct cfs_schedulable_data { 8319 struct task_group *tg; 8320 u64 period, quota; 8321 }; 8322 8323 /* 8324 * normalize group quota/period to be quota/max_period 8325 * note: units are usecs 8326 */ 8327 static u64 normalize_cfs_quota(struct task_group *tg, 8328 struct cfs_schedulable_data *d) 8329 { 8330 u64 quota, period; 8331 8332 if (tg == d->tg) { 8333 period = d->period; 8334 quota = d->quota; 8335 } else { 8336 period = tg_get_cfs_period(tg); 8337 quota = tg_get_cfs_quota(tg); 8338 } 8339 8340 /* note: these should typically be equivalent */ 8341 if (quota == RUNTIME_INF || quota == -1) 8342 return RUNTIME_INF; 8343 8344 return to_ratio(period, quota); 8345 } 8346 8347 static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 8348 { 8349 struct cfs_schedulable_data *d = data; 8350 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8351 s64 quota = 0, parent_quota = -1; 8352 8353 if (!tg->parent) { 8354 quota = RUNTIME_INF; 8355 } else { 8356 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 8357 8358 quota = normalize_cfs_quota(tg, d); 8359 parent_quota = parent_b->hierarchical_quota; 8360 8361 /* 8362 * ensure max(child_quota) <= parent_quota, inherit when no 8363 * limit is set 8364 */ 8365 if (quota == RUNTIME_INF) 8366 quota = parent_quota; 8367 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 8368 return -EINVAL; 8369 } 8370 cfs_b->hierarchical_quota = quota; 8371 8372 return 0; 8373 } 8374 8375 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 8376 { 8377 int ret; 8378 struct cfs_schedulable_data data = { 8379 .tg = tg, 8380 .period = period, 8381 .quota = quota, 8382 }; 8383 8384 if (quota != RUNTIME_INF) { 8385 do_div(data.period, NSEC_PER_USEC); 8386 do_div(data.quota, NSEC_PER_USEC); 8387 } 8388 8389 rcu_read_lock(); 8390 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 8391 rcu_read_unlock(); 8392 8393 return ret; 8394 } 8395 8396 static int cpu_stats_show(struct seq_file *sf, void *v) 8397 { 8398 struct task_group *tg = css_tg(seq_css(sf)); 8399 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8400 8401 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); 8402 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 8403 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 8404 8405 return 0; 8406 } 8407 #endif /* CONFIG_CFS_BANDWIDTH */ 8408 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8409 8410 #ifdef CONFIG_RT_GROUP_SCHED 8411 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, 8412 struct cftype *cft, s64 val) 8413 { 8414 return sched_group_set_rt_runtime(css_tg(css), val); 8415 } 8416 8417 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, 8418 struct cftype *cft) 8419 { 8420 return sched_group_rt_runtime(css_tg(css)); 8421 } 8422 8423 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, 8424 struct cftype *cftype, u64 rt_period_us) 8425 { 8426 return sched_group_set_rt_period(css_tg(css), rt_period_us); 8427 } 8428 8429 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 8430 struct cftype *cft) 8431 { 8432 return sched_group_rt_period(css_tg(css)); 8433 } 8434 #endif /* CONFIG_RT_GROUP_SCHED */ 8435 8436 static struct cftype cpu_files[] = { 8437 #ifdef CONFIG_FAIR_GROUP_SCHED 8438 { 8439 .name = "shares", 8440 .read_u64 = cpu_shares_read_u64, 8441 .write_u64 = cpu_shares_write_u64, 8442 }, 8443 #endif 8444 #ifdef CONFIG_CFS_BANDWIDTH 8445 { 8446 .name = "cfs_quota_us", 8447 .read_s64 = cpu_cfs_quota_read_s64, 8448 .write_s64 = cpu_cfs_quota_write_s64, 8449 }, 8450 { 8451 .name = "cfs_period_us", 8452 .read_u64 = cpu_cfs_period_read_u64, 8453 .write_u64 = cpu_cfs_period_write_u64, 8454 }, 8455 { 8456 .name = "stat", 8457 .seq_show = cpu_stats_show, 8458 }, 8459 #endif 8460 #ifdef CONFIG_RT_GROUP_SCHED 8461 { 8462 .name = "rt_runtime_us", 8463 .read_s64 = cpu_rt_runtime_read, 8464 .write_s64 = cpu_rt_runtime_write, 8465 }, 8466 { 8467 .name = "rt_period_us", 8468 .read_u64 = cpu_rt_period_read_uint, 8469 .write_u64 = cpu_rt_period_write_uint, 8470 }, 8471 #endif 8472 { } /* terminate */ 8473 }; 8474 8475 struct cgroup_subsys cpu_cgrp_subsys = { 8476 .css_alloc = cpu_cgroup_css_alloc, 8477 .css_free = cpu_cgroup_css_free, 8478 .css_online = cpu_cgroup_css_online, 8479 .css_offline = cpu_cgroup_css_offline, 8480 .fork = cpu_cgroup_fork, 8481 .can_attach = cpu_cgroup_can_attach, 8482 .attach = cpu_cgroup_attach, 8483 .exit = cpu_cgroup_exit, 8484 .legacy_cftypes = cpu_files, 8485 .early_init = 1, 8486 }; 8487 8488 #endif /* CONFIG_CGROUP_SCHED */ 8489 8490 void dump_cpu_task(int cpu) 8491 { 8492 pr_info("Task dump for CPU %d:\n", cpu); 8493 sched_show_task(cpu_curr(cpu)); 8494 } 8495