1 /* 2 * kernel/sched/core.c 3 * 4 * Kernel scheduler and related syscalls 5 * 6 * Copyright (C) 1991-2002 Linus Torvalds 7 * 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 9 * make semaphores SMP safe 10 * 1998-11-19 Implemented schedule_timeout() and related stuff 11 * by Andrea Arcangeli 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 13 * hybrid priority-list and round-robin design with 14 * an array-switch method of distributing timeslices 15 * and per-CPU runqueues. Cleanups and useful suggestions 16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 17 * 2003-09-03 Interactivity tuning by Con Kolivas. 18 * 2004-04-02 Scheduler domains code by Nick Piggin 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 20 * fair scheduling design by Con Kolivas. 21 * 2007-05-05 Load balancing (smp-nice) and other improvements 22 * by Peter Williams 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 26 * Thomas Gleixner, Mike Kravetz 27 */ 28 29 #include <linux/mm.h> 30 #include <linux/module.h> 31 #include <linux/nmi.h> 32 #include <linux/init.h> 33 #include <linux/uaccess.h> 34 #include <linux/highmem.h> 35 #include <asm/mmu_context.h> 36 #include <linux/interrupt.h> 37 #include <linux/capability.h> 38 #include <linux/completion.h> 39 #include <linux/kernel_stat.h> 40 #include <linux/debug_locks.h> 41 #include <linux/perf_event.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/profile.h> 45 #include <linux/freezer.h> 46 #include <linux/vmalloc.h> 47 #include <linux/blkdev.h> 48 #include <linux/delay.h> 49 #include <linux/pid_namespace.h> 50 #include <linux/smp.h> 51 #include <linux/threads.h> 52 #include <linux/timer.h> 53 #include <linux/rcupdate.h> 54 #include <linux/cpu.h> 55 #include <linux/cpuset.h> 56 #include <linux/percpu.h> 57 #include <linux/proc_fs.h> 58 #include <linux/seq_file.h> 59 #include <linux/sysctl.h> 60 #include <linux/syscalls.h> 61 #include <linux/times.h> 62 #include <linux/tsacct_kern.h> 63 #include <linux/kprobes.h> 64 #include <linux/delayacct.h> 65 #include <linux/unistd.h> 66 #include <linux/pagemap.h> 67 #include <linux/hrtimer.h> 68 #include <linux/tick.h> 69 #include <linux/debugfs.h> 70 #include <linux/ctype.h> 71 #include <linux/ftrace.h> 72 #include <linux/slab.h> 73 #include <linux/init_task.h> 74 #include <linux/binfmts.h> 75 #include <linux/context_tracking.h> 76 77 #include <asm/switch_to.h> 78 #include <asm/tlb.h> 79 #include <asm/irq_regs.h> 80 #include <asm/mutex.h> 81 #ifdef CONFIG_PARAVIRT 82 #include <asm/paravirt.h> 83 #endif 84 85 #include "sched.h" 86 #include "../workqueue_internal.h" 87 #include "../smpboot.h" 88 89 #define CREATE_TRACE_POINTS 90 #include <trace/events/sched.h> 91 92 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 93 { 94 unsigned long delta; 95 ktime_t soft, hard, now; 96 97 for (;;) { 98 if (hrtimer_active(period_timer)) 99 break; 100 101 now = hrtimer_cb_get_time(period_timer); 102 hrtimer_forward(period_timer, now, period); 103 104 soft = hrtimer_get_softexpires(period_timer); 105 hard = hrtimer_get_expires(period_timer); 106 delta = ktime_to_ns(ktime_sub(hard, soft)); 107 __hrtimer_start_range_ns(period_timer, soft, delta, 108 HRTIMER_MODE_ABS_PINNED, 0); 109 } 110 } 111 112 DEFINE_MUTEX(sched_domains_mutex); 113 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 114 115 static void update_rq_clock_task(struct rq *rq, s64 delta); 116 117 void update_rq_clock(struct rq *rq) 118 { 119 s64 delta; 120 121 if (rq->skip_clock_update > 0) 122 return; 123 124 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 125 rq->clock += delta; 126 update_rq_clock_task(rq, delta); 127 } 128 129 /* 130 * Debugging: various feature bits 131 */ 132 133 #define SCHED_FEAT(name, enabled) \ 134 (1UL << __SCHED_FEAT_##name) * enabled | 135 136 const_debug unsigned int sysctl_sched_features = 137 #include "features.h" 138 0; 139 140 #undef SCHED_FEAT 141 142 #ifdef CONFIG_SCHED_DEBUG 143 #define SCHED_FEAT(name, enabled) \ 144 #name , 145 146 static const char * const sched_feat_names[] = { 147 #include "features.h" 148 }; 149 150 #undef SCHED_FEAT 151 152 static int sched_feat_show(struct seq_file *m, void *v) 153 { 154 int i; 155 156 for (i = 0; i < __SCHED_FEAT_NR; i++) { 157 if (!(sysctl_sched_features & (1UL << i))) 158 seq_puts(m, "NO_"); 159 seq_printf(m, "%s ", sched_feat_names[i]); 160 } 161 seq_puts(m, "\n"); 162 163 return 0; 164 } 165 166 #ifdef HAVE_JUMP_LABEL 167 168 #define jump_label_key__true STATIC_KEY_INIT_TRUE 169 #define jump_label_key__false STATIC_KEY_INIT_FALSE 170 171 #define SCHED_FEAT(name, enabled) \ 172 jump_label_key__##enabled , 173 174 struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { 175 #include "features.h" 176 }; 177 178 #undef SCHED_FEAT 179 180 static void sched_feat_disable(int i) 181 { 182 if (static_key_enabled(&sched_feat_keys[i])) 183 static_key_slow_dec(&sched_feat_keys[i]); 184 } 185 186 static void sched_feat_enable(int i) 187 { 188 if (!static_key_enabled(&sched_feat_keys[i])) 189 static_key_slow_inc(&sched_feat_keys[i]); 190 } 191 #else 192 static void sched_feat_disable(int i) { }; 193 static void sched_feat_enable(int i) { }; 194 #endif /* HAVE_JUMP_LABEL */ 195 196 static int sched_feat_set(char *cmp) 197 { 198 int i; 199 int neg = 0; 200 201 if (strncmp(cmp, "NO_", 3) == 0) { 202 neg = 1; 203 cmp += 3; 204 } 205 206 for (i = 0; i < __SCHED_FEAT_NR; i++) { 207 if (strcmp(cmp, sched_feat_names[i]) == 0) { 208 if (neg) { 209 sysctl_sched_features &= ~(1UL << i); 210 sched_feat_disable(i); 211 } else { 212 sysctl_sched_features |= (1UL << i); 213 sched_feat_enable(i); 214 } 215 break; 216 } 217 } 218 219 return i; 220 } 221 222 static ssize_t 223 sched_feat_write(struct file *filp, const char __user *ubuf, 224 size_t cnt, loff_t *ppos) 225 { 226 char buf[64]; 227 char *cmp; 228 int i; 229 230 if (cnt > 63) 231 cnt = 63; 232 233 if (copy_from_user(&buf, ubuf, cnt)) 234 return -EFAULT; 235 236 buf[cnt] = 0; 237 cmp = strstrip(buf); 238 239 i = sched_feat_set(cmp); 240 if (i == __SCHED_FEAT_NR) 241 return -EINVAL; 242 243 *ppos += cnt; 244 245 return cnt; 246 } 247 248 static int sched_feat_open(struct inode *inode, struct file *filp) 249 { 250 return single_open(filp, sched_feat_show, NULL); 251 } 252 253 static const struct file_operations sched_feat_fops = { 254 .open = sched_feat_open, 255 .write = sched_feat_write, 256 .read = seq_read, 257 .llseek = seq_lseek, 258 .release = single_release, 259 }; 260 261 static __init int sched_init_debug(void) 262 { 263 debugfs_create_file("sched_features", 0644, NULL, NULL, 264 &sched_feat_fops); 265 266 return 0; 267 } 268 late_initcall(sched_init_debug); 269 #endif /* CONFIG_SCHED_DEBUG */ 270 271 /* 272 * Number of tasks to iterate in a single balance run. 273 * Limited because this is done with IRQs disabled. 274 */ 275 const_debug unsigned int sysctl_sched_nr_migrate = 32; 276 277 /* 278 * period over which we average the RT time consumption, measured 279 * in ms. 280 * 281 * default: 1s 282 */ 283 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 284 285 /* 286 * period over which we measure -rt task cpu usage in us. 287 * default: 1s 288 */ 289 unsigned int sysctl_sched_rt_period = 1000000; 290 291 __read_mostly int scheduler_running; 292 293 /* 294 * part of the period that we allow rt tasks to run in us. 295 * default: 0.95s 296 */ 297 int sysctl_sched_rt_runtime = 950000; 298 299 300 301 /* 302 * __task_rq_lock - lock the rq @p resides on. 303 */ 304 static inline struct rq *__task_rq_lock(struct task_struct *p) 305 __acquires(rq->lock) 306 { 307 struct rq *rq; 308 309 lockdep_assert_held(&p->pi_lock); 310 311 for (;;) { 312 rq = task_rq(p); 313 raw_spin_lock(&rq->lock); 314 if (likely(rq == task_rq(p))) 315 return rq; 316 raw_spin_unlock(&rq->lock); 317 } 318 } 319 320 /* 321 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 322 */ 323 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 324 __acquires(p->pi_lock) 325 __acquires(rq->lock) 326 { 327 struct rq *rq; 328 329 for (;;) { 330 raw_spin_lock_irqsave(&p->pi_lock, *flags); 331 rq = task_rq(p); 332 raw_spin_lock(&rq->lock); 333 if (likely(rq == task_rq(p))) 334 return rq; 335 raw_spin_unlock(&rq->lock); 336 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 337 } 338 } 339 340 static void __task_rq_unlock(struct rq *rq) 341 __releases(rq->lock) 342 { 343 raw_spin_unlock(&rq->lock); 344 } 345 346 static inline void 347 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 348 __releases(rq->lock) 349 __releases(p->pi_lock) 350 { 351 raw_spin_unlock(&rq->lock); 352 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 353 } 354 355 /* 356 * this_rq_lock - lock this runqueue and disable interrupts. 357 */ 358 static struct rq *this_rq_lock(void) 359 __acquires(rq->lock) 360 { 361 struct rq *rq; 362 363 local_irq_disable(); 364 rq = this_rq(); 365 raw_spin_lock(&rq->lock); 366 367 return rq; 368 } 369 370 #ifdef CONFIG_SCHED_HRTICK 371 /* 372 * Use HR-timers to deliver accurate preemption points. 373 */ 374 375 static void hrtick_clear(struct rq *rq) 376 { 377 if (hrtimer_active(&rq->hrtick_timer)) 378 hrtimer_cancel(&rq->hrtick_timer); 379 } 380 381 /* 382 * High-resolution timer tick. 383 * Runs from hardirq context with interrupts disabled. 384 */ 385 static enum hrtimer_restart hrtick(struct hrtimer *timer) 386 { 387 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 388 389 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 390 391 raw_spin_lock(&rq->lock); 392 update_rq_clock(rq); 393 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 394 raw_spin_unlock(&rq->lock); 395 396 return HRTIMER_NORESTART; 397 } 398 399 #ifdef CONFIG_SMP 400 401 static int __hrtick_restart(struct rq *rq) 402 { 403 struct hrtimer *timer = &rq->hrtick_timer; 404 ktime_t time = hrtimer_get_softexpires(timer); 405 406 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); 407 } 408 409 /* 410 * called from hardirq (IPI) context 411 */ 412 static void __hrtick_start(void *arg) 413 { 414 struct rq *rq = arg; 415 416 raw_spin_lock(&rq->lock); 417 __hrtick_restart(rq); 418 rq->hrtick_csd_pending = 0; 419 raw_spin_unlock(&rq->lock); 420 } 421 422 /* 423 * Called to set the hrtick timer state. 424 * 425 * called with rq->lock held and irqs disabled 426 */ 427 void hrtick_start(struct rq *rq, u64 delay) 428 { 429 struct hrtimer *timer = &rq->hrtick_timer; 430 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 431 432 hrtimer_set_expires(timer, time); 433 434 if (rq == this_rq()) { 435 __hrtick_restart(rq); 436 } else if (!rq->hrtick_csd_pending) { 437 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 438 rq->hrtick_csd_pending = 1; 439 } 440 } 441 442 static int 443 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 444 { 445 int cpu = (int)(long)hcpu; 446 447 switch (action) { 448 case CPU_UP_CANCELED: 449 case CPU_UP_CANCELED_FROZEN: 450 case CPU_DOWN_PREPARE: 451 case CPU_DOWN_PREPARE_FROZEN: 452 case CPU_DEAD: 453 case CPU_DEAD_FROZEN: 454 hrtick_clear(cpu_rq(cpu)); 455 return NOTIFY_OK; 456 } 457 458 return NOTIFY_DONE; 459 } 460 461 static __init void init_hrtick(void) 462 { 463 hotcpu_notifier(hotplug_hrtick, 0); 464 } 465 #else 466 /* 467 * Called to set the hrtick timer state. 468 * 469 * called with rq->lock held and irqs disabled 470 */ 471 void hrtick_start(struct rq *rq, u64 delay) 472 { 473 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 474 HRTIMER_MODE_REL_PINNED, 0); 475 } 476 477 static inline void init_hrtick(void) 478 { 479 } 480 #endif /* CONFIG_SMP */ 481 482 static void init_rq_hrtick(struct rq *rq) 483 { 484 #ifdef CONFIG_SMP 485 rq->hrtick_csd_pending = 0; 486 487 rq->hrtick_csd.flags = 0; 488 rq->hrtick_csd.func = __hrtick_start; 489 rq->hrtick_csd.info = rq; 490 #endif 491 492 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 493 rq->hrtick_timer.function = hrtick; 494 } 495 #else /* CONFIG_SCHED_HRTICK */ 496 static inline void hrtick_clear(struct rq *rq) 497 { 498 } 499 500 static inline void init_rq_hrtick(struct rq *rq) 501 { 502 } 503 504 static inline void init_hrtick(void) 505 { 506 } 507 #endif /* CONFIG_SCHED_HRTICK */ 508 509 /* 510 * resched_task - mark a task 'to be rescheduled now'. 511 * 512 * On UP this means the setting of the need_resched flag, on SMP it 513 * might also involve a cross-CPU call to trigger the scheduler on 514 * the target CPU. 515 */ 516 #ifdef CONFIG_SMP 517 void resched_task(struct task_struct *p) 518 { 519 int cpu; 520 521 assert_raw_spin_locked(&task_rq(p)->lock); 522 523 if (test_tsk_need_resched(p)) 524 return; 525 526 set_tsk_need_resched(p); 527 528 cpu = task_cpu(p); 529 if (cpu == smp_processor_id()) 530 return; 531 532 /* NEED_RESCHED must be visible before we test polling */ 533 smp_mb(); 534 if (!tsk_is_polling(p)) 535 smp_send_reschedule(cpu); 536 } 537 538 void resched_cpu(int cpu) 539 { 540 struct rq *rq = cpu_rq(cpu); 541 unsigned long flags; 542 543 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 544 return; 545 resched_task(cpu_curr(cpu)); 546 raw_spin_unlock_irqrestore(&rq->lock, flags); 547 } 548 549 #ifdef CONFIG_NO_HZ_COMMON 550 /* 551 * In the semi idle case, use the nearest busy cpu for migrating timers 552 * from an idle cpu. This is good for power-savings. 553 * 554 * We don't do similar optimization for completely idle system, as 555 * selecting an idle cpu will add more delays to the timers than intended 556 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 557 */ 558 int get_nohz_timer_target(void) 559 { 560 int cpu = smp_processor_id(); 561 int i; 562 struct sched_domain *sd; 563 564 rcu_read_lock(); 565 for_each_domain(cpu, sd) { 566 for_each_cpu(i, sched_domain_span(sd)) { 567 if (!idle_cpu(i)) { 568 cpu = i; 569 goto unlock; 570 } 571 } 572 } 573 unlock: 574 rcu_read_unlock(); 575 return cpu; 576 } 577 /* 578 * When add_timer_on() enqueues a timer into the timer wheel of an 579 * idle CPU then this timer might expire before the next timer event 580 * which is scheduled to wake up that CPU. In case of a completely 581 * idle system the next event might even be infinite time into the 582 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 583 * leaves the inner idle loop so the newly added timer is taken into 584 * account when the CPU goes back to idle and evaluates the timer 585 * wheel for the next timer event. 586 */ 587 static void wake_up_idle_cpu(int cpu) 588 { 589 struct rq *rq = cpu_rq(cpu); 590 591 if (cpu == smp_processor_id()) 592 return; 593 594 /* 595 * This is safe, as this function is called with the timer 596 * wheel base lock of (cpu) held. When the CPU is on the way 597 * to idle and has not yet set rq->curr to idle then it will 598 * be serialized on the timer wheel base lock and take the new 599 * timer into account automatically. 600 */ 601 if (rq->curr != rq->idle) 602 return; 603 604 /* 605 * We can set TIF_RESCHED on the idle task of the other CPU 606 * lockless. The worst case is that the other CPU runs the 607 * idle task through an additional NOOP schedule() 608 */ 609 set_tsk_need_resched(rq->idle); 610 611 /* NEED_RESCHED must be visible before we test polling */ 612 smp_mb(); 613 if (!tsk_is_polling(rq->idle)) 614 smp_send_reschedule(cpu); 615 } 616 617 static bool wake_up_full_nohz_cpu(int cpu) 618 { 619 if (tick_nohz_full_cpu(cpu)) { 620 if (cpu != smp_processor_id() || 621 tick_nohz_tick_stopped()) 622 smp_send_reschedule(cpu); 623 return true; 624 } 625 626 return false; 627 } 628 629 void wake_up_nohz_cpu(int cpu) 630 { 631 if (!wake_up_full_nohz_cpu(cpu)) 632 wake_up_idle_cpu(cpu); 633 } 634 635 static inline bool got_nohz_idle_kick(void) 636 { 637 int cpu = smp_processor_id(); 638 639 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 640 return false; 641 642 if (idle_cpu(cpu) && !need_resched()) 643 return true; 644 645 /* 646 * We can't run Idle Load Balance on this CPU for this time so we 647 * cancel it and clear NOHZ_BALANCE_KICK 648 */ 649 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 650 return false; 651 } 652 653 #else /* CONFIG_NO_HZ_COMMON */ 654 655 static inline bool got_nohz_idle_kick(void) 656 { 657 return false; 658 } 659 660 #endif /* CONFIG_NO_HZ_COMMON */ 661 662 #ifdef CONFIG_NO_HZ_FULL 663 bool sched_can_stop_tick(void) 664 { 665 struct rq *rq; 666 667 rq = this_rq(); 668 669 /* Make sure rq->nr_running update is visible after the IPI */ 670 smp_rmb(); 671 672 /* More than one running task need preemption */ 673 if (rq->nr_running > 1) 674 return false; 675 676 return true; 677 } 678 #endif /* CONFIG_NO_HZ_FULL */ 679 680 void sched_avg_update(struct rq *rq) 681 { 682 s64 period = sched_avg_period(); 683 684 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { 685 /* 686 * Inline assembly required to prevent the compiler 687 * optimising this loop into a divmod call. 688 * See __iter_div_u64_rem() for another example of this. 689 */ 690 asm("" : "+rm" (rq->age_stamp)); 691 rq->age_stamp += period; 692 rq->rt_avg /= 2; 693 } 694 } 695 696 #else /* !CONFIG_SMP */ 697 void resched_task(struct task_struct *p) 698 { 699 assert_raw_spin_locked(&task_rq(p)->lock); 700 set_tsk_need_resched(p); 701 } 702 #endif /* CONFIG_SMP */ 703 704 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 705 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 706 /* 707 * Iterate task_group tree rooted at *from, calling @down when first entering a 708 * node and @up when leaving it for the final time. 709 * 710 * Caller must hold rcu_lock or sufficient equivalent. 711 */ 712 int walk_tg_tree_from(struct task_group *from, 713 tg_visitor down, tg_visitor up, void *data) 714 { 715 struct task_group *parent, *child; 716 int ret; 717 718 parent = from; 719 720 down: 721 ret = (*down)(parent, data); 722 if (ret) 723 goto out; 724 list_for_each_entry_rcu(child, &parent->children, siblings) { 725 parent = child; 726 goto down; 727 728 up: 729 continue; 730 } 731 ret = (*up)(parent, data); 732 if (ret || parent == from) 733 goto out; 734 735 child = parent; 736 parent = parent->parent; 737 if (parent) 738 goto up; 739 out: 740 return ret; 741 } 742 743 int tg_nop(struct task_group *tg, void *data) 744 { 745 return 0; 746 } 747 #endif 748 749 static void set_load_weight(struct task_struct *p) 750 { 751 int prio = p->static_prio - MAX_RT_PRIO; 752 struct load_weight *load = &p->se.load; 753 754 /* 755 * SCHED_IDLE tasks get minimal weight: 756 */ 757 if (p->policy == SCHED_IDLE) { 758 load->weight = scale_load(WEIGHT_IDLEPRIO); 759 load->inv_weight = WMULT_IDLEPRIO; 760 return; 761 } 762 763 load->weight = scale_load(prio_to_weight[prio]); 764 load->inv_weight = prio_to_wmult[prio]; 765 } 766 767 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 768 { 769 update_rq_clock(rq); 770 sched_info_queued(p); 771 p->sched_class->enqueue_task(rq, p, flags); 772 } 773 774 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 775 { 776 update_rq_clock(rq); 777 sched_info_dequeued(p); 778 p->sched_class->dequeue_task(rq, p, flags); 779 } 780 781 void activate_task(struct rq *rq, struct task_struct *p, int flags) 782 { 783 if (task_contributes_to_load(p)) 784 rq->nr_uninterruptible--; 785 786 enqueue_task(rq, p, flags); 787 } 788 789 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 790 { 791 if (task_contributes_to_load(p)) 792 rq->nr_uninterruptible++; 793 794 dequeue_task(rq, p, flags); 795 } 796 797 static void update_rq_clock_task(struct rq *rq, s64 delta) 798 { 799 /* 800 * In theory, the compile should just see 0 here, and optimize out the call 801 * to sched_rt_avg_update. But I don't trust it... 802 */ 803 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 804 s64 steal = 0, irq_delta = 0; 805 #endif 806 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 807 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 808 809 /* 810 * Since irq_time is only updated on {soft,}irq_exit, we might run into 811 * this case when a previous update_rq_clock() happened inside a 812 * {soft,}irq region. 813 * 814 * When this happens, we stop ->clock_task and only update the 815 * prev_irq_time stamp to account for the part that fit, so that a next 816 * update will consume the rest. This ensures ->clock_task is 817 * monotonic. 818 * 819 * It does however cause some slight miss-attribution of {soft,}irq 820 * time, a more accurate solution would be to update the irq_time using 821 * the current rq->clock timestamp, except that would require using 822 * atomic ops. 823 */ 824 if (irq_delta > delta) 825 irq_delta = delta; 826 827 rq->prev_irq_time += irq_delta; 828 delta -= irq_delta; 829 #endif 830 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 831 if (static_key_false((¶virt_steal_rq_enabled))) { 832 u64 st; 833 834 steal = paravirt_steal_clock(cpu_of(rq)); 835 steal -= rq->prev_steal_time_rq; 836 837 if (unlikely(steal > delta)) 838 steal = delta; 839 840 st = steal_ticks(steal); 841 steal = st * TICK_NSEC; 842 843 rq->prev_steal_time_rq += steal; 844 845 delta -= steal; 846 } 847 #endif 848 849 rq->clock_task += delta; 850 851 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 852 if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) 853 sched_rt_avg_update(rq, irq_delta + steal); 854 #endif 855 } 856 857 void sched_set_stop_task(int cpu, struct task_struct *stop) 858 { 859 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 860 struct task_struct *old_stop = cpu_rq(cpu)->stop; 861 862 if (stop) { 863 /* 864 * Make it appear like a SCHED_FIFO task, its something 865 * userspace knows about and won't get confused about. 866 * 867 * Also, it will make PI more or less work without too 868 * much confusion -- but then, stop work should not 869 * rely on PI working anyway. 870 */ 871 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 872 873 stop->sched_class = &stop_sched_class; 874 } 875 876 cpu_rq(cpu)->stop = stop; 877 878 if (old_stop) { 879 /* 880 * Reset it back to a normal scheduling class so that 881 * it can die in pieces. 882 */ 883 old_stop->sched_class = &rt_sched_class; 884 } 885 } 886 887 /* 888 * __normal_prio - return the priority that is based on the static prio 889 */ 890 static inline int __normal_prio(struct task_struct *p) 891 { 892 return p->static_prio; 893 } 894 895 /* 896 * Calculate the expected normal priority: i.e. priority 897 * without taking RT-inheritance into account. Might be 898 * boosted by interactivity modifiers. Changes upon fork, 899 * setprio syscalls, and whenever the interactivity 900 * estimator recalculates. 901 */ 902 static inline int normal_prio(struct task_struct *p) 903 { 904 int prio; 905 906 if (task_has_rt_policy(p)) 907 prio = MAX_RT_PRIO-1 - p->rt_priority; 908 else 909 prio = __normal_prio(p); 910 return prio; 911 } 912 913 /* 914 * Calculate the current priority, i.e. the priority 915 * taken into account by the scheduler. This value might 916 * be boosted by RT tasks, or might be boosted by 917 * interactivity modifiers. Will be RT if the task got 918 * RT-boosted. If not then it returns p->normal_prio. 919 */ 920 static int effective_prio(struct task_struct *p) 921 { 922 p->normal_prio = normal_prio(p); 923 /* 924 * If we are RT tasks or we were boosted to RT priority, 925 * keep the priority unchanged. Otherwise, update priority 926 * to the normal priority: 927 */ 928 if (!rt_prio(p->prio)) 929 return p->normal_prio; 930 return p->prio; 931 } 932 933 /** 934 * task_curr - is this task currently executing on a CPU? 935 * @p: the task in question. 936 * 937 * Return: 1 if the task is currently executing. 0 otherwise. 938 */ 939 inline int task_curr(const struct task_struct *p) 940 { 941 return cpu_curr(task_cpu(p)) == p; 942 } 943 944 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 945 const struct sched_class *prev_class, 946 int oldprio) 947 { 948 if (prev_class != p->sched_class) { 949 if (prev_class->switched_from) 950 prev_class->switched_from(rq, p); 951 p->sched_class->switched_to(rq, p); 952 } else if (oldprio != p->prio) 953 p->sched_class->prio_changed(rq, p, oldprio); 954 } 955 956 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 957 { 958 const struct sched_class *class; 959 960 if (p->sched_class == rq->curr->sched_class) { 961 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 962 } else { 963 for_each_class(class) { 964 if (class == rq->curr->sched_class) 965 break; 966 if (class == p->sched_class) { 967 resched_task(rq->curr); 968 break; 969 } 970 } 971 } 972 973 /* 974 * A queue event has occurred, and we're going to schedule. In 975 * this case, we can save a useless back to back clock update. 976 */ 977 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 978 rq->skip_clock_update = 1; 979 } 980 981 static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); 982 983 void register_task_migration_notifier(struct notifier_block *n) 984 { 985 atomic_notifier_chain_register(&task_migration_notifier, n); 986 } 987 988 #ifdef CONFIG_SMP 989 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 990 { 991 #ifdef CONFIG_SCHED_DEBUG 992 /* 993 * We should never call set_task_cpu() on a blocked task, 994 * ttwu() will sort out the placement. 995 */ 996 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 997 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 998 999 #ifdef CONFIG_LOCKDEP 1000 /* 1001 * The caller should hold either p->pi_lock or rq->lock, when changing 1002 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1003 * 1004 * sched_move_task() holds both and thus holding either pins the cgroup, 1005 * see task_group(). 1006 * 1007 * Furthermore, all task_rq users should acquire both locks, see 1008 * task_rq_lock(). 1009 */ 1010 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 1011 lockdep_is_held(&task_rq(p)->lock))); 1012 #endif 1013 #endif 1014 1015 trace_sched_migrate_task(p, new_cpu); 1016 1017 if (task_cpu(p) != new_cpu) { 1018 struct task_migration_notifier tmn; 1019 1020 if (p->sched_class->migrate_task_rq) 1021 p->sched_class->migrate_task_rq(p, new_cpu); 1022 p->se.nr_migrations++; 1023 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1024 1025 tmn.task = p; 1026 tmn.from_cpu = task_cpu(p); 1027 tmn.to_cpu = new_cpu; 1028 1029 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); 1030 } 1031 1032 __set_task_cpu(p, new_cpu); 1033 } 1034 1035 struct migration_arg { 1036 struct task_struct *task; 1037 int dest_cpu; 1038 }; 1039 1040 static int migration_cpu_stop(void *data); 1041 1042 /* 1043 * wait_task_inactive - wait for a thread to unschedule. 1044 * 1045 * If @match_state is nonzero, it's the @p->state value just checked and 1046 * not expected to change. If it changes, i.e. @p might have woken up, 1047 * then return zero. When we succeed in waiting for @p to be off its CPU, 1048 * we return a positive number (its total switch count). If a second call 1049 * a short while later returns the same number, the caller can be sure that 1050 * @p has remained unscheduled the whole time. 1051 * 1052 * The caller must ensure that the task *will* unschedule sometime soon, 1053 * else this function might spin for a *long* time. This function can't 1054 * be called with interrupts off, or it may introduce deadlock with 1055 * smp_call_function() if an IPI is sent by the same process we are 1056 * waiting to become inactive. 1057 */ 1058 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1059 { 1060 unsigned long flags; 1061 int running, on_rq; 1062 unsigned long ncsw; 1063 struct rq *rq; 1064 1065 for (;;) { 1066 /* 1067 * We do the initial early heuristics without holding 1068 * any task-queue locks at all. We'll only try to get 1069 * the runqueue lock when things look like they will 1070 * work out! 1071 */ 1072 rq = task_rq(p); 1073 1074 /* 1075 * If the task is actively running on another CPU 1076 * still, just relax and busy-wait without holding 1077 * any locks. 1078 * 1079 * NOTE! Since we don't hold any locks, it's not 1080 * even sure that "rq" stays as the right runqueue! 1081 * But we don't care, since "task_running()" will 1082 * return false if the runqueue has changed and p 1083 * is actually now running somewhere else! 1084 */ 1085 while (task_running(rq, p)) { 1086 if (match_state && unlikely(p->state != match_state)) 1087 return 0; 1088 cpu_relax(); 1089 } 1090 1091 /* 1092 * Ok, time to look more closely! We need the rq 1093 * lock now, to be *sure*. If we're wrong, we'll 1094 * just go back and repeat. 1095 */ 1096 rq = task_rq_lock(p, &flags); 1097 trace_sched_wait_task(p); 1098 running = task_running(rq, p); 1099 on_rq = p->on_rq; 1100 ncsw = 0; 1101 if (!match_state || p->state == match_state) 1102 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1103 task_rq_unlock(rq, p, &flags); 1104 1105 /* 1106 * If it changed from the expected state, bail out now. 1107 */ 1108 if (unlikely(!ncsw)) 1109 break; 1110 1111 /* 1112 * Was it really running after all now that we 1113 * checked with the proper locks actually held? 1114 * 1115 * Oops. Go back and try again.. 1116 */ 1117 if (unlikely(running)) { 1118 cpu_relax(); 1119 continue; 1120 } 1121 1122 /* 1123 * It's not enough that it's not actively running, 1124 * it must be off the runqueue _entirely_, and not 1125 * preempted! 1126 * 1127 * So if it was still runnable (but just not actively 1128 * running right now), it's preempted, and we should 1129 * yield - it could be a while. 1130 */ 1131 if (unlikely(on_rq)) { 1132 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1133 1134 set_current_state(TASK_UNINTERRUPTIBLE); 1135 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1136 continue; 1137 } 1138 1139 /* 1140 * Ahh, all good. It wasn't running, and it wasn't 1141 * runnable, which means that it will never become 1142 * running in the future either. We're all done! 1143 */ 1144 break; 1145 } 1146 1147 return ncsw; 1148 } 1149 1150 /*** 1151 * kick_process - kick a running thread to enter/exit the kernel 1152 * @p: the to-be-kicked thread 1153 * 1154 * Cause a process which is running on another CPU to enter 1155 * kernel-mode, without any delay. (to get signals handled.) 1156 * 1157 * NOTE: this function doesn't have to take the runqueue lock, 1158 * because all it wants to ensure is that the remote task enters 1159 * the kernel. If the IPI races and the task has been migrated 1160 * to another CPU then no harm is done and the purpose has been 1161 * achieved as well. 1162 */ 1163 void kick_process(struct task_struct *p) 1164 { 1165 int cpu; 1166 1167 preempt_disable(); 1168 cpu = task_cpu(p); 1169 if ((cpu != smp_processor_id()) && task_curr(p)) 1170 smp_send_reschedule(cpu); 1171 preempt_enable(); 1172 } 1173 EXPORT_SYMBOL_GPL(kick_process); 1174 #endif /* CONFIG_SMP */ 1175 1176 #ifdef CONFIG_SMP 1177 /* 1178 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1179 */ 1180 static int select_fallback_rq(int cpu, struct task_struct *p) 1181 { 1182 int nid = cpu_to_node(cpu); 1183 const struct cpumask *nodemask = NULL; 1184 enum { cpuset, possible, fail } state = cpuset; 1185 int dest_cpu; 1186 1187 /* 1188 * If the node that the cpu is on has been offlined, cpu_to_node() 1189 * will return -1. There is no cpu on the node, and we should 1190 * select the cpu on the other node. 1191 */ 1192 if (nid != -1) { 1193 nodemask = cpumask_of_node(nid); 1194 1195 /* Look for allowed, online CPU in same node. */ 1196 for_each_cpu(dest_cpu, nodemask) { 1197 if (!cpu_online(dest_cpu)) 1198 continue; 1199 if (!cpu_active(dest_cpu)) 1200 continue; 1201 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1202 return dest_cpu; 1203 } 1204 } 1205 1206 for (;;) { 1207 /* Any allowed, online CPU? */ 1208 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1209 if (!cpu_online(dest_cpu)) 1210 continue; 1211 if (!cpu_active(dest_cpu)) 1212 continue; 1213 goto out; 1214 } 1215 1216 switch (state) { 1217 case cpuset: 1218 /* No more Mr. Nice Guy. */ 1219 cpuset_cpus_allowed_fallback(p); 1220 state = possible; 1221 break; 1222 1223 case possible: 1224 do_set_cpus_allowed(p, cpu_possible_mask); 1225 state = fail; 1226 break; 1227 1228 case fail: 1229 BUG(); 1230 break; 1231 } 1232 } 1233 1234 out: 1235 if (state != cpuset) { 1236 /* 1237 * Don't tell them about moving exiting tasks or 1238 * kernel threads (both mm NULL), since they never 1239 * leave kernel. 1240 */ 1241 if (p->mm && printk_ratelimit()) { 1242 printk_sched("process %d (%s) no longer affine to cpu%d\n", 1243 task_pid_nr(p), p->comm, cpu); 1244 } 1245 } 1246 1247 return dest_cpu; 1248 } 1249 1250 /* 1251 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1252 */ 1253 static inline 1254 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 1255 { 1256 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 1257 1258 /* 1259 * In order not to call set_task_cpu() on a blocking task we need 1260 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1261 * cpu. 1262 * 1263 * Since this is common to all placement strategies, this lives here. 1264 * 1265 * [ this allows ->select_task() to simply return task_cpu(p) and 1266 * not worry about this generic constraint ] 1267 */ 1268 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1269 !cpu_online(cpu))) 1270 cpu = select_fallback_rq(task_cpu(p), p); 1271 1272 return cpu; 1273 } 1274 1275 static void update_avg(u64 *avg, u64 sample) 1276 { 1277 s64 diff = sample - *avg; 1278 *avg += diff >> 3; 1279 } 1280 #endif 1281 1282 static void 1283 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1284 { 1285 #ifdef CONFIG_SCHEDSTATS 1286 struct rq *rq = this_rq(); 1287 1288 #ifdef CONFIG_SMP 1289 int this_cpu = smp_processor_id(); 1290 1291 if (cpu == this_cpu) { 1292 schedstat_inc(rq, ttwu_local); 1293 schedstat_inc(p, se.statistics.nr_wakeups_local); 1294 } else { 1295 struct sched_domain *sd; 1296 1297 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1298 rcu_read_lock(); 1299 for_each_domain(this_cpu, sd) { 1300 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1301 schedstat_inc(sd, ttwu_wake_remote); 1302 break; 1303 } 1304 } 1305 rcu_read_unlock(); 1306 } 1307 1308 if (wake_flags & WF_MIGRATED) 1309 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1310 1311 #endif /* CONFIG_SMP */ 1312 1313 schedstat_inc(rq, ttwu_count); 1314 schedstat_inc(p, se.statistics.nr_wakeups); 1315 1316 if (wake_flags & WF_SYNC) 1317 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1318 1319 #endif /* CONFIG_SCHEDSTATS */ 1320 } 1321 1322 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1323 { 1324 activate_task(rq, p, en_flags); 1325 p->on_rq = 1; 1326 1327 /* if a worker is waking up, notify workqueue */ 1328 if (p->flags & PF_WQ_WORKER) 1329 wq_worker_waking_up(p, cpu_of(rq)); 1330 } 1331 1332 /* 1333 * Mark the task runnable and perform wakeup-preemption. 1334 */ 1335 static void 1336 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1337 { 1338 check_preempt_curr(rq, p, wake_flags); 1339 trace_sched_wakeup(p, true); 1340 1341 p->state = TASK_RUNNING; 1342 #ifdef CONFIG_SMP 1343 if (p->sched_class->task_woken) 1344 p->sched_class->task_woken(rq, p); 1345 1346 if (rq->idle_stamp) { 1347 u64 delta = rq_clock(rq) - rq->idle_stamp; 1348 u64 max = 2*sysctl_sched_migration_cost; 1349 1350 if (delta > max) 1351 rq->avg_idle = max; 1352 else 1353 update_avg(&rq->avg_idle, delta); 1354 rq->idle_stamp = 0; 1355 } 1356 #endif 1357 } 1358 1359 static void 1360 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 1361 { 1362 #ifdef CONFIG_SMP 1363 if (p->sched_contributes_to_load) 1364 rq->nr_uninterruptible--; 1365 #endif 1366 1367 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 1368 ttwu_do_wakeup(rq, p, wake_flags); 1369 } 1370 1371 /* 1372 * Called in case the task @p isn't fully descheduled from its runqueue, 1373 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1374 * since all we need to do is flip p->state to TASK_RUNNING, since 1375 * the task is still ->on_rq. 1376 */ 1377 static int ttwu_remote(struct task_struct *p, int wake_flags) 1378 { 1379 struct rq *rq; 1380 int ret = 0; 1381 1382 rq = __task_rq_lock(p); 1383 if (p->on_rq) { 1384 /* check_preempt_curr() may use rq clock */ 1385 update_rq_clock(rq); 1386 ttwu_do_wakeup(rq, p, wake_flags); 1387 ret = 1; 1388 } 1389 __task_rq_unlock(rq); 1390 1391 return ret; 1392 } 1393 1394 #ifdef CONFIG_SMP 1395 static void sched_ttwu_pending(void) 1396 { 1397 struct rq *rq = this_rq(); 1398 struct llist_node *llist = llist_del_all(&rq->wake_list); 1399 struct task_struct *p; 1400 1401 raw_spin_lock(&rq->lock); 1402 1403 while (llist) { 1404 p = llist_entry(llist, struct task_struct, wake_entry); 1405 llist = llist_next(llist); 1406 ttwu_do_activate(rq, p, 0); 1407 } 1408 1409 raw_spin_unlock(&rq->lock); 1410 } 1411 1412 void scheduler_ipi(void) 1413 { 1414 if (llist_empty(&this_rq()->wake_list) 1415 && !tick_nohz_full_cpu(smp_processor_id()) 1416 && !got_nohz_idle_kick()) 1417 return; 1418 1419 /* 1420 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1421 * traditionally all their work was done from the interrupt return 1422 * path. Now that we actually do some work, we need to make sure 1423 * we do call them. 1424 * 1425 * Some archs already do call them, luckily irq_enter/exit nest 1426 * properly. 1427 * 1428 * Arguably we should visit all archs and update all handlers, 1429 * however a fair share of IPIs are still resched only so this would 1430 * somewhat pessimize the simple resched case. 1431 */ 1432 irq_enter(); 1433 tick_nohz_full_check(); 1434 sched_ttwu_pending(); 1435 1436 /* 1437 * Check if someone kicked us for doing the nohz idle load balance. 1438 */ 1439 if (unlikely(got_nohz_idle_kick())) { 1440 this_rq()->idle_balance = 1; 1441 raise_softirq_irqoff(SCHED_SOFTIRQ); 1442 } 1443 irq_exit(); 1444 } 1445 1446 static void ttwu_queue_remote(struct task_struct *p, int cpu) 1447 { 1448 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) 1449 smp_send_reschedule(cpu); 1450 } 1451 1452 bool cpus_share_cache(int this_cpu, int that_cpu) 1453 { 1454 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1455 } 1456 #endif /* CONFIG_SMP */ 1457 1458 static void ttwu_queue(struct task_struct *p, int cpu) 1459 { 1460 struct rq *rq = cpu_rq(cpu); 1461 1462 #if defined(CONFIG_SMP) 1463 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1464 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1465 ttwu_queue_remote(p, cpu); 1466 return; 1467 } 1468 #endif 1469 1470 raw_spin_lock(&rq->lock); 1471 ttwu_do_activate(rq, p, 0); 1472 raw_spin_unlock(&rq->lock); 1473 } 1474 1475 /** 1476 * try_to_wake_up - wake up a thread 1477 * @p: the thread to be awakened 1478 * @state: the mask of task states that can be woken 1479 * @wake_flags: wake modifier flags (WF_*) 1480 * 1481 * Put it on the run-queue if it's not already there. The "current" 1482 * thread is always on the run-queue (except when the actual 1483 * re-schedule is in progress), and as such you're allowed to do 1484 * the simpler "current->state = TASK_RUNNING" to mark yourself 1485 * runnable without the overhead of this. 1486 * 1487 * Return: %true if @p was woken up, %false if it was already running. 1488 * or @state didn't match @p's state. 1489 */ 1490 static int 1491 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1492 { 1493 unsigned long flags; 1494 int cpu, success = 0; 1495 1496 /* 1497 * If we are going to wake up a thread waiting for CONDITION we 1498 * need to ensure that CONDITION=1 done by the caller can not be 1499 * reordered with p->state check below. This pairs with mb() in 1500 * set_current_state() the waiting thread does. 1501 */ 1502 smp_mb__before_spinlock(); 1503 raw_spin_lock_irqsave(&p->pi_lock, flags); 1504 if (!(p->state & state)) 1505 goto out; 1506 1507 success = 1; /* we're going to change ->state */ 1508 cpu = task_cpu(p); 1509 1510 if (p->on_rq && ttwu_remote(p, wake_flags)) 1511 goto stat; 1512 1513 #ifdef CONFIG_SMP 1514 /* 1515 * If the owning (remote) cpu is still in the middle of schedule() with 1516 * this task as prev, wait until its done referencing the task. 1517 */ 1518 while (p->on_cpu) 1519 cpu_relax(); 1520 /* 1521 * Pairs with the smp_wmb() in finish_lock_switch(). 1522 */ 1523 smp_rmb(); 1524 1525 p->sched_contributes_to_load = !!task_contributes_to_load(p); 1526 p->state = TASK_WAKING; 1527 1528 if (p->sched_class->task_waking) 1529 p->sched_class->task_waking(p); 1530 1531 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 1532 if (task_cpu(p) != cpu) { 1533 wake_flags |= WF_MIGRATED; 1534 set_task_cpu(p, cpu); 1535 } 1536 #endif /* CONFIG_SMP */ 1537 1538 ttwu_queue(p, cpu); 1539 stat: 1540 ttwu_stat(p, cpu, wake_flags); 1541 out: 1542 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1543 1544 return success; 1545 } 1546 1547 /** 1548 * try_to_wake_up_local - try to wake up a local task with rq lock held 1549 * @p: the thread to be awakened 1550 * 1551 * Put @p on the run-queue if it's not already there. The caller must 1552 * ensure that this_rq() is locked, @p is bound to this_rq() and not 1553 * the current task. 1554 */ 1555 static void try_to_wake_up_local(struct task_struct *p) 1556 { 1557 struct rq *rq = task_rq(p); 1558 1559 if (WARN_ON_ONCE(rq != this_rq()) || 1560 WARN_ON_ONCE(p == current)) 1561 return; 1562 1563 lockdep_assert_held(&rq->lock); 1564 1565 if (!raw_spin_trylock(&p->pi_lock)) { 1566 raw_spin_unlock(&rq->lock); 1567 raw_spin_lock(&p->pi_lock); 1568 raw_spin_lock(&rq->lock); 1569 } 1570 1571 if (!(p->state & TASK_NORMAL)) 1572 goto out; 1573 1574 if (!p->on_rq) 1575 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1576 1577 ttwu_do_wakeup(rq, p, 0); 1578 ttwu_stat(p, smp_processor_id(), 0); 1579 out: 1580 raw_spin_unlock(&p->pi_lock); 1581 } 1582 1583 /** 1584 * wake_up_process - Wake up a specific process 1585 * @p: The process to be woken up. 1586 * 1587 * Attempt to wake up the nominated process and move it to the set of runnable 1588 * processes. 1589 * 1590 * Return: 1 if the process was woken up, 0 if it was already running. 1591 * 1592 * It may be assumed that this function implies a write memory barrier before 1593 * changing the task state if and only if any tasks are woken up. 1594 */ 1595 int wake_up_process(struct task_struct *p) 1596 { 1597 WARN_ON(task_is_stopped_or_traced(p)); 1598 return try_to_wake_up(p, TASK_NORMAL, 0); 1599 } 1600 EXPORT_SYMBOL(wake_up_process); 1601 1602 int wake_up_state(struct task_struct *p, unsigned int state) 1603 { 1604 return try_to_wake_up(p, state, 0); 1605 } 1606 1607 /* 1608 * Perform scheduler related setup for a newly forked process p. 1609 * p is forked by current. 1610 * 1611 * __sched_fork() is basic setup used by init_idle() too: 1612 */ 1613 static void __sched_fork(struct task_struct *p) 1614 { 1615 p->on_rq = 0; 1616 1617 p->se.on_rq = 0; 1618 p->se.exec_start = 0; 1619 p->se.sum_exec_runtime = 0; 1620 p->se.prev_sum_exec_runtime = 0; 1621 p->se.nr_migrations = 0; 1622 p->se.vruntime = 0; 1623 INIT_LIST_HEAD(&p->se.group_node); 1624 1625 #ifdef CONFIG_SCHEDSTATS 1626 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1627 #endif 1628 1629 INIT_LIST_HEAD(&p->rt.run_list); 1630 1631 #ifdef CONFIG_PREEMPT_NOTIFIERS 1632 INIT_HLIST_HEAD(&p->preempt_notifiers); 1633 #endif 1634 1635 #ifdef CONFIG_NUMA_BALANCING 1636 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1637 p->mm->numa_next_scan = jiffies; 1638 p->mm->numa_next_reset = jiffies; 1639 p->mm->numa_scan_seq = 0; 1640 } 1641 1642 p->node_stamp = 0ULL; 1643 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1644 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; 1645 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1646 p->numa_work.next = &p->numa_work; 1647 #endif /* CONFIG_NUMA_BALANCING */ 1648 } 1649 1650 #ifdef CONFIG_NUMA_BALANCING 1651 #ifdef CONFIG_SCHED_DEBUG 1652 void set_numabalancing_state(bool enabled) 1653 { 1654 if (enabled) 1655 sched_feat_set("NUMA"); 1656 else 1657 sched_feat_set("NO_NUMA"); 1658 } 1659 #else 1660 __read_mostly bool numabalancing_enabled; 1661 1662 void set_numabalancing_state(bool enabled) 1663 { 1664 numabalancing_enabled = enabled; 1665 } 1666 #endif /* CONFIG_SCHED_DEBUG */ 1667 #endif /* CONFIG_NUMA_BALANCING */ 1668 1669 /* 1670 * fork()/clone()-time setup: 1671 */ 1672 void sched_fork(struct task_struct *p) 1673 { 1674 unsigned long flags; 1675 int cpu = get_cpu(); 1676 1677 __sched_fork(p); 1678 /* 1679 * We mark the process as running here. This guarantees that 1680 * nobody will actually run it, and a signal or other external 1681 * event cannot wake it up and insert it on the runqueue either. 1682 */ 1683 p->state = TASK_RUNNING; 1684 1685 /* 1686 * Make sure we do not leak PI boosting priority to the child. 1687 */ 1688 p->prio = current->normal_prio; 1689 1690 /* 1691 * Revert to default priority/policy on fork if requested. 1692 */ 1693 if (unlikely(p->sched_reset_on_fork)) { 1694 if (task_has_rt_policy(p)) { 1695 p->policy = SCHED_NORMAL; 1696 p->static_prio = NICE_TO_PRIO(0); 1697 p->rt_priority = 0; 1698 } else if (PRIO_TO_NICE(p->static_prio) < 0) 1699 p->static_prio = NICE_TO_PRIO(0); 1700 1701 p->prio = p->normal_prio = __normal_prio(p); 1702 set_load_weight(p); 1703 1704 /* 1705 * We don't need the reset flag anymore after the fork. It has 1706 * fulfilled its duty: 1707 */ 1708 p->sched_reset_on_fork = 0; 1709 } 1710 1711 if (!rt_prio(p->prio)) 1712 p->sched_class = &fair_sched_class; 1713 1714 if (p->sched_class->task_fork) 1715 p->sched_class->task_fork(p); 1716 1717 /* 1718 * The child is not yet in the pid-hash so no cgroup attach races, 1719 * and the cgroup is pinned to this child due to cgroup_fork() 1720 * is ran before sched_fork(). 1721 * 1722 * Silence PROVE_RCU. 1723 */ 1724 raw_spin_lock_irqsave(&p->pi_lock, flags); 1725 set_task_cpu(p, cpu); 1726 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1727 1728 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1729 if (likely(sched_info_on())) 1730 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1731 #endif 1732 #if defined(CONFIG_SMP) 1733 p->on_cpu = 0; 1734 #endif 1735 #ifdef CONFIG_PREEMPT_COUNT 1736 /* Want to start with kernel preemption disabled. */ 1737 task_thread_info(p)->preempt_count = 1; 1738 #endif 1739 #ifdef CONFIG_SMP 1740 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1741 #endif 1742 1743 put_cpu(); 1744 } 1745 1746 /* 1747 * wake_up_new_task - wake up a newly created task for the first time. 1748 * 1749 * This function will do some initial scheduler statistics housekeeping 1750 * that must be done for every newly created context, then puts the task 1751 * on the runqueue and wakes it. 1752 */ 1753 void wake_up_new_task(struct task_struct *p) 1754 { 1755 unsigned long flags; 1756 struct rq *rq; 1757 1758 raw_spin_lock_irqsave(&p->pi_lock, flags); 1759 #ifdef CONFIG_SMP 1760 /* 1761 * Fork balancing, do it here and not earlier because: 1762 * - cpus_allowed can change in the fork path 1763 * - any previously selected cpu might disappear through hotplug 1764 */ 1765 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1766 #endif 1767 1768 /* Initialize new task's runnable average */ 1769 init_task_runnable_average(p); 1770 rq = __task_rq_lock(p); 1771 activate_task(rq, p, 0); 1772 p->on_rq = 1; 1773 trace_sched_wakeup_new(p, true); 1774 check_preempt_curr(rq, p, WF_FORK); 1775 #ifdef CONFIG_SMP 1776 if (p->sched_class->task_woken) 1777 p->sched_class->task_woken(rq, p); 1778 #endif 1779 task_rq_unlock(rq, p, &flags); 1780 } 1781 1782 #ifdef CONFIG_PREEMPT_NOTIFIERS 1783 1784 /** 1785 * preempt_notifier_register - tell me when current is being preempted & rescheduled 1786 * @notifier: notifier struct to register 1787 */ 1788 void preempt_notifier_register(struct preempt_notifier *notifier) 1789 { 1790 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 1791 } 1792 EXPORT_SYMBOL_GPL(preempt_notifier_register); 1793 1794 /** 1795 * preempt_notifier_unregister - no longer interested in preemption notifications 1796 * @notifier: notifier struct to unregister 1797 * 1798 * This is safe to call from within a preemption notifier. 1799 */ 1800 void preempt_notifier_unregister(struct preempt_notifier *notifier) 1801 { 1802 hlist_del(¬ifier->link); 1803 } 1804 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 1805 1806 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1807 { 1808 struct preempt_notifier *notifier; 1809 1810 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 1811 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 1812 } 1813 1814 static void 1815 fire_sched_out_preempt_notifiers(struct task_struct *curr, 1816 struct task_struct *next) 1817 { 1818 struct preempt_notifier *notifier; 1819 1820 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 1821 notifier->ops->sched_out(notifier, next); 1822 } 1823 1824 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 1825 1826 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1827 { 1828 } 1829 1830 static void 1831 fire_sched_out_preempt_notifiers(struct task_struct *curr, 1832 struct task_struct *next) 1833 { 1834 } 1835 1836 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 1837 1838 /** 1839 * prepare_task_switch - prepare to switch tasks 1840 * @rq: the runqueue preparing to switch 1841 * @prev: the current task that is being switched out 1842 * @next: the task we are going to switch to. 1843 * 1844 * This is called with the rq lock held and interrupts off. It must 1845 * be paired with a subsequent finish_task_switch after the context 1846 * switch. 1847 * 1848 * prepare_task_switch sets up locking and calls architecture specific 1849 * hooks. 1850 */ 1851 static inline void 1852 prepare_task_switch(struct rq *rq, struct task_struct *prev, 1853 struct task_struct *next) 1854 { 1855 trace_sched_switch(prev, next); 1856 sched_info_switch(prev, next); 1857 perf_event_task_sched_out(prev, next); 1858 fire_sched_out_preempt_notifiers(prev, next); 1859 prepare_lock_switch(rq, next); 1860 prepare_arch_switch(next); 1861 } 1862 1863 /** 1864 * finish_task_switch - clean up after a task-switch 1865 * @rq: runqueue associated with task-switch 1866 * @prev: the thread we just switched away from. 1867 * 1868 * finish_task_switch must be called after the context switch, paired 1869 * with a prepare_task_switch call before the context switch. 1870 * finish_task_switch will reconcile locking set up by prepare_task_switch, 1871 * and do any other architecture-specific cleanup actions. 1872 * 1873 * Note that we may have delayed dropping an mm in context_switch(). If 1874 * so, we finish that here outside of the runqueue lock. (Doing it 1875 * with the lock held can cause deadlocks; see schedule() for 1876 * details.) 1877 */ 1878 static void finish_task_switch(struct rq *rq, struct task_struct *prev) 1879 __releases(rq->lock) 1880 { 1881 struct mm_struct *mm = rq->prev_mm; 1882 long prev_state; 1883 1884 rq->prev_mm = NULL; 1885 1886 /* 1887 * A task struct has one reference for the use as "current". 1888 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 1889 * schedule one last time. The schedule call will never return, and 1890 * the scheduled task must drop that reference. 1891 * The test for TASK_DEAD must occur while the runqueue locks are 1892 * still held, otherwise prev could be scheduled on another cpu, die 1893 * there before we look at prev->state, and then the reference would 1894 * be dropped twice. 1895 * Manfred Spraul <manfred@colorfullife.com> 1896 */ 1897 prev_state = prev->state; 1898 vtime_task_switch(prev); 1899 finish_arch_switch(prev); 1900 perf_event_task_sched_in(prev, current); 1901 finish_lock_switch(rq, prev); 1902 finish_arch_post_lock_switch(); 1903 1904 fire_sched_in_preempt_notifiers(current); 1905 if (mm) 1906 mmdrop(mm); 1907 if (unlikely(prev_state == TASK_DEAD)) { 1908 /* 1909 * Remove function-return probe instances associated with this 1910 * task and put them back on the free list. 1911 */ 1912 kprobe_flush_task(prev); 1913 put_task_struct(prev); 1914 } 1915 1916 tick_nohz_task_switch(current); 1917 } 1918 1919 #ifdef CONFIG_SMP 1920 1921 /* assumes rq->lock is held */ 1922 static inline void pre_schedule(struct rq *rq, struct task_struct *prev) 1923 { 1924 if (prev->sched_class->pre_schedule) 1925 prev->sched_class->pre_schedule(rq, prev); 1926 } 1927 1928 /* rq->lock is NOT held, but preemption is disabled */ 1929 static inline void post_schedule(struct rq *rq) 1930 { 1931 if (rq->post_schedule) { 1932 unsigned long flags; 1933 1934 raw_spin_lock_irqsave(&rq->lock, flags); 1935 if (rq->curr->sched_class->post_schedule) 1936 rq->curr->sched_class->post_schedule(rq); 1937 raw_spin_unlock_irqrestore(&rq->lock, flags); 1938 1939 rq->post_schedule = 0; 1940 } 1941 } 1942 1943 #else 1944 1945 static inline void pre_schedule(struct rq *rq, struct task_struct *p) 1946 { 1947 } 1948 1949 static inline void post_schedule(struct rq *rq) 1950 { 1951 } 1952 1953 #endif 1954 1955 /** 1956 * schedule_tail - first thing a freshly forked thread must call. 1957 * @prev: the thread we just switched away from. 1958 */ 1959 asmlinkage void schedule_tail(struct task_struct *prev) 1960 __releases(rq->lock) 1961 { 1962 struct rq *rq = this_rq(); 1963 1964 finish_task_switch(rq, prev); 1965 1966 /* 1967 * FIXME: do we need to worry about rq being invalidated by the 1968 * task_switch? 1969 */ 1970 post_schedule(rq); 1971 1972 #ifdef __ARCH_WANT_UNLOCKED_CTXSW 1973 /* In this case, finish_task_switch does not reenable preemption */ 1974 preempt_enable(); 1975 #endif 1976 if (current->set_child_tid) 1977 put_user(task_pid_vnr(current), current->set_child_tid); 1978 } 1979 1980 /* 1981 * context_switch - switch to the new MM and the new 1982 * thread's register state. 1983 */ 1984 static inline void 1985 context_switch(struct rq *rq, struct task_struct *prev, 1986 struct task_struct *next) 1987 { 1988 struct mm_struct *mm, *oldmm; 1989 1990 prepare_task_switch(rq, prev, next); 1991 1992 mm = next->mm; 1993 oldmm = prev->active_mm; 1994 /* 1995 * For paravirt, this is coupled with an exit in switch_to to 1996 * combine the page table reload and the switch backend into 1997 * one hypercall. 1998 */ 1999 arch_start_context_switch(prev); 2000 2001 if (!mm) { 2002 next->active_mm = oldmm; 2003 atomic_inc(&oldmm->mm_count); 2004 enter_lazy_tlb(oldmm, next); 2005 } else 2006 switch_mm(oldmm, mm, next); 2007 2008 if (!prev->mm) { 2009 prev->active_mm = NULL; 2010 rq->prev_mm = oldmm; 2011 } 2012 /* 2013 * Since the runqueue lock will be released by the next 2014 * task (which is an invalid locking op but in the case 2015 * of the scheduler it's an obvious special-case), so we 2016 * do an early lockdep release here: 2017 */ 2018 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 2019 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2020 #endif 2021 2022 context_tracking_task_switch(prev, next); 2023 /* Here we just switch the register state and the stack. */ 2024 switch_to(prev, next, prev); 2025 2026 barrier(); 2027 /* 2028 * this_rq must be evaluated again because prev may have moved 2029 * CPUs since it called schedule(), thus the 'rq' on its stack 2030 * frame will be invalid. 2031 */ 2032 finish_task_switch(this_rq(), prev); 2033 } 2034 2035 /* 2036 * nr_running and nr_context_switches: 2037 * 2038 * externally visible scheduler statistics: current number of runnable 2039 * threads, total number of context switches performed since bootup. 2040 */ 2041 unsigned long nr_running(void) 2042 { 2043 unsigned long i, sum = 0; 2044 2045 for_each_online_cpu(i) 2046 sum += cpu_rq(i)->nr_running; 2047 2048 return sum; 2049 } 2050 2051 unsigned long long nr_context_switches(void) 2052 { 2053 int i; 2054 unsigned long long sum = 0; 2055 2056 for_each_possible_cpu(i) 2057 sum += cpu_rq(i)->nr_switches; 2058 2059 return sum; 2060 } 2061 2062 unsigned long nr_iowait(void) 2063 { 2064 unsigned long i, sum = 0; 2065 2066 for_each_possible_cpu(i) 2067 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2068 2069 return sum; 2070 } 2071 2072 unsigned long nr_iowait_cpu(int cpu) 2073 { 2074 struct rq *this = cpu_rq(cpu); 2075 return atomic_read(&this->nr_iowait); 2076 } 2077 2078 #ifdef CONFIG_SMP 2079 2080 /* 2081 * sched_exec - execve() is a valuable balancing opportunity, because at 2082 * this point the task has the smallest effective memory and cache footprint. 2083 */ 2084 void sched_exec(void) 2085 { 2086 struct task_struct *p = current; 2087 unsigned long flags; 2088 int dest_cpu; 2089 2090 raw_spin_lock_irqsave(&p->pi_lock, flags); 2091 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 2092 if (dest_cpu == smp_processor_id()) 2093 goto unlock; 2094 2095 if (likely(cpu_active(dest_cpu))) { 2096 struct migration_arg arg = { p, dest_cpu }; 2097 2098 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2099 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2100 return; 2101 } 2102 unlock: 2103 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2104 } 2105 2106 #endif 2107 2108 DEFINE_PER_CPU(struct kernel_stat, kstat); 2109 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2110 2111 EXPORT_PER_CPU_SYMBOL(kstat); 2112 EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2113 2114 /* 2115 * Return any ns on the sched_clock that have not yet been accounted in 2116 * @p in case that task is currently running. 2117 * 2118 * Called with task_rq_lock() held on @rq. 2119 */ 2120 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 2121 { 2122 u64 ns = 0; 2123 2124 if (task_current(rq, p)) { 2125 update_rq_clock(rq); 2126 ns = rq_clock_task(rq) - p->se.exec_start; 2127 if ((s64)ns < 0) 2128 ns = 0; 2129 } 2130 2131 return ns; 2132 } 2133 2134 unsigned long long task_delta_exec(struct task_struct *p) 2135 { 2136 unsigned long flags; 2137 struct rq *rq; 2138 u64 ns = 0; 2139 2140 rq = task_rq_lock(p, &flags); 2141 ns = do_task_delta_exec(p, rq); 2142 task_rq_unlock(rq, p, &flags); 2143 2144 return ns; 2145 } 2146 2147 /* 2148 * Return accounted runtime for the task. 2149 * In case the task is currently running, return the runtime plus current's 2150 * pending runtime that have not been accounted yet. 2151 */ 2152 unsigned long long task_sched_runtime(struct task_struct *p) 2153 { 2154 unsigned long flags; 2155 struct rq *rq; 2156 u64 ns = 0; 2157 2158 rq = task_rq_lock(p, &flags); 2159 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2160 task_rq_unlock(rq, p, &flags); 2161 2162 return ns; 2163 } 2164 2165 /* 2166 * This function gets called by the timer code, with HZ frequency. 2167 * We call it with interrupts disabled. 2168 */ 2169 void scheduler_tick(void) 2170 { 2171 int cpu = smp_processor_id(); 2172 struct rq *rq = cpu_rq(cpu); 2173 struct task_struct *curr = rq->curr; 2174 2175 sched_clock_tick(); 2176 2177 raw_spin_lock(&rq->lock); 2178 update_rq_clock(rq); 2179 curr->sched_class->task_tick(rq, curr, 0); 2180 update_cpu_load_active(rq); 2181 raw_spin_unlock(&rq->lock); 2182 2183 perf_event_task_tick(); 2184 2185 #ifdef CONFIG_SMP 2186 rq->idle_balance = idle_cpu(cpu); 2187 trigger_load_balance(rq, cpu); 2188 #endif 2189 rq_last_tick_reset(rq); 2190 } 2191 2192 #ifdef CONFIG_NO_HZ_FULL 2193 /** 2194 * scheduler_tick_max_deferment 2195 * 2196 * Keep at least one tick per second when a single 2197 * active task is running because the scheduler doesn't 2198 * yet completely support full dynticks environment. 2199 * 2200 * This makes sure that uptime, CFS vruntime, load 2201 * balancing, etc... continue to move forward, even 2202 * with a very low granularity. 2203 * 2204 * Return: Maximum deferment in nanoseconds. 2205 */ 2206 u64 scheduler_tick_max_deferment(void) 2207 { 2208 struct rq *rq = this_rq(); 2209 unsigned long next, now = ACCESS_ONCE(jiffies); 2210 2211 next = rq->last_sched_tick + HZ; 2212 2213 if (time_before_eq(next, now)) 2214 return 0; 2215 2216 return jiffies_to_usecs(next - now) * NSEC_PER_USEC; 2217 } 2218 #endif 2219 2220 notrace unsigned long get_parent_ip(unsigned long addr) 2221 { 2222 if (in_lock_functions(addr)) { 2223 addr = CALLER_ADDR2; 2224 if (in_lock_functions(addr)) 2225 addr = CALLER_ADDR3; 2226 } 2227 return addr; 2228 } 2229 2230 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2231 defined(CONFIG_PREEMPT_TRACER)) 2232 2233 void __kprobes add_preempt_count(int val) 2234 { 2235 #ifdef CONFIG_DEBUG_PREEMPT 2236 /* 2237 * Underflow? 2238 */ 2239 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2240 return; 2241 #endif 2242 preempt_count() += val; 2243 #ifdef CONFIG_DEBUG_PREEMPT 2244 /* 2245 * Spinlock count overflowing soon? 2246 */ 2247 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2248 PREEMPT_MASK - 10); 2249 #endif 2250 if (preempt_count() == val) 2251 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2252 } 2253 EXPORT_SYMBOL(add_preempt_count); 2254 2255 void __kprobes sub_preempt_count(int val) 2256 { 2257 #ifdef CONFIG_DEBUG_PREEMPT 2258 /* 2259 * Underflow? 2260 */ 2261 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 2262 return; 2263 /* 2264 * Is the spinlock portion underflowing? 2265 */ 2266 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 2267 !(preempt_count() & PREEMPT_MASK))) 2268 return; 2269 #endif 2270 2271 if (preempt_count() == val) 2272 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2273 preempt_count() -= val; 2274 } 2275 EXPORT_SYMBOL(sub_preempt_count); 2276 2277 #endif 2278 2279 /* 2280 * Print scheduling while atomic bug: 2281 */ 2282 static noinline void __schedule_bug(struct task_struct *prev) 2283 { 2284 if (oops_in_progress) 2285 return; 2286 2287 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 2288 prev->comm, prev->pid, preempt_count()); 2289 2290 debug_show_held_locks(prev); 2291 print_modules(); 2292 if (irqs_disabled()) 2293 print_irqtrace_events(prev); 2294 dump_stack(); 2295 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2296 } 2297 2298 /* 2299 * Various schedule()-time debugging checks and statistics: 2300 */ 2301 static inline void schedule_debug(struct task_struct *prev) 2302 { 2303 /* 2304 * Test if we are atomic. Since do_exit() needs to call into 2305 * schedule() atomically, we ignore that path for now. 2306 * Otherwise, whine if we are scheduling when we should not be. 2307 */ 2308 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 2309 __schedule_bug(prev); 2310 rcu_sleep_check(); 2311 2312 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2313 2314 schedstat_inc(this_rq(), sched_count); 2315 } 2316 2317 static void put_prev_task(struct rq *rq, struct task_struct *prev) 2318 { 2319 if (prev->on_rq || rq->skip_clock_update < 0) 2320 update_rq_clock(rq); 2321 prev->sched_class->put_prev_task(rq, prev); 2322 } 2323 2324 /* 2325 * Pick up the highest-prio task: 2326 */ 2327 static inline struct task_struct * 2328 pick_next_task(struct rq *rq) 2329 { 2330 const struct sched_class *class; 2331 struct task_struct *p; 2332 2333 /* 2334 * Optimization: we know that if all tasks are in 2335 * the fair class we can call that function directly: 2336 */ 2337 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 2338 p = fair_sched_class.pick_next_task(rq); 2339 if (likely(p)) 2340 return p; 2341 } 2342 2343 for_each_class(class) { 2344 p = class->pick_next_task(rq); 2345 if (p) 2346 return p; 2347 } 2348 2349 BUG(); /* the idle class will always have a runnable task */ 2350 } 2351 2352 /* 2353 * __schedule() is the main scheduler function. 2354 * 2355 * The main means of driving the scheduler and thus entering this function are: 2356 * 2357 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 2358 * 2359 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 2360 * paths. For example, see arch/x86/entry_64.S. 2361 * 2362 * To drive preemption between tasks, the scheduler sets the flag in timer 2363 * interrupt handler scheduler_tick(). 2364 * 2365 * 3. Wakeups don't really cause entry into schedule(). They add a 2366 * task to the run-queue and that's it. 2367 * 2368 * Now, if the new task added to the run-queue preempts the current 2369 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 2370 * called on the nearest possible occasion: 2371 * 2372 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 2373 * 2374 * - in syscall or exception context, at the next outmost 2375 * preempt_enable(). (this might be as soon as the wake_up()'s 2376 * spin_unlock()!) 2377 * 2378 * - in IRQ context, return from interrupt-handler to 2379 * preemptible context 2380 * 2381 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 2382 * then at the next: 2383 * 2384 * - cond_resched() call 2385 * - explicit schedule() call 2386 * - return from syscall or exception to user-space 2387 * - return from interrupt-handler to user-space 2388 */ 2389 static void __sched __schedule(void) 2390 { 2391 struct task_struct *prev, *next; 2392 unsigned long *switch_count; 2393 struct rq *rq; 2394 int cpu; 2395 2396 need_resched: 2397 preempt_disable(); 2398 cpu = smp_processor_id(); 2399 rq = cpu_rq(cpu); 2400 rcu_note_context_switch(cpu); 2401 prev = rq->curr; 2402 2403 schedule_debug(prev); 2404 2405 if (sched_feat(HRTICK)) 2406 hrtick_clear(rq); 2407 2408 /* 2409 * Make sure that signal_pending_state()->signal_pending() below 2410 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 2411 * done by the caller to avoid the race with signal_wake_up(). 2412 */ 2413 smp_mb__before_spinlock(); 2414 raw_spin_lock_irq(&rq->lock); 2415 2416 switch_count = &prev->nivcsw; 2417 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2418 if (unlikely(signal_pending_state(prev->state, prev))) { 2419 prev->state = TASK_RUNNING; 2420 } else { 2421 deactivate_task(rq, prev, DEQUEUE_SLEEP); 2422 prev->on_rq = 0; 2423 2424 /* 2425 * If a worker went to sleep, notify and ask workqueue 2426 * whether it wants to wake up a task to maintain 2427 * concurrency. 2428 */ 2429 if (prev->flags & PF_WQ_WORKER) { 2430 struct task_struct *to_wakeup; 2431 2432 to_wakeup = wq_worker_sleeping(prev, cpu); 2433 if (to_wakeup) 2434 try_to_wake_up_local(to_wakeup); 2435 } 2436 } 2437 switch_count = &prev->nvcsw; 2438 } 2439 2440 pre_schedule(rq, prev); 2441 2442 if (unlikely(!rq->nr_running)) 2443 idle_balance(cpu, rq); 2444 2445 put_prev_task(rq, prev); 2446 next = pick_next_task(rq); 2447 clear_tsk_need_resched(prev); 2448 rq->skip_clock_update = 0; 2449 2450 if (likely(prev != next)) { 2451 rq->nr_switches++; 2452 rq->curr = next; 2453 ++*switch_count; 2454 2455 context_switch(rq, prev, next); /* unlocks the rq */ 2456 /* 2457 * The context switch have flipped the stack from under us 2458 * and restored the local variables which were saved when 2459 * this task called schedule() in the past. prev == current 2460 * is still correct, but it can be moved to another cpu/rq. 2461 */ 2462 cpu = smp_processor_id(); 2463 rq = cpu_rq(cpu); 2464 } else 2465 raw_spin_unlock_irq(&rq->lock); 2466 2467 post_schedule(rq); 2468 2469 sched_preempt_enable_no_resched(); 2470 if (need_resched()) 2471 goto need_resched; 2472 } 2473 2474 static inline void sched_submit_work(struct task_struct *tsk) 2475 { 2476 if (!tsk->state || tsk_is_pi_blocked(tsk)) 2477 return; 2478 /* 2479 * If we are going to sleep and we have plugged IO queued, 2480 * make sure to submit it to avoid deadlocks. 2481 */ 2482 if (blk_needs_flush_plug(tsk)) 2483 blk_schedule_flush_plug(tsk); 2484 } 2485 2486 asmlinkage void __sched schedule(void) 2487 { 2488 struct task_struct *tsk = current; 2489 2490 sched_submit_work(tsk); 2491 __schedule(); 2492 } 2493 EXPORT_SYMBOL(schedule); 2494 2495 #ifdef CONFIG_CONTEXT_TRACKING 2496 asmlinkage void __sched schedule_user(void) 2497 { 2498 /* 2499 * If we come here after a random call to set_need_resched(), 2500 * or we have been woken up remotely but the IPI has not yet arrived, 2501 * we haven't yet exited the RCU idle mode. Do it here manually until 2502 * we find a better solution. 2503 */ 2504 user_exit(); 2505 schedule(); 2506 user_enter(); 2507 } 2508 #endif 2509 2510 /** 2511 * schedule_preempt_disabled - called with preemption disabled 2512 * 2513 * Returns with preemption disabled. Note: preempt_count must be 1 2514 */ 2515 void __sched schedule_preempt_disabled(void) 2516 { 2517 sched_preempt_enable_no_resched(); 2518 schedule(); 2519 preempt_disable(); 2520 } 2521 2522 #ifdef CONFIG_PREEMPT 2523 /* 2524 * this is the entry point to schedule() from in-kernel preemption 2525 * off of preempt_enable. Kernel preemptions off return from interrupt 2526 * occur there and call schedule directly. 2527 */ 2528 asmlinkage void __sched notrace preempt_schedule(void) 2529 { 2530 struct thread_info *ti = current_thread_info(); 2531 2532 /* 2533 * If there is a non-zero preempt_count or interrupts are disabled, 2534 * we do not want to preempt the current task. Just return.. 2535 */ 2536 if (likely(ti->preempt_count || irqs_disabled())) 2537 return; 2538 2539 do { 2540 add_preempt_count_notrace(PREEMPT_ACTIVE); 2541 __schedule(); 2542 sub_preempt_count_notrace(PREEMPT_ACTIVE); 2543 2544 /* 2545 * Check again in case we missed a preemption opportunity 2546 * between schedule and now. 2547 */ 2548 barrier(); 2549 } while (need_resched()); 2550 } 2551 EXPORT_SYMBOL(preempt_schedule); 2552 2553 /* 2554 * this is the entry point to schedule() from kernel preemption 2555 * off of irq context. 2556 * Note, that this is called and return with irqs disabled. This will 2557 * protect us against recursive calling from irq. 2558 */ 2559 asmlinkage void __sched preempt_schedule_irq(void) 2560 { 2561 struct thread_info *ti = current_thread_info(); 2562 enum ctx_state prev_state; 2563 2564 /* Catch callers which need to be fixed */ 2565 BUG_ON(ti->preempt_count || !irqs_disabled()); 2566 2567 prev_state = exception_enter(); 2568 2569 do { 2570 add_preempt_count(PREEMPT_ACTIVE); 2571 local_irq_enable(); 2572 __schedule(); 2573 local_irq_disable(); 2574 sub_preempt_count(PREEMPT_ACTIVE); 2575 2576 /* 2577 * Check again in case we missed a preemption opportunity 2578 * between schedule and now. 2579 */ 2580 barrier(); 2581 } while (need_resched()); 2582 2583 exception_exit(prev_state); 2584 } 2585 2586 #endif /* CONFIG_PREEMPT */ 2587 2588 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 2589 void *key) 2590 { 2591 return try_to_wake_up(curr->private, mode, wake_flags); 2592 } 2593 EXPORT_SYMBOL(default_wake_function); 2594 2595 /* 2596 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 2597 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 2598 * number) then we wake all the non-exclusive tasks and one exclusive task. 2599 * 2600 * There are circumstances in which we can try to wake a task which has already 2601 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 2602 * zero in this (rare) case, and we handle it by continuing to scan the queue. 2603 */ 2604 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 2605 int nr_exclusive, int wake_flags, void *key) 2606 { 2607 wait_queue_t *curr, *next; 2608 2609 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 2610 unsigned flags = curr->flags; 2611 2612 if (curr->func(curr, mode, wake_flags, key) && 2613 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 2614 break; 2615 } 2616 } 2617 2618 /** 2619 * __wake_up - wake up threads blocked on a waitqueue. 2620 * @q: the waitqueue 2621 * @mode: which threads 2622 * @nr_exclusive: how many wake-one or wake-many threads to wake up 2623 * @key: is directly passed to the wakeup function 2624 * 2625 * It may be assumed that this function implies a write memory barrier before 2626 * changing the task state if and only if any tasks are woken up. 2627 */ 2628 void __wake_up(wait_queue_head_t *q, unsigned int mode, 2629 int nr_exclusive, void *key) 2630 { 2631 unsigned long flags; 2632 2633 spin_lock_irqsave(&q->lock, flags); 2634 __wake_up_common(q, mode, nr_exclusive, 0, key); 2635 spin_unlock_irqrestore(&q->lock, flags); 2636 } 2637 EXPORT_SYMBOL(__wake_up); 2638 2639 /* 2640 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 2641 */ 2642 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) 2643 { 2644 __wake_up_common(q, mode, nr, 0, NULL); 2645 } 2646 EXPORT_SYMBOL_GPL(__wake_up_locked); 2647 2648 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 2649 { 2650 __wake_up_common(q, mode, 1, 0, key); 2651 } 2652 EXPORT_SYMBOL_GPL(__wake_up_locked_key); 2653 2654 /** 2655 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 2656 * @q: the waitqueue 2657 * @mode: which threads 2658 * @nr_exclusive: how many wake-one or wake-many threads to wake up 2659 * @key: opaque value to be passed to wakeup targets 2660 * 2661 * The sync wakeup differs that the waker knows that it will schedule 2662 * away soon, so while the target thread will be woken up, it will not 2663 * be migrated to another CPU - ie. the two threads are 'synchronized' 2664 * with each other. This can prevent needless bouncing between CPUs. 2665 * 2666 * On UP it can prevent extra preemption. 2667 * 2668 * It may be assumed that this function implies a write memory barrier before 2669 * changing the task state if and only if any tasks are woken up. 2670 */ 2671 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 2672 int nr_exclusive, void *key) 2673 { 2674 unsigned long flags; 2675 int wake_flags = WF_SYNC; 2676 2677 if (unlikely(!q)) 2678 return; 2679 2680 if (unlikely(!nr_exclusive)) 2681 wake_flags = 0; 2682 2683 spin_lock_irqsave(&q->lock, flags); 2684 __wake_up_common(q, mode, nr_exclusive, wake_flags, key); 2685 spin_unlock_irqrestore(&q->lock, flags); 2686 } 2687 EXPORT_SYMBOL_GPL(__wake_up_sync_key); 2688 2689 /* 2690 * __wake_up_sync - see __wake_up_sync_key() 2691 */ 2692 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 2693 { 2694 __wake_up_sync_key(q, mode, nr_exclusive, NULL); 2695 } 2696 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 2697 2698 /** 2699 * complete: - signals a single thread waiting on this completion 2700 * @x: holds the state of this particular completion 2701 * 2702 * This will wake up a single thread waiting on this completion. Threads will be 2703 * awakened in the same order in which they were queued. 2704 * 2705 * See also complete_all(), wait_for_completion() and related routines. 2706 * 2707 * It may be assumed that this function implies a write memory barrier before 2708 * changing the task state if and only if any tasks are woken up. 2709 */ 2710 void complete(struct completion *x) 2711 { 2712 unsigned long flags; 2713 2714 spin_lock_irqsave(&x->wait.lock, flags); 2715 x->done++; 2716 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); 2717 spin_unlock_irqrestore(&x->wait.lock, flags); 2718 } 2719 EXPORT_SYMBOL(complete); 2720 2721 /** 2722 * complete_all: - signals all threads waiting on this completion 2723 * @x: holds the state of this particular completion 2724 * 2725 * This will wake up all threads waiting on this particular completion event. 2726 * 2727 * It may be assumed that this function implies a write memory barrier before 2728 * changing the task state if and only if any tasks are woken up. 2729 */ 2730 void complete_all(struct completion *x) 2731 { 2732 unsigned long flags; 2733 2734 spin_lock_irqsave(&x->wait.lock, flags); 2735 x->done += UINT_MAX/2; 2736 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); 2737 spin_unlock_irqrestore(&x->wait.lock, flags); 2738 } 2739 EXPORT_SYMBOL(complete_all); 2740 2741 static inline long __sched 2742 do_wait_for_common(struct completion *x, 2743 long (*action)(long), long timeout, int state) 2744 { 2745 if (!x->done) { 2746 DECLARE_WAITQUEUE(wait, current); 2747 2748 __add_wait_queue_tail_exclusive(&x->wait, &wait); 2749 do { 2750 if (signal_pending_state(state, current)) { 2751 timeout = -ERESTARTSYS; 2752 break; 2753 } 2754 __set_current_state(state); 2755 spin_unlock_irq(&x->wait.lock); 2756 timeout = action(timeout); 2757 spin_lock_irq(&x->wait.lock); 2758 } while (!x->done && timeout); 2759 __remove_wait_queue(&x->wait, &wait); 2760 if (!x->done) 2761 return timeout; 2762 } 2763 x->done--; 2764 return timeout ?: 1; 2765 } 2766 2767 static inline long __sched 2768 __wait_for_common(struct completion *x, 2769 long (*action)(long), long timeout, int state) 2770 { 2771 might_sleep(); 2772 2773 spin_lock_irq(&x->wait.lock); 2774 timeout = do_wait_for_common(x, action, timeout, state); 2775 spin_unlock_irq(&x->wait.lock); 2776 return timeout; 2777 } 2778 2779 static long __sched 2780 wait_for_common(struct completion *x, long timeout, int state) 2781 { 2782 return __wait_for_common(x, schedule_timeout, timeout, state); 2783 } 2784 2785 static long __sched 2786 wait_for_common_io(struct completion *x, long timeout, int state) 2787 { 2788 return __wait_for_common(x, io_schedule_timeout, timeout, state); 2789 } 2790 2791 /** 2792 * wait_for_completion: - waits for completion of a task 2793 * @x: holds the state of this particular completion 2794 * 2795 * This waits to be signaled for completion of a specific task. It is NOT 2796 * interruptible and there is no timeout. 2797 * 2798 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout 2799 * and interrupt capability. Also see complete(). 2800 */ 2801 void __sched wait_for_completion(struct completion *x) 2802 { 2803 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 2804 } 2805 EXPORT_SYMBOL(wait_for_completion); 2806 2807 /** 2808 * wait_for_completion_timeout: - waits for completion of a task (w/timeout) 2809 * @x: holds the state of this particular completion 2810 * @timeout: timeout value in jiffies 2811 * 2812 * This waits for either a completion of a specific task to be signaled or for a 2813 * specified timeout to expire. The timeout is in jiffies. It is not 2814 * interruptible. 2815 * 2816 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left 2817 * till timeout) if completed. 2818 */ 2819 unsigned long __sched 2820 wait_for_completion_timeout(struct completion *x, unsigned long timeout) 2821 { 2822 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); 2823 } 2824 EXPORT_SYMBOL(wait_for_completion_timeout); 2825 2826 /** 2827 * wait_for_completion_io: - waits for completion of a task 2828 * @x: holds the state of this particular completion 2829 * 2830 * This waits to be signaled for completion of a specific task. It is NOT 2831 * interruptible and there is no timeout. The caller is accounted as waiting 2832 * for IO. 2833 */ 2834 void __sched wait_for_completion_io(struct completion *x) 2835 { 2836 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 2837 } 2838 EXPORT_SYMBOL(wait_for_completion_io); 2839 2840 /** 2841 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) 2842 * @x: holds the state of this particular completion 2843 * @timeout: timeout value in jiffies 2844 * 2845 * This waits for either a completion of a specific task to be signaled or for a 2846 * specified timeout to expire. The timeout is in jiffies. It is not 2847 * interruptible. The caller is accounted as waiting for IO. 2848 * 2849 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left 2850 * till timeout) if completed. 2851 */ 2852 unsigned long __sched 2853 wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) 2854 { 2855 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); 2856 } 2857 EXPORT_SYMBOL(wait_for_completion_io_timeout); 2858 2859 /** 2860 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 2861 * @x: holds the state of this particular completion 2862 * 2863 * This waits for completion of a specific task to be signaled. It is 2864 * interruptible. 2865 * 2866 * Return: -ERESTARTSYS if interrupted, 0 if completed. 2867 */ 2868 int __sched wait_for_completion_interruptible(struct completion *x) 2869 { 2870 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 2871 if (t == -ERESTARTSYS) 2872 return t; 2873 return 0; 2874 } 2875 EXPORT_SYMBOL(wait_for_completion_interruptible); 2876 2877 /** 2878 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) 2879 * @x: holds the state of this particular completion 2880 * @timeout: timeout value in jiffies 2881 * 2882 * This waits for either a completion of a specific task to be signaled or for a 2883 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 2884 * 2885 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, 2886 * or number of jiffies left till timeout) if completed. 2887 */ 2888 long __sched 2889 wait_for_completion_interruptible_timeout(struct completion *x, 2890 unsigned long timeout) 2891 { 2892 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); 2893 } 2894 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 2895 2896 /** 2897 * wait_for_completion_killable: - waits for completion of a task (killable) 2898 * @x: holds the state of this particular completion 2899 * 2900 * This waits to be signaled for completion of a specific task. It can be 2901 * interrupted by a kill signal. 2902 * 2903 * Return: -ERESTARTSYS if interrupted, 0 if completed. 2904 */ 2905 int __sched wait_for_completion_killable(struct completion *x) 2906 { 2907 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 2908 if (t == -ERESTARTSYS) 2909 return t; 2910 return 0; 2911 } 2912 EXPORT_SYMBOL(wait_for_completion_killable); 2913 2914 /** 2915 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) 2916 * @x: holds the state of this particular completion 2917 * @timeout: timeout value in jiffies 2918 * 2919 * This waits for either a completion of a specific task to be 2920 * signaled or for a specified timeout to expire. It can be 2921 * interrupted by a kill signal. The timeout is in jiffies. 2922 * 2923 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, 2924 * or number of jiffies left till timeout) if completed. 2925 */ 2926 long __sched 2927 wait_for_completion_killable_timeout(struct completion *x, 2928 unsigned long timeout) 2929 { 2930 return wait_for_common(x, timeout, TASK_KILLABLE); 2931 } 2932 EXPORT_SYMBOL(wait_for_completion_killable_timeout); 2933 2934 /** 2935 * try_wait_for_completion - try to decrement a completion without blocking 2936 * @x: completion structure 2937 * 2938 * Return: 0 if a decrement cannot be done without blocking 2939 * 1 if a decrement succeeded. 2940 * 2941 * If a completion is being used as a counting completion, 2942 * attempt to decrement the counter without blocking. This 2943 * enables us to avoid waiting if the resource the completion 2944 * is protecting is not available. 2945 */ 2946 bool try_wait_for_completion(struct completion *x) 2947 { 2948 unsigned long flags; 2949 int ret = 1; 2950 2951 spin_lock_irqsave(&x->wait.lock, flags); 2952 if (!x->done) 2953 ret = 0; 2954 else 2955 x->done--; 2956 spin_unlock_irqrestore(&x->wait.lock, flags); 2957 return ret; 2958 } 2959 EXPORT_SYMBOL(try_wait_for_completion); 2960 2961 /** 2962 * completion_done - Test to see if a completion has any waiters 2963 * @x: completion structure 2964 * 2965 * Return: 0 if there are waiters (wait_for_completion() in progress) 2966 * 1 if there are no waiters. 2967 * 2968 */ 2969 bool completion_done(struct completion *x) 2970 { 2971 unsigned long flags; 2972 int ret = 1; 2973 2974 spin_lock_irqsave(&x->wait.lock, flags); 2975 if (!x->done) 2976 ret = 0; 2977 spin_unlock_irqrestore(&x->wait.lock, flags); 2978 return ret; 2979 } 2980 EXPORT_SYMBOL(completion_done); 2981 2982 static long __sched 2983 sleep_on_common(wait_queue_head_t *q, int state, long timeout) 2984 { 2985 unsigned long flags; 2986 wait_queue_t wait; 2987 2988 init_waitqueue_entry(&wait, current); 2989 2990 __set_current_state(state); 2991 2992 spin_lock_irqsave(&q->lock, flags); 2993 __add_wait_queue(q, &wait); 2994 spin_unlock(&q->lock); 2995 timeout = schedule_timeout(timeout); 2996 spin_lock_irq(&q->lock); 2997 __remove_wait_queue(q, &wait); 2998 spin_unlock_irqrestore(&q->lock, flags); 2999 3000 return timeout; 3001 } 3002 3003 void __sched interruptible_sleep_on(wait_queue_head_t *q) 3004 { 3005 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 3006 } 3007 EXPORT_SYMBOL(interruptible_sleep_on); 3008 3009 long __sched 3010 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3011 { 3012 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); 3013 } 3014 EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3015 3016 void __sched sleep_on(wait_queue_head_t *q) 3017 { 3018 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 3019 } 3020 EXPORT_SYMBOL(sleep_on); 3021 3022 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3023 { 3024 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); 3025 } 3026 EXPORT_SYMBOL(sleep_on_timeout); 3027 3028 #ifdef CONFIG_RT_MUTEXES 3029 3030 /* 3031 * rt_mutex_setprio - set the current priority of a task 3032 * @p: task 3033 * @prio: prio value (kernel-internal form) 3034 * 3035 * This function changes the 'effective' priority of a task. It does 3036 * not touch ->normal_prio like __setscheduler(). 3037 * 3038 * Used by the rt_mutex code to implement priority inheritance logic. 3039 */ 3040 void rt_mutex_setprio(struct task_struct *p, int prio) 3041 { 3042 int oldprio, on_rq, running; 3043 struct rq *rq; 3044 const struct sched_class *prev_class; 3045 3046 BUG_ON(prio < 0 || prio > MAX_PRIO); 3047 3048 rq = __task_rq_lock(p); 3049 3050 /* 3051 * Idle task boosting is a nono in general. There is one 3052 * exception, when PREEMPT_RT and NOHZ is active: 3053 * 3054 * The idle task calls get_next_timer_interrupt() and holds 3055 * the timer wheel base->lock on the CPU and another CPU wants 3056 * to access the timer (probably to cancel it). We can safely 3057 * ignore the boosting request, as the idle CPU runs this code 3058 * with interrupts disabled and will complete the lock 3059 * protected section without being interrupted. So there is no 3060 * real need to boost. 3061 */ 3062 if (unlikely(p == rq->idle)) { 3063 WARN_ON(p != rq->curr); 3064 WARN_ON(p->pi_blocked_on); 3065 goto out_unlock; 3066 } 3067 3068 trace_sched_pi_setprio(p, prio); 3069 oldprio = p->prio; 3070 prev_class = p->sched_class; 3071 on_rq = p->on_rq; 3072 running = task_current(rq, p); 3073 if (on_rq) 3074 dequeue_task(rq, p, 0); 3075 if (running) 3076 p->sched_class->put_prev_task(rq, p); 3077 3078 if (rt_prio(prio)) 3079 p->sched_class = &rt_sched_class; 3080 else 3081 p->sched_class = &fair_sched_class; 3082 3083 p->prio = prio; 3084 3085 if (running) 3086 p->sched_class->set_curr_task(rq); 3087 if (on_rq) 3088 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 3089 3090 check_class_changed(rq, p, prev_class, oldprio); 3091 out_unlock: 3092 __task_rq_unlock(rq); 3093 } 3094 #endif 3095 void set_user_nice(struct task_struct *p, long nice) 3096 { 3097 int old_prio, delta, on_rq; 3098 unsigned long flags; 3099 struct rq *rq; 3100 3101 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3102 return; 3103 /* 3104 * We have to be careful, if called from sys_setpriority(), 3105 * the task might be in the middle of scheduling on another CPU. 3106 */ 3107 rq = task_rq_lock(p, &flags); 3108 /* 3109 * The RT priorities are set via sched_setscheduler(), but we still 3110 * allow the 'normal' nice value to be set - but as expected 3111 * it wont have any effect on scheduling until the task is 3112 * SCHED_FIFO/SCHED_RR: 3113 */ 3114 if (task_has_rt_policy(p)) { 3115 p->static_prio = NICE_TO_PRIO(nice); 3116 goto out_unlock; 3117 } 3118 on_rq = p->on_rq; 3119 if (on_rq) 3120 dequeue_task(rq, p, 0); 3121 3122 p->static_prio = NICE_TO_PRIO(nice); 3123 set_load_weight(p); 3124 old_prio = p->prio; 3125 p->prio = effective_prio(p); 3126 delta = p->prio - old_prio; 3127 3128 if (on_rq) { 3129 enqueue_task(rq, p, 0); 3130 /* 3131 * If the task increased its priority or is running and 3132 * lowered its priority, then reschedule its CPU: 3133 */ 3134 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3135 resched_task(rq->curr); 3136 } 3137 out_unlock: 3138 task_rq_unlock(rq, p, &flags); 3139 } 3140 EXPORT_SYMBOL(set_user_nice); 3141 3142 /* 3143 * can_nice - check if a task can reduce its nice value 3144 * @p: task 3145 * @nice: nice value 3146 */ 3147 int can_nice(const struct task_struct *p, const int nice) 3148 { 3149 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3150 int nice_rlim = 20 - nice; 3151 3152 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3153 capable(CAP_SYS_NICE)); 3154 } 3155 3156 #ifdef __ARCH_WANT_SYS_NICE 3157 3158 /* 3159 * sys_nice - change the priority of the current process. 3160 * @increment: priority increment 3161 * 3162 * sys_setpriority is a more generic, but much slower function that 3163 * does similar things. 3164 */ 3165 SYSCALL_DEFINE1(nice, int, increment) 3166 { 3167 long nice, retval; 3168 3169 /* 3170 * Setpriority might change our priority at the same moment. 3171 * We don't have to worry. Conceptually one call occurs first 3172 * and we have a single winner. 3173 */ 3174 if (increment < -40) 3175 increment = -40; 3176 if (increment > 40) 3177 increment = 40; 3178 3179 nice = TASK_NICE(current) + increment; 3180 if (nice < -20) 3181 nice = -20; 3182 if (nice > 19) 3183 nice = 19; 3184 3185 if (increment < 0 && !can_nice(current, nice)) 3186 return -EPERM; 3187 3188 retval = security_task_setnice(current, nice); 3189 if (retval) 3190 return retval; 3191 3192 set_user_nice(current, nice); 3193 return 0; 3194 } 3195 3196 #endif 3197 3198 /** 3199 * task_prio - return the priority value of a given task. 3200 * @p: the task in question. 3201 * 3202 * Return: The priority value as seen by users in /proc. 3203 * RT tasks are offset by -200. Normal tasks are centered 3204 * around 0, value goes from -16 to +15. 3205 */ 3206 int task_prio(const struct task_struct *p) 3207 { 3208 return p->prio - MAX_RT_PRIO; 3209 } 3210 3211 /** 3212 * task_nice - return the nice value of a given task. 3213 * @p: the task in question. 3214 * 3215 * Return: The nice value [ -20 ... 0 ... 19 ]. 3216 */ 3217 int task_nice(const struct task_struct *p) 3218 { 3219 return TASK_NICE(p); 3220 } 3221 EXPORT_SYMBOL(task_nice); 3222 3223 /** 3224 * idle_cpu - is a given cpu idle currently? 3225 * @cpu: the processor in question. 3226 * 3227 * Return: 1 if the CPU is currently idle. 0 otherwise. 3228 */ 3229 int idle_cpu(int cpu) 3230 { 3231 struct rq *rq = cpu_rq(cpu); 3232 3233 if (rq->curr != rq->idle) 3234 return 0; 3235 3236 if (rq->nr_running) 3237 return 0; 3238 3239 #ifdef CONFIG_SMP 3240 if (!llist_empty(&rq->wake_list)) 3241 return 0; 3242 #endif 3243 3244 return 1; 3245 } 3246 3247 /** 3248 * idle_task - return the idle task for a given cpu. 3249 * @cpu: the processor in question. 3250 * 3251 * Return: The idle task for the cpu @cpu. 3252 */ 3253 struct task_struct *idle_task(int cpu) 3254 { 3255 return cpu_rq(cpu)->idle; 3256 } 3257 3258 /** 3259 * find_process_by_pid - find a process with a matching PID value. 3260 * @pid: the pid in question. 3261 * 3262 * The task of @pid, if found. %NULL otherwise. 3263 */ 3264 static struct task_struct *find_process_by_pid(pid_t pid) 3265 { 3266 return pid ? find_task_by_vpid(pid) : current; 3267 } 3268 3269 /* Actually do priority change: must hold rq lock. */ 3270 static void 3271 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3272 { 3273 p->policy = policy; 3274 p->rt_priority = prio; 3275 p->normal_prio = normal_prio(p); 3276 /* we are holding p->pi_lock already */ 3277 p->prio = rt_mutex_getprio(p); 3278 if (rt_prio(p->prio)) 3279 p->sched_class = &rt_sched_class; 3280 else 3281 p->sched_class = &fair_sched_class; 3282 set_load_weight(p); 3283 } 3284 3285 /* 3286 * check the target process has a UID that matches the current process's 3287 */ 3288 static bool check_same_owner(struct task_struct *p) 3289 { 3290 const struct cred *cred = current_cred(), *pcred; 3291 bool match; 3292 3293 rcu_read_lock(); 3294 pcred = __task_cred(p); 3295 match = (uid_eq(cred->euid, pcred->euid) || 3296 uid_eq(cred->euid, pcred->uid)); 3297 rcu_read_unlock(); 3298 return match; 3299 } 3300 3301 static int __sched_setscheduler(struct task_struct *p, int policy, 3302 const struct sched_param *param, bool user) 3303 { 3304 int retval, oldprio, oldpolicy = -1, on_rq, running; 3305 unsigned long flags; 3306 const struct sched_class *prev_class; 3307 struct rq *rq; 3308 int reset_on_fork; 3309 3310 /* may grab non-irq protected spin_locks */ 3311 BUG_ON(in_interrupt()); 3312 recheck: 3313 /* double check policy once rq lock held */ 3314 if (policy < 0) { 3315 reset_on_fork = p->sched_reset_on_fork; 3316 policy = oldpolicy = p->policy; 3317 } else { 3318 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 3319 policy &= ~SCHED_RESET_ON_FORK; 3320 3321 if (policy != SCHED_FIFO && policy != SCHED_RR && 3322 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3323 policy != SCHED_IDLE) 3324 return -EINVAL; 3325 } 3326 3327 /* 3328 * Valid priorities for SCHED_FIFO and SCHED_RR are 3329 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3330 * SCHED_BATCH and SCHED_IDLE is 0. 3331 */ 3332 if (param->sched_priority < 0 || 3333 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3334 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3335 return -EINVAL; 3336 if (rt_policy(policy) != (param->sched_priority != 0)) 3337 return -EINVAL; 3338 3339 /* 3340 * Allow unprivileged RT tasks to decrease priority: 3341 */ 3342 if (user && !capable(CAP_SYS_NICE)) { 3343 if (rt_policy(policy)) { 3344 unsigned long rlim_rtprio = 3345 task_rlimit(p, RLIMIT_RTPRIO); 3346 3347 /* can't set/change the rt policy */ 3348 if (policy != p->policy && !rlim_rtprio) 3349 return -EPERM; 3350 3351 /* can't increase priority */ 3352 if (param->sched_priority > p->rt_priority && 3353 param->sched_priority > rlim_rtprio) 3354 return -EPERM; 3355 } 3356 3357 /* 3358 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3359 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3360 */ 3361 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3362 if (!can_nice(p, TASK_NICE(p))) 3363 return -EPERM; 3364 } 3365 3366 /* can't change other user's priorities */ 3367 if (!check_same_owner(p)) 3368 return -EPERM; 3369 3370 /* Normal users shall not reset the sched_reset_on_fork flag */ 3371 if (p->sched_reset_on_fork && !reset_on_fork) 3372 return -EPERM; 3373 } 3374 3375 if (user) { 3376 retval = security_task_setscheduler(p); 3377 if (retval) 3378 return retval; 3379 } 3380 3381 /* 3382 * make sure no PI-waiters arrive (or leave) while we are 3383 * changing the priority of the task: 3384 * 3385 * To be able to change p->policy safely, the appropriate 3386 * runqueue lock must be held. 3387 */ 3388 rq = task_rq_lock(p, &flags); 3389 3390 /* 3391 * Changing the policy of the stop threads its a very bad idea 3392 */ 3393 if (p == rq->stop) { 3394 task_rq_unlock(rq, p, &flags); 3395 return -EINVAL; 3396 } 3397 3398 /* 3399 * If not changing anything there's no need to proceed further: 3400 */ 3401 if (unlikely(policy == p->policy && (!rt_policy(policy) || 3402 param->sched_priority == p->rt_priority))) { 3403 task_rq_unlock(rq, p, &flags); 3404 return 0; 3405 } 3406 3407 #ifdef CONFIG_RT_GROUP_SCHED 3408 if (user) { 3409 /* 3410 * Do not allow realtime tasks into groups that have no runtime 3411 * assigned. 3412 */ 3413 if (rt_bandwidth_enabled() && rt_policy(policy) && 3414 task_group(p)->rt_bandwidth.rt_runtime == 0 && 3415 !task_group_is_autogroup(task_group(p))) { 3416 task_rq_unlock(rq, p, &flags); 3417 return -EPERM; 3418 } 3419 } 3420 #endif 3421 3422 /* recheck policy now with rq lock held */ 3423 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3424 policy = oldpolicy = -1; 3425 task_rq_unlock(rq, p, &flags); 3426 goto recheck; 3427 } 3428 on_rq = p->on_rq; 3429 running = task_current(rq, p); 3430 if (on_rq) 3431 dequeue_task(rq, p, 0); 3432 if (running) 3433 p->sched_class->put_prev_task(rq, p); 3434 3435 p->sched_reset_on_fork = reset_on_fork; 3436 3437 oldprio = p->prio; 3438 prev_class = p->sched_class; 3439 __setscheduler(rq, p, policy, param->sched_priority); 3440 3441 if (running) 3442 p->sched_class->set_curr_task(rq); 3443 if (on_rq) 3444 enqueue_task(rq, p, 0); 3445 3446 check_class_changed(rq, p, prev_class, oldprio); 3447 task_rq_unlock(rq, p, &flags); 3448 3449 rt_mutex_adjust_pi(p); 3450 3451 return 0; 3452 } 3453 3454 /** 3455 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3456 * @p: the task in question. 3457 * @policy: new policy. 3458 * @param: structure containing the new RT priority. 3459 * 3460 * Return: 0 on success. An error code otherwise. 3461 * 3462 * NOTE that the task may be already dead. 3463 */ 3464 int sched_setscheduler(struct task_struct *p, int policy, 3465 const struct sched_param *param) 3466 { 3467 return __sched_setscheduler(p, policy, param, true); 3468 } 3469 EXPORT_SYMBOL_GPL(sched_setscheduler); 3470 3471 /** 3472 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3473 * @p: the task in question. 3474 * @policy: new policy. 3475 * @param: structure containing the new RT priority. 3476 * 3477 * Just like sched_setscheduler, only don't bother checking if the 3478 * current context has permission. For example, this is needed in 3479 * stop_machine(): we create temporary high priority worker threads, 3480 * but our caller might not have that capability. 3481 * 3482 * Return: 0 on success. An error code otherwise. 3483 */ 3484 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3485 const struct sched_param *param) 3486 { 3487 return __sched_setscheduler(p, policy, param, false); 3488 } 3489 3490 static int 3491 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3492 { 3493 struct sched_param lparam; 3494 struct task_struct *p; 3495 int retval; 3496 3497 if (!param || pid < 0) 3498 return -EINVAL; 3499 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 3500 return -EFAULT; 3501 3502 rcu_read_lock(); 3503 retval = -ESRCH; 3504 p = find_process_by_pid(pid); 3505 if (p != NULL) 3506 retval = sched_setscheduler(p, policy, &lparam); 3507 rcu_read_unlock(); 3508 3509 return retval; 3510 } 3511 3512 /** 3513 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3514 * @pid: the pid in question. 3515 * @policy: new policy. 3516 * @param: structure containing the new RT priority. 3517 * 3518 * Return: 0 on success. An error code otherwise. 3519 */ 3520 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 3521 struct sched_param __user *, param) 3522 { 3523 /* negative values for policy are not valid */ 3524 if (policy < 0) 3525 return -EINVAL; 3526 3527 return do_sched_setscheduler(pid, policy, param); 3528 } 3529 3530 /** 3531 * sys_sched_setparam - set/change the RT priority of a thread 3532 * @pid: the pid in question. 3533 * @param: structure containing the new RT priority. 3534 * 3535 * Return: 0 on success. An error code otherwise. 3536 */ 3537 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3538 { 3539 return do_sched_setscheduler(pid, -1, param); 3540 } 3541 3542 /** 3543 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3544 * @pid: the pid in question. 3545 * 3546 * Return: On success, the policy of the thread. Otherwise, a negative error 3547 * code. 3548 */ 3549 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 3550 { 3551 struct task_struct *p; 3552 int retval; 3553 3554 if (pid < 0) 3555 return -EINVAL; 3556 3557 retval = -ESRCH; 3558 rcu_read_lock(); 3559 p = find_process_by_pid(pid); 3560 if (p) { 3561 retval = security_task_getscheduler(p); 3562 if (!retval) 3563 retval = p->policy 3564 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 3565 } 3566 rcu_read_unlock(); 3567 return retval; 3568 } 3569 3570 /** 3571 * sys_sched_getparam - get the RT priority of a thread 3572 * @pid: the pid in question. 3573 * @param: structure containing the RT priority. 3574 * 3575 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error 3576 * code. 3577 */ 3578 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3579 { 3580 struct sched_param lp; 3581 struct task_struct *p; 3582 int retval; 3583 3584 if (!param || pid < 0) 3585 return -EINVAL; 3586 3587 rcu_read_lock(); 3588 p = find_process_by_pid(pid); 3589 retval = -ESRCH; 3590 if (!p) 3591 goto out_unlock; 3592 3593 retval = security_task_getscheduler(p); 3594 if (retval) 3595 goto out_unlock; 3596 3597 lp.sched_priority = p->rt_priority; 3598 rcu_read_unlock(); 3599 3600 /* 3601 * This one might sleep, we cannot do it with a spinlock held ... 3602 */ 3603 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 3604 3605 return retval; 3606 3607 out_unlock: 3608 rcu_read_unlock(); 3609 return retval; 3610 } 3611 3612 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 3613 { 3614 cpumask_var_t cpus_allowed, new_mask; 3615 struct task_struct *p; 3616 int retval; 3617 3618 get_online_cpus(); 3619 rcu_read_lock(); 3620 3621 p = find_process_by_pid(pid); 3622 if (!p) { 3623 rcu_read_unlock(); 3624 put_online_cpus(); 3625 return -ESRCH; 3626 } 3627 3628 /* Prevent p going away */ 3629 get_task_struct(p); 3630 rcu_read_unlock(); 3631 3632 if (p->flags & PF_NO_SETAFFINITY) { 3633 retval = -EINVAL; 3634 goto out_put_task; 3635 } 3636 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 3637 retval = -ENOMEM; 3638 goto out_put_task; 3639 } 3640 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 3641 retval = -ENOMEM; 3642 goto out_free_cpus_allowed; 3643 } 3644 retval = -EPERM; 3645 if (!check_same_owner(p)) { 3646 rcu_read_lock(); 3647 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 3648 rcu_read_unlock(); 3649 goto out_unlock; 3650 } 3651 rcu_read_unlock(); 3652 } 3653 3654 retval = security_task_setscheduler(p); 3655 if (retval) 3656 goto out_unlock; 3657 3658 cpuset_cpus_allowed(p, cpus_allowed); 3659 cpumask_and(new_mask, in_mask, cpus_allowed); 3660 again: 3661 retval = set_cpus_allowed_ptr(p, new_mask); 3662 3663 if (!retval) { 3664 cpuset_cpus_allowed(p, cpus_allowed); 3665 if (!cpumask_subset(new_mask, cpus_allowed)) { 3666 /* 3667 * We must have raced with a concurrent cpuset 3668 * update. Just reset the cpus_allowed to the 3669 * cpuset's cpus_allowed 3670 */ 3671 cpumask_copy(new_mask, cpus_allowed); 3672 goto again; 3673 } 3674 } 3675 out_unlock: 3676 free_cpumask_var(new_mask); 3677 out_free_cpus_allowed: 3678 free_cpumask_var(cpus_allowed); 3679 out_put_task: 3680 put_task_struct(p); 3681 put_online_cpus(); 3682 return retval; 3683 } 3684 3685 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 3686 struct cpumask *new_mask) 3687 { 3688 if (len < cpumask_size()) 3689 cpumask_clear(new_mask); 3690 else if (len > cpumask_size()) 3691 len = cpumask_size(); 3692 3693 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 3694 } 3695 3696 /** 3697 * sys_sched_setaffinity - set the cpu affinity of a process 3698 * @pid: pid of the process 3699 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3700 * @user_mask_ptr: user-space pointer to the new cpu mask 3701 * 3702 * Return: 0 on success. An error code otherwise. 3703 */ 3704 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 3705 unsigned long __user *, user_mask_ptr) 3706 { 3707 cpumask_var_t new_mask; 3708 int retval; 3709 3710 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 3711 return -ENOMEM; 3712 3713 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 3714 if (retval == 0) 3715 retval = sched_setaffinity(pid, new_mask); 3716 free_cpumask_var(new_mask); 3717 return retval; 3718 } 3719 3720 long sched_getaffinity(pid_t pid, struct cpumask *mask) 3721 { 3722 struct task_struct *p; 3723 unsigned long flags; 3724 int retval; 3725 3726 get_online_cpus(); 3727 rcu_read_lock(); 3728 3729 retval = -ESRCH; 3730 p = find_process_by_pid(pid); 3731 if (!p) 3732 goto out_unlock; 3733 3734 retval = security_task_getscheduler(p); 3735 if (retval) 3736 goto out_unlock; 3737 3738 raw_spin_lock_irqsave(&p->pi_lock, flags); 3739 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 3740 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3741 3742 out_unlock: 3743 rcu_read_unlock(); 3744 put_online_cpus(); 3745 3746 return retval; 3747 } 3748 3749 /** 3750 * sys_sched_getaffinity - get the cpu affinity of a process 3751 * @pid: pid of the process 3752 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3753 * @user_mask_ptr: user-space pointer to hold the current cpu mask 3754 * 3755 * Return: 0 on success. An error code otherwise. 3756 */ 3757 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 3758 unsigned long __user *, user_mask_ptr) 3759 { 3760 int ret; 3761 cpumask_var_t mask; 3762 3763 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 3764 return -EINVAL; 3765 if (len & (sizeof(unsigned long)-1)) 3766 return -EINVAL; 3767 3768 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 3769 return -ENOMEM; 3770 3771 ret = sched_getaffinity(pid, mask); 3772 if (ret == 0) { 3773 size_t retlen = min_t(size_t, len, cpumask_size()); 3774 3775 if (copy_to_user(user_mask_ptr, mask, retlen)) 3776 ret = -EFAULT; 3777 else 3778 ret = retlen; 3779 } 3780 free_cpumask_var(mask); 3781 3782 return ret; 3783 } 3784 3785 /** 3786 * sys_sched_yield - yield the current processor to other threads. 3787 * 3788 * This function yields the current CPU to other tasks. If there are no 3789 * other threads running on this CPU then this function will return. 3790 * 3791 * Return: 0. 3792 */ 3793 SYSCALL_DEFINE0(sched_yield) 3794 { 3795 struct rq *rq = this_rq_lock(); 3796 3797 schedstat_inc(rq, yld_count); 3798 current->sched_class->yield_task(rq); 3799 3800 /* 3801 * Since we are going to call schedule() anyway, there's 3802 * no need to preempt or enable interrupts: 3803 */ 3804 __release(rq->lock); 3805 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 3806 do_raw_spin_unlock(&rq->lock); 3807 sched_preempt_enable_no_resched(); 3808 3809 schedule(); 3810 3811 return 0; 3812 } 3813 3814 static inline int should_resched(void) 3815 { 3816 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); 3817 } 3818 3819 static void __cond_resched(void) 3820 { 3821 add_preempt_count(PREEMPT_ACTIVE); 3822 __schedule(); 3823 sub_preempt_count(PREEMPT_ACTIVE); 3824 } 3825 3826 int __sched _cond_resched(void) 3827 { 3828 if (should_resched()) { 3829 __cond_resched(); 3830 return 1; 3831 } 3832 return 0; 3833 } 3834 EXPORT_SYMBOL(_cond_resched); 3835 3836 /* 3837 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 3838 * call schedule, and on return reacquire the lock. 3839 * 3840 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 3841 * operations here to prevent schedule() from being called twice (once via 3842 * spin_unlock(), once by hand). 3843 */ 3844 int __cond_resched_lock(spinlock_t *lock) 3845 { 3846 int resched = should_resched(); 3847 int ret = 0; 3848 3849 lockdep_assert_held(lock); 3850 3851 if (spin_needbreak(lock) || resched) { 3852 spin_unlock(lock); 3853 if (resched) 3854 __cond_resched(); 3855 else 3856 cpu_relax(); 3857 ret = 1; 3858 spin_lock(lock); 3859 } 3860 return ret; 3861 } 3862 EXPORT_SYMBOL(__cond_resched_lock); 3863 3864 int __sched __cond_resched_softirq(void) 3865 { 3866 BUG_ON(!in_softirq()); 3867 3868 if (should_resched()) { 3869 local_bh_enable(); 3870 __cond_resched(); 3871 local_bh_disable(); 3872 return 1; 3873 } 3874 return 0; 3875 } 3876 EXPORT_SYMBOL(__cond_resched_softirq); 3877 3878 /** 3879 * yield - yield the current processor to other threads. 3880 * 3881 * Do not ever use this function, there's a 99% chance you're doing it wrong. 3882 * 3883 * The scheduler is at all times free to pick the calling task as the most 3884 * eligible task to run, if removing the yield() call from your code breaks 3885 * it, its already broken. 3886 * 3887 * Typical broken usage is: 3888 * 3889 * while (!event) 3890 * yield(); 3891 * 3892 * where one assumes that yield() will let 'the other' process run that will 3893 * make event true. If the current task is a SCHED_FIFO task that will never 3894 * happen. Never use yield() as a progress guarantee!! 3895 * 3896 * If you want to use yield() to wait for something, use wait_event(). 3897 * If you want to use yield() to be 'nice' for others, use cond_resched(). 3898 * If you still want to use yield(), do not! 3899 */ 3900 void __sched yield(void) 3901 { 3902 set_current_state(TASK_RUNNING); 3903 sys_sched_yield(); 3904 } 3905 EXPORT_SYMBOL(yield); 3906 3907 /** 3908 * yield_to - yield the current processor to another thread in 3909 * your thread group, or accelerate that thread toward the 3910 * processor it's on. 3911 * @p: target task 3912 * @preempt: whether task preemption is allowed or not 3913 * 3914 * It's the caller's job to ensure that the target task struct 3915 * can't go away on us before we can do any checks. 3916 * 3917 * Return: 3918 * true (>0) if we indeed boosted the target task. 3919 * false (0) if we failed to boost the target. 3920 * -ESRCH if there's no task to yield to. 3921 */ 3922 bool __sched yield_to(struct task_struct *p, bool preempt) 3923 { 3924 struct task_struct *curr = current; 3925 struct rq *rq, *p_rq; 3926 unsigned long flags; 3927 int yielded = 0; 3928 3929 local_irq_save(flags); 3930 rq = this_rq(); 3931 3932 again: 3933 p_rq = task_rq(p); 3934 /* 3935 * If we're the only runnable task on the rq and target rq also 3936 * has only one task, there's absolutely no point in yielding. 3937 */ 3938 if (rq->nr_running == 1 && p_rq->nr_running == 1) { 3939 yielded = -ESRCH; 3940 goto out_irq; 3941 } 3942 3943 double_rq_lock(rq, p_rq); 3944 while (task_rq(p) != p_rq) { 3945 double_rq_unlock(rq, p_rq); 3946 goto again; 3947 } 3948 3949 if (!curr->sched_class->yield_to_task) 3950 goto out_unlock; 3951 3952 if (curr->sched_class != p->sched_class) 3953 goto out_unlock; 3954 3955 if (task_running(p_rq, p) || p->state) 3956 goto out_unlock; 3957 3958 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 3959 if (yielded) { 3960 schedstat_inc(rq, yld_count); 3961 /* 3962 * Make p's CPU reschedule; pick_next_entity takes care of 3963 * fairness. 3964 */ 3965 if (preempt && rq != p_rq) 3966 resched_task(p_rq->curr); 3967 } 3968 3969 out_unlock: 3970 double_rq_unlock(rq, p_rq); 3971 out_irq: 3972 local_irq_restore(flags); 3973 3974 if (yielded > 0) 3975 schedule(); 3976 3977 return yielded; 3978 } 3979 EXPORT_SYMBOL_GPL(yield_to); 3980 3981 /* 3982 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 3983 * that process accounting knows that this is a task in IO wait state. 3984 */ 3985 void __sched io_schedule(void) 3986 { 3987 struct rq *rq = raw_rq(); 3988 3989 delayacct_blkio_start(); 3990 atomic_inc(&rq->nr_iowait); 3991 blk_flush_plug(current); 3992 current->in_iowait = 1; 3993 schedule(); 3994 current->in_iowait = 0; 3995 atomic_dec(&rq->nr_iowait); 3996 delayacct_blkio_end(); 3997 } 3998 EXPORT_SYMBOL(io_schedule); 3999 4000 long __sched io_schedule_timeout(long timeout) 4001 { 4002 struct rq *rq = raw_rq(); 4003 long ret; 4004 4005 delayacct_blkio_start(); 4006 atomic_inc(&rq->nr_iowait); 4007 blk_flush_plug(current); 4008 current->in_iowait = 1; 4009 ret = schedule_timeout(timeout); 4010 current->in_iowait = 0; 4011 atomic_dec(&rq->nr_iowait); 4012 delayacct_blkio_end(); 4013 return ret; 4014 } 4015 4016 /** 4017 * sys_sched_get_priority_max - return maximum RT priority. 4018 * @policy: scheduling class. 4019 * 4020 * Return: On success, this syscall returns the maximum 4021 * rt_priority that can be used by a given scheduling class. 4022 * On failure, a negative error code is returned. 4023 */ 4024 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4025 { 4026 int ret = -EINVAL; 4027 4028 switch (policy) { 4029 case SCHED_FIFO: 4030 case SCHED_RR: 4031 ret = MAX_USER_RT_PRIO-1; 4032 break; 4033 case SCHED_NORMAL: 4034 case SCHED_BATCH: 4035 case SCHED_IDLE: 4036 ret = 0; 4037 break; 4038 } 4039 return ret; 4040 } 4041 4042 /** 4043 * sys_sched_get_priority_min - return minimum RT priority. 4044 * @policy: scheduling class. 4045 * 4046 * Return: On success, this syscall returns the minimum 4047 * rt_priority that can be used by a given scheduling class. 4048 * On failure, a negative error code is returned. 4049 */ 4050 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4051 { 4052 int ret = -EINVAL; 4053 4054 switch (policy) { 4055 case SCHED_FIFO: 4056 case SCHED_RR: 4057 ret = 1; 4058 break; 4059 case SCHED_NORMAL: 4060 case SCHED_BATCH: 4061 case SCHED_IDLE: 4062 ret = 0; 4063 } 4064 return ret; 4065 } 4066 4067 /** 4068 * sys_sched_rr_get_interval - return the default timeslice of a process. 4069 * @pid: pid of the process. 4070 * @interval: userspace pointer to the timeslice value. 4071 * 4072 * this syscall writes the default timeslice value of a given process 4073 * into the user-space timespec buffer. A value of '0' means infinity. 4074 * 4075 * Return: On success, 0 and the timeslice is in @interval. Otherwise, 4076 * an error code. 4077 */ 4078 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4079 struct timespec __user *, interval) 4080 { 4081 struct task_struct *p; 4082 unsigned int time_slice; 4083 unsigned long flags; 4084 struct rq *rq; 4085 int retval; 4086 struct timespec t; 4087 4088 if (pid < 0) 4089 return -EINVAL; 4090 4091 retval = -ESRCH; 4092 rcu_read_lock(); 4093 p = find_process_by_pid(pid); 4094 if (!p) 4095 goto out_unlock; 4096 4097 retval = security_task_getscheduler(p); 4098 if (retval) 4099 goto out_unlock; 4100 4101 rq = task_rq_lock(p, &flags); 4102 time_slice = p->sched_class->get_rr_interval(rq, p); 4103 task_rq_unlock(rq, p, &flags); 4104 4105 rcu_read_unlock(); 4106 jiffies_to_timespec(time_slice, &t); 4107 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4108 return retval; 4109 4110 out_unlock: 4111 rcu_read_unlock(); 4112 return retval; 4113 } 4114 4115 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 4116 4117 void sched_show_task(struct task_struct *p) 4118 { 4119 unsigned long free = 0; 4120 int ppid; 4121 unsigned state; 4122 4123 state = p->state ? __ffs(p->state) + 1 : 0; 4124 printk(KERN_INFO "%-15.15s %c", p->comm, 4125 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4126 #if BITS_PER_LONG == 32 4127 if (state == TASK_RUNNING) 4128 printk(KERN_CONT " running "); 4129 else 4130 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 4131 #else 4132 if (state == TASK_RUNNING) 4133 printk(KERN_CONT " running task "); 4134 else 4135 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 4136 #endif 4137 #ifdef CONFIG_DEBUG_STACK_USAGE 4138 free = stack_not_used(p); 4139 #endif 4140 rcu_read_lock(); 4141 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4142 rcu_read_unlock(); 4143 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4144 task_pid_nr(p), ppid, 4145 (unsigned long)task_thread_info(p)->flags); 4146 4147 print_worker_info(KERN_INFO, p); 4148 show_stack(p, NULL); 4149 } 4150 4151 void show_state_filter(unsigned long state_filter) 4152 { 4153 struct task_struct *g, *p; 4154 4155 #if BITS_PER_LONG == 32 4156 printk(KERN_INFO 4157 " task PC stack pid father\n"); 4158 #else 4159 printk(KERN_INFO 4160 " task PC stack pid father\n"); 4161 #endif 4162 rcu_read_lock(); 4163 do_each_thread(g, p) { 4164 /* 4165 * reset the NMI-timeout, listing all files on a slow 4166 * console might take a lot of time: 4167 */ 4168 touch_nmi_watchdog(); 4169 if (!state_filter || (p->state & state_filter)) 4170 sched_show_task(p); 4171 } while_each_thread(g, p); 4172 4173 touch_all_softlockup_watchdogs(); 4174 4175 #ifdef CONFIG_SCHED_DEBUG 4176 sysrq_sched_debug_show(); 4177 #endif 4178 rcu_read_unlock(); 4179 /* 4180 * Only show locks if all tasks are dumped: 4181 */ 4182 if (!state_filter) 4183 debug_show_all_locks(); 4184 } 4185 4186 void init_idle_bootup_task(struct task_struct *idle) 4187 { 4188 idle->sched_class = &idle_sched_class; 4189 } 4190 4191 /** 4192 * init_idle - set up an idle thread for a given CPU 4193 * @idle: task in question 4194 * @cpu: cpu the idle task belongs to 4195 * 4196 * NOTE: this function does not set the idle thread's NEED_RESCHED 4197 * flag, to make booting more robust. 4198 */ 4199 void init_idle(struct task_struct *idle, int cpu) 4200 { 4201 struct rq *rq = cpu_rq(cpu); 4202 unsigned long flags; 4203 4204 raw_spin_lock_irqsave(&rq->lock, flags); 4205 4206 __sched_fork(idle); 4207 idle->state = TASK_RUNNING; 4208 idle->se.exec_start = sched_clock(); 4209 4210 do_set_cpus_allowed(idle, cpumask_of(cpu)); 4211 /* 4212 * We're having a chicken and egg problem, even though we are 4213 * holding rq->lock, the cpu isn't yet set to this cpu so the 4214 * lockdep check in task_group() will fail. 4215 * 4216 * Similar case to sched_fork(). / Alternatively we could 4217 * use task_rq_lock() here and obtain the other rq->lock. 4218 * 4219 * Silence PROVE_RCU 4220 */ 4221 rcu_read_lock(); 4222 __set_task_cpu(idle, cpu); 4223 rcu_read_unlock(); 4224 4225 rq->curr = rq->idle = idle; 4226 #if defined(CONFIG_SMP) 4227 idle->on_cpu = 1; 4228 #endif 4229 raw_spin_unlock_irqrestore(&rq->lock, flags); 4230 4231 /* Set the preempt count _outside_ the spinlocks! */ 4232 task_thread_info(idle)->preempt_count = 0; 4233 4234 /* 4235 * The idle tasks have their own, simple scheduling class: 4236 */ 4237 idle->sched_class = &idle_sched_class; 4238 ftrace_graph_init_idle_task(idle, cpu); 4239 vtime_init_idle(idle, cpu); 4240 #if defined(CONFIG_SMP) 4241 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4242 #endif 4243 } 4244 4245 #ifdef CONFIG_SMP 4246 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4247 { 4248 if (p->sched_class && p->sched_class->set_cpus_allowed) 4249 p->sched_class->set_cpus_allowed(p, new_mask); 4250 4251 cpumask_copy(&p->cpus_allowed, new_mask); 4252 p->nr_cpus_allowed = cpumask_weight(new_mask); 4253 } 4254 4255 /* 4256 * This is how migration works: 4257 * 4258 * 1) we invoke migration_cpu_stop() on the target CPU using 4259 * stop_one_cpu(). 4260 * 2) stopper starts to run (implicitly forcing the migrated thread 4261 * off the CPU) 4262 * 3) it checks whether the migrated task is still in the wrong runqueue. 4263 * 4) if it's in the wrong runqueue then the migration thread removes 4264 * it and puts it into the right queue. 4265 * 5) stopper completes and stop_one_cpu() returns and the migration 4266 * is done. 4267 */ 4268 4269 /* 4270 * Change a given task's CPU affinity. Migrate the thread to a 4271 * proper CPU and schedule it away if the CPU it's executing on 4272 * is removed from the allowed bitmask. 4273 * 4274 * NOTE: the caller must have a valid reference to the task, the 4275 * task must not exit() & deallocate itself prematurely. The 4276 * call is not atomic; no spinlocks may be held. 4277 */ 4278 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 4279 { 4280 unsigned long flags; 4281 struct rq *rq; 4282 unsigned int dest_cpu; 4283 int ret = 0; 4284 4285 rq = task_rq_lock(p, &flags); 4286 4287 if (cpumask_equal(&p->cpus_allowed, new_mask)) 4288 goto out; 4289 4290 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 4291 ret = -EINVAL; 4292 goto out; 4293 } 4294 4295 do_set_cpus_allowed(p, new_mask); 4296 4297 /* Can the task run on the task's current CPU? If so, we're done */ 4298 if (cpumask_test_cpu(task_cpu(p), new_mask)) 4299 goto out; 4300 4301 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4302 if (p->on_rq) { 4303 struct migration_arg arg = { p, dest_cpu }; 4304 /* Need help from migration thread: drop lock and wait. */ 4305 task_rq_unlock(rq, p, &flags); 4306 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4307 tlb_migrate_finish(p->mm); 4308 return 0; 4309 } 4310 out: 4311 task_rq_unlock(rq, p, &flags); 4312 4313 return ret; 4314 } 4315 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 4316 4317 /* 4318 * Move (not current) task off this cpu, onto dest cpu. We're doing 4319 * this because either it can't run here any more (set_cpus_allowed() 4320 * away from this CPU, or CPU going down), or because we're 4321 * attempting to rebalance this task on exec (sched_exec). 4322 * 4323 * So we race with normal scheduler movements, but that's OK, as long 4324 * as the task is no longer on this CPU. 4325 * 4326 * Returns non-zero if task was successfully migrated. 4327 */ 4328 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4329 { 4330 struct rq *rq_dest, *rq_src; 4331 int ret = 0; 4332 4333 if (unlikely(!cpu_active(dest_cpu))) 4334 return ret; 4335 4336 rq_src = cpu_rq(src_cpu); 4337 rq_dest = cpu_rq(dest_cpu); 4338 4339 raw_spin_lock(&p->pi_lock); 4340 double_rq_lock(rq_src, rq_dest); 4341 /* Already moved. */ 4342 if (task_cpu(p) != src_cpu) 4343 goto done; 4344 /* Affinity changed (again). */ 4345 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4346 goto fail; 4347 4348 /* 4349 * If we're not on a rq, the next wake-up will ensure we're 4350 * placed properly. 4351 */ 4352 if (p->on_rq) { 4353 dequeue_task(rq_src, p, 0); 4354 set_task_cpu(p, dest_cpu); 4355 enqueue_task(rq_dest, p, 0); 4356 check_preempt_curr(rq_dest, p, 0); 4357 } 4358 done: 4359 ret = 1; 4360 fail: 4361 double_rq_unlock(rq_src, rq_dest); 4362 raw_spin_unlock(&p->pi_lock); 4363 return ret; 4364 } 4365 4366 /* 4367 * migration_cpu_stop - this will be executed by a highprio stopper thread 4368 * and performs thread migration by bumping thread off CPU then 4369 * 'pushing' onto another runqueue. 4370 */ 4371 static int migration_cpu_stop(void *data) 4372 { 4373 struct migration_arg *arg = data; 4374 4375 /* 4376 * The original target cpu might have gone down and we might 4377 * be on another cpu but it doesn't matter. 4378 */ 4379 local_irq_disable(); 4380 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 4381 local_irq_enable(); 4382 return 0; 4383 } 4384 4385 #ifdef CONFIG_HOTPLUG_CPU 4386 4387 /* 4388 * Ensures that the idle task is using init_mm right before its cpu goes 4389 * offline. 4390 */ 4391 void idle_task_exit(void) 4392 { 4393 struct mm_struct *mm = current->active_mm; 4394 4395 BUG_ON(cpu_online(smp_processor_id())); 4396 4397 if (mm != &init_mm) 4398 switch_mm(mm, &init_mm, current); 4399 mmdrop(mm); 4400 } 4401 4402 /* 4403 * Since this CPU is going 'away' for a while, fold any nr_active delta 4404 * we might have. Assumes we're called after migrate_tasks() so that the 4405 * nr_active count is stable. 4406 * 4407 * Also see the comment "Global load-average calculations". 4408 */ 4409 static void calc_load_migrate(struct rq *rq) 4410 { 4411 long delta = calc_load_fold_active(rq); 4412 if (delta) 4413 atomic_long_add(delta, &calc_load_tasks); 4414 } 4415 4416 /* 4417 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4418 * try_to_wake_up()->select_task_rq(). 4419 * 4420 * Called with rq->lock held even though we'er in stop_machine() and 4421 * there's no concurrency possible, we hold the required locks anyway 4422 * because of lock validation efforts. 4423 */ 4424 static void migrate_tasks(unsigned int dead_cpu) 4425 { 4426 struct rq *rq = cpu_rq(dead_cpu); 4427 struct task_struct *next, *stop = rq->stop; 4428 int dest_cpu; 4429 4430 /* 4431 * Fudge the rq selection such that the below task selection loop 4432 * doesn't get stuck on the currently eligible stop task. 4433 * 4434 * We're currently inside stop_machine() and the rq is either stuck 4435 * in the stop_machine_cpu_stop() loop, or we're executing this code, 4436 * either way we should never end up calling schedule() until we're 4437 * done here. 4438 */ 4439 rq->stop = NULL; 4440 4441 /* 4442 * put_prev_task() and pick_next_task() sched 4443 * class method both need to have an up-to-date 4444 * value of rq->clock[_task] 4445 */ 4446 update_rq_clock(rq); 4447 4448 for ( ; ; ) { 4449 /* 4450 * There's this thread running, bail when that's the only 4451 * remaining thread. 4452 */ 4453 if (rq->nr_running == 1) 4454 break; 4455 4456 next = pick_next_task(rq); 4457 BUG_ON(!next); 4458 next->sched_class->put_prev_task(rq, next); 4459 4460 /* Find suitable destination for @next, with force if needed. */ 4461 dest_cpu = select_fallback_rq(dead_cpu, next); 4462 raw_spin_unlock(&rq->lock); 4463 4464 __migrate_task(next, dead_cpu, dest_cpu); 4465 4466 raw_spin_lock(&rq->lock); 4467 } 4468 4469 rq->stop = stop; 4470 } 4471 4472 #endif /* CONFIG_HOTPLUG_CPU */ 4473 4474 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 4475 4476 static struct ctl_table sd_ctl_dir[] = { 4477 { 4478 .procname = "sched_domain", 4479 .mode = 0555, 4480 }, 4481 {} 4482 }; 4483 4484 static struct ctl_table sd_ctl_root[] = { 4485 { 4486 .procname = "kernel", 4487 .mode = 0555, 4488 .child = sd_ctl_dir, 4489 }, 4490 {} 4491 }; 4492 4493 static struct ctl_table *sd_alloc_ctl_entry(int n) 4494 { 4495 struct ctl_table *entry = 4496 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 4497 4498 return entry; 4499 } 4500 4501 static void sd_free_ctl_entry(struct ctl_table **tablep) 4502 { 4503 struct ctl_table *entry; 4504 4505 /* 4506 * In the intermediate directories, both the child directory and 4507 * procname are dynamically allocated and could fail but the mode 4508 * will always be set. In the lowest directory the names are 4509 * static strings and all have proc handlers. 4510 */ 4511 for (entry = *tablep; entry->mode; entry++) { 4512 if (entry->child) 4513 sd_free_ctl_entry(&entry->child); 4514 if (entry->proc_handler == NULL) 4515 kfree(entry->procname); 4516 } 4517 4518 kfree(*tablep); 4519 *tablep = NULL; 4520 } 4521 4522 static int min_load_idx = 0; 4523 static int max_load_idx = CPU_LOAD_IDX_MAX-1; 4524 4525 static void 4526 set_table_entry(struct ctl_table *entry, 4527 const char *procname, void *data, int maxlen, 4528 umode_t mode, proc_handler *proc_handler, 4529 bool load_idx) 4530 { 4531 entry->procname = procname; 4532 entry->data = data; 4533 entry->maxlen = maxlen; 4534 entry->mode = mode; 4535 entry->proc_handler = proc_handler; 4536 4537 if (load_idx) { 4538 entry->extra1 = &min_load_idx; 4539 entry->extra2 = &max_load_idx; 4540 } 4541 } 4542 4543 static struct ctl_table * 4544 sd_alloc_ctl_domain_table(struct sched_domain *sd) 4545 { 4546 struct ctl_table *table = sd_alloc_ctl_entry(13); 4547 4548 if (table == NULL) 4549 return NULL; 4550 4551 set_table_entry(&table[0], "min_interval", &sd->min_interval, 4552 sizeof(long), 0644, proc_doulongvec_minmax, false); 4553 set_table_entry(&table[1], "max_interval", &sd->max_interval, 4554 sizeof(long), 0644, proc_doulongvec_minmax, false); 4555 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 4556 sizeof(int), 0644, proc_dointvec_minmax, true); 4557 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 4558 sizeof(int), 0644, proc_dointvec_minmax, true); 4559 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 4560 sizeof(int), 0644, proc_dointvec_minmax, true); 4561 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 4562 sizeof(int), 0644, proc_dointvec_minmax, true); 4563 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 4564 sizeof(int), 0644, proc_dointvec_minmax, true); 4565 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 4566 sizeof(int), 0644, proc_dointvec_minmax, false); 4567 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 4568 sizeof(int), 0644, proc_dointvec_minmax, false); 4569 set_table_entry(&table[9], "cache_nice_tries", 4570 &sd->cache_nice_tries, 4571 sizeof(int), 0644, proc_dointvec_minmax, false); 4572 set_table_entry(&table[10], "flags", &sd->flags, 4573 sizeof(int), 0644, proc_dointvec_minmax, false); 4574 set_table_entry(&table[11], "name", sd->name, 4575 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4576 /* &table[12] is terminator */ 4577 4578 return table; 4579 } 4580 4581 static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) 4582 { 4583 struct ctl_table *entry, *table; 4584 struct sched_domain *sd; 4585 int domain_num = 0, i; 4586 char buf[32]; 4587 4588 for_each_domain(cpu, sd) 4589 domain_num++; 4590 entry = table = sd_alloc_ctl_entry(domain_num + 1); 4591 if (table == NULL) 4592 return NULL; 4593 4594 i = 0; 4595 for_each_domain(cpu, sd) { 4596 snprintf(buf, 32, "domain%d", i); 4597 entry->procname = kstrdup(buf, GFP_KERNEL); 4598 entry->mode = 0555; 4599 entry->child = sd_alloc_ctl_domain_table(sd); 4600 entry++; 4601 i++; 4602 } 4603 return table; 4604 } 4605 4606 static struct ctl_table_header *sd_sysctl_header; 4607 static void register_sched_domain_sysctl(void) 4608 { 4609 int i, cpu_num = num_possible_cpus(); 4610 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 4611 char buf[32]; 4612 4613 WARN_ON(sd_ctl_dir[0].child); 4614 sd_ctl_dir[0].child = entry; 4615 4616 if (entry == NULL) 4617 return; 4618 4619 for_each_possible_cpu(i) { 4620 snprintf(buf, 32, "cpu%d", i); 4621 entry->procname = kstrdup(buf, GFP_KERNEL); 4622 entry->mode = 0555; 4623 entry->child = sd_alloc_ctl_cpu_table(i); 4624 entry++; 4625 } 4626 4627 WARN_ON(sd_sysctl_header); 4628 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 4629 } 4630 4631 /* may be called multiple times per register */ 4632 static void unregister_sched_domain_sysctl(void) 4633 { 4634 if (sd_sysctl_header) 4635 unregister_sysctl_table(sd_sysctl_header); 4636 sd_sysctl_header = NULL; 4637 if (sd_ctl_dir[0].child) 4638 sd_free_ctl_entry(&sd_ctl_dir[0].child); 4639 } 4640 #else 4641 static void register_sched_domain_sysctl(void) 4642 { 4643 } 4644 static void unregister_sched_domain_sysctl(void) 4645 { 4646 } 4647 #endif 4648 4649 static void set_rq_online(struct rq *rq) 4650 { 4651 if (!rq->online) { 4652 const struct sched_class *class; 4653 4654 cpumask_set_cpu(rq->cpu, rq->rd->online); 4655 rq->online = 1; 4656 4657 for_each_class(class) { 4658 if (class->rq_online) 4659 class->rq_online(rq); 4660 } 4661 } 4662 } 4663 4664 static void set_rq_offline(struct rq *rq) 4665 { 4666 if (rq->online) { 4667 const struct sched_class *class; 4668 4669 for_each_class(class) { 4670 if (class->rq_offline) 4671 class->rq_offline(rq); 4672 } 4673 4674 cpumask_clear_cpu(rq->cpu, rq->rd->online); 4675 rq->online = 0; 4676 } 4677 } 4678 4679 /* 4680 * migration_call - callback that gets triggered when a CPU is added. 4681 * Here we can start up the necessary migration thread for the new CPU. 4682 */ 4683 static int 4684 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 4685 { 4686 int cpu = (long)hcpu; 4687 unsigned long flags; 4688 struct rq *rq = cpu_rq(cpu); 4689 4690 switch (action & ~CPU_TASKS_FROZEN) { 4691 4692 case CPU_UP_PREPARE: 4693 rq->calc_load_update = calc_load_update; 4694 break; 4695 4696 case CPU_ONLINE: 4697 /* Update our root-domain */ 4698 raw_spin_lock_irqsave(&rq->lock, flags); 4699 if (rq->rd) { 4700 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 4701 4702 set_rq_online(rq); 4703 } 4704 raw_spin_unlock_irqrestore(&rq->lock, flags); 4705 break; 4706 4707 #ifdef CONFIG_HOTPLUG_CPU 4708 case CPU_DYING: 4709 sched_ttwu_pending(); 4710 /* Update our root-domain */ 4711 raw_spin_lock_irqsave(&rq->lock, flags); 4712 if (rq->rd) { 4713 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 4714 set_rq_offline(rq); 4715 } 4716 migrate_tasks(cpu); 4717 BUG_ON(rq->nr_running != 1); /* the migration thread */ 4718 raw_spin_unlock_irqrestore(&rq->lock, flags); 4719 break; 4720 4721 case CPU_DEAD: 4722 calc_load_migrate(rq); 4723 break; 4724 #endif 4725 } 4726 4727 update_max_interval(); 4728 4729 return NOTIFY_OK; 4730 } 4731 4732 /* 4733 * Register at high priority so that task migration (migrate_all_tasks) 4734 * happens before everything else. This has to be lower priority than 4735 * the notifier in the perf_event subsystem, though. 4736 */ 4737 static struct notifier_block migration_notifier = { 4738 .notifier_call = migration_call, 4739 .priority = CPU_PRI_MIGRATION, 4740 }; 4741 4742 static int sched_cpu_active(struct notifier_block *nfb, 4743 unsigned long action, void *hcpu) 4744 { 4745 switch (action & ~CPU_TASKS_FROZEN) { 4746 case CPU_STARTING: 4747 case CPU_DOWN_FAILED: 4748 set_cpu_active((long)hcpu, true); 4749 return NOTIFY_OK; 4750 default: 4751 return NOTIFY_DONE; 4752 } 4753 } 4754 4755 static int sched_cpu_inactive(struct notifier_block *nfb, 4756 unsigned long action, void *hcpu) 4757 { 4758 switch (action & ~CPU_TASKS_FROZEN) { 4759 case CPU_DOWN_PREPARE: 4760 set_cpu_active((long)hcpu, false); 4761 return NOTIFY_OK; 4762 default: 4763 return NOTIFY_DONE; 4764 } 4765 } 4766 4767 static int __init migration_init(void) 4768 { 4769 void *cpu = (void *)(long)smp_processor_id(); 4770 int err; 4771 4772 /* Initialize migration for the boot CPU */ 4773 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 4774 BUG_ON(err == NOTIFY_BAD); 4775 migration_call(&migration_notifier, CPU_ONLINE, cpu); 4776 register_cpu_notifier(&migration_notifier); 4777 4778 /* Register cpu active notifiers */ 4779 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 4780 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 4781 4782 return 0; 4783 } 4784 early_initcall(migration_init); 4785 #endif 4786 4787 #ifdef CONFIG_SMP 4788 4789 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 4790 4791 #ifdef CONFIG_SCHED_DEBUG 4792 4793 static __read_mostly int sched_debug_enabled; 4794 4795 static int __init sched_debug_setup(char *str) 4796 { 4797 sched_debug_enabled = 1; 4798 4799 return 0; 4800 } 4801 early_param("sched_debug", sched_debug_setup); 4802 4803 static inline bool sched_debug(void) 4804 { 4805 return sched_debug_enabled; 4806 } 4807 4808 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 4809 struct cpumask *groupmask) 4810 { 4811 struct sched_group *group = sd->groups; 4812 char str[256]; 4813 4814 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 4815 cpumask_clear(groupmask); 4816 4817 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 4818 4819 if (!(sd->flags & SD_LOAD_BALANCE)) { 4820 printk("does not load-balance\n"); 4821 if (sd->parent) 4822 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 4823 " has parent"); 4824 return -1; 4825 } 4826 4827 printk(KERN_CONT "span %s level %s\n", str, sd->name); 4828 4829 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 4830 printk(KERN_ERR "ERROR: domain->span does not contain " 4831 "CPU%d\n", cpu); 4832 } 4833 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 4834 printk(KERN_ERR "ERROR: domain->groups does not contain" 4835 " CPU%d\n", cpu); 4836 } 4837 4838 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 4839 do { 4840 if (!group) { 4841 printk("\n"); 4842 printk(KERN_ERR "ERROR: group is NULL\n"); 4843 break; 4844 } 4845 4846 /* 4847 * Even though we initialize ->power to something semi-sane, 4848 * we leave power_orig unset. This allows us to detect if 4849 * domain iteration is still funny without causing /0 traps. 4850 */ 4851 if (!group->sgp->power_orig) { 4852 printk(KERN_CONT "\n"); 4853 printk(KERN_ERR "ERROR: domain->cpu_power not " 4854 "set\n"); 4855 break; 4856 } 4857 4858 if (!cpumask_weight(sched_group_cpus(group))) { 4859 printk(KERN_CONT "\n"); 4860 printk(KERN_ERR "ERROR: empty group\n"); 4861 break; 4862 } 4863 4864 if (!(sd->flags & SD_OVERLAP) && 4865 cpumask_intersects(groupmask, sched_group_cpus(group))) { 4866 printk(KERN_CONT "\n"); 4867 printk(KERN_ERR "ERROR: repeated CPUs\n"); 4868 break; 4869 } 4870 4871 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 4872 4873 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 4874 4875 printk(KERN_CONT " %s", str); 4876 if (group->sgp->power != SCHED_POWER_SCALE) { 4877 printk(KERN_CONT " (cpu_power = %d)", 4878 group->sgp->power); 4879 } 4880 4881 group = group->next; 4882 } while (group != sd->groups); 4883 printk(KERN_CONT "\n"); 4884 4885 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 4886 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 4887 4888 if (sd->parent && 4889 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 4890 printk(KERN_ERR "ERROR: parent span is not a superset " 4891 "of domain->span\n"); 4892 return 0; 4893 } 4894 4895 static void sched_domain_debug(struct sched_domain *sd, int cpu) 4896 { 4897 int level = 0; 4898 4899 if (!sched_debug_enabled) 4900 return; 4901 4902 if (!sd) { 4903 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 4904 return; 4905 } 4906 4907 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 4908 4909 for (;;) { 4910 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 4911 break; 4912 level++; 4913 sd = sd->parent; 4914 if (!sd) 4915 break; 4916 } 4917 } 4918 #else /* !CONFIG_SCHED_DEBUG */ 4919 # define sched_domain_debug(sd, cpu) do { } while (0) 4920 static inline bool sched_debug(void) 4921 { 4922 return false; 4923 } 4924 #endif /* CONFIG_SCHED_DEBUG */ 4925 4926 static int sd_degenerate(struct sched_domain *sd) 4927 { 4928 if (cpumask_weight(sched_domain_span(sd)) == 1) 4929 return 1; 4930 4931 /* Following flags need at least 2 groups */ 4932 if (sd->flags & (SD_LOAD_BALANCE | 4933 SD_BALANCE_NEWIDLE | 4934 SD_BALANCE_FORK | 4935 SD_BALANCE_EXEC | 4936 SD_SHARE_CPUPOWER | 4937 SD_SHARE_PKG_RESOURCES)) { 4938 if (sd->groups != sd->groups->next) 4939 return 0; 4940 } 4941 4942 /* Following flags don't use groups */ 4943 if (sd->flags & (SD_WAKE_AFFINE)) 4944 return 0; 4945 4946 return 1; 4947 } 4948 4949 static int 4950 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 4951 { 4952 unsigned long cflags = sd->flags, pflags = parent->flags; 4953 4954 if (sd_degenerate(parent)) 4955 return 1; 4956 4957 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 4958 return 0; 4959 4960 /* Flags needing groups don't count if only 1 group in parent */ 4961 if (parent->groups == parent->groups->next) { 4962 pflags &= ~(SD_LOAD_BALANCE | 4963 SD_BALANCE_NEWIDLE | 4964 SD_BALANCE_FORK | 4965 SD_BALANCE_EXEC | 4966 SD_SHARE_CPUPOWER | 4967 SD_SHARE_PKG_RESOURCES); 4968 if (nr_node_ids == 1) 4969 pflags &= ~SD_SERIALIZE; 4970 } 4971 if (~cflags & pflags) 4972 return 0; 4973 4974 return 1; 4975 } 4976 4977 static void free_rootdomain(struct rcu_head *rcu) 4978 { 4979 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 4980 4981 cpupri_cleanup(&rd->cpupri); 4982 free_cpumask_var(rd->rto_mask); 4983 free_cpumask_var(rd->online); 4984 free_cpumask_var(rd->span); 4985 kfree(rd); 4986 } 4987 4988 static void rq_attach_root(struct rq *rq, struct root_domain *rd) 4989 { 4990 struct root_domain *old_rd = NULL; 4991 unsigned long flags; 4992 4993 raw_spin_lock_irqsave(&rq->lock, flags); 4994 4995 if (rq->rd) { 4996 old_rd = rq->rd; 4997 4998 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 4999 set_rq_offline(rq); 5000 5001 cpumask_clear_cpu(rq->cpu, old_rd->span); 5002 5003 /* 5004 * If we dont want to free the old_rt yet then 5005 * set old_rd to NULL to skip the freeing later 5006 * in this function: 5007 */ 5008 if (!atomic_dec_and_test(&old_rd->refcount)) 5009 old_rd = NULL; 5010 } 5011 5012 atomic_inc(&rd->refcount); 5013 rq->rd = rd; 5014 5015 cpumask_set_cpu(rq->cpu, rd->span); 5016 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 5017 set_rq_online(rq); 5018 5019 raw_spin_unlock_irqrestore(&rq->lock, flags); 5020 5021 if (old_rd) 5022 call_rcu_sched(&old_rd->rcu, free_rootdomain); 5023 } 5024 5025 static int init_rootdomain(struct root_domain *rd) 5026 { 5027 memset(rd, 0, sizeof(*rd)); 5028 5029 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 5030 goto out; 5031 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5032 goto free_span; 5033 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5034 goto free_online; 5035 5036 if (cpupri_init(&rd->cpupri) != 0) 5037 goto free_rto_mask; 5038 return 0; 5039 5040 free_rto_mask: 5041 free_cpumask_var(rd->rto_mask); 5042 free_online: 5043 free_cpumask_var(rd->online); 5044 free_span: 5045 free_cpumask_var(rd->span); 5046 out: 5047 return -ENOMEM; 5048 } 5049 5050 /* 5051 * By default the system creates a single root-domain with all cpus as 5052 * members (mimicking the global state we have today). 5053 */ 5054 struct root_domain def_root_domain; 5055 5056 static void init_defrootdomain(void) 5057 { 5058 init_rootdomain(&def_root_domain); 5059 5060 atomic_set(&def_root_domain.refcount, 1); 5061 } 5062 5063 static struct root_domain *alloc_rootdomain(void) 5064 { 5065 struct root_domain *rd; 5066 5067 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 5068 if (!rd) 5069 return NULL; 5070 5071 if (init_rootdomain(rd) != 0) { 5072 kfree(rd); 5073 return NULL; 5074 } 5075 5076 return rd; 5077 } 5078 5079 static void free_sched_groups(struct sched_group *sg, int free_sgp) 5080 { 5081 struct sched_group *tmp, *first; 5082 5083 if (!sg) 5084 return; 5085 5086 first = sg; 5087 do { 5088 tmp = sg->next; 5089 5090 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) 5091 kfree(sg->sgp); 5092 5093 kfree(sg); 5094 sg = tmp; 5095 } while (sg != first); 5096 } 5097 5098 static void free_sched_domain(struct rcu_head *rcu) 5099 { 5100 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 5101 5102 /* 5103 * If its an overlapping domain it has private groups, iterate and 5104 * nuke them all. 5105 */ 5106 if (sd->flags & SD_OVERLAP) { 5107 free_sched_groups(sd->groups, 1); 5108 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5109 kfree(sd->groups->sgp); 5110 kfree(sd->groups); 5111 } 5112 kfree(sd); 5113 } 5114 5115 static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5116 { 5117 call_rcu(&sd->rcu, free_sched_domain); 5118 } 5119 5120 static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5121 { 5122 for (; sd; sd = sd->parent) 5123 destroy_sched_domain(sd, cpu); 5124 } 5125 5126 /* 5127 * Keep a special pointer to the highest sched_domain that has 5128 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 5129 * allows us to avoid some pointer chasing select_idle_sibling(). 5130 * 5131 * Also keep a unique ID per domain (we use the first cpu number in 5132 * the cpumask of the domain), this allows us to quickly tell if 5133 * two cpus are in the same cache domain, see cpus_share_cache(). 5134 */ 5135 DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5136 DEFINE_PER_CPU(int, sd_llc_id); 5137 5138 static void update_top_cache_domain(int cpu) 5139 { 5140 struct sched_domain *sd; 5141 int id = cpu; 5142 5143 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5144 if (sd) 5145 id = cpumask_first(sched_domain_span(sd)); 5146 5147 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5148 per_cpu(sd_llc_id, cpu) = id; 5149 } 5150 5151 /* 5152 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5153 * hold the hotplug lock. 5154 */ 5155 static void 5156 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 5157 { 5158 struct rq *rq = cpu_rq(cpu); 5159 struct sched_domain *tmp; 5160 5161 /* Remove the sched domains which do not contribute to scheduling. */ 5162 for (tmp = sd; tmp; ) { 5163 struct sched_domain *parent = tmp->parent; 5164 if (!parent) 5165 break; 5166 5167 if (sd_parent_degenerate(tmp, parent)) { 5168 tmp->parent = parent->parent; 5169 if (parent->parent) 5170 parent->parent->child = tmp; 5171 destroy_sched_domain(parent, cpu); 5172 } else 5173 tmp = tmp->parent; 5174 } 5175 5176 if (sd && sd_degenerate(sd)) { 5177 tmp = sd; 5178 sd = sd->parent; 5179 destroy_sched_domain(tmp, cpu); 5180 if (sd) 5181 sd->child = NULL; 5182 } 5183 5184 sched_domain_debug(sd, cpu); 5185 5186 rq_attach_root(rq, rd); 5187 tmp = rq->sd; 5188 rcu_assign_pointer(rq->sd, sd); 5189 destroy_sched_domains(tmp, cpu); 5190 5191 update_top_cache_domain(cpu); 5192 } 5193 5194 /* cpus with isolated domains */ 5195 static cpumask_var_t cpu_isolated_map; 5196 5197 /* Setup the mask of cpus configured for isolated domains */ 5198 static int __init isolated_cpu_setup(char *str) 5199 { 5200 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5201 cpulist_parse(str, cpu_isolated_map); 5202 return 1; 5203 } 5204 5205 __setup("isolcpus=", isolated_cpu_setup); 5206 5207 static const struct cpumask *cpu_cpu_mask(int cpu) 5208 { 5209 return cpumask_of_node(cpu_to_node(cpu)); 5210 } 5211 5212 struct sd_data { 5213 struct sched_domain **__percpu sd; 5214 struct sched_group **__percpu sg; 5215 struct sched_group_power **__percpu sgp; 5216 }; 5217 5218 struct s_data { 5219 struct sched_domain ** __percpu sd; 5220 struct root_domain *rd; 5221 }; 5222 5223 enum s_alloc { 5224 sa_rootdomain, 5225 sa_sd, 5226 sa_sd_storage, 5227 sa_none, 5228 }; 5229 5230 struct sched_domain_topology_level; 5231 5232 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 5233 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 5234 5235 #define SDTL_OVERLAP 0x01 5236 5237 struct sched_domain_topology_level { 5238 sched_domain_init_f init; 5239 sched_domain_mask_f mask; 5240 int flags; 5241 int numa_level; 5242 struct sd_data data; 5243 }; 5244 5245 /* 5246 * Build an iteration mask that can exclude certain CPUs from the upwards 5247 * domain traversal. 5248 * 5249 * Asymmetric node setups can result in situations where the domain tree is of 5250 * unequal depth, make sure to skip domains that already cover the entire 5251 * range. 5252 * 5253 * In that case build_sched_domains() will have terminated the iteration early 5254 * and our sibling sd spans will be empty. Domains should always include the 5255 * cpu they're built on, so check that. 5256 * 5257 */ 5258 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 5259 { 5260 const struct cpumask *span = sched_domain_span(sd); 5261 struct sd_data *sdd = sd->private; 5262 struct sched_domain *sibling; 5263 int i; 5264 5265 for_each_cpu(i, span) { 5266 sibling = *per_cpu_ptr(sdd->sd, i); 5267 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5268 continue; 5269 5270 cpumask_set_cpu(i, sched_group_mask(sg)); 5271 } 5272 } 5273 5274 /* 5275 * Return the canonical balance cpu for this group, this is the first cpu 5276 * of this group that's also in the iteration mask. 5277 */ 5278 int group_balance_cpu(struct sched_group *sg) 5279 { 5280 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 5281 } 5282 5283 static int 5284 build_overlap_sched_groups(struct sched_domain *sd, int cpu) 5285 { 5286 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 5287 const struct cpumask *span = sched_domain_span(sd); 5288 struct cpumask *covered = sched_domains_tmpmask; 5289 struct sd_data *sdd = sd->private; 5290 struct sched_domain *child; 5291 int i; 5292 5293 cpumask_clear(covered); 5294 5295 for_each_cpu(i, span) { 5296 struct cpumask *sg_span; 5297 5298 if (cpumask_test_cpu(i, covered)) 5299 continue; 5300 5301 child = *per_cpu_ptr(sdd->sd, i); 5302 5303 /* See the comment near build_group_mask(). */ 5304 if (!cpumask_test_cpu(i, sched_domain_span(child))) 5305 continue; 5306 5307 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5308 GFP_KERNEL, cpu_to_node(cpu)); 5309 5310 if (!sg) 5311 goto fail; 5312 5313 sg_span = sched_group_cpus(sg); 5314 if (child->child) { 5315 child = child->child; 5316 cpumask_copy(sg_span, sched_domain_span(child)); 5317 } else 5318 cpumask_set_cpu(i, sg_span); 5319 5320 cpumask_or(covered, covered, sg_span); 5321 5322 sg->sgp = *per_cpu_ptr(sdd->sgp, i); 5323 if (atomic_inc_return(&sg->sgp->ref) == 1) 5324 build_group_mask(sd, sg); 5325 5326 /* 5327 * Initialize sgp->power such that even if we mess up the 5328 * domains and no possible iteration will get us here, we won't 5329 * die on a /0 trap. 5330 */ 5331 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5332 5333 /* 5334 * Make sure the first group of this domain contains the 5335 * canonical balance cpu. Otherwise the sched_domain iteration 5336 * breaks. See update_sg_lb_stats(). 5337 */ 5338 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 5339 group_balance_cpu(sg) == cpu) 5340 groups = sg; 5341 5342 if (!first) 5343 first = sg; 5344 if (last) 5345 last->next = sg; 5346 last = sg; 5347 last->next = first; 5348 } 5349 sd->groups = groups; 5350 5351 return 0; 5352 5353 fail: 5354 free_sched_groups(first, 0); 5355 5356 return -ENOMEM; 5357 } 5358 5359 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 5360 { 5361 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 5362 struct sched_domain *child = sd->child; 5363 5364 if (child) 5365 cpu = cpumask_first(sched_domain_span(child)); 5366 5367 if (sg) { 5368 *sg = *per_cpu_ptr(sdd->sg, cpu); 5369 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); 5370 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ 5371 } 5372 5373 return cpu; 5374 } 5375 5376 /* 5377 * build_sched_groups will build a circular linked list of the groups 5378 * covered by the given span, and will set each group's ->cpumask correctly, 5379 * and ->cpu_power to 0. 5380 * 5381 * Assumes the sched_domain tree is fully constructed 5382 */ 5383 static int 5384 build_sched_groups(struct sched_domain *sd, int cpu) 5385 { 5386 struct sched_group *first = NULL, *last = NULL; 5387 struct sd_data *sdd = sd->private; 5388 const struct cpumask *span = sched_domain_span(sd); 5389 struct cpumask *covered; 5390 int i; 5391 5392 get_group(cpu, sdd, &sd->groups); 5393 atomic_inc(&sd->groups->ref); 5394 5395 if (cpu != cpumask_first(span)) 5396 return 0; 5397 5398 lockdep_assert_held(&sched_domains_mutex); 5399 covered = sched_domains_tmpmask; 5400 5401 cpumask_clear(covered); 5402 5403 for_each_cpu(i, span) { 5404 struct sched_group *sg; 5405 int group, j; 5406 5407 if (cpumask_test_cpu(i, covered)) 5408 continue; 5409 5410 group = get_group(i, sdd, &sg); 5411 cpumask_clear(sched_group_cpus(sg)); 5412 sg->sgp->power = 0; 5413 cpumask_setall(sched_group_mask(sg)); 5414 5415 for_each_cpu(j, span) { 5416 if (get_group(j, sdd, NULL) != group) 5417 continue; 5418 5419 cpumask_set_cpu(j, covered); 5420 cpumask_set_cpu(j, sched_group_cpus(sg)); 5421 } 5422 5423 if (!first) 5424 first = sg; 5425 if (last) 5426 last->next = sg; 5427 last = sg; 5428 } 5429 last->next = first; 5430 5431 return 0; 5432 } 5433 5434 /* 5435 * Initialize sched groups cpu_power. 5436 * 5437 * cpu_power indicates the capacity of sched group, which is used while 5438 * distributing the load between different sched groups in a sched domain. 5439 * Typically cpu_power for all the groups in a sched domain will be same unless 5440 * there are asymmetries in the topology. If there are asymmetries, group 5441 * having more cpu_power will pickup more load compared to the group having 5442 * less cpu_power. 5443 */ 5444 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 5445 { 5446 struct sched_group *sg = sd->groups; 5447 5448 WARN_ON(!sg); 5449 5450 do { 5451 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 5452 sg = sg->next; 5453 } while (sg != sd->groups); 5454 5455 if (cpu != group_balance_cpu(sg)) 5456 return; 5457 5458 update_group_power(sd, cpu); 5459 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5460 } 5461 5462 int __weak arch_sd_sibling_asym_packing(void) 5463 { 5464 return 0*SD_ASYM_PACKING; 5465 } 5466 5467 /* 5468 * Initializers for schedule domains 5469 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5470 */ 5471 5472 #ifdef CONFIG_SCHED_DEBUG 5473 # define SD_INIT_NAME(sd, type) sd->name = #type 5474 #else 5475 # define SD_INIT_NAME(sd, type) do { } while (0) 5476 #endif 5477 5478 #define SD_INIT_FUNC(type) \ 5479 static noinline struct sched_domain * \ 5480 sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ 5481 { \ 5482 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ 5483 *sd = SD_##type##_INIT; \ 5484 SD_INIT_NAME(sd, type); \ 5485 sd->private = &tl->data; \ 5486 return sd; \ 5487 } 5488 5489 SD_INIT_FUNC(CPU) 5490 #ifdef CONFIG_SCHED_SMT 5491 SD_INIT_FUNC(SIBLING) 5492 #endif 5493 #ifdef CONFIG_SCHED_MC 5494 SD_INIT_FUNC(MC) 5495 #endif 5496 #ifdef CONFIG_SCHED_BOOK 5497 SD_INIT_FUNC(BOOK) 5498 #endif 5499 5500 static int default_relax_domain_level = -1; 5501 int sched_domain_level_max; 5502 5503 static int __init setup_relax_domain_level(char *str) 5504 { 5505 if (kstrtoint(str, 0, &default_relax_domain_level)) 5506 pr_warn("Unable to set relax_domain_level\n"); 5507 5508 return 1; 5509 } 5510 __setup("relax_domain_level=", setup_relax_domain_level); 5511 5512 static void set_domain_attribute(struct sched_domain *sd, 5513 struct sched_domain_attr *attr) 5514 { 5515 int request; 5516 5517 if (!attr || attr->relax_domain_level < 0) { 5518 if (default_relax_domain_level < 0) 5519 return; 5520 else 5521 request = default_relax_domain_level; 5522 } else 5523 request = attr->relax_domain_level; 5524 if (request < sd->level) { 5525 /* turn off idle balance on this domain */ 5526 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5527 } else { 5528 /* turn on idle balance on this domain */ 5529 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5530 } 5531 } 5532 5533 static void __sdt_free(const struct cpumask *cpu_map); 5534 static int __sdt_alloc(const struct cpumask *cpu_map); 5535 5536 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 5537 const struct cpumask *cpu_map) 5538 { 5539 switch (what) { 5540 case sa_rootdomain: 5541 if (!atomic_read(&d->rd->refcount)) 5542 free_rootdomain(&d->rd->rcu); /* fall through */ 5543 case sa_sd: 5544 free_percpu(d->sd); /* fall through */ 5545 case sa_sd_storage: 5546 __sdt_free(cpu_map); /* fall through */ 5547 case sa_none: 5548 break; 5549 } 5550 } 5551 5552 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 5553 const struct cpumask *cpu_map) 5554 { 5555 memset(d, 0, sizeof(*d)); 5556 5557 if (__sdt_alloc(cpu_map)) 5558 return sa_sd_storage; 5559 d->sd = alloc_percpu(struct sched_domain *); 5560 if (!d->sd) 5561 return sa_sd_storage; 5562 d->rd = alloc_rootdomain(); 5563 if (!d->rd) 5564 return sa_sd; 5565 return sa_rootdomain; 5566 } 5567 5568 /* 5569 * NULL the sd_data elements we've used to build the sched_domain and 5570 * sched_group structure so that the subsequent __free_domain_allocs() 5571 * will not free the data we're using. 5572 */ 5573 static void claim_allocations(int cpu, struct sched_domain *sd) 5574 { 5575 struct sd_data *sdd = sd->private; 5576 5577 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 5578 *per_cpu_ptr(sdd->sd, cpu) = NULL; 5579 5580 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 5581 *per_cpu_ptr(sdd->sg, cpu) = NULL; 5582 5583 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) 5584 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5585 } 5586 5587 #ifdef CONFIG_SCHED_SMT 5588 static const struct cpumask *cpu_smt_mask(int cpu) 5589 { 5590 return topology_thread_cpumask(cpu); 5591 } 5592 #endif 5593 5594 /* 5595 * Topology list, bottom-up. 5596 */ 5597 static struct sched_domain_topology_level default_topology[] = { 5598 #ifdef CONFIG_SCHED_SMT 5599 { sd_init_SIBLING, cpu_smt_mask, }, 5600 #endif 5601 #ifdef CONFIG_SCHED_MC 5602 { sd_init_MC, cpu_coregroup_mask, }, 5603 #endif 5604 #ifdef CONFIG_SCHED_BOOK 5605 { sd_init_BOOK, cpu_book_mask, }, 5606 #endif 5607 { sd_init_CPU, cpu_cpu_mask, }, 5608 { NULL, }, 5609 }; 5610 5611 static struct sched_domain_topology_level *sched_domain_topology = default_topology; 5612 5613 #define for_each_sd_topology(tl) \ 5614 for (tl = sched_domain_topology; tl->init; tl++) 5615 5616 #ifdef CONFIG_NUMA 5617 5618 static int sched_domains_numa_levels; 5619 static int *sched_domains_numa_distance; 5620 static struct cpumask ***sched_domains_numa_masks; 5621 static int sched_domains_curr_level; 5622 5623 static inline int sd_local_flags(int level) 5624 { 5625 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 5626 return 0; 5627 5628 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 5629 } 5630 5631 static struct sched_domain * 5632 sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 5633 { 5634 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 5635 int level = tl->numa_level; 5636 int sd_weight = cpumask_weight( 5637 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 5638 5639 *sd = (struct sched_domain){ 5640 .min_interval = sd_weight, 5641 .max_interval = 2*sd_weight, 5642 .busy_factor = 32, 5643 .imbalance_pct = 125, 5644 .cache_nice_tries = 2, 5645 .busy_idx = 3, 5646 .idle_idx = 2, 5647 .newidle_idx = 0, 5648 .wake_idx = 0, 5649 .forkexec_idx = 0, 5650 5651 .flags = 1*SD_LOAD_BALANCE 5652 | 1*SD_BALANCE_NEWIDLE 5653 | 0*SD_BALANCE_EXEC 5654 | 0*SD_BALANCE_FORK 5655 | 0*SD_BALANCE_WAKE 5656 | 0*SD_WAKE_AFFINE 5657 | 0*SD_SHARE_CPUPOWER 5658 | 0*SD_SHARE_PKG_RESOURCES 5659 | 1*SD_SERIALIZE 5660 | 0*SD_PREFER_SIBLING 5661 | sd_local_flags(level) 5662 , 5663 .last_balance = jiffies, 5664 .balance_interval = sd_weight, 5665 }; 5666 SD_INIT_NAME(sd, NUMA); 5667 sd->private = &tl->data; 5668 5669 /* 5670 * Ugly hack to pass state to sd_numa_mask()... 5671 */ 5672 sched_domains_curr_level = tl->numa_level; 5673 5674 return sd; 5675 } 5676 5677 static const struct cpumask *sd_numa_mask(int cpu) 5678 { 5679 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 5680 } 5681 5682 static void sched_numa_warn(const char *str) 5683 { 5684 static int done = false; 5685 int i,j; 5686 5687 if (done) 5688 return; 5689 5690 done = true; 5691 5692 printk(KERN_WARNING "ERROR: %s\n\n", str); 5693 5694 for (i = 0; i < nr_node_ids; i++) { 5695 printk(KERN_WARNING " "); 5696 for (j = 0; j < nr_node_ids; j++) 5697 printk(KERN_CONT "%02d ", node_distance(i,j)); 5698 printk(KERN_CONT "\n"); 5699 } 5700 printk(KERN_WARNING "\n"); 5701 } 5702 5703 static bool find_numa_distance(int distance) 5704 { 5705 int i; 5706 5707 if (distance == node_distance(0, 0)) 5708 return true; 5709 5710 for (i = 0; i < sched_domains_numa_levels; i++) { 5711 if (sched_domains_numa_distance[i] == distance) 5712 return true; 5713 } 5714 5715 return false; 5716 } 5717 5718 static void sched_init_numa(void) 5719 { 5720 int next_distance, curr_distance = node_distance(0, 0); 5721 struct sched_domain_topology_level *tl; 5722 int level = 0; 5723 int i, j, k; 5724 5725 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 5726 if (!sched_domains_numa_distance) 5727 return; 5728 5729 /* 5730 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 5731 * unique distances in the node_distance() table. 5732 * 5733 * Assumes node_distance(0,j) includes all distances in 5734 * node_distance(i,j) in order to avoid cubic time. 5735 */ 5736 next_distance = curr_distance; 5737 for (i = 0; i < nr_node_ids; i++) { 5738 for (j = 0; j < nr_node_ids; j++) { 5739 for (k = 0; k < nr_node_ids; k++) { 5740 int distance = node_distance(i, k); 5741 5742 if (distance > curr_distance && 5743 (distance < next_distance || 5744 next_distance == curr_distance)) 5745 next_distance = distance; 5746 5747 /* 5748 * While not a strong assumption it would be nice to know 5749 * about cases where if node A is connected to B, B is not 5750 * equally connected to A. 5751 */ 5752 if (sched_debug() && node_distance(k, i) != distance) 5753 sched_numa_warn("Node-distance not symmetric"); 5754 5755 if (sched_debug() && i && !find_numa_distance(distance)) 5756 sched_numa_warn("Node-0 not representative"); 5757 } 5758 if (next_distance != curr_distance) { 5759 sched_domains_numa_distance[level++] = next_distance; 5760 sched_domains_numa_levels = level; 5761 curr_distance = next_distance; 5762 } else break; 5763 } 5764 5765 /* 5766 * In case of sched_debug() we verify the above assumption. 5767 */ 5768 if (!sched_debug()) 5769 break; 5770 } 5771 /* 5772 * 'level' contains the number of unique distances, excluding the 5773 * identity distance node_distance(i,i). 5774 * 5775 * The sched_domains_numa_distance[] array includes the actual distance 5776 * numbers. 5777 */ 5778 5779 /* 5780 * Here, we should temporarily reset sched_domains_numa_levels to 0. 5781 * If it fails to allocate memory for array sched_domains_numa_masks[][], 5782 * the array will contain less then 'level' members. This could be 5783 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 5784 * in other functions. 5785 * 5786 * We reset it to 'level' at the end of this function. 5787 */ 5788 sched_domains_numa_levels = 0; 5789 5790 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 5791 if (!sched_domains_numa_masks) 5792 return; 5793 5794 /* 5795 * Now for each level, construct a mask per node which contains all 5796 * cpus of nodes that are that many hops away from us. 5797 */ 5798 for (i = 0; i < level; i++) { 5799 sched_domains_numa_masks[i] = 5800 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 5801 if (!sched_domains_numa_masks[i]) 5802 return; 5803 5804 for (j = 0; j < nr_node_ids; j++) { 5805 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 5806 if (!mask) 5807 return; 5808 5809 sched_domains_numa_masks[i][j] = mask; 5810 5811 for (k = 0; k < nr_node_ids; k++) { 5812 if (node_distance(j, k) > sched_domains_numa_distance[i]) 5813 continue; 5814 5815 cpumask_or(mask, mask, cpumask_of_node(k)); 5816 } 5817 } 5818 } 5819 5820 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 5821 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 5822 if (!tl) 5823 return; 5824 5825 /* 5826 * Copy the default topology bits.. 5827 */ 5828 for (i = 0; default_topology[i].init; i++) 5829 tl[i] = default_topology[i]; 5830 5831 /* 5832 * .. and append 'j' levels of NUMA goodness. 5833 */ 5834 for (j = 0; j < level; i++, j++) { 5835 tl[i] = (struct sched_domain_topology_level){ 5836 .init = sd_numa_init, 5837 .mask = sd_numa_mask, 5838 .flags = SDTL_OVERLAP, 5839 .numa_level = j, 5840 }; 5841 } 5842 5843 sched_domain_topology = tl; 5844 5845 sched_domains_numa_levels = level; 5846 } 5847 5848 static void sched_domains_numa_masks_set(int cpu) 5849 { 5850 int i, j; 5851 int node = cpu_to_node(cpu); 5852 5853 for (i = 0; i < sched_domains_numa_levels; i++) { 5854 for (j = 0; j < nr_node_ids; j++) { 5855 if (node_distance(j, node) <= sched_domains_numa_distance[i]) 5856 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 5857 } 5858 } 5859 } 5860 5861 static void sched_domains_numa_masks_clear(int cpu) 5862 { 5863 int i, j; 5864 for (i = 0; i < sched_domains_numa_levels; i++) { 5865 for (j = 0; j < nr_node_ids; j++) 5866 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 5867 } 5868 } 5869 5870 /* 5871 * Update sched_domains_numa_masks[level][node] array when new cpus 5872 * are onlined. 5873 */ 5874 static int sched_domains_numa_masks_update(struct notifier_block *nfb, 5875 unsigned long action, 5876 void *hcpu) 5877 { 5878 int cpu = (long)hcpu; 5879 5880 switch (action & ~CPU_TASKS_FROZEN) { 5881 case CPU_ONLINE: 5882 sched_domains_numa_masks_set(cpu); 5883 break; 5884 5885 case CPU_DEAD: 5886 sched_domains_numa_masks_clear(cpu); 5887 break; 5888 5889 default: 5890 return NOTIFY_DONE; 5891 } 5892 5893 return NOTIFY_OK; 5894 } 5895 #else 5896 static inline void sched_init_numa(void) 5897 { 5898 } 5899 5900 static int sched_domains_numa_masks_update(struct notifier_block *nfb, 5901 unsigned long action, 5902 void *hcpu) 5903 { 5904 return 0; 5905 } 5906 #endif /* CONFIG_NUMA */ 5907 5908 static int __sdt_alloc(const struct cpumask *cpu_map) 5909 { 5910 struct sched_domain_topology_level *tl; 5911 int j; 5912 5913 for_each_sd_topology(tl) { 5914 struct sd_data *sdd = &tl->data; 5915 5916 sdd->sd = alloc_percpu(struct sched_domain *); 5917 if (!sdd->sd) 5918 return -ENOMEM; 5919 5920 sdd->sg = alloc_percpu(struct sched_group *); 5921 if (!sdd->sg) 5922 return -ENOMEM; 5923 5924 sdd->sgp = alloc_percpu(struct sched_group_power *); 5925 if (!sdd->sgp) 5926 return -ENOMEM; 5927 5928 for_each_cpu(j, cpu_map) { 5929 struct sched_domain *sd; 5930 struct sched_group *sg; 5931 struct sched_group_power *sgp; 5932 5933 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 5934 GFP_KERNEL, cpu_to_node(j)); 5935 if (!sd) 5936 return -ENOMEM; 5937 5938 *per_cpu_ptr(sdd->sd, j) = sd; 5939 5940 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5941 GFP_KERNEL, cpu_to_node(j)); 5942 if (!sg) 5943 return -ENOMEM; 5944 5945 sg->next = sg; 5946 5947 *per_cpu_ptr(sdd->sg, j) = sg; 5948 5949 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), 5950 GFP_KERNEL, cpu_to_node(j)); 5951 if (!sgp) 5952 return -ENOMEM; 5953 5954 *per_cpu_ptr(sdd->sgp, j) = sgp; 5955 } 5956 } 5957 5958 return 0; 5959 } 5960 5961 static void __sdt_free(const struct cpumask *cpu_map) 5962 { 5963 struct sched_domain_topology_level *tl; 5964 int j; 5965 5966 for_each_sd_topology(tl) { 5967 struct sd_data *sdd = &tl->data; 5968 5969 for_each_cpu(j, cpu_map) { 5970 struct sched_domain *sd; 5971 5972 if (sdd->sd) { 5973 sd = *per_cpu_ptr(sdd->sd, j); 5974 if (sd && (sd->flags & SD_OVERLAP)) 5975 free_sched_groups(sd->groups, 0); 5976 kfree(*per_cpu_ptr(sdd->sd, j)); 5977 } 5978 5979 if (sdd->sg) 5980 kfree(*per_cpu_ptr(sdd->sg, j)); 5981 if (sdd->sgp) 5982 kfree(*per_cpu_ptr(sdd->sgp, j)); 5983 } 5984 free_percpu(sdd->sd); 5985 sdd->sd = NULL; 5986 free_percpu(sdd->sg); 5987 sdd->sg = NULL; 5988 free_percpu(sdd->sgp); 5989 sdd->sgp = NULL; 5990 } 5991 } 5992 5993 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 5994 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 5995 struct sched_domain *child, int cpu) 5996 { 5997 struct sched_domain *sd = tl->init(tl, cpu); 5998 if (!sd) 5999 return child; 6000 6001 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6002 if (child) { 6003 sd->level = child->level + 1; 6004 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6005 child->parent = sd; 6006 sd->child = child; 6007 } 6008 set_domain_attribute(sd, attr); 6009 6010 return sd; 6011 } 6012 6013 /* 6014 * Build sched domains for a given set of cpus and attach the sched domains 6015 * to the individual cpus 6016 */ 6017 static int build_sched_domains(const struct cpumask *cpu_map, 6018 struct sched_domain_attr *attr) 6019 { 6020 enum s_alloc alloc_state; 6021 struct sched_domain *sd; 6022 struct s_data d; 6023 int i, ret = -ENOMEM; 6024 6025 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 6026 if (alloc_state != sa_rootdomain) 6027 goto error; 6028 6029 /* Set up domains for cpus specified by the cpu_map. */ 6030 for_each_cpu(i, cpu_map) { 6031 struct sched_domain_topology_level *tl; 6032 6033 sd = NULL; 6034 for_each_sd_topology(tl) { 6035 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 6036 if (tl == sched_domain_topology) 6037 *per_cpu_ptr(d.sd, i) = sd; 6038 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6039 sd->flags |= SD_OVERLAP; 6040 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6041 break; 6042 } 6043 } 6044 6045 /* Build the groups for the domains */ 6046 for_each_cpu(i, cpu_map) { 6047 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6048 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 6049 if (sd->flags & SD_OVERLAP) { 6050 if (build_overlap_sched_groups(sd, i)) 6051 goto error; 6052 } else { 6053 if (build_sched_groups(sd, i)) 6054 goto error; 6055 } 6056 } 6057 } 6058 6059 /* Calculate CPU power for physical packages and nodes */ 6060 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6061 if (!cpumask_test_cpu(i, cpu_map)) 6062 continue; 6063 6064 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6065 claim_allocations(i, sd); 6066 init_sched_groups_power(i, sd); 6067 } 6068 } 6069 6070 /* Attach the domains */ 6071 rcu_read_lock(); 6072 for_each_cpu(i, cpu_map) { 6073 sd = *per_cpu_ptr(d.sd, i); 6074 cpu_attach_domain(sd, d.rd, i); 6075 } 6076 rcu_read_unlock(); 6077 6078 ret = 0; 6079 error: 6080 __free_domain_allocs(&d, alloc_state, cpu_map); 6081 return ret; 6082 } 6083 6084 static cpumask_var_t *doms_cur; /* current sched domains */ 6085 static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 6086 static struct sched_domain_attr *dattr_cur; 6087 /* attribues of custom domains in 'doms_cur' */ 6088 6089 /* 6090 * Special case: If a kmalloc of a doms_cur partition (array of 6091 * cpumask) fails, then fallback to a single sched domain, 6092 * as determined by the single cpumask fallback_doms. 6093 */ 6094 static cpumask_var_t fallback_doms; 6095 6096 /* 6097 * arch_update_cpu_topology lets virtualized architectures update the 6098 * cpu core maps. It is supposed to return 1 if the topology changed 6099 * or 0 if it stayed the same. 6100 */ 6101 int __attribute__((weak)) arch_update_cpu_topology(void) 6102 { 6103 return 0; 6104 } 6105 6106 cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 6107 { 6108 int i; 6109 cpumask_var_t *doms; 6110 6111 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 6112 if (!doms) 6113 return NULL; 6114 for (i = 0; i < ndoms; i++) { 6115 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 6116 free_sched_domains(doms, i); 6117 return NULL; 6118 } 6119 } 6120 return doms; 6121 } 6122 6123 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 6124 { 6125 unsigned int i; 6126 for (i = 0; i < ndoms; i++) 6127 free_cpumask_var(doms[i]); 6128 kfree(doms); 6129 } 6130 6131 /* 6132 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6133 * For now this just excludes isolated cpus, but could be used to 6134 * exclude other special cases in the future. 6135 */ 6136 static int init_sched_domains(const struct cpumask *cpu_map) 6137 { 6138 int err; 6139 6140 arch_update_cpu_topology(); 6141 ndoms_cur = 1; 6142 doms_cur = alloc_sched_domains(ndoms_cur); 6143 if (!doms_cur) 6144 doms_cur = &fallback_doms; 6145 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6146 err = build_sched_domains(doms_cur[0], NULL); 6147 register_sched_domain_sysctl(); 6148 6149 return err; 6150 } 6151 6152 /* 6153 * Detach sched domains from a group of cpus specified in cpu_map 6154 * These cpus will now be attached to the NULL domain 6155 */ 6156 static void detach_destroy_domains(const struct cpumask *cpu_map) 6157 { 6158 int i; 6159 6160 rcu_read_lock(); 6161 for_each_cpu(i, cpu_map) 6162 cpu_attach_domain(NULL, &def_root_domain, i); 6163 rcu_read_unlock(); 6164 } 6165 6166 /* handle null as "default" */ 6167 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 6168 struct sched_domain_attr *new, int idx_new) 6169 { 6170 struct sched_domain_attr tmp; 6171 6172 /* fast path */ 6173 if (!new && !cur) 6174 return 1; 6175 6176 tmp = SD_ATTR_INIT; 6177 return !memcmp(cur ? (cur + idx_cur) : &tmp, 6178 new ? (new + idx_new) : &tmp, 6179 sizeof(struct sched_domain_attr)); 6180 } 6181 6182 /* 6183 * Partition sched domains as specified by the 'ndoms_new' 6184 * cpumasks in the array doms_new[] of cpumasks. This compares 6185 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6186 * It destroys each deleted domain and builds each new domain. 6187 * 6188 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 6189 * The masks don't intersect (don't overlap.) We should setup one 6190 * sched domain for each mask. CPUs not in any of the cpumasks will 6191 * not be load balanced. If the same cpumask appears both in the 6192 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6193 * it as it is. 6194 * 6195 * The passed in 'doms_new' should be allocated using 6196 * alloc_sched_domains. This routine takes ownership of it and will 6197 * free_sched_domains it when done with it. If the caller failed the 6198 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 6199 * and partition_sched_domains() will fallback to the single partition 6200 * 'fallback_doms', it also forces the domains to be rebuilt. 6201 * 6202 * If doms_new == NULL it will be replaced with cpu_online_mask. 6203 * ndoms_new == 0 is a special case for destroying existing domains, 6204 * and it will not create the default domain. 6205 * 6206 * Call with hotplug lock held 6207 */ 6208 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 6209 struct sched_domain_attr *dattr_new) 6210 { 6211 int i, j, n; 6212 int new_topology; 6213 6214 mutex_lock(&sched_domains_mutex); 6215 6216 /* always unregister in case we don't destroy any domains */ 6217 unregister_sched_domain_sysctl(); 6218 6219 /* Let architecture update cpu core mappings. */ 6220 new_topology = arch_update_cpu_topology(); 6221 6222 n = doms_new ? ndoms_new : 0; 6223 6224 /* Destroy deleted domains */ 6225 for (i = 0; i < ndoms_cur; i++) { 6226 for (j = 0; j < n && !new_topology; j++) { 6227 if (cpumask_equal(doms_cur[i], doms_new[j]) 6228 && dattrs_equal(dattr_cur, i, dattr_new, j)) 6229 goto match1; 6230 } 6231 /* no match - a current sched domain not in new doms_new[] */ 6232 detach_destroy_domains(doms_cur[i]); 6233 match1: 6234 ; 6235 } 6236 6237 if (doms_new == NULL) { 6238 ndoms_cur = 0; 6239 doms_new = &fallback_doms; 6240 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6241 WARN_ON_ONCE(dattr_new); 6242 } 6243 6244 /* Build new domains */ 6245 for (i = 0; i < ndoms_new; i++) { 6246 for (j = 0; j < ndoms_cur && !new_topology; j++) { 6247 if (cpumask_equal(doms_new[i], doms_cur[j]) 6248 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6249 goto match2; 6250 } 6251 /* no match - add a new doms_new */ 6252 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 6253 match2: 6254 ; 6255 } 6256 6257 /* Remember the new sched domains */ 6258 if (doms_cur != &fallback_doms) 6259 free_sched_domains(doms_cur, ndoms_cur); 6260 kfree(dattr_cur); /* kfree(NULL) is safe */ 6261 doms_cur = doms_new; 6262 dattr_cur = dattr_new; 6263 ndoms_cur = ndoms_new; 6264 6265 register_sched_domain_sysctl(); 6266 6267 mutex_unlock(&sched_domains_mutex); 6268 } 6269 6270 static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ 6271 6272 /* 6273 * Update cpusets according to cpu_active mask. If cpusets are 6274 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6275 * around partition_sched_domains(). 6276 * 6277 * If we come here as part of a suspend/resume, don't touch cpusets because we 6278 * want to restore it back to its original state upon resume anyway. 6279 */ 6280 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 6281 void *hcpu) 6282 { 6283 switch (action) { 6284 case CPU_ONLINE_FROZEN: 6285 case CPU_DOWN_FAILED_FROZEN: 6286 6287 /* 6288 * num_cpus_frozen tracks how many CPUs are involved in suspend 6289 * resume sequence. As long as this is not the last online 6290 * operation in the resume sequence, just build a single sched 6291 * domain, ignoring cpusets. 6292 */ 6293 num_cpus_frozen--; 6294 if (likely(num_cpus_frozen)) { 6295 partition_sched_domains(1, NULL, NULL); 6296 break; 6297 } 6298 6299 /* 6300 * This is the last CPU online operation. So fall through and 6301 * restore the original sched domains by considering the 6302 * cpuset configurations. 6303 */ 6304 6305 case CPU_ONLINE: 6306 case CPU_DOWN_FAILED: 6307 cpuset_update_active_cpus(true); 6308 break; 6309 default: 6310 return NOTIFY_DONE; 6311 } 6312 return NOTIFY_OK; 6313 } 6314 6315 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 6316 void *hcpu) 6317 { 6318 switch (action) { 6319 case CPU_DOWN_PREPARE: 6320 cpuset_update_active_cpus(false); 6321 break; 6322 case CPU_DOWN_PREPARE_FROZEN: 6323 num_cpus_frozen++; 6324 partition_sched_domains(1, NULL, NULL); 6325 break; 6326 default: 6327 return NOTIFY_DONE; 6328 } 6329 return NOTIFY_OK; 6330 } 6331 6332 void __init sched_init_smp(void) 6333 { 6334 cpumask_var_t non_isolated_cpus; 6335 6336 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6337 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6338 6339 sched_init_numa(); 6340 6341 get_online_cpus(); 6342 mutex_lock(&sched_domains_mutex); 6343 init_sched_domains(cpu_active_mask); 6344 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6345 if (cpumask_empty(non_isolated_cpus)) 6346 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6347 mutex_unlock(&sched_domains_mutex); 6348 put_online_cpus(); 6349 6350 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 6351 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6352 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6353 6354 init_hrtick(); 6355 6356 /* Move init over to a non-isolated CPU */ 6357 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 6358 BUG(); 6359 sched_init_granularity(); 6360 free_cpumask_var(non_isolated_cpus); 6361 6362 init_sched_rt_class(); 6363 } 6364 #else 6365 void __init sched_init_smp(void) 6366 { 6367 sched_init_granularity(); 6368 } 6369 #endif /* CONFIG_SMP */ 6370 6371 const_debug unsigned int sysctl_timer_migration = 1; 6372 6373 int in_sched_functions(unsigned long addr) 6374 { 6375 return in_lock_functions(addr) || 6376 (addr >= (unsigned long)__sched_text_start 6377 && addr < (unsigned long)__sched_text_end); 6378 } 6379 6380 #ifdef CONFIG_CGROUP_SCHED 6381 /* 6382 * Default task group. 6383 * Every task in system belongs to this group at bootup. 6384 */ 6385 struct task_group root_task_group; 6386 LIST_HEAD(task_groups); 6387 #endif 6388 6389 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 6390 6391 void __init sched_init(void) 6392 { 6393 int i, j; 6394 unsigned long alloc_size = 0, ptr; 6395 6396 #ifdef CONFIG_FAIR_GROUP_SCHED 6397 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6398 #endif 6399 #ifdef CONFIG_RT_GROUP_SCHED 6400 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6401 #endif 6402 #ifdef CONFIG_CPUMASK_OFFSTACK 6403 alloc_size += num_possible_cpus() * cpumask_size(); 6404 #endif 6405 if (alloc_size) { 6406 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 6407 6408 #ifdef CONFIG_FAIR_GROUP_SCHED 6409 root_task_group.se = (struct sched_entity **)ptr; 6410 ptr += nr_cpu_ids * sizeof(void **); 6411 6412 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 6413 ptr += nr_cpu_ids * sizeof(void **); 6414 6415 #endif /* CONFIG_FAIR_GROUP_SCHED */ 6416 #ifdef CONFIG_RT_GROUP_SCHED 6417 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 6418 ptr += nr_cpu_ids * sizeof(void **); 6419 6420 root_task_group.rt_rq = (struct rt_rq **)ptr; 6421 ptr += nr_cpu_ids * sizeof(void **); 6422 6423 #endif /* CONFIG_RT_GROUP_SCHED */ 6424 #ifdef CONFIG_CPUMASK_OFFSTACK 6425 for_each_possible_cpu(i) { 6426 per_cpu(load_balance_mask, i) = (void *)ptr; 6427 ptr += cpumask_size(); 6428 } 6429 #endif /* CONFIG_CPUMASK_OFFSTACK */ 6430 } 6431 6432 #ifdef CONFIG_SMP 6433 init_defrootdomain(); 6434 #endif 6435 6436 init_rt_bandwidth(&def_rt_bandwidth, 6437 global_rt_period(), global_rt_runtime()); 6438 6439 #ifdef CONFIG_RT_GROUP_SCHED 6440 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6441 global_rt_period(), global_rt_runtime()); 6442 #endif /* CONFIG_RT_GROUP_SCHED */ 6443 6444 #ifdef CONFIG_CGROUP_SCHED 6445 list_add(&root_task_group.list, &task_groups); 6446 INIT_LIST_HEAD(&root_task_group.children); 6447 INIT_LIST_HEAD(&root_task_group.siblings); 6448 autogroup_init(&init_task); 6449 6450 #endif /* CONFIG_CGROUP_SCHED */ 6451 6452 for_each_possible_cpu(i) { 6453 struct rq *rq; 6454 6455 rq = cpu_rq(i); 6456 raw_spin_lock_init(&rq->lock); 6457 rq->nr_running = 0; 6458 rq->calc_load_active = 0; 6459 rq->calc_load_update = jiffies + LOAD_FREQ; 6460 init_cfs_rq(&rq->cfs); 6461 init_rt_rq(&rq->rt, rq); 6462 #ifdef CONFIG_FAIR_GROUP_SCHED 6463 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6464 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6465 /* 6466 * How much cpu bandwidth does root_task_group get? 6467 * 6468 * In case of task-groups formed thr' the cgroup filesystem, it 6469 * gets 100% of the cpu resources in the system. This overall 6470 * system cpu resource is divided among the tasks of 6471 * root_task_group and its child task-groups in a fair manner, 6472 * based on each entity's (task or task-group's) weight 6473 * (se->load.weight). 6474 * 6475 * In other words, if root_task_group has 10 tasks of weight 6476 * 1024) and two child groups A0 and A1 (of weight 1024 each), 6477 * then A0's share of the cpu resource is: 6478 * 6479 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 6480 * 6481 * We achieve this by letting root_task_group's tasks sit 6482 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 6483 */ 6484 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 6485 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 6486 #endif /* CONFIG_FAIR_GROUP_SCHED */ 6487 6488 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6489 #ifdef CONFIG_RT_GROUP_SCHED 6490 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 6491 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6492 #endif 6493 6494 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6495 rq->cpu_load[j] = 0; 6496 6497 rq->last_load_update_tick = jiffies; 6498 6499 #ifdef CONFIG_SMP 6500 rq->sd = NULL; 6501 rq->rd = NULL; 6502 rq->cpu_power = SCHED_POWER_SCALE; 6503 rq->post_schedule = 0; 6504 rq->active_balance = 0; 6505 rq->next_balance = jiffies; 6506 rq->push_cpu = 0; 6507 rq->cpu = i; 6508 rq->online = 0; 6509 rq->idle_stamp = 0; 6510 rq->avg_idle = 2*sysctl_sched_migration_cost; 6511 6512 INIT_LIST_HEAD(&rq->cfs_tasks); 6513 6514 rq_attach_root(rq, &def_root_domain); 6515 #ifdef CONFIG_NO_HZ_COMMON 6516 rq->nohz_flags = 0; 6517 #endif 6518 #ifdef CONFIG_NO_HZ_FULL 6519 rq->last_sched_tick = 0; 6520 #endif 6521 #endif 6522 init_rq_hrtick(rq); 6523 atomic_set(&rq->nr_iowait, 0); 6524 } 6525 6526 set_load_weight(&init_task); 6527 6528 #ifdef CONFIG_PREEMPT_NOTIFIERS 6529 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6530 #endif 6531 6532 #ifdef CONFIG_RT_MUTEXES 6533 plist_head_init(&init_task.pi_waiters); 6534 #endif 6535 6536 /* 6537 * The boot idle thread does lazy MMU switching as well: 6538 */ 6539 atomic_inc(&init_mm.mm_count); 6540 enter_lazy_tlb(&init_mm, current); 6541 6542 /* 6543 * Make us the idle thread. Technically, schedule() should not be 6544 * called from this thread, however somewhere below it might be, 6545 * but because we are the idle thread, we just pick up running again 6546 * when this runqueue becomes "idle". 6547 */ 6548 init_idle(current, smp_processor_id()); 6549 6550 calc_load_update = jiffies + LOAD_FREQ; 6551 6552 /* 6553 * During early bootup we pretend to be a normal task: 6554 */ 6555 current->sched_class = &fair_sched_class; 6556 6557 #ifdef CONFIG_SMP 6558 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6559 /* May be allocated at isolcpus cmdline parse time */ 6560 if (cpu_isolated_map == NULL) 6561 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6562 idle_thread_set_boot_cpu(); 6563 #endif 6564 init_sched_fair_class(); 6565 6566 scheduler_running = 1; 6567 } 6568 6569 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 6570 static inline int preempt_count_equals(int preempt_offset) 6571 { 6572 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 6573 6574 return (nested == preempt_offset); 6575 } 6576 6577 void __might_sleep(const char *file, int line, int preempt_offset) 6578 { 6579 static unsigned long prev_jiffy; /* ratelimiting */ 6580 6581 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 6582 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 6583 system_state != SYSTEM_RUNNING || oops_in_progress) 6584 return; 6585 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6586 return; 6587 prev_jiffy = jiffies; 6588 6589 printk(KERN_ERR 6590 "BUG: sleeping function called from invalid context at %s:%d\n", 6591 file, line); 6592 printk(KERN_ERR 6593 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 6594 in_atomic(), irqs_disabled(), 6595 current->pid, current->comm); 6596 6597 debug_show_held_locks(current); 6598 if (irqs_disabled()) 6599 print_irqtrace_events(current); 6600 dump_stack(); 6601 } 6602 EXPORT_SYMBOL(__might_sleep); 6603 #endif 6604 6605 #ifdef CONFIG_MAGIC_SYSRQ 6606 static void normalize_task(struct rq *rq, struct task_struct *p) 6607 { 6608 const struct sched_class *prev_class = p->sched_class; 6609 int old_prio = p->prio; 6610 int on_rq; 6611 6612 on_rq = p->on_rq; 6613 if (on_rq) 6614 dequeue_task(rq, p, 0); 6615 __setscheduler(rq, p, SCHED_NORMAL, 0); 6616 if (on_rq) { 6617 enqueue_task(rq, p, 0); 6618 resched_task(rq->curr); 6619 } 6620 6621 check_class_changed(rq, p, prev_class, old_prio); 6622 } 6623 6624 void normalize_rt_tasks(void) 6625 { 6626 struct task_struct *g, *p; 6627 unsigned long flags; 6628 struct rq *rq; 6629 6630 read_lock_irqsave(&tasklist_lock, flags); 6631 do_each_thread(g, p) { 6632 /* 6633 * Only normalize user tasks: 6634 */ 6635 if (!p->mm) 6636 continue; 6637 6638 p->se.exec_start = 0; 6639 #ifdef CONFIG_SCHEDSTATS 6640 p->se.statistics.wait_start = 0; 6641 p->se.statistics.sleep_start = 0; 6642 p->se.statistics.block_start = 0; 6643 #endif 6644 6645 if (!rt_task(p)) { 6646 /* 6647 * Renice negative nice level userspace 6648 * tasks back to 0: 6649 */ 6650 if (TASK_NICE(p) < 0 && p->mm) 6651 set_user_nice(p, 0); 6652 continue; 6653 } 6654 6655 raw_spin_lock(&p->pi_lock); 6656 rq = __task_rq_lock(p); 6657 6658 normalize_task(rq, p); 6659 6660 __task_rq_unlock(rq); 6661 raw_spin_unlock(&p->pi_lock); 6662 } while_each_thread(g, p); 6663 6664 read_unlock_irqrestore(&tasklist_lock, flags); 6665 } 6666 6667 #endif /* CONFIG_MAGIC_SYSRQ */ 6668 6669 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 6670 /* 6671 * These functions are only useful for the IA64 MCA handling, or kdb. 6672 * 6673 * They can only be called when the whole system has been 6674 * stopped - every CPU needs to be quiescent, and no scheduling 6675 * activity can take place. Using them for anything else would 6676 * be a serious bug, and as a result, they aren't even visible 6677 * under any other configuration. 6678 */ 6679 6680 /** 6681 * curr_task - return the current task for a given cpu. 6682 * @cpu: the processor in question. 6683 * 6684 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6685 * 6686 * Return: The current task for @cpu. 6687 */ 6688 struct task_struct *curr_task(int cpu) 6689 { 6690 return cpu_curr(cpu); 6691 } 6692 6693 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 6694 6695 #ifdef CONFIG_IA64 6696 /** 6697 * set_curr_task - set the current task for a given cpu. 6698 * @cpu: the processor in question. 6699 * @p: the task pointer to set. 6700 * 6701 * Description: This function must only be used when non-maskable interrupts 6702 * are serviced on a separate stack. It allows the architecture to switch the 6703 * notion of the current task on a cpu in a non-blocking manner. This function 6704 * must be called with all CPU's synchronized, and interrupts disabled, the 6705 * and caller must save the original value of the current task (see 6706 * curr_task() above) and restore that value before reenabling interrupts and 6707 * re-starting the system. 6708 * 6709 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6710 */ 6711 void set_curr_task(int cpu, struct task_struct *p) 6712 { 6713 cpu_curr(cpu) = p; 6714 } 6715 6716 #endif 6717 6718 #ifdef CONFIG_CGROUP_SCHED 6719 /* task_group_lock serializes the addition/removal of task groups */ 6720 static DEFINE_SPINLOCK(task_group_lock); 6721 6722 static void free_sched_group(struct task_group *tg) 6723 { 6724 free_fair_sched_group(tg); 6725 free_rt_sched_group(tg); 6726 autogroup_free(tg); 6727 kfree(tg); 6728 } 6729 6730 /* allocate runqueue etc for a new task group */ 6731 struct task_group *sched_create_group(struct task_group *parent) 6732 { 6733 struct task_group *tg; 6734 6735 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 6736 if (!tg) 6737 return ERR_PTR(-ENOMEM); 6738 6739 if (!alloc_fair_sched_group(tg, parent)) 6740 goto err; 6741 6742 if (!alloc_rt_sched_group(tg, parent)) 6743 goto err; 6744 6745 return tg; 6746 6747 err: 6748 free_sched_group(tg); 6749 return ERR_PTR(-ENOMEM); 6750 } 6751 6752 void sched_online_group(struct task_group *tg, struct task_group *parent) 6753 { 6754 unsigned long flags; 6755 6756 spin_lock_irqsave(&task_group_lock, flags); 6757 list_add_rcu(&tg->list, &task_groups); 6758 6759 WARN_ON(!parent); /* root should already exist */ 6760 6761 tg->parent = parent; 6762 INIT_LIST_HEAD(&tg->children); 6763 list_add_rcu(&tg->siblings, &parent->children); 6764 spin_unlock_irqrestore(&task_group_lock, flags); 6765 } 6766 6767 /* rcu callback to free various structures associated with a task group */ 6768 static void free_sched_group_rcu(struct rcu_head *rhp) 6769 { 6770 /* now it should be safe to free those cfs_rqs */ 6771 free_sched_group(container_of(rhp, struct task_group, rcu)); 6772 } 6773 6774 /* Destroy runqueue etc associated with a task group */ 6775 void sched_destroy_group(struct task_group *tg) 6776 { 6777 /* wait for possible concurrent references to cfs_rqs complete */ 6778 call_rcu(&tg->rcu, free_sched_group_rcu); 6779 } 6780 6781 void sched_offline_group(struct task_group *tg) 6782 { 6783 unsigned long flags; 6784 int i; 6785 6786 /* end participation in shares distribution */ 6787 for_each_possible_cpu(i) 6788 unregister_fair_sched_group(tg, i); 6789 6790 spin_lock_irqsave(&task_group_lock, flags); 6791 list_del_rcu(&tg->list); 6792 list_del_rcu(&tg->siblings); 6793 spin_unlock_irqrestore(&task_group_lock, flags); 6794 } 6795 6796 /* change task's runqueue when it moves between groups. 6797 * The caller of this function should have put the task in its new group 6798 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 6799 * reflect its new group. 6800 */ 6801 void sched_move_task(struct task_struct *tsk) 6802 { 6803 struct task_group *tg; 6804 int on_rq, running; 6805 unsigned long flags; 6806 struct rq *rq; 6807 6808 rq = task_rq_lock(tsk, &flags); 6809 6810 running = task_current(rq, tsk); 6811 on_rq = tsk->on_rq; 6812 6813 if (on_rq) 6814 dequeue_task(rq, tsk, 0); 6815 if (unlikely(running)) 6816 tsk->sched_class->put_prev_task(rq, tsk); 6817 6818 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, 6819 lockdep_is_held(&tsk->sighand->siglock)), 6820 struct task_group, css); 6821 tg = autogroup_task_group(tsk, tg); 6822 tsk->sched_task_group = tg; 6823 6824 #ifdef CONFIG_FAIR_GROUP_SCHED 6825 if (tsk->sched_class->task_move_group) 6826 tsk->sched_class->task_move_group(tsk, on_rq); 6827 else 6828 #endif 6829 set_task_rq(tsk, task_cpu(tsk)); 6830 6831 if (unlikely(running)) 6832 tsk->sched_class->set_curr_task(rq); 6833 if (on_rq) 6834 enqueue_task(rq, tsk, 0); 6835 6836 task_rq_unlock(rq, tsk, &flags); 6837 } 6838 #endif /* CONFIG_CGROUP_SCHED */ 6839 6840 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 6841 static unsigned long to_ratio(u64 period, u64 runtime) 6842 { 6843 if (runtime == RUNTIME_INF) 6844 return 1ULL << 20; 6845 6846 return div64_u64(runtime << 20, period); 6847 } 6848 #endif 6849 6850 #ifdef CONFIG_RT_GROUP_SCHED 6851 /* 6852 * Ensure that the real time constraints are schedulable. 6853 */ 6854 static DEFINE_MUTEX(rt_constraints_mutex); 6855 6856 /* Must be called with tasklist_lock held */ 6857 static inline int tg_has_rt_tasks(struct task_group *tg) 6858 { 6859 struct task_struct *g, *p; 6860 6861 do_each_thread(g, p) { 6862 if (rt_task(p) && task_rq(p)->rt.tg == tg) 6863 return 1; 6864 } while_each_thread(g, p); 6865 6866 return 0; 6867 } 6868 6869 struct rt_schedulable_data { 6870 struct task_group *tg; 6871 u64 rt_period; 6872 u64 rt_runtime; 6873 }; 6874 6875 static int tg_rt_schedulable(struct task_group *tg, void *data) 6876 { 6877 struct rt_schedulable_data *d = data; 6878 struct task_group *child; 6879 unsigned long total, sum = 0; 6880 u64 period, runtime; 6881 6882 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 6883 runtime = tg->rt_bandwidth.rt_runtime; 6884 6885 if (tg == d->tg) { 6886 period = d->rt_period; 6887 runtime = d->rt_runtime; 6888 } 6889 6890 /* 6891 * Cannot have more runtime than the period. 6892 */ 6893 if (runtime > period && runtime != RUNTIME_INF) 6894 return -EINVAL; 6895 6896 /* 6897 * Ensure we don't starve existing RT tasks. 6898 */ 6899 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 6900 return -EBUSY; 6901 6902 total = to_ratio(period, runtime); 6903 6904 /* 6905 * Nobody can have more than the global setting allows. 6906 */ 6907 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 6908 return -EINVAL; 6909 6910 /* 6911 * The sum of our children's runtime should not exceed our own. 6912 */ 6913 list_for_each_entry_rcu(child, &tg->children, siblings) { 6914 period = ktime_to_ns(child->rt_bandwidth.rt_period); 6915 runtime = child->rt_bandwidth.rt_runtime; 6916 6917 if (child == d->tg) { 6918 period = d->rt_period; 6919 runtime = d->rt_runtime; 6920 } 6921 6922 sum += to_ratio(period, runtime); 6923 } 6924 6925 if (sum > total) 6926 return -EINVAL; 6927 6928 return 0; 6929 } 6930 6931 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 6932 { 6933 int ret; 6934 6935 struct rt_schedulable_data data = { 6936 .tg = tg, 6937 .rt_period = period, 6938 .rt_runtime = runtime, 6939 }; 6940 6941 rcu_read_lock(); 6942 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 6943 rcu_read_unlock(); 6944 6945 return ret; 6946 } 6947 6948 static int tg_set_rt_bandwidth(struct task_group *tg, 6949 u64 rt_period, u64 rt_runtime) 6950 { 6951 int i, err = 0; 6952 6953 mutex_lock(&rt_constraints_mutex); 6954 read_lock(&tasklist_lock); 6955 err = __rt_schedulable(tg, rt_period, rt_runtime); 6956 if (err) 6957 goto unlock; 6958 6959 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 6960 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 6961 tg->rt_bandwidth.rt_runtime = rt_runtime; 6962 6963 for_each_possible_cpu(i) { 6964 struct rt_rq *rt_rq = tg->rt_rq[i]; 6965 6966 raw_spin_lock(&rt_rq->rt_runtime_lock); 6967 rt_rq->rt_runtime = rt_runtime; 6968 raw_spin_unlock(&rt_rq->rt_runtime_lock); 6969 } 6970 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 6971 unlock: 6972 read_unlock(&tasklist_lock); 6973 mutex_unlock(&rt_constraints_mutex); 6974 6975 return err; 6976 } 6977 6978 static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 6979 { 6980 u64 rt_runtime, rt_period; 6981 6982 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 6983 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 6984 if (rt_runtime_us < 0) 6985 rt_runtime = RUNTIME_INF; 6986 6987 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 6988 } 6989 6990 static long sched_group_rt_runtime(struct task_group *tg) 6991 { 6992 u64 rt_runtime_us; 6993 6994 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 6995 return -1; 6996 6997 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 6998 do_div(rt_runtime_us, NSEC_PER_USEC); 6999 return rt_runtime_us; 7000 } 7001 7002 static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7003 { 7004 u64 rt_runtime, rt_period; 7005 7006 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7007 rt_runtime = tg->rt_bandwidth.rt_runtime; 7008 7009 if (rt_period == 0) 7010 return -EINVAL; 7011 7012 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7013 } 7014 7015 static long sched_group_rt_period(struct task_group *tg) 7016 { 7017 u64 rt_period_us; 7018 7019 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 7020 do_div(rt_period_us, NSEC_PER_USEC); 7021 return rt_period_us; 7022 } 7023 7024 static int sched_rt_global_constraints(void) 7025 { 7026 u64 runtime, period; 7027 int ret = 0; 7028 7029 if (sysctl_sched_rt_period <= 0) 7030 return -EINVAL; 7031 7032 runtime = global_rt_runtime(); 7033 period = global_rt_period(); 7034 7035 /* 7036 * Sanity check on the sysctl variables. 7037 */ 7038 if (runtime > period && runtime != RUNTIME_INF) 7039 return -EINVAL; 7040 7041 mutex_lock(&rt_constraints_mutex); 7042 read_lock(&tasklist_lock); 7043 ret = __rt_schedulable(NULL, 0, 0); 7044 read_unlock(&tasklist_lock); 7045 mutex_unlock(&rt_constraints_mutex); 7046 7047 return ret; 7048 } 7049 7050 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7051 { 7052 /* Don't accept realtime tasks when there is no way for them to run */ 7053 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7054 return 0; 7055 7056 return 1; 7057 } 7058 7059 #else /* !CONFIG_RT_GROUP_SCHED */ 7060 static int sched_rt_global_constraints(void) 7061 { 7062 unsigned long flags; 7063 int i; 7064 7065 if (sysctl_sched_rt_period <= 0) 7066 return -EINVAL; 7067 7068 /* 7069 * There's always some RT tasks in the root group 7070 * -- migration, kstopmachine etc.. 7071 */ 7072 if (sysctl_sched_rt_runtime == 0) 7073 return -EBUSY; 7074 7075 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7076 for_each_possible_cpu(i) { 7077 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7078 7079 raw_spin_lock(&rt_rq->rt_runtime_lock); 7080 rt_rq->rt_runtime = global_rt_runtime(); 7081 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7082 } 7083 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7084 7085 return 0; 7086 } 7087 #endif /* CONFIG_RT_GROUP_SCHED */ 7088 7089 int sched_rr_handler(struct ctl_table *table, int write, 7090 void __user *buffer, size_t *lenp, 7091 loff_t *ppos) 7092 { 7093 int ret; 7094 static DEFINE_MUTEX(mutex); 7095 7096 mutex_lock(&mutex); 7097 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7098 /* make sure that internally we keep jiffies */ 7099 /* also, writing zero resets timeslice to default */ 7100 if (!ret && write) { 7101 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 7102 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7103 } 7104 mutex_unlock(&mutex); 7105 return ret; 7106 } 7107 7108 int sched_rt_handler(struct ctl_table *table, int write, 7109 void __user *buffer, size_t *lenp, 7110 loff_t *ppos) 7111 { 7112 int ret; 7113 int old_period, old_runtime; 7114 static DEFINE_MUTEX(mutex); 7115 7116 mutex_lock(&mutex); 7117 old_period = sysctl_sched_rt_period; 7118 old_runtime = sysctl_sched_rt_runtime; 7119 7120 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7121 7122 if (!ret && write) { 7123 ret = sched_rt_global_constraints(); 7124 if (ret) { 7125 sysctl_sched_rt_period = old_period; 7126 sysctl_sched_rt_runtime = old_runtime; 7127 } else { 7128 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7129 def_rt_bandwidth.rt_period = 7130 ns_to_ktime(global_rt_period()); 7131 } 7132 } 7133 mutex_unlock(&mutex); 7134 7135 return ret; 7136 } 7137 7138 #ifdef CONFIG_CGROUP_SCHED 7139 7140 /* return corresponding task_group object of a cgroup */ 7141 static inline struct task_group *cgroup_tg(struct cgroup *cgrp) 7142 { 7143 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 7144 struct task_group, css); 7145 } 7146 7147 static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) 7148 { 7149 struct task_group *tg, *parent; 7150 7151 if (!cgrp->parent) { 7152 /* This is early initialization for the top cgroup */ 7153 return &root_task_group.css; 7154 } 7155 7156 parent = cgroup_tg(cgrp->parent); 7157 tg = sched_create_group(parent); 7158 if (IS_ERR(tg)) 7159 return ERR_PTR(-ENOMEM); 7160 7161 return &tg->css; 7162 } 7163 7164 static int cpu_cgroup_css_online(struct cgroup *cgrp) 7165 { 7166 struct task_group *tg = cgroup_tg(cgrp); 7167 struct task_group *parent; 7168 7169 if (!cgrp->parent) 7170 return 0; 7171 7172 parent = cgroup_tg(cgrp->parent); 7173 sched_online_group(tg, parent); 7174 return 0; 7175 } 7176 7177 static void cpu_cgroup_css_free(struct cgroup *cgrp) 7178 { 7179 struct task_group *tg = cgroup_tg(cgrp); 7180 7181 sched_destroy_group(tg); 7182 } 7183 7184 static void cpu_cgroup_css_offline(struct cgroup *cgrp) 7185 { 7186 struct task_group *tg = cgroup_tg(cgrp); 7187 7188 sched_offline_group(tg); 7189 } 7190 7191 static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7192 struct cgroup_taskset *tset) 7193 { 7194 struct task_struct *task; 7195 7196 cgroup_taskset_for_each(task, cgrp, tset) { 7197 #ifdef CONFIG_RT_GROUP_SCHED 7198 if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 7199 return -EINVAL; 7200 #else 7201 /* We don't support RT-tasks being in separate groups */ 7202 if (task->sched_class != &fair_sched_class) 7203 return -EINVAL; 7204 #endif 7205 } 7206 return 0; 7207 } 7208 7209 static void cpu_cgroup_attach(struct cgroup *cgrp, 7210 struct cgroup_taskset *tset) 7211 { 7212 struct task_struct *task; 7213 7214 cgroup_taskset_for_each(task, cgrp, tset) 7215 sched_move_task(task); 7216 } 7217 7218 static void 7219 cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7220 struct task_struct *task) 7221 { 7222 /* 7223 * cgroup_exit() is called in the copy_process() failure path. 7224 * Ignore this case since the task hasn't ran yet, this avoids 7225 * trying to poke a half freed task state from generic code. 7226 */ 7227 if (!(task->flags & PF_EXITING)) 7228 return; 7229 7230 sched_move_task(task); 7231 } 7232 7233 #ifdef CONFIG_FAIR_GROUP_SCHED 7234 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7235 u64 shareval) 7236 { 7237 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 7238 } 7239 7240 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 7241 { 7242 struct task_group *tg = cgroup_tg(cgrp); 7243 7244 return (u64) scale_load_down(tg->shares); 7245 } 7246 7247 #ifdef CONFIG_CFS_BANDWIDTH 7248 static DEFINE_MUTEX(cfs_constraints_mutex); 7249 7250 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 7251 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 7252 7253 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 7254 7255 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7256 { 7257 int i, ret = 0, runtime_enabled, runtime_was_enabled; 7258 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7259 7260 if (tg == &root_task_group) 7261 return -EINVAL; 7262 7263 /* 7264 * Ensure we have at some amount of bandwidth every period. This is 7265 * to prevent reaching a state of large arrears when throttled via 7266 * entity_tick() resulting in prolonged exit starvation. 7267 */ 7268 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 7269 return -EINVAL; 7270 7271 /* 7272 * Likewise, bound things on the otherside by preventing insane quota 7273 * periods. This also allows us to normalize in computing quota 7274 * feasibility. 7275 */ 7276 if (period > max_cfs_quota_period) 7277 return -EINVAL; 7278 7279 mutex_lock(&cfs_constraints_mutex); 7280 ret = __cfs_schedulable(tg, period, quota); 7281 if (ret) 7282 goto out_unlock; 7283 7284 runtime_enabled = quota != RUNTIME_INF; 7285 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7286 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7287 raw_spin_lock_irq(&cfs_b->lock); 7288 cfs_b->period = ns_to_ktime(period); 7289 cfs_b->quota = quota; 7290 7291 __refill_cfs_bandwidth_runtime(cfs_b); 7292 /* restart the period timer (if active) to handle new period expiry */ 7293 if (runtime_enabled && cfs_b->timer_active) { 7294 /* force a reprogram */ 7295 cfs_b->timer_active = 0; 7296 __start_cfs_bandwidth(cfs_b); 7297 } 7298 raw_spin_unlock_irq(&cfs_b->lock); 7299 7300 for_each_possible_cpu(i) { 7301 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7302 struct rq *rq = cfs_rq->rq; 7303 7304 raw_spin_lock_irq(&rq->lock); 7305 cfs_rq->runtime_enabled = runtime_enabled; 7306 cfs_rq->runtime_remaining = 0; 7307 7308 if (cfs_rq->throttled) 7309 unthrottle_cfs_rq(cfs_rq); 7310 raw_spin_unlock_irq(&rq->lock); 7311 } 7312 out_unlock: 7313 mutex_unlock(&cfs_constraints_mutex); 7314 7315 return ret; 7316 } 7317 7318 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 7319 { 7320 u64 quota, period; 7321 7322 period = ktime_to_ns(tg->cfs_bandwidth.period); 7323 if (cfs_quota_us < 0) 7324 quota = RUNTIME_INF; 7325 else 7326 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 7327 7328 return tg_set_cfs_bandwidth(tg, period, quota); 7329 } 7330 7331 long tg_get_cfs_quota(struct task_group *tg) 7332 { 7333 u64 quota_us; 7334 7335 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 7336 return -1; 7337 7338 quota_us = tg->cfs_bandwidth.quota; 7339 do_div(quota_us, NSEC_PER_USEC); 7340 7341 return quota_us; 7342 } 7343 7344 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 7345 { 7346 u64 quota, period; 7347 7348 period = (u64)cfs_period_us * NSEC_PER_USEC; 7349 quota = tg->cfs_bandwidth.quota; 7350 7351 return tg_set_cfs_bandwidth(tg, period, quota); 7352 } 7353 7354 long tg_get_cfs_period(struct task_group *tg) 7355 { 7356 u64 cfs_period_us; 7357 7358 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 7359 do_div(cfs_period_us, NSEC_PER_USEC); 7360 7361 return cfs_period_us; 7362 } 7363 7364 static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) 7365 { 7366 return tg_get_cfs_quota(cgroup_tg(cgrp)); 7367 } 7368 7369 static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, 7370 s64 cfs_quota_us) 7371 { 7372 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); 7373 } 7374 7375 static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) 7376 { 7377 return tg_get_cfs_period(cgroup_tg(cgrp)); 7378 } 7379 7380 static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7381 u64 cfs_period_us) 7382 { 7383 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); 7384 } 7385 7386 struct cfs_schedulable_data { 7387 struct task_group *tg; 7388 u64 period, quota; 7389 }; 7390 7391 /* 7392 * normalize group quota/period to be quota/max_period 7393 * note: units are usecs 7394 */ 7395 static u64 normalize_cfs_quota(struct task_group *tg, 7396 struct cfs_schedulable_data *d) 7397 { 7398 u64 quota, period; 7399 7400 if (tg == d->tg) { 7401 period = d->period; 7402 quota = d->quota; 7403 } else { 7404 period = tg_get_cfs_period(tg); 7405 quota = tg_get_cfs_quota(tg); 7406 } 7407 7408 /* note: these should typically be equivalent */ 7409 if (quota == RUNTIME_INF || quota == -1) 7410 return RUNTIME_INF; 7411 7412 return to_ratio(period, quota); 7413 } 7414 7415 static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7416 { 7417 struct cfs_schedulable_data *d = data; 7418 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7419 s64 quota = 0, parent_quota = -1; 7420 7421 if (!tg->parent) { 7422 quota = RUNTIME_INF; 7423 } else { 7424 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 7425 7426 quota = normalize_cfs_quota(tg, d); 7427 parent_quota = parent_b->hierarchal_quota; 7428 7429 /* 7430 * ensure max(child_quota) <= parent_quota, inherit when no 7431 * limit is set 7432 */ 7433 if (quota == RUNTIME_INF) 7434 quota = parent_quota; 7435 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 7436 return -EINVAL; 7437 } 7438 cfs_b->hierarchal_quota = quota; 7439 7440 return 0; 7441 } 7442 7443 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 7444 { 7445 int ret; 7446 struct cfs_schedulable_data data = { 7447 .tg = tg, 7448 .period = period, 7449 .quota = quota, 7450 }; 7451 7452 if (quota != RUNTIME_INF) { 7453 do_div(data.period, NSEC_PER_USEC); 7454 do_div(data.quota, NSEC_PER_USEC); 7455 } 7456 7457 rcu_read_lock(); 7458 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 7459 rcu_read_unlock(); 7460 7461 return ret; 7462 } 7463 7464 static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, 7465 struct cgroup_map_cb *cb) 7466 { 7467 struct task_group *tg = cgroup_tg(cgrp); 7468 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7469 7470 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7471 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7472 cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7473 7474 return 0; 7475 } 7476 #endif /* CONFIG_CFS_BANDWIDTH */ 7477 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7478 7479 #ifdef CONFIG_RT_GROUP_SCHED 7480 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 7481 s64 val) 7482 { 7483 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 7484 } 7485 7486 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 7487 { 7488 return sched_group_rt_runtime(cgroup_tg(cgrp)); 7489 } 7490 7491 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 7492 u64 rt_period_us) 7493 { 7494 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 7495 } 7496 7497 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 7498 { 7499 return sched_group_rt_period(cgroup_tg(cgrp)); 7500 } 7501 #endif /* CONFIG_RT_GROUP_SCHED */ 7502 7503 static struct cftype cpu_files[] = { 7504 #ifdef CONFIG_FAIR_GROUP_SCHED 7505 { 7506 .name = "shares", 7507 .read_u64 = cpu_shares_read_u64, 7508 .write_u64 = cpu_shares_write_u64, 7509 }, 7510 #endif 7511 #ifdef CONFIG_CFS_BANDWIDTH 7512 { 7513 .name = "cfs_quota_us", 7514 .read_s64 = cpu_cfs_quota_read_s64, 7515 .write_s64 = cpu_cfs_quota_write_s64, 7516 }, 7517 { 7518 .name = "cfs_period_us", 7519 .read_u64 = cpu_cfs_period_read_u64, 7520 .write_u64 = cpu_cfs_period_write_u64, 7521 }, 7522 { 7523 .name = "stat", 7524 .read_map = cpu_stats_show, 7525 }, 7526 #endif 7527 #ifdef CONFIG_RT_GROUP_SCHED 7528 { 7529 .name = "rt_runtime_us", 7530 .read_s64 = cpu_rt_runtime_read, 7531 .write_s64 = cpu_rt_runtime_write, 7532 }, 7533 { 7534 .name = "rt_period_us", 7535 .read_u64 = cpu_rt_period_read_uint, 7536 .write_u64 = cpu_rt_period_write_uint, 7537 }, 7538 #endif 7539 { } /* terminate */ 7540 }; 7541 7542 struct cgroup_subsys cpu_cgroup_subsys = { 7543 .name = "cpu", 7544 .css_alloc = cpu_cgroup_css_alloc, 7545 .css_free = cpu_cgroup_css_free, 7546 .css_online = cpu_cgroup_css_online, 7547 .css_offline = cpu_cgroup_css_offline, 7548 .can_attach = cpu_cgroup_can_attach, 7549 .attach = cpu_cgroup_attach, 7550 .exit = cpu_cgroup_exit, 7551 .subsys_id = cpu_cgroup_subsys_id, 7552 .base_cftypes = cpu_files, 7553 .early_init = 1, 7554 }; 7555 7556 #endif /* CONFIG_CGROUP_SCHED */ 7557 7558 void dump_cpu_task(int cpu) 7559 { 7560 pr_info("Task dump for CPU %d:\n", cpu); 7561 sched_show_task(cpu_curr(cpu)); 7562 } 7563