1 /* 2 * linux/kernel/time/tick-sched.c 3 * 4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> 5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar 6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner 7 * 8 * No idle tick implementation for low and high resolution timers 9 * 10 * Started by: Thomas Gleixner and Ingo Molnar 11 * 12 * Distribute under GPLv2. 13 */ 14 #include <linux/cpu.h> 15 #include <linux/err.h> 16 #include <linux/hrtimer.h> 17 #include <linux/interrupt.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/percpu.h> 20 #include <linux/profile.h> 21 #include <linux/sched.h> 22 #include <linux/module.h> 23 #include <linux/irq_work.h> 24 #include <linux/posix-timers.h> 25 #include <linux/perf_event.h> 26 #include <linux/context_tracking.h> 27 28 #include <asm/irq_regs.h> 29 30 #include "tick-internal.h" 31 32 #include <trace/events/timer.h> 33 34 /* 35 * Per cpu nohz control structure 36 */ 37 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 38 39 struct tick_sched *tick_get_tick_sched(int cpu) 40 { 41 return &per_cpu(tick_cpu_sched, cpu); 42 } 43 44 #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) 45 /* 46 * The time, when the last jiffy update happened. Protected by jiffies_lock. 47 */ 48 static ktime_t last_jiffies_update; 49 50 /* 51 * Must be called with interrupts disabled ! 52 */ 53 static void tick_do_update_jiffies64(ktime_t now) 54 { 55 unsigned long ticks = 0; 56 ktime_t delta; 57 58 /* 59 * Do a quick check without holding jiffies_lock: 60 */ 61 delta = ktime_sub(now, last_jiffies_update); 62 if (delta.tv64 < tick_period.tv64) 63 return; 64 65 /* Reevalute with jiffies_lock held */ 66 write_seqlock(&jiffies_lock); 67 68 delta = ktime_sub(now, last_jiffies_update); 69 if (delta.tv64 >= tick_period.tv64) { 70 71 delta = ktime_sub(delta, tick_period); 72 last_jiffies_update = ktime_add(last_jiffies_update, 73 tick_period); 74 75 /* Slow path for long timeouts */ 76 if (unlikely(delta.tv64 >= tick_period.tv64)) { 77 s64 incr = ktime_to_ns(tick_period); 78 79 ticks = ktime_divns(delta, incr); 80 81 last_jiffies_update = ktime_add_ns(last_jiffies_update, 82 incr * ticks); 83 } 84 do_timer(++ticks); 85 86 /* Keep the tick_next_period variable up to date */ 87 tick_next_period = ktime_add(last_jiffies_update, tick_period); 88 } else { 89 write_sequnlock(&jiffies_lock); 90 return; 91 } 92 write_sequnlock(&jiffies_lock); 93 update_wall_time(); 94 } 95 96 /* 97 * Initialize and return retrieve the jiffies update. 98 */ 99 static ktime_t tick_init_jiffy_update(void) 100 { 101 ktime_t period; 102 103 write_seqlock(&jiffies_lock); 104 /* Did we start the jiffies update yet ? */ 105 if (last_jiffies_update.tv64 == 0) 106 last_jiffies_update = tick_next_period; 107 period = last_jiffies_update; 108 write_sequnlock(&jiffies_lock); 109 return period; 110 } 111 112 113 static void tick_sched_do_timer(ktime_t now) 114 { 115 int cpu = smp_processor_id(); 116 117 #ifdef CONFIG_NO_HZ_COMMON 118 /* 119 * Check if the do_timer duty was dropped. We don't care about 120 * concurrency: This happens only when the cpu in charge went 121 * into a long sleep. If two cpus happen to assign themself to 122 * this duty, then the jiffies update is still serialized by 123 * jiffies_lock. 124 */ 125 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) 126 && !tick_nohz_full_cpu(cpu)) 127 tick_do_timer_cpu = cpu; 128 #endif 129 130 /* Check, if the jiffies need an update */ 131 if (tick_do_timer_cpu == cpu) 132 tick_do_update_jiffies64(now); 133 } 134 135 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) 136 { 137 #ifdef CONFIG_NO_HZ_COMMON 138 /* 139 * When we are idle and the tick is stopped, we have to touch 140 * the watchdog as we might not schedule for a really long 141 * time. This happens on complete idle SMP systems while 142 * waiting on the login prompt. We also increment the "start of 143 * idle" jiffy stamp so the idle accounting adjustment we do 144 * when we go busy again does not account too much ticks. 145 */ 146 if (ts->tick_stopped) { 147 touch_softlockup_watchdog_sched(); 148 if (is_idle_task(current)) 149 ts->idle_jiffies++; 150 } 151 #endif 152 update_process_times(user_mode(regs)); 153 profile_tick(CPU_PROFILING); 154 } 155 #endif 156 157 #ifdef CONFIG_NO_HZ_FULL 158 cpumask_var_t tick_nohz_full_mask; 159 cpumask_var_t housekeeping_mask; 160 bool tick_nohz_full_running; 161 static unsigned long tick_dep_mask; 162 163 static void trace_tick_dependency(unsigned long dep) 164 { 165 if (dep & TICK_DEP_MASK_POSIX_TIMER) { 166 trace_tick_stop(0, "posix timers running\n"); 167 return; 168 } 169 170 if (dep & TICK_DEP_MASK_PERF_EVENTS) { 171 trace_tick_stop(0, "perf events running\n"); 172 return; 173 } 174 175 if (dep & TICK_DEP_MASK_SCHED) { 176 trace_tick_stop(0, "more than 1 task in runqueue\n"); 177 return; 178 } 179 180 if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE) 181 trace_tick_stop(0, "unstable sched clock\n"); 182 } 183 184 static bool can_stop_full_tick(struct tick_sched *ts) 185 { 186 WARN_ON_ONCE(!irqs_disabled()); 187 188 if (tick_dep_mask) { 189 trace_tick_dependency(tick_dep_mask); 190 return false; 191 } 192 193 if (ts->tick_dep_mask) { 194 trace_tick_dependency(ts->tick_dep_mask); 195 return false; 196 } 197 198 if (current->tick_dep_mask) { 199 trace_tick_dependency(current->tick_dep_mask); 200 return false; 201 } 202 203 if (current->signal->tick_dep_mask) { 204 trace_tick_dependency(current->signal->tick_dep_mask); 205 return false; 206 } 207 208 if (!sched_can_stop_tick()) { 209 trace_tick_stop(0, "more than 1 task in runqueue\n"); 210 return false; 211 } 212 213 if (!posix_cpu_timers_can_stop_tick(current)) { 214 trace_tick_stop(0, "posix timers running\n"); 215 return false; 216 } 217 218 if (!perf_event_can_stop_tick()) { 219 trace_tick_stop(0, "perf events running\n"); 220 return false; 221 } 222 223 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 224 /* 225 * sched_clock_tick() needs us? 226 * 227 * TODO: kick full dynticks CPUs when 228 * sched_clock_stable is set. 229 */ 230 if (!sched_clock_stable()) { 231 trace_tick_stop(0, "unstable sched clock\n"); 232 /* 233 * Don't allow the user to think they can get 234 * full NO_HZ with this machine. 235 */ 236 WARN_ONCE(tick_nohz_full_running, 237 "NO_HZ FULL will not work with unstable sched clock"); 238 return false; 239 } 240 #endif 241 242 return true; 243 } 244 245 static void nohz_full_kick_func(struct irq_work *work) 246 { 247 /* Empty, the tick restart happens on tick_nohz_irq_exit() */ 248 } 249 250 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { 251 .func = nohz_full_kick_func, 252 }; 253 254 /* 255 * Kick this CPU if it's full dynticks in order to force it to 256 * re-evaluate its dependency on the tick and restart it if necessary. 257 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), 258 * is NMI safe. 259 */ 260 void tick_nohz_full_kick(void) 261 { 262 if (!tick_nohz_full_cpu(smp_processor_id())) 263 return; 264 265 irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); 266 } 267 268 /* 269 * Kick the CPU if it's full dynticks in order to force it to 270 * re-evaluate its dependency on the tick and restart it if necessary. 271 */ 272 void tick_nohz_full_kick_cpu(int cpu) 273 { 274 if (!tick_nohz_full_cpu(cpu)) 275 return; 276 277 irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); 278 } 279 280 /* 281 * Kick all full dynticks CPUs in order to force these to re-evaluate 282 * their dependency on the tick and restart it if necessary. 283 */ 284 void tick_nohz_full_kick_all(void) 285 { 286 int cpu; 287 288 if (!tick_nohz_full_running) 289 return; 290 291 preempt_disable(); 292 for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) 293 tick_nohz_full_kick_cpu(cpu); 294 preempt_enable(); 295 } 296 297 static void tick_nohz_dep_set_all(unsigned long *dep, 298 enum tick_dep_bits bit) 299 { 300 unsigned long prev; 301 302 prev = fetch_or(dep, BIT_MASK(bit)); 303 if (!prev) 304 tick_nohz_full_kick_all(); 305 } 306 307 /* 308 * Set a global tick dependency. Used by perf events that rely on freq and 309 * by unstable clock. 310 */ 311 void tick_nohz_dep_set(enum tick_dep_bits bit) 312 { 313 tick_nohz_dep_set_all(&tick_dep_mask, bit); 314 } 315 316 void tick_nohz_dep_clear(enum tick_dep_bits bit) 317 { 318 clear_bit(bit, &tick_dep_mask); 319 } 320 321 /* 322 * Set per-CPU tick dependency. Used by scheduler and perf events in order to 323 * manage events throttling. 324 */ 325 void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) 326 { 327 unsigned long prev; 328 struct tick_sched *ts; 329 330 ts = per_cpu_ptr(&tick_cpu_sched, cpu); 331 332 prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit)); 333 if (!prev) { 334 preempt_disable(); 335 /* Perf needs local kick that is NMI safe */ 336 if (cpu == smp_processor_id()) { 337 tick_nohz_full_kick(); 338 } else { 339 /* Remote irq work not NMI-safe */ 340 if (!WARN_ON_ONCE(in_nmi())) 341 tick_nohz_full_kick_cpu(cpu); 342 } 343 preempt_enable(); 344 } 345 } 346 347 void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) 348 { 349 struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); 350 351 clear_bit(bit, &ts->tick_dep_mask); 352 } 353 354 /* 355 * Set a per-task tick dependency. Posix CPU timers need this in order to elapse 356 * per task timers. 357 */ 358 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) 359 { 360 /* 361 * We could optimize this with just kicking the target running the task 362 * if that noise matters for nohz full users. 363 */ 364 tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); 365 } 366 367 void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) 368 { 369 clear_bit(bit, &tsk->tick_dep_mask); 370 } 371 372 /* 373 * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse 374 * per process timers. 375 */ 376 void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit) 377 { 378 tick_nohz_dep_set_all(&sig->tick_dep_mask, bit); 379 } 380 381 void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) 382 { 383 clear_bit(bit, &sig->tick_dep_mask); 384 } 385 386 /* 387 * Re-evaluate the need for the tick as we switch the current task. 388 * It might need the tick due to per task/process properties: 389 * perf events, posix cpu timers, ... 390 */ 391 void __tick_nohz_task_switch(void) 392 { 393 unsigned long flags; 394 struct tick_sched *ts; 395 396 local_irq_save(flags); 397 398 if (!tick_nohz_full_cpu(smp_processor_id())) 399 goto out; 400 401 ts = this_cpu_ptr(&tick_cpu_sched); 402 403 if (ts->tick_stopped) { 404 if (current->tick_dep_mask || current->signal->tick_dep_mask) 405 tick_nohz_full_kick(); 406 } 407 out: 408 local_irq_restore(flags); 409 } 410 411 /* Parse the boot-time nohz CPU list from the kernel parameters. */ 412 static int __init tick_nohz_full_setup(char *str) 413 { 414 alloc_bootmem_cpumask_var(&tick_nohz_full_mask); 415 if (cpulist_parse(str, tick_nohz_full_mask) < 0) { 416 pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); 417 free_bootmem_cpumask_var(tick_nohz_full_mask); 418 return 1; 419 } 420 tick_nohz_full_running = true; 421 422 return 1; 423 } 424 __setup("nohz_full=", tick_nohz_full_setup); 425 426 static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, 427 unsigned long action, 428 void *hcpu) 429 { 430 unsigned int cpu = (unsigned long)hcpu; 431 432 switch (action & ~CPU_TASKS_FROZEN) { 433 case CPU_DOWN_PREPARE: 434 /* 435 * The boot CPU handles housekeeping duty (unbound timers, 436 * workqueues, timekeeping, ...) on behalf of full dynticks 437 * CPUs. It must remain online when nohz full is enabled. 438 */ 439 if (tick_nohz_full_running && tick_do_timer_cpu == cpu) 440 return NOTIFY_BAD; 441 break; 442 } 443 return NOTIFY_OK; 444 } 445 446 static int tick_nohz_init_all(void) 447 { 448 int err = -1; 449 450 #ifdef CONFIG_NO_HZ_FULL_ALL 451 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { 452 WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); 453 return err; 454 } 455 err = 0; 456 cpumask_setall(tick_nohz_full_mask); 457 tick_nohz_full_running = true; 458 #endif 459 return err; 460 } 461 462 void __init tick_nohz_init(void) 463 { 464 int cpu; 465 466 if (!tick_nohz_full_running) { 467 if (tick_nohz_init_all() < 0) 468 return; 469 } 470 471 if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { 472 WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); 473 cpumask_clear(tick_nohz_full_mask); 474 tick_nohz_full_running = false; 475 return; 476 } 477 478 /* 479 * Full dynticks uses irq work to drive the tick rescheduling on safe 480 * locking contexts. But then we need irq work to raise its own 481 * interrupts to avoid circular dependency on the tick 482 */ 483 if (!arch_irq_work_has_interrupt()) { 484 pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " 485 "support irq work self-IPIs\n"); 486 cpumask_clear(tick_nohz_full_mask); 487 cpumask_copy(housekeeping_mask, cpu_possible_mask); 488 tick_nohz_full_running = false; 489 return; 490 } 491 492 cpu = smp_processor_id(); 493 494 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { 495 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); 496 cpumask_clear_cpu(cpu, tick_nohz_full_mask); 497 } 498 499 cpumask_andnot(housekeeping_mask, 500 cpu_possible_mask, tick_nohz_full_mask); 501 502 for_each_cpu(cpu, tick_nohz_full_mask) 503 context_tracking_cpu_set(cpu); 504 505 cpu_notifier(tick_nohz_cpu_down_callback, 0); 506 pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", 507 cpumask_pr_args(tick_nohz_full_mask)); 508 509 /* 510 * We need at least one CPU to handle housekeeping work such 511 * as timekeeping, unbound timers, workqueues, ... 512 */ 513 WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); 514 } 515 #endif 516 517 /* 518 * NOHZ - aka dynamic tick functionality 519 */ 520 #ifdef CONFIG_NO_HZ_COMMON 521 /* 522 * NO HZ enabled ? 523 */ 524 int tick_nohz_enabled __read_mostly = 1; 525 unsigned long tick_nohz_active __read_mostly; 526 /* 527 * Enable / Disable tickless mode 528 */ 529 static int __init setup_tick_nohz(char *str) 530 { 531 if (!strcmp(str, "off")) 532 tick_nohz_enabled = 0; 533 else if (!strcmp(str, "on")) 534 tick_nohz_enabled = 1; 535 else 536 return 0; 537 return 1; 538 } 539 540 __setup("nohz=", setup_tick_nohz); 541 542 int tick_nohz_tick_stopped(void) 543 { 544 return __this_cpu_read(tick_cpu_sched.tick_stopped); 545 } 546 547 /** 548 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 549 * 550 * Called from interrupt entry when the CPU was idle 551 * 552 * In case the sched_tick was stopped on this CPU, we have to check if jiffies 553 * must be updated. Otherwise an interrupt handler could use a stale jiffy 554 * value. We do this unconditionally on any cpu, as we don't know whether the 555 * cpu, which has the update task assigned is in a long sleep. 556 */ 557 static void tick_nohz_update_jiffies(ktime_t now) 558 { 559 unsigned long flags; 560 561 __this_cpu_write(tick_cpu_sched.idle_waketime, now); 562 563 local_irq_save(flags); 564 tick_do_update_jiffies64(now); 565 local_irq_restore(flags); 566 567 touch_softlockup_watchdog_sched(); 568 } 569 570 /* 571 * Updates the per cpu time idle statistics counters 572 */ 573 static void 574 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) 575 { 576 ktime_t delta; 577 578 if (ts->idle_active) { 579 delta = ktime_sub(now, ts->idle_entrytime); 580 if (nr_iowait_cpu(cpu) > 0) 581 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); 582 else 583 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 584 ts->idle_entrytime = now; 585 } 586 587 if (last_update_time) 588 *last_update_time = ktime_to_us(now); 589 590 } 591 592 static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) 593 { 594 update_ts_time_stats(smp_processor_id(), ts, now, NULL); 595 ts->idle_active = 0; 596 597 sched_clock_idle_wakeup_event(0); 598 } 599 600 static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 601 { 602 ktime_t now = ktime_get(); 603 604 ts->idle_entrytime = now; 605 ts->idle_active = 1; 606 sched_clock_idle_sleep_event(); 607 return now; 608 } 609 610 /** 611 * get_cpu_idle_time_us - get the total idle time of a cpu 612 * @cpu: CPU number to query 613 * @last_update_time: variable to store update time in. Do not update 614 * counters if NULL. 615 * 616 * Return the cummulative idle time (since boot) for a given 617 * CPU, in microseconds. 618 * 619 * This time is measured via accounting rather than sampling, 620 * and is as accurate as ktime_get() is. 621 * 622 * This function returns -1 if NOHZ is not enabled. 623 */ 624 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 625 { 626 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 627 ktime_t now, idle; 628 629 if (!tick_nohz_active) 630 return -1; 631 632 now = ktime_get(); 633 if (last_update_time) { 634 update_ts_time_stats(cpu, ts, now, last_update_time); 635 idle = ts->idle_sleeptime; 636 } else { 637 if (ts->idle_active && !nr_iowait_cpu(cpu)) { 638 ktime_t delta = ktime_sub(now, ts->idle_entrytime); 639 640 idle = ktime_add(ts->idle_sleeptime, delta); 641 } else { 642 idle = ts->idle_sleeptime; 643 } 644 } 645 646 return ktime_to_us(idle); 647 648 } 649 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 650 651 /** 652 * get_cpu_iowait_time_us - get the total iowait time of a cpu 653 * @cpu: CPU number to query 654 * @last_update_time: variable to store update time in. Do not update 655 * counters if NULL. 656 * 657 * Return the cummulative iowait time (since boot) for a given 658 * CPU, in microseconds. 659 * 660 * This time is measured via accounting rather than sampling, 661 * and is as accurate as ktime_get() is. 662 * 663 * This function returns -1 if NOHZ is not enabled. 664 */ 665 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) 666 { 667 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 668 ktime_t now, iowait; 669 670 if (!tick_nohz_active) 671 return -1; 672 673 now = ktime_get(); 674 if (last_update_time) { 675 update_ts_time_stats(cpu, ts, now, last_update_time); 676 iowait = ts->iowait_sleeptime; 677 } else { 678 if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { 679 ktime_t delta = ktime_sub(now, ts->idle_entrytime); 680 681 iowait = ktime_add(ts->iowait_sleeptime, delta); 682 } else { 683 iowait = ts->iowait_sleeptime; 684 } 685 } 686 687 return ktime_to_us(iowait); 688 } 689 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 690 691 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) 692 { 693 hrtimer_cancel(&ts->sched_timer); 694 hrtimer_set_expires(&ts->sched_timer, ts->last_tick); 695 696 /* Forward the time to expire in the future */ 697 hrtimer_forward(&ts->sched_timer, now, tick_period); 698 699 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 700 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); 701 else 702 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 703 } 704 705 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, 706 ktime_t now, int cpu) 707 { 708 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 709 u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; 710 unsigned long seq, basejiff; 711 ktime_t tick; 712 713 /* Read jiffies and the time when jiffies were updated last */ 714 do { 715 seq = read_seqbegin(&jiffies_lock); 716 basemono = last_jiffies_update.tv64; 717 basejiff = jiffies; 718 } while (read_seqretry(&jiffies_lock, seq)); 719 ts->last_jiffies = basejiff; 720 721 if (rcu_needs_cpu(basemono, &next_rcu) || 722 arch_needs_cpu() || irq_work_needs_cpu()) { 723 next_tick = basemono + TICK_NSEC; 724 } else { 725 /* 726 * Get the next pending timer. If high resolution 727 * timers are enabled this only takes the timer wheel 728 * timers into account. If high resolution timers are 729 * disabled this also looks at the next expiring 730 * hrtimer. 731 */ 732 next_tmr = get_next_timer_interrupt(basejiff, basemono); 733 ts->next_timer = next_tmr; 734 /* Take the next rcu event into account */ 735 next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; 736 } 737 738 /* 739 * If the tick is due in the next period, keep it ticking or 740 * force prod the timer. 741 */ 742 delta = next_tick - basemono; 743 if (delta <= (u64)TICK_NSEC) { 744 tick.tv64 = 0; 745 /* 746 * We've not stopped the tick yet, and there's a timer in the 747 * next period, so no point in stopping it either, bail. 748 */ 749 if (!ts->tick_stopped) 750 goto out; 751 752 /* 753 * If, OTOH, we did stop it, but there's a pending (expired) 754 * timer reprogram the timer hardware to fire now. 755 * 756 * We will not restart the tick proper, just prod the timer 757 * hardware into firing an interrupt to process the pending 758 * timers. Just like tick_irq_exit() will not restart the tick 759 * for 'normal' interrupts. 760 * 761 * Only once we exit the idle loop will we re-enable the tick, 762 * see tick_nohz_idle_exit(). 763 */ 764 if (delta == 0) { 765 tick_nohz_restart(ts, now); 766 goto out; 767 } 768 } 769 770 /* 771 * If this cpu is the one which updates jiffies, then give up 772 * the assignment and let it be taken by the cpu which runs 773 * the tick timer next, which might be this cpu as well. If we 774 * don't drop this here the jiffies might be stale and 775 * do_timer() never invoked. Keep track of the fact that it 776 * was the one which had the do_timer() duty last. If this cpu 777 * is the one which had the do_timer() duty last, we limit the 778 * sleep time to the timekeeping max_deferement value. 779 * Otherwise we can sleep as long as we want. 780 */ 781 delta = timekeeping_max_deferment(); 782 if (cpu == tick_do_timer_cpu) { 783 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 784 ts->do_timer_last = 1; 785 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { 786 delta = KTIME_MAX; 787 ts->do_timer_last = 0; 788 } else if (!ts->do_timer_last) { 789 delta = KTIME_MAX; 790 } 791 792 #ifdef CONFIG_NO_HZ_FULL 793 /* Limit the tick delta to the maximum scheduler deferment */ 794 if (!ts->inidle) 795 delta = min(delta, scheduler_tick_max_deferment()); 796 #endif 797 798 /* Calculate the next expiry time */ 799 if (delta < (KTIME_MAX - basemono)) 800 expires = basemono + delta; 801 else 802 expires = KTIME_MAX; 803 804 expires = min_t(u64, expires, next_tick); 805 tick.tv64 = expires; 806 807 /* Skip reprogram of event if its not changed */ 808 if (ts->tick_stopped && (expires == dev->next_event.tv64)) 809 goto out; 810 811 /* 812 * nohz_stop_sched_tick can be called several times before 813 * the nohz_restart_sched_tick is called. This happens when 814 * interrupts arrive which do not cause a reschedule. In the 815 * first call we save the current tick time, so we can restart 816 * the scheduler tick in nohz_restart_sched_tick. 817 */ 818 if (!ts->tick_stopped) { 819 nohz_balance_enter_idle(cpu); 820 calc_load_enter_idle(); 821 822 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 823 ts->tick_stopped = 1; 824 trace_tick_stop(1, " "); 825 } 826 827 /* 828 * If the expiration time == KTIME_MAX, then we simply stop 829 * the tick timer. 830 */ 831 if (unlikely(expires == KTIME_MAX)) { 832 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 833 hrtimer_cancel(&ts->sched_timer); 834 goto out; 835 } 836 837 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 838 hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); 839 else 840 tick_program_event(tick, 1); 841 out: 842 /* Update the estimated sleep length */ 843 ts->sleep_length = ktime_sub(dev->next_event, now); 844 return tick; 845 } 846 847 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active) 848 { 849 /* Update jiffies first */ 850 tick_do_update_jiffies64(now); 851 update_cpu_load_nohz(active); 852 853 calc_load_exit_idle(); 854 touch_softlockup_watchdog_sched(); 855 /* 856 * Cancel the scheduled timer and restore the tick 857 */ 858 ts->tick_stopped = 0; 859 ts->idle_exittime = now; 860 861 tick_nohz_restart(ts, now); 862 } 863 864 static void tick_nohz_full_update_tick(struct tick_sched *ts) 865 { 866 #ifdef CONFIG_NO_HZ_FULL 867 int cpu = smp_processor_id(); 868 869 if (!tick_nohz_full_cpu(cpu)) 870 return; 871 872 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) 873 return; 874 875 if (can_stop_full_tick(ts)) 876 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); 877 else if (ts->tick_stopped) 878 tick_nohz_restart_sched_tick(ts, ktime_get(), 1); 879 #endif 880 } 881 882 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) 883 { 884 /* 885 * If this cpu is offline and it is the one which updates 886 * jiffies, then give up the assignment and let it be taken by 887 * the cpu which runs the tick timer next. If we don't drop 888 * this here the jiffies might be stale and do_timer() never 889 * invoked. 890 */ 891 if (unlikely(!cpu_online(cpu))) { 892 if (cpu == tick_do_timer_cpu) 893 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 894 return false; 895 } 896 897 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { 898 ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; 899 return false; 900 } 901 902 if (need_resched()) 903 return false; 904 905 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 906 static int ratelimit; 907 908 if (ratelimit < 10 && 909 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { 910 pr_warn("NOHZ: local_softirq_pending %02x\n", 911 (unsigned int) local_softirq_pending()); 912 ratelimit++; 913 } 914 return false; 915 } 916 917 if (tick_nohz_full_enabled()) { 918 /* 919 * Keep the tick alive to guarantee timekeeping progression 920 * if there are full dynticks CPUs around 921 */ 922 if (tick_do_timer_cpu == cpu) 923 return false; 924 /* 925 * Boot safety: make sure the timekeeping duty has been 926 * assigned before entering dyntick-idle mode, 927 */ 928 if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) 929 return false; 930 } 931 932 return true; 933 } 934 935 static void __tick_nohz_idle_enter(struct tick_sched *ts) 936 { 937 ktime_t now, expires; 938 int cpu = smp_processor_id(); 939 940 now = tick_nohz_start_idle(ts); 941 942 if (can_stop_idle_tick(cpu, ts)) { 943 int was_stopped = ts->tick_stopped; 944 945 ts->idle_calls++; 946 947 expires = tick_nohz_stop_sched_tick(ts, now, cpu); 948 if (expires.tv64 > 0LL) { 949 ts->idle_sleeps++; 950 ts->idle_expires = expires; 951 } 952 953 if (!was_stopped && ts->tick_stopped) 954 ts->idle_jiffies = ts->last_jiffies; 955 } 956 } 957 958 /** 959 * tick_nohz_idle_enter - stop the idle tick from the idle task 960 * 961 * When the next event is more than a tick into the future, stop the idle tick 962 * Called when we start the idle loop. 963 * 964 * The arch is responsible of calling: 965 * 966 * - rcu_idle_enter() after its last use of RCU before the CPU is put 967 * to sleep. 968 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. 969 */ 970 void tick_nohz_idle_enter(void) 971 { 972 struct tick_sched *ts; 973 974 WARN_ON_ONCE(irqs_disabled()); 975 976 /* 977 * Update the idle state in the scheduler domain hierarchy 978 * when tick_nohz_stop_sched_tick() is called from the idle loop. 979 * State will be updated to busy during the first busy tick after 980 * exiting idle. 981 */ 982 set_cpu_sd_state_idle(); 983 984 local_irq_disable(); 985 986 ts = this_cpu_ptr(&tick_cpu_sched); 987 ts->inidle = 1; 988 __tick_nohz_idle_enter(ts); 989 990 local_irq_enable(); 991 } 992 993 /** 994 * tick_nohz_irq_exit - update next tick event from interrupt exit 995 * 996 * When an interrupt fires while we are idle and it doesn't cause 997 * a reschedule, it may still add, modify or delete a timer, enqueue 998 * an RCU callback, etc... 999 * So we need to re-calculate and reprogram the next tick event. 1000 */ 1001 void tick_nohz_irq_exit(void) 1002 { 1003 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1004 1005 if (ts->inidle) 1006 __tick_nohz_idle_enter(ts); 1007 else 1008 tick_nohz_full_update_tick(ts); 1009 } 1010 1011 /** 1012 * tick_nohz_get_sleep_length - return the length of the current sleep 1013 * 1014 * Called from power state control code with interrupts disabled 1015 */ 1016 ktime_t tick_nohz_get_sleep_length(void) 1017 { 1018 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1019 1020 return ts->sleep_length; 1021 } 1022 1023 static void tick_nohz_account_idle_ticks(struct tick_sched *ts) 1024 { 1025 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 1026 unsigned long ticks; 1027 1028 if (vtime_accounting_cpu_enabled()) 1029 return; 1030 /* 1031 * We stopped the tick in idle. Update process times would miss the 1032 * time we slept as update_process_times does only a 1 tick 1033 * accounting. Enforce that this is accounted to idle ! 1034 */ 1035 ticks = jiffies - ts->idle_jiffies; 1036 /* 1037 * We might be one off. Do not randomly account a huge number of ticks! 1038 */ 1039 if (ticks && ticks < LONG_MAX) 1040 account_idle_ticks(ticks); 1041 #endif 1042 } 1043 1044 /** 1045 * tick_nohz_idle_exit - restart the idle tick from the idle task 1046 * 1047 * Restart the idle tick when the CPU is woken up from idle 1048 * This also exit the RCU extended quiescent state. The CPU 1049 * can use RCU again after this function is called. 1050 */ 1051 void tick_nohz_idle_exit(void) 1052 { 1053 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1054 ktime_t now; 1055 1056 local_irq_disable(); 1057 1058 WARN_ON_ONCE(!ts->inidle); 1059 1060 ts->inidle = 0; 1061 1062 if (ts->idle_active || ts->tick_stopped) 1063 now = ktime_get(); 1064 1065 if (ts->idle_active) 1066 tick_nohz_stop_idle(ts, now); 1067 1068 if (ts->tick_stopped) { 1069 tick_nohz_restart_sched_tick(ts, now, 0); 1070 tick_nohz_account_idle_ticks(ts); 1071 } 1072 1073 local_irq_enable(); 1074 } 1075 1076 /* 1077 * The nohz low res interrupt handler 1078 */ 1079 static void tick_nohz_handler(struct clock_event_device *dev) 1080 { 1081 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1082 struct pt_regs *regs = get_irq_regs(); 1083 ktime_t now = ktime_get(); 1084 1085 dev->next_event.tv64 = KTIME_MAX; 1086 1087 tick_sched_do_timer(now); 1088 tick_sched_handle(ts, regs); 1089 1090 /* No need to reprogram if we are running tickless */ 1091 if (unlikely(ts->tick_stopped)) 1092 return; 1093 1094 hrtimer_forward(&ts->sched_timer, now, tick_period); 1095 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 1096 } 1097 1098 static inline void tick_nohz_activate(struct tick_sched *ts, int mode) 1099 { 1100 if (!tick_nohz_enabled) 1101 return; 1102 ts->nohz_mode = mode; 1103 /* One update is enough */ 1104 if (!test_and_set_bit(0, &tick_nohz_active)) 1105 timers_update_migration(true); 1106 } 1107 1108 /** 1109 * tick_nohz_switch_to_nohz - switch to nohz mode 1110 */ 1111 static void tick_nohz_switch_to_nohz(void) 1112 { 1113 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1114 ktime_t next; 1115 1116 if (!tick_nohz_enabled) 1117 return; 1118 1119 if (tick_switch_to_oneshot(tick_nohz_handler)) 1120 return; 1121 1122 /* 1123 * Recycle the hrtimer in ts, so we can share the 1124 * hrtimer_forward with the highres code. 1125 */ 1126 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1127 /* Get the next period */ 1128 next = tick_init_jiffy_update(); 1129 1130 hrtimer_set_expires(&ts->sched_timer, next); 1131 hrtimer_forward_now(&ts->sched_timer, tick_period); 1132 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 1133 tick_nohz_activate(ts, NOHZ_MODE_LOWRES); 1134 } 1135 1136 /* 1137 * When NOHZ is enabled and the tick is stopped, we need to kick the 1138 * tick timer from irq_enter() so that the jiffies update is kept 1139 * alive during long running softirqs. That's ugly as hell, but 1140 * correctness is key even if we need to fix the offending softirq in 1141 * the first place. 1142 * 1143 * Note, this is different to tick_nohz_restart. We just kick the 1144 * timer and do not touch the other magic bits which need to be done 1145 * when idle is left. 1146 */ 1147 static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) 1148 { 1149 #if 0 1150 /* Switch back to 2.6.27 behaviour */ 1151 ktime_t delta; 1152 1153 /* 1154 * Do not touch the tick device, when the next expiry is either 1155 * already reached or less/equal than the tick period. 1156 */ 1157 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); 1158 if (delta.tv64 <= tick_period.tv64) 1159 return; 1160 1161 tick_nohz_restart(ts, now); 1162 #endif 1163 } 1164 1165 static inline void tick_nohz_irq_enter(void) 1166 { 1167 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1168 ktime_t now; 1169 1170 if (!ts->idle_active && !ts->tick_stopped) 1171 return; 1172 now = ktime_get(); 1173 if (ts->idle_active) 1174 tick_nohz_stop_idle(ts, now); 1175 if (ts->tick_stopped) { 1176 tick_nohz_update_jiffies(now); 1177 tick_nohz_kick_tick(ts, now); 1178 } 1179 } 1180 1181 #else 1182 1183 static inline void tick_nohz_switch_to_nohz(void) { } 1184 static inline void tick_nohz_irq_enter(void) { } 1185 static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } 1186 1187 #endif /* CONFIG_NO_HZ_COMMON */ 1188 1189 /* 1190 * Called from irq_enter to notify about the possible interruption of idle() 1191 */ 1192 void tick_irq_enter(void) 1193 { 1194 tick_check_oneshot_broadcast_this_cpu(); 1195 tick_nohz_irq_enter(); 1196 } 1197 1198 /* 1199 * High resolution timer specific code 1200 */ 1201 #ifdef CONFIG_HIGH_RES_TIMERS 1202 /* 1203 * We rearm the timer until we get disabled by the idle code. 1204 * Called with interrupts disabled. 1205 */ 1206 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 1207 { 1208 struct tick_sched *ts = 1209 container_of(timer, struct tick_sched, sched_timer); 1210 struct pt_regs *regs = get_irq_regs(); 1211 ktime_t now = ktime_get(); 1212 1213 tick_sched_do_timer(now); 1214 1215 /* 1216 * Do not call, when we are not in irq context and have 1217 * no valid regs pointer 1218 */ 1219 if (regs) 1220 tick_sched_handle(ts, regs); 1221 1222 /* No need to reprogram if we are in idle or full dynticks mode */ 1223 if (unlikely(ts->tick_stopped)) 1224 return HRTIMER_NORESTART; 1225 1226 hrtimer_forward(timer, now, tick_period); 1227 1228 return HRTIMER_RESTART; 1229 } 1230 1231 static int sched_skew_tick; 1232 1233 static int __init skew_tick(char *str) 1234 { 1235 get_option(&str, &sched_skew_tick); 1236 1237 return 0; 1238 } 1239 early_param("skew_tick", skew_tick); 1240 1241 /** 1242 * tick_setup_sched_timer - setup the tick emulation timer 1243 */ 1244 void tick_setup_sched_timer(void) 1245 { 1246 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1247 ktime_t now = ktime_get(); 1248 1249 /* 1250 * Emulate tick processing via per-CPU hrtimers: 1251 */ 1252 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1253 ts->sched_timer.function = tick_sched_timer; 1254 1255 /* Get the next period (per cpu) */ 1256 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 1257 1258 /* Offset the tick to avert jiffies_lock contention. */ 1259 if (sched_skew_tick) { 1260 u64 offset = ktime_to_ns(tick_period) >> 1; 1261 do_div(offset, num_possible_cpus()); 1262 offset *= smp_processor_id(); 1263 hrtimer_add_expires_ns(&ts->sched_timer, offset); 1264 } 1265 1266 hrtimer_forward(&ts->sched_timer, now, tick_period); 1267 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); 1268 tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); 1269 } 1270 #endif /* HIGH_RES_TIMERS */ 1271 1272 #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS 1273 void tick_cancel_sched_timer(int cpu) 1274 { 1275 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1276 1277 # ifdef CONFIG_HIGH_RES_TIMERS 1278 if (ts->sched_timer.base) 1279 hrtimer_cancel(&ts->sched_timer); 1280 # endif 1281 1282 memset(ts, 0, sizeof(*ts)); 1283 } 1284 #endif 1285 1286 /** 1287 * Async notification about clocksource changes 1288 */ 1289 void tick_clock_notify(void) 1290 { 1291 int cpu; 1292 1293 for_each_possible_cpu(cpu) 1294 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); 1295 } 1296 1297 /* 1298 * Async notification about clock event changes 1299 */ 1300 void tick_oneshot_notify(void) 1301 { 1302 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1303 1304 set_bit(0, &ts->check_clocks); 1305 } 1306 1307 /** 1308 * Check, if a change happened, which makes oneshot possible. 1309 * 1310 * Called cyclic from the hrtimer softirq (driven by the timer 1311 * softirq) allow_nohz signals, that we can switch into low-res nohz 1312 * mode, because high resolution timers are disabled (either compile 1313 * or runtime). Called with interrupts disabled. 1314 */ 1315 int tick_check_oneshot_change(int allow_nohz) 1316 { 1317 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1318 1319 if (!test_and_clear_bit(0, &ts->check_clocks)) 1320 return 0; 1321 1322 if (ts->nohz_mode != NOHZ_MODE_INACTIVE) 1323 return 0; 1324 1325 if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) 1326 return 0; 1327 1328 if (!allow_nohz) 1329 return 1; 1330 1331 tick_nohz_switch_to_nohz(); 1332 return 0; 1333 } 1334