1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> 4 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar 5 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner 6 * 7 * No idle tick implementation for low and high resolution timers 8 * 9 * Started by: Thomas Gleixner and Ingo Molnar 10 */ 11 #include <linux/cpu.h> 12 #include <linux/err.h> 13 #include <linux/hrtimer.h> 14 #include <linux/interrupt.h> 15 #include <linux/kernel_stat.h> 16 #include <linux/percpu.h> 17 #include <linux/nmi.h> 18 #include <linux/profile.h> 19 #include <linux/sched/signal.h> 20 #include <linux/sched/clock.h> 21 #include <linux/sched/stat.h> 22 #include <linux/sched/nohz.h> 23 #include <linux/sched/loadavg.h> 24 #include <linux/module.h> 25 #include <linux/irq_work.h> 26 #include <linux/posix-timers.h> 27 #include <linux/context_tracking.h> 28 #include <linux/mm.h> 29 30 #include <asm/irq_regs.h> 31 32 #include "tick-internal.h" 33 34 #include <trace/events/timer.h> 35 36 /* 37 * Per-CPU nohz control structure 38 */ 39 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 40 41 struct tick_sched *tick_get_tick_sched(int cpu) 42 { 43 return &per_cpu(tick_cpu_sched, cpu); 44 } 45 46 #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) 47 /* 48 * The time, when the last jiffy update happened. Write access must hold 49 * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a 50 * consistent view of jiffies and last_jiffies_update. 51 */ 52 static ktime_t last_jiffies_update; 53 54 /* 55 * Must be called with interrupts disabled ! 56 */ 57 static void tick_do_update_jiffies64(ktime_t now) 58 { 59 unsigned long ticks = 1; 60 ktime_t delta, nextp; 61 62 /* 63 * 64bit can do a quick check without holding jiffies lock and 64 * without looking at the sequence count. The smp_load_acquire() 65 * pairs with the update done later in this function. 66 * 67 * 32bit cannot do that because the store of tick_next_period 68 * consists of two 32bit stores and the first store could move it 69 * to a random point in the future. 70 */ 71 if (IS_ENABLED(CONFIG_64BIT)) { 72 if (ktime_before(now, smp_load_acquire(&tick_next_period))) 73 return; 74 } else { 75 unsigned int seq; 76 77 /* 78 * Avoid contention on jiffies_lock and protect the quick 79 * check with the sequence count. 80 */ 81 do { 82 seq = read_seqcount_begin(&jiffies_seq); 83 nextp = tick_next_period; 84 } while (read_seqcount_retry(&jiffies_seq, seq)); 85 86 if (ktime_before(now, nextp)) 87 return; 88 } 89 90 /* Quick check failed, i.e. update is required. */ 91 raw_spin_lock(&jiffies_lock); 92 /* 93 * Reevaluate with the lock held. Another CPU might have done the 94 * update already. 95 */ 96 if (ktime_before(now, tick_next_period)) { 97 raw_spin_unlock(&jiffies_lock); 98 return; 99 } 100 101 write_seqcount_begin(&jiffies_seq); 102 103 delta = ktime_sub(now, tick_next_period); 104 if (unlikely(delta >= TICK_NSEC)) { 105 /* Slow path for long idle sleep times */ 106 s64 incr = TICK_NSEC; 107 108 ticks += ktime_divns(delta, incr); 109 110 last_jiffies_update = ktime_add_ns(last_jiffies_update, 111 incr * ticks); 112 } else { 113 last_jiffies_update = ktime_add_ns(last_jiffies_update, 114 TICK_NSEC); 115 } 116 117 /* Advance jiffies to complete the jiffies_seq protected job */ 118 jiffies_64 += ticks; 119 120 /* 121 * Keep the tick_next_period variable up to date. 122 */ 123 nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); 124 125 if (IS_ENABLED(CONFIG_64BIT)) { 126 /* 127 * Pairs with smp_load_acquire() in the lockless quick 128 * check above and ensures that the update to jiffies_64 is 129 * not reordered vs. the store to tick_next_period, neither 130 * by the compiler nor by the CPU. 131 */ 132 smp_store_release(&tick_next_period, nextp); 133 } else { 134 /* 135 * A plain store is good enough on 32bit as the quick check 136 * above is protected by the sequence count. 137 */ 138 tick_next_period = nextp; 139 } 140 141 /* 142 * Release the sequence count. calc_global_load() below is not 143 * protected by it, but jiffies_lock needs to be held to prevent 144 * concurrent invocations. 145 */ 146 write_seqcount_end(&jiffies_seq); 147 148 calc_global_load(); 149 150 raw_spin_unlock(&jiffies_lock); 151 update_wall_time(); 152 } 153 154 /* 155 * Initialize and return retrieve the jiffies update. 156 */ 157 static ktime_t tick_init_jiffy_update(void) 158 { 159 ktime_t period; 160 161 raw_spin_lock(&jiffies_lock); 162 write_seqcount_begin(&jiffies_seq); 163 /* Did we start the jiffies update yet ? */ 164 if (last_jiffies_update == 0) 165 last_jiffies_update = tick_next_period; 166 period = last_jiffies_update; 167 write_seqcount_end(&jiffies_seq); 168 raw_spin_unlock(&jiffies_lock); 169 return period; 170 } 171 172 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) 173 { 174 int cpu = smp_processor_id(); 175 176 #ifdef CONFIG_NO_HZ_COMMON 177 /* 178 * Check if the do_timer duty was dropped. We don't care about 179 * concurrency: This happens only when the CPU in charge went 180 * into a long sleep. If two CPUs happen to assign themselves to 181 * this duty, then the jiffies update is still serialized by 182 * jiffies_lock. 183 * 184 * If nohz_full is enabled, this should not happen because the 185 * tick_do_timer_cpu never relinquishes. 186 */ 187 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { 188 #ifdef CONFIG_NO_HZ_FULL 189 WARN_ON(tick_nohz_full_running); 190 #endif 191 tick_do_timer_cpu = cpu; 192 } 193 #endif 194 195 /* Check, if the jiffies need an update */ 196 if (tick_do_timer_cpu == cpu) 197 tick_do_update_jiffies64(now); 198 199 if (ts->inidle) 200 ts->got_idle_tick = 1; 201 } 202 203 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) 204 { 205 #ifdef CONFIG_NO_HZ_COMMON 206 /* 207 * When we are idle and the tick is stopped, we have to touch 208 * the watchdog as we might not schedule for a really long 209 * time. This happens on complete idle SMP systems while 210 * waiting on the login prompt. We also increment the "start of 211 * idle" jiffy stamp so the idle accounting adjustment we do 212 * when we go busy again does not account too much ticks. 213 */ 214 if (ts->tick_stopped) { 215 touch_softlockup_watchdog_sched(); 216 if (is_idle_task(current)) 217 ts->idle_jiffies++; 218 /* 219 * In case the current tick fired too early past its expected 220 * expiration, make sure we don't bypass the next clock reprogramming 221 * to the same deadline. 222 */ 223 ts->next_tick = 0; 224 } 225 #endif 226 update_process_times(user_mode(regs)); 227 profile_tick(CPU_PROFILING); 228 } 229 #endif 230 231 #ifdef CONFIG_NO_HZ_FULL 232 cpumask_var_t tick_nohz_full_mask; 233 EXPORT_SYMBOL_GPL(tick_nohz_full_mask); 234 bool tick_nohz_full_running; 235 EXPORT_SYMBOL_GPL(tick_nohz_full_running); 236 static atomic_t tick_dep_mask; 237 238 static bool check_tick_dependency(atomic_t *dep) 239 { 240 int val = atomic_read(dep); 241 242 if (val & TICK_DEP_MASK_POSIX_TIMER) { 243 trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); 244 return true; 245 } 246 247 if (val & TICK_DEP_MASK_PERF_EVENTS) { 248 trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); 249 return true; 250 } 251 252 if (val & TICK_DEP_MASK_SCHED) { 253 trace_tick_stop(0, TICK_DEP_MASK_SCHED); 254 return true; 255 } 256 257 if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { 258 trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); 259 return true; 260 } 261 262 if (val & TICK_DEP_MASK_RCU) { 263 trace_tick_stop(0, TICK_DEP_MASK_RCU); 264 return true; 265 } 266 267 return false; 268 } 269 270 static bool can_stop_full_tick(int cpu, struct tick_sched *ts) 271 { 272 lockdep_assert_irqs_disabled(); 273 274 if (unlikely(!cpu_online(cpu))) 275 return false; 276 277 if (check_tick_dependency(&tick_dep_mask)) 278 return false; 279 280 if (check_tick_dependency(&ts->tick_dep_mask)) 281 return false; 282 283 if (check_tick_dependency(¤t->tick_dep_mask)) 284 return false; 285 286 if (check_tick_dependency(¤t->signal->tick_dep_mask)) 287 return false; 288 289 return true; 290 } 291 292 static void nohz_full_kick_func(struct irq_work *work) 293 { 294 /* Empty, the tick restart happens on tick_nohz_irq_exit() */ 295 } 296 297 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = 298 IRQ_WORK_INIT_HARD(nohz_full_kick_func); 299 300 /* 301 * Kick this CPU if it's full dynticks in order to force it to 302 * re-evaluate its dependency on the tick and restart it if necessary. 303 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), 304 * is NMI safe. 305 */ 306 static void tick_nohz_full_kick(void) 307 { 308 if (!tick_nohz_full_cpu(smp_processor_id())) 309 return; 310 311 irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); 312 } 313 314 /* 315 * Kick the CPU if it's full dynticks in order to force it to 316 * re-evaluate its dependency on the tick and restart it if necessary. 317 */ 318 void tick_nohz_full_kick_cpu(int cpu) 319 { 320 if (!tick_nohz_full_cpu(cpu)) 321 return; 322 323 irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); 324 } 325 326 static void tick_nohz_kick_task(struct task_struct *tsk) 327 { 328 int cpu; 329 330 /* 331 * If the task is not running, run_posix_cpu_timers() 332 * has nothing to elapse, IPI can then be spared. 333 * 334 * activate_task() STORE p->tick_dep_mask 335 * STORE p->on_rq 336 * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) 337 * LOCK rq->lock LOAD p->on_rq 338 * smp_mb__after_spin_lock() 339 * tick_nohz_task_switch() 340 * LOAD p->tick_dep_mask 341 */ 342 if (!sched_task_on_rq(tsk)) 343 return; 344 345 /* 346 * If the task concurrently migrates to another CPU, 347 * we guarantee it sees the new tick dependency upon 348 * schedule. 349 * 350 * set_task_cpu(p, cpu); 351 * STORE p->cpu = @cpu 352 * __schedule() (switch to task 'p') 353 * LOCK rq->lock 354 * smp_mb__after_spin_lock() STORE p->tick_dep_mask 355 * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) 356 * LOAD p->tick_dep_mask LOAD p->cpu 357 */ 358 cpu = task_cpu(tsk); 359 360 preempt_disable(); 361 if (cpu_online(cpu)) 362 tick_nohz_full_kick_cpu(cpu); 363 preempt_enable(); 364 } 365 366 /* 367 * Kick all full dynticks CPUs in order to force these to re-evaluate 368 * their dependency on the tick and restart it if necessary. 369 */ 370 static void tick_nohz_full_kick_all(void) 371 { 372 int cpu; 373 374 if (!tick_nohz_full_running) 375 return; 376 377 preempt_disable(); 378 for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) 379 tick_nohz_full_kick_cpu(cpu); 380 preempt_enable(); 381 } 382 383 static void tick_nohz_dep_set_all(atomic_t *dep, 384 enum tick_dep_bits bit) 385 { 386 int prev; 387 388 prev = atomic_fetch_or(BIT(bit), dep); 389 if (!prev) 390 tick_nohz_full_kick_all(); 391 } 392 393 /* 394 * Set a global tick dependency. Used by perf events that rely on freq and 395 * by unstable clock. 396 */ 397 void tick_nohz_dep_set(enum tick_dep_bits bit) 398 { 399 tick_nohz_dep_set_all(&tick_dep_mask, bit); 400 } 401 402 void tick_nohz_dep_clear(enum tick_dep_bits bit) 403 { 404 atomic_andnot(BIT(bit), &tick_dep_mask); 405 } 406 407 /* 408 * Set per-CPU tick dependency. Used by scheduler and perf events in order to 409 * manage events throttling. 410 */ 411 void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) 412 { 413 int prev; 414 struct tick_sched *ts; 415 416 ts = per_cpu_ptr(&tick_cpu_sched, cpu); 417 418 prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); 419 if (!prev) { 420 preempt_disable(); 421 /* Perf needs local kick that is NMI safe */ 422 if (cpu == smp_processor_id()) { 423 tick_nohz_full_kick(); 424 } else { 425 /* Remote irq work not NMI-safe */ 426 if (!WARN_ON_ONCE(in_nmi())) 427 tick_nohz_full_kick_cpu(cpu); 428 } 429 preempt_enable(); 430 } 431 } 432 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); 433 434 void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) 435 { 436 struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); 437 438 atomic_andnot(BIT(bit), &ts->tick_dep_mask); 439 } 440 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); 441 442 /* 443 * Set a per-task tick dependency. RCU need this. Also posix CPU timers 444 * in order to elapse per task timers. 445 */ 446 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) 447 { 448 if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) 449 tick_nohz_kick_task(tsk); 450 } 451 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); 452 453 void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) 454 { 455 atomic_andnot(BIT(bit), &tsk->tick_dep_mask); 456 } 457 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); 458 459 /* 460 * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse 461 * per process timers. 462 */ 463 void tick_nohz_dep_set_signal(struct task_struct *tsk, 464 enum tick_dep_bits bit) 465 { 466 int prev; 467 struct signal_struct *sig = tsk->signal; 468 469 prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); 470 if (!prev) { 471 struct task_struct *t; 472 473 lockdep_assert_held(&tsk->sighand->siglock); 474 __for_each_thread(sig, t) 475 tick_nohz_kick_task(t); 476 } 477 } 478 479 void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) 480 { 481 atomic_andnot(BIT(bit), &sig->tick_dep_mask); 482 } 483 484 /* 485 * Re-evaluate the need for the tick as we switch the current task. 486 * It might need the tick due to per task/process properties: 487 * perf events, posix CPU timers, ... 488 */ 489 void __tick_nohz_task_switch(void) 490 { 491 struct tick_sched *ts; 492 493 if (!tick_nohz_full_cpu(smp_processor_id())) 494 return; 495 496 ts = this_cpu_ptr(&tick_cpu_sched); 497 498 if (ts->tick_stopped) { 499 if (atomic_read(¤t->tick_dep_mask) || 500 atomic_read(¤t->signal->tick_dep_mask)) 501 tick_nohz_full_kick(); 502 } 503 } 504 505 /* Get the boot-time nohz CPU list from the kernel parameters. */ 506 void __init tick_nohz_full_setup(cpumask_var_t cpumask) 507 { 508 alloc_bootmem_cpumask_var(&tick_nohz_full_mask); 509 cpumask_copy(tick_nohz_full_mask, cpumask); 510 tick_nohz_full_running = true; 511 } 512 EXPORT_SYMBOL_GPL(tick_nohz_full_setup); 513 514 static int tick_nohz_cpu_down(unsigned int cpu) 515 { 516 /* 517 * The tick_do_timer_cpu CPU handles housekeeping duty (unbound 518 * timers, workqueues, timekeeping, ...) on behalf of full dynticks 519 * CPUs. It must remain online when nohz full is enabled. 520 */ 521 if (tick_nohz_full_running && tick_do_timer_cpu == cpu) 522 return -EBUSY; 523 return 0; 524 } 525 526 void __init tick_nohz_init(void) 527 { 528 int cpu, ret; 529 530 if (!tick_nohz_full_running) 531 return; 532 533 /* 534 * Full dynticks uses irq work to drive the tick rescheduling on safe 535 * locking contexts. But then we need irq work to raise its own 536 * interrupts to avoid circular dependency on the tick 537 */ 538 if (!arch_irq_work_has_interrupt()) { 539 pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); 540 cpumask_clear(tick_nohz_full_mask); 541 tick_nohz_full_running = false; 542 return; 543 } 544 545 if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && 546 !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { 547 cpu = smp_processor_id(); 548 549 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { 550 pr_warn("NO_HZ: Clearing %d from nohz_full range " 551 "for timekeeping\n", cpu); 552 cpumask_clear_cpu(cpu, tick_nohz_full_mask); 553 } 554 } 555 556 for_each_cpu(cpu, tick_nohz_full_mask) 557 context_tracking_cpu_set(cpu); 558 559 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, 560 "kernel/nohz:predown", NULL, 561 tick_nohz_cpu_down); 562 WARN_ON(ret < 0); 563 pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", 564 cpumask_pr_args(tick_nohz_full_mask)); 565 } 566 #endif 567 568 /* 569 * NOHZ - aka dynamic tick functionality 570 */ 571 #ifdef CONFIG_NO_HZ_COMMON 572 /* 573 * NO HZ enabled ? 574 */ 575 bool tick_nohz_enabled __read_mostly = true; 576 unsigned long tick_nohz_active __read_mostly; 577 /* 578 * Enable / Disable tickless mode 579 */ 580 static int __init setup_tick_nohz(char *str) 581 { 582 return (kstrtobool(str, &tick_nohz_enabled) == 0); 583 } 584 585 __setup("nohz=", setup_tick_nohz); 586 587 bool tick_nohz_tick_stopped(void) 588 { 589 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 590 591 return ts->tick_stopped; 592 } 593 594 bool tick_nohz_tick_stopped_cpu(int cpu) 595 { 596 struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); 597 598 return ts->tick_stopped; 599 } 600 601 /** 602 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 603 * 604 * Called from interrupt entry when the CPU was idle 605 * 606 * In case the sched_tick was stopped on this CPU, we have to check if jiffies 607 * must be updated. Otherwise an interrupt handler could use a stale jiffy 608 * value. We do this unconditionally on any CPU, as we don't know whether the 609 * CPU, which has the update task assigned is in a long sleep. 610 */ 611 static void tick_nohz_update_jiffies(ktime_t now) 612 { 613 unsigned long flags; 614 615 __this_cpu_write(tick_cpu_sched.idle_waketime, now); 616 617 local_irq_save(flags); 618 tick_do_update_jiffies64(now); 619 local_irq_restore(flags); 620 621 touch_softlockup_watchdog_sched(); 622 } 623 624 /* 625 * Updates the per-CPU time idle statistics counters 626 */ 627 static void 628 update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) 629 { 630 ktime_t delta; 631 632 if (ts->idle_active) { 633 delta = ktime_sub(now, ts->idle_entrytime); 634 if (nr_iowait_cpu(cpu) > 0) 635 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); 636 else 637 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 638 ts->idle_entrytime = now; 639 } 640 641 if (last_update_time) 642 *last_update_time = ktime_to_us(now); 643 644 } 645 646 static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) 647 { 648 update_ts_time_stats(smp_processor_id(), ts, now, NULL); 649 ts->idle_active = 0; 650 651 sched_clock_idle_wakeup_event(); 652 } 653 654 static void tick_nohz_start_idle(struct tick_sched *ts) 655 { 656 ts->idle_entrytime = ktime_get(); 657 ts->idle_active = 1; 658 sched_clock_idle_sleep_event(); 659 } 660 661 /** 662 * get_cpu_idle_time_us - get the total idle time of a CPU 663 * @cpu: CPU number to query 664 * @last_update_time: variable to store update time in. Do not update 665 * counters if NULL. 666 * 667 * Return the cumulative idle time (since boot) for a given 668 * CPU, in microseconds. 669 * 670 * This time is measured via accounting rather than sampling, 671 * and is as accurate as ktime_get() is. 672 * 673 * This function returns -1 if NOHZ is not enabled. 674 */ 675 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 676 { 677 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 678 ktime_t now, idle; 679 680 if (!tick_nohz_active) 681 return -1; 682 683 now = ktime_get(); 684 if (last_update_time) { 685 update_ts_time_stats(cpu, ts, now, last_update_time); 686 idle = ts->idle_sleeptime; 687 } else { 688 if (ts->idle_active && !nr_iowait_cpu(cpu)) { 689 ktime_t delta = ktime_sub(now, ts->idle_entrytime); 690 691 idle = ktime_add(ts->idle_sleeptime, delta); 692 } else { 693 idle = ts->idle_sleeptime; 694 } 695 } 696 697 return ktime_to_us(idle); 698 699 } 700 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 701 702 /** 703 * get_cpu_iowait_time_us - get the total iowait time of a CPU 704 * @cpu: CPU number to query 705 * @last_update_time: variable to store update time in. Do not update 706 * counters if NULL. 707 * 708 * Return the cumulative iowait time (since boot) for a given 709 * CPU, in microseconds. 710 * 711 * This time is measured via accounting rather than sampling, 712 * and is as accurate as ktime_get() is. 713 * 714 * This function returns -1 if NOHZ is not enabled. 715 */ 716 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) 717 { 718 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 719 ktime_t now, iowait; 720 721 if (!tick_nohz_active) 722 return -1; 723 724 now = ktime_get(); 725 if (last_update_time) { 726 update_ts_time_stats(cpu, ts, now, last_update_time); 727 iowait = ts->iowait_sleeptime; 728 } else { 729 if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { 730 ktime_t delta = ktime_sub(now, ts->idle_entrytime); 731 732 iowait = ktime_add(ts->iowait_sleeptime, delta); 733 } else { 734 iowait = ts->iowait_sleeptime; 735 } 736 } 737 738 return ktime_to_us(iowait); 739 } 740 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 741 742 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) 743 { 744 hrtimer_cancel(&ts->sched_timer); 745 hrtimer_set_expires(&ts->sched_timer, ts->last_tick); 746 747 /* Forward the time to expire in the future */ 748 hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); 749 750 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 751 hrtimer_start_expires(&ts->sched_timer, 752 HRTIMER_MODE_ABS_PINNED_HARD); 753 } else { 754 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 755 } 756 757 /* 758 * Reset to make sure next tick stop doesn't get fooled by past 759 * cached clock deadline. 760 */ 761 ts->next_tick = 0; 762 } 763 764 static inline bool local_timer_softirq_pending(void) 765 { 766 return local_softirq_pending() & BIT(TIMER_SOFTIRQ); 767 } 768 769 static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) 770 { 771 u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; 772 unsigned long basejiff; 773 unsigned int seq; 774 775 /* Read jiffies and the time when jiffies were updated last */ 776 do { 777 seq = read_seqcount_begin(&jiffies_seq); 778 basemono = last_jiffies_update; 779 basejiff = jiffies; 780 } while (read_seqcount_retry(&jiffies_seq, seq)); 781 ts->last_jiffies = basejiff; 782 ts->timer_expires_base = basemono; 783 784 /* 785 * Keep the periodic tick, when RCU, architecture or irq_work 786 * requests it. 787 * Aside of that check whether the local timer softirq is 788 * pending. If so its a bad idea to call get_next_timer_interrupt() 789 * because there is an already expired timer, so it will request 790 * immediate expiry, which rearms the hardware timer with a 791 * minimal delta which brings us back to this place 792 * immediately. Lather, rinse and repeat... 793 */ 794 if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || 795 irq_work_needs_cpu() || local_timer_softirq_pending()) { 796 next_tick = basemono + TICK_NSEC; 797 } else { 798 /* 799 * Get the next pending timer. If high resolution 800 * timers are enabled this only takes the timer wheel 801 * timers into account. If high resolution timers are 802 * disabled this also looks at the next expiring 803 * hrtimer. 804 */ 805 next_tmr = get_next_timer_interrupt(basejiff, basemono); 806 ts->next_timer = next_tmr; 807 /* Take the next rcu event into account */ 808 next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; 809 } 810 811 /* 812 * If the tick is due in the next period, keep it ticking or 813 * force prod the timer. 814 */ 815 delta = next_tick - basemono; 816 if (delta <= (u64)TICK_NSEC) { 817 /* 818 * Tell the timer code that the base is not idle, i.e. undo 819 * the effect of get_next_timer_interrupt(): 820 */ 821 timer_clear_idle(); 822 /* 823 * We've not stopped the tick yet, and there's a timer in the 824 * next period, so no point in stopping it either, bail. 825 */ 826 if (!ts->tick_stopped) { 827 ts->timer_expires = 0; 828 goto out; 829 } 830 } 831 832 /* 833 * If this CPU is the one which had the do_timer() duty last, we limit 834 * the sleep time to the timekeeping max_deferment value. 835 * Otherwise we can sleep as long as we want. 836 */ 837 delta = timekeeping_max_deferment(); 838 if (cpu != tick_do_timer_cpu && 839 (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) 840 delta = KTIME_MAX; 841 842 /* Calculate the next expiry time */ 843 if (delta < (KTIME_MAX - basemono)) 844 expires = basemono + delta; 845 else 846 expires = KTIME_MAX; 847 848 ts->timer_expires = min_t(u64, expires, next_tick); 849 850 out: 851 return ts->timer_expires; 852 } 853 854 static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) 855 { 856 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 857 u64 basemono = ts->timer_expires_base; 858 u64 expires = ts->timer_expires; 859 ktime_t tick = expires; 860 861 /* Make sure we won't be trying to stop it twice in a row. */ 862 ts->timer_expires_base = 0; 863 864 /* 865 * If this CPU is the one which updates jiffies, then give up 866 * the assignment and let it be taken by the CPU which runs 867 * the tick timer next, which might be this CPU as well. If we 868 * don't drop this here the jiffies might be stale and 869 * do_timer() never invoked. Keep track of the fact that it 870 * was the one which had the do_timer() duty last. 871 */ 872 if (cpu == tick_do_timer_cpu) { 873 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 874 ts->do_timer_last = 1; 875 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { 876 ts->do_timer_last = 0; 877 } 878 879 /* Skip reprogram of event if its not changed */ 880 if (ts->tick_stopped && (expires == ts->next_tick)) { 881 /* Sanity check: make sure clockevent is actually programmed */ 882 if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) 883 return; 884 885 WARN_ON_ONCE(1); 886 printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", 887 basemono, ts->next_tick, dev->next_event, 888 hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); 889 } 890 891 /* 892 * nohz_stop_sched_tick can be called several times before 893 * the nohz_restart_sched_tick is called. This happens when 894 * interrupts arrive which do not cause a reschedule. In the 895 * first call we save the current tick time, so we can restart 896 * the scheduler tick in nohz_restart_sched_tick. 897 */ 898 if (!ts->tick_stopped) { 899 calc_load_nohz_start(); 900 quiet_vmstat(); 901 902 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 903 ts->tick_stopped = 1; 904 trace_tick_stop(1, TICK_DEP_MASK_NONE); 905 } 906 907 ts->next_tick = tick; 908 909 /* 910 * If the expiration time == KTIME_MAX, then we simply stop 911 * the tick timer. 912 */ 913 if (unlikely(expires == KTIME_MAX)) { 914 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 915 hrtimer_cancel(&ts->sched_timer); 916 return; 917 } 918 919 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 920 hrtimer_start(&ts->sched_timer, tick, 921 HRTIMER_MODE_ABS_PINNED_HARD); 922 } else { 923 hrtimer_set_expires(&ts->sched_timer, tick); 924 tick_program_event(tick, 1); 925 } 926 } 927 928 static void tick_nohz_retain_tick(struct tick_sched *ts) 929 { 930 ts->timer_expires_base = 0; 931 } 932 933 #ifdef CONFIG_NO_HZ_FULL 934 static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) 935 { 936 if (tick_nohz_next_event(ts, cpu)) 937 tick_nohz_stop_tick(ts, cpu); 938 else 939 tick_nohz_retain_tick(ts); 940 } 941 #endif /* CONFIG_NO_HZ_FULL */ 942 943 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 944 { 945 /* Update jiffies first */ 946 tick_do_update_jiffies64(now); 947 948 /* 949 * Clear the timer idle flag, so we avoid IPIs on remote queueing and 950 * the clock forward checks in the enqueue path: 951 */ 952 timer_clear_idle(); 953 954 calc_load_nohz_stop(); 955 touch_softlockup_watchdog_sched(); 956 /* 957 * Cancel the scheduled timer and restore the tick 958 */ 959 ts->tick_stopped = 0; 960 tick_nohz_restart(ts, now); 961 } 962 963 static void __tick_nohz_full_update_tick(struct tick_sched *ts, 964 ktime_t now) 965 { 966 #ifdef CONFIG_NO_HZ_FULL 967 int cpu = smp_processor_id(); 968 969 if (can_stop_full_tick(cpu, ts)) 970 tick_nohz_stop_sched_tick(ts, cpu); 971 else if (ts->tick_stopped) 972 tick_nohz_restart_sched_tick(ts, now); 973 #endif 974 } 975 976 static void tick_nohz_full_update_tick(struct tick_sched *ts) 977 { 978 if (!tick_nohz_full_cpu(smp_processor_id())) 979 return; 980 981 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) 982 return; 983 984 __tick_nohz_full_update_tick(ts, ktime_get()); 985 } 986 987 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) 988 { 989 /* 990 * If this CPU is offline and it is the one which updates 991 * jiffies, then give up the assignment and let it be taken by 992 * the CPU which runs the tick timer next. If we don't drop 993 * this here the jiffies might be stale and do_timer() never 994 * invoked. 995 */ 996 if (unlikely(!cpu_online(cpu))) { 997 if (cpu == tick_do_timer_cpu) 998 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 999 /* 1000 * Make sure the CPU doesn't get fooled by obsolete tick 1001 * deadline if it comes back online later. 1002 */ 1003 ts->next_tick = 0; 1004 return false; 1005 } 1006 1007 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 1008 return false; 1009 1010 if (need_resched()) 1011 return false; 1012 1013 if (unlikely(local_softirq_pending())) { 1014 static int ratelimit; 1015 1016 if (ratelimit < 10 && !local_bh_blocked() && 1017 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { 1018 pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", 1019 (unsigned int) local_softirq_pending()); 1020 ratelimit++; 1021 } 1022 return false; 1023 } 1024 1025 if (tick_nohz_full_enabled()) { 1026 /* 1027 * Keep the tick alive to guarantee timekeeping progression 1028 * if there are full dynticks CPUs around 1029 */ 1030 if (tick_do_timer_cpu == cpu) 1031 return false; 1032 1033 /* Should not happen for nohz-full */ 1034 if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) 1035 return false; 1036 } 1037 1038 return true; 1039 } 1040 1041 static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) 1042 { 1043 ktime_t expires; 1044 int cpu = smp_processor_id(); 1045 1046 /* 1047 * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the 1048 * tick timer expiration time is known already. 1049 */ 1050 if (ts->timer_expires_base) 1051 expires = ts->timer_expires; 1052 else if (can_stop_idle_tick(cpu, ts)) 1053 expires = tick_nohz_next_event(ts, cpu); 1054 else 1055 return; 1056 1057 ts->idle_calls++; 1058 1059 if (expires > 0LL) { 1060 int was_stopped = ts->tick_stopped; 1061 1062 tick_nohz_stop_tick(ts, cpu); 1063 1064 ts->idle_sleeps++; 1065 ts->idle_expires = expires; 1066 1067 if (!was_stopped && ts->tick_stopped) { 1068 ts->idle_jiffies = ts->last_jiffies; 1069 nohz_balance_enter_idle(cpu); 1070 } 1071 } else { 1072 tick_nohz_retain_tick(ts); 1073 } 1074 } 1075 1076 /** 1077 * tick_nohz_idle_stop_tick - stop the idle tick from the idle task 1078 * 1079 * When the next event is more than a tick into the future, stop the idle tick 1080 */ 1081 void tick_nohz_idle_stop_tick(void) 1082 { 1083 __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); 1084 } 1085 1086 void tick_nohz_idle_retain_tick(void) 1087 { 1088 tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); 1089 /* 1090 * Undo the effect of get_next_timer_interrupt() called from 1091 * tick_nohz_next_event(). 1092 */ 1093 timer_clear_idle(); 1094 } 1095 1096 /** 1097 * tick_nohz_idle_enter - prepare for entering idle on the current CPU 1098 * 1099 * Called when we start the idle loop. 1100 */ 1101 void tick_nohz_idle_enter(void) 1102 { 1103 struct tick_sched *ts; 1104 1105 lockdep_assert_irqs_enabled(); 1106 1107 local_irq_disable(); 1108 1109 ts = this_cpu_ptr(&tick_cpu_sched); 1110 1111 WARN_ON_ONCE(ts->timer_expires_base); 1112 1113 ts->inidle = 1; 1114 tick_nohz_start_idle(ts); 1115 1116 local_irq_enable(); 1117 } 1118 1119 /** 1120 * tick_nohz_irq_exit - update next tick event from interrupt exit 1121 * 1122 * When an interrupt fires while we are idle and it doesn't cause 1123 * a reschedule, it may still add, modify or delete a timer, enqueue 1124 * an RCU callback, etc... 1125 * So we need to re-calculate and reprogram the next tick event. 1126 */ 1127 void tick_nohz_irq_exit(void) 1128 { 1129 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1130 1131 if (ts->inidle) 1132 tick_nohz_start_idle(ts); 1133 else 1134 tick_nohz_full_update_tick(ts); 1135 } 1136 1137 /** 1138 * tick_nohz_idle_got_tick - Check whether or not the tick handler has run 1139 */ 1140 bool tick_nohz_idle_got_tick(void) 1141 { 1142 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1143 1144 if (ts->got_idle_tick) { 1145 ts->got_idle_tick = 0; 1146 return true; 1147 } 1148 return false; 1149 } 1150 1151 /** 1152 * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer 1153 * or the tick, whatever that expires first. Note that, if the tick has been 1154 * stopped, it returns the next hrtimer. 1155 * 1156 * Called from power state control code with interrupts disabled 1157 */ 1158 ktime_t tick_nohz_get_next_hrtimer(void) 1159 { 1160 return __this_cpu_read(tick_cpu_device.evtdev)->next_event; 1161 } 1162 1163 /** 1164 * tick_nohz_get_sleep_length - return the expected length of the current sleep 1165 * @delta_next: duration until the next event if the tick cannot be stopped 1166 * 1167 * Called from power state control code with interrupts disabled. 1168 * 1169 * The return value of this function and/or the value returned by it through the 1170 * @delta_next pointer can be negative which must be taken into account by its 1171 * callers. 1172 */ 1173 ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) 1174 { 1175 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 1176 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1177 int cpu = smp_processor_id(); 1178 /* 1179 * The idle entry time is expected to be a sufficient approximation of 1180 * the current time at this point. 1181 */ 1182 ktime_t now = ts->idle_entrytime; 1183 ktime_t next_event; 1184 1185 WARN_ON_ONCE(!ts->inidle); 1186 1187 *delta_next = ktime_sub(dev->next_event, now); 1188 1189 if (!can_stop_idle_tick(cpu, ts)) 1190 return *delta_next; 1191 1192 next_event = tick_nohz_next_event(ts, cpu); 1193 if (!next_event) 1194 return *delta_next; 1195 1196 /* 1197 * If the next highres timer to expire is earlier than next_event, the 1198 * idle governor needs to know that. 1199 */ 1200 next_event = min_t(u64, next_event, 1201 hrtimer_next_event_without(&ts->sched_timer)); 1202 1203 return ktime_sub(next_event, now); 1204 } 1205 1206 /** 1207 * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value 1208 * for a particular CPU. 1209 * 1210 * Called from the schedutil frequency scaling governor in scheduler context. 1211 */ 1212 unsigned long tick_nohz_get_idle_calls_cpu(int cpu) 1213 { 1214 struct tick_sched *ts = tick_get_tick_sched(cpu); 1215 1216 return ts->idle_calls; 1217 } 1218 1219 /** 1220 * tick_nohz_get_idle_calls - return the current idle calls counter value 1221 * 1222 * Called from the schedutil frequency scaling governor in scheduler context. 1223 */ 1224 unsigned long tick_nohz_get_idle_calls(void) 1225 { 1226 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1227 1228 return ts->idle_calls; 1229 } 1230 1231 static void tick_nohz_account_idle_time(struct tick_sched *ts, 1232 ktime_t now) 1233 { 1234 unsigned long ticks; 1235 1236 ts->idle_exittime = now; 1237 1238 if (vtime_accounting_enabled_this_cpu()) 1239 return; 1240 /* 1241 * We stopped the tick in idle. Update process times would miss the 1242 * time we slept as update_process_times does only a 1 tick 1243 * accounting. Enforce that this is accounted to idle ! 1244 */ 1245 ticks = jiffies - ts->idle_jiffies; 1246 /* 1247 * We might be one off. Do not randomly account a huge number of ticks! 1248 */ 1249 if (ticks && ticks < LONG_MAX) 1250 account_idle_ticks(ticks); 1251 } 1252 1253 void tick_nohz_idle_restart_tick(void) 1254 { 1255 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1256 1257 if (ts->tick_stopped) { 1258 ktime_t now = ktime_get(); 1259 tick_nohz_restart_sched_tick(ts, now); 1260 tick_nohz_account_idle_time(ts, now); 1261 } 1262 } 1263 1264 static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) 1265 { 1266 if (tick_nohz_full_cpu(smp_processor_id())) 1267 __tick_nohz_full_update_tick(ts, now); 1268 else 1269 tick_nohz_restart_sched_tick(ts, now); 1270 1271 tick_nohz_account_idle_time(ts, now); 1272 } 1273 1274 /** 1275 * tick_nohz_idle_exit - restart the idle tick from the idle task 1276 * 1277 * Restart the idle tick when the CPU is woken up from idle 1278 * This also exit the RCU extended quiescent state. The CPU 1279 * can use RCU again after this function is called. 1280 */ 1281 void tick_nohz_idle_exit(void) 1282 { 1283 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1284 bool idle_active, tick_stopped; 1285 ktime_t now; 1286 1287 local_irq_disable(); 1288 1289 WARN_ON_ONCE(!ts->inidle); 1290 WARN_ON_ONCE(ts->timer_expires_base); 1291 1292 ts->inidle = 0; 1293 idle_active = ts->idle_active; 1294 tick_stopped = ts->tick_stopped; 1295 1296 if (idle_active || tick_stopped) 1297 now = ktime_get(); 1298 1299 if (idle_active) 1300 tick_nohz_stop_idle(ts, now); 1301 1302 if (tick_stopped) 1303 tick_nohz_idle_update_tick(ts, now); 1304 1305 local_irq_enable(); 1306 } 1307 1308 /* 1309 * The nohz low res interrupt handler 1310 */ 1311 static void tick_nohz_handler(struct clock_event_device *dev) 1312 { 1313 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1314 struct pt_regs *regs = get_irq_regs(); 1315 ktime_t now = ktime_get(); 1316 1317 dev->next_event = KTIME_MAX; 1318 1319 tick_sched_do_timer(ts, now); 1320 tick_sched_handle(ts, regs); 1321 1322 /* No need to reprogram if we are running tickless */ 1323 if (unlikely(ts->tick_stopped)) 1324 return; 1325 1326 hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); 1327 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 1328 } 1329 1330 static inline void tick_nohz_activate(struct tick_sched *ts, int mode) 1331 { 1332 if (!tick_nohz_enabled) 1333 return; 1334 ts->nohz_mode = mode; 1335 /* One update is enough */ 1336 if (!test_and_set_bit(0, &tick_nohz_active)) 1337 timers_update_nohz(); 1338 } 1339 1340 /** 1341 * tick_nohz_switch_to_nohz - switch to nohz mode 1342 */ 1343 static void tick_nohz_switch_to_nohz(void) 1344 { 1345 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1346 ktime_t next; 1347 1348 if (!tick_nohz_enabled) 1349 return; 1350 1351 if (tick_switch_to_oneshot(tick_nohz_handler)) 1352 return; 1353 1354 /* 1355 * Recycle the hrtimer in ts, so we can share the 1356 * hrtimer_forward with the highres code. 1357 */ 1358 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); 1359 /* Get the next period */ 1360 next = tick_init_jiffy_update(); 1361 1362 hrtimer_set_expires(&ts->sched_timer, next); 1363 hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); 1364 tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); 1365 tick_nohz_activate(ts, NOHZ_MODE_LOWRES); 1366 } 1367 1368 static inline void tick_nohz_irq_enter(void) 1369 { 1370 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1371 ktime_t now; 1372 1373 if (!ts->idle_active && !ts->tick_stopped) 1374 return; 1375 now = ktime_get(); 1376 if (ts->idle_active) 1377 tick_nohz_stop_idle(ts, now); 1378 if (ts->tick_stopped) 1379 tick_nohz_update_jiffies(now); 1380 } 1381 1382 #else 1383 1384 static inline void tick_nohz_switch_to_nohz(void) { } 1385 static inline void tick_nohz_irq_enter(void) { } 1386 static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } 1387 1388 #endif /* CONFIG_NO_HZ_COMMON */ 1389 1390 /* 1391 * Called from irq_enter to notify about the possible interruption of idle() 1392 */ 1393 void tick_irq_enter(void) 1394 { 1395 tick_check_oneshot_broadcast_this_cpu(); 1396 tick_nohz_irq_enter(); 1397 } 1398 1399 /* 1400 * High resolution timer specific code 1401 */ 1402 #ifdef CONFIG_HIGH_RES_TIMERS 1403 /* 1404 * We rearm the timer until we get disabled by the idle code. 1405 * Called with interrupts disabled. 1406 */ 1407 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 1408 { 1409 struct tick_sched *ts = 1410 container_of(timer, struct tick_sched, sched_timer); 1411 struct pt_regs *regs = get_irq_regs(); 1412 ktime_t now = ktime_get(); 1413 1414 tick_sched_do_timer(ts, now); 1415 1416 /* 1417 * Do not call, when we are not in irq context and have 1418 * no valid regs pointer 1419 */ 1420 if (regs) 1421 tick_sched_handle(ts, regs); 1422 else 1423 ts->next_tick = 0; 1424 1425 /* No need to reprogram if we are in idle or full dynticks mode */ 1426 if (unlikely(ts->tick_stopped)) 1427 return HRTIMER_NORESTART; 1428 1429 hrtimer_forward(timer, now, TICK_NSEC); 1430 1431 return HRTIMER_RESTART; 1432 } 1433 1434 static int sched_skew_tick; 1435 1436 static int __init skew_tick(char *str) 1437 { 1438 get_option(&str, &sched_skew_tick); 1439 1440 return 0; 1441 } 1442 early_param("skew_tick", skew_tick); 1443 1444 /** 1445 * tick_setup_sched_timer - setup the tick emulation timer 1446 */ 1447 void tick_setup_sched_timer(void) 1448 { 1449 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1450 ktime_t now = ktime_get(); 1451 1452 /* 1453 * Emulate tick processing via per-CPU hrtimers: 1454 */ 1455 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); 1456 ts->sched_timer.function = tick_sched_timer; 1457 1458 /* Get the next period (per-CPU) */ 1459 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 1460 1461 /* Offset the tick to avert jiffies_lock contention. */ 1462 if (sched_skew_tick) { 1463 u64 offset = TICK_NSEC >> 1; 1464 do_div(offset, num_possible_cpus()); 1465 offset *= smp_processor_id(); 1466 hrtimer_add_expires_ns(&ts->sched_timer, offset); 1467 } 1468 1469 hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); 1470 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); 1471 tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); 1472 } 1473 #endif /* HIGH_RES_TIMERS */ 1474 1475 #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS 1476 void tick_cancel_sched_timer(int cpu) 1477 { 1478 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1479 1480 # ifdef CONFIG_HIGH_RES_TIMERS 1481 if (ts->sched_timer.base) 1482 hrtimer_cancel(&ts->sched_timer); 1483 # endif 1484 1485 memset(ts, 0, sizeof(*ts)); 1486 } 1487 #endif 1488 1489 /** 1490 * Async notification about clocksource changes 1491 */ 1492 void tick_clock_notify(void) 1493 { 1494 int cpu; 1495 1496 for_each_possible_cpu(cpu) 1497 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); 1498 } 1499 1500 /* 1501 * Async notification about clock event changes 1502 */ 1503 void tick_oneshot_notify(void) 1504 { 1505 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1506 1507 set_bit(0, &ts->check_clocks); 1508 } 1509 1510 /** 1511 * Check, if a change happened, which makes oneshot possible. 1512 * 1513 * Called cyclic from the hrtimer softirq (driven by the timer 1514 * softirq) allow_nohz signals, that we can switch into low-res nohz 1515 * mode, because high resolution timers are disabled (either compile 1516 * or runtime). Called with interrupts disabled. 1517 */ 1518 int tick_check_oneshot_change(int allow_nohz) 1519 { 1520 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); 1521 1522 if (!test_and_clear_bit(0, &ts->check_clocks)) 1523 return 0; 1524 1525 if (ts->nohz_mode != NOHZ_MODE_INACTIVE) 1526 return 0; 1527 1528 if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) 1529 return 0; 1530 1531 if (!allow_nohz) 1532 return 1; 1533 1534 tick_nohz_switch_to_nohz(); 1535 return 0; 1536 } 1537