1 /* 2 * linux/kernel/time/tick-sched.c 3 * 4 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> 5 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar 6 * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner 7 * 8 * No idle tick implementation for low and high resolution timers 9 * 10 * Started by: Thomas Gleixner and Ingo Molnar 11 * 12 * Distribute under GPLv2. 13 */ 14 #include <linux/cpu.h> 15 #include <linux/err.h> 16 #include <linux/hrtimer.h> 17 #include <linux/interrupt.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/percpu.h> 20 #include <linux/profile.h> 21 #include <linux/sched.h> 22 #include <linux/tick.h> 23 24 #include <asm/irq_regs.h> 25 26 #include "tick-internal.h" 27 28 /* 29 * Per cpu nohz control structure 30 */ 31 static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); 32 33 /* 34 * The time, when the last jiffy update happened. Protected by xtime_lock. 35 */ 36 static ktime_t last_jiffies_update; 37 38 struct tick_sched *tick_get_tick_sched(int cpu) 39 { 40 return &per_cpu(tick_cpu_sched, cpu); 41 } 42 43 /* 44 * Must be called with interrupts disabled ! 45 */ 46 static void tick_do_update_jiffies64(ktime_t now) 47 { 48 unsigned long ticks = 0; 49 ktime_t delta; 50 51 /* Reevalute with xtime_lock held */ 52 write_seqlock(&xtime_lock); 53 54 delta = ktime_sub(now, last_jiffies_update); 55 if (delta.tv64 >= tick_period.tv64) { 56 57 delta = ktime_sub(delta, tick_period); 58 last_jiffies_update = ktime_add(last_jiffies_update, 59 tick_period); 60 61 /* Slow path for long timeouts */ 62 if (unlikely(delta.tv64 >= tick_period.tv64)) { 63 s64 incr = ktime_to_ns(tick_period); 64 65 ticks = ktime_divns(delta, incr); 66 67 last_jiffies_update = ktime_add_ns(last_jiffies_update, 68 incr * ticks); 69 } 70 do_timer(++ticks); 71 } 72 write_sequnlock(&xtime_lock); 73 } 74 75 /* 76 * Initialize and return retrieve the jiffies update. 77 */ 78 static ktime_t tick_init_jiffy_update(void) 79 { 80 ktime_t period; 81 82 write_seqlock(&xtime_lock); 83 /* Did we start the jiffies update yet ? */ 84 if (last_jiffies_update.tv64 == 0) 85 last_jiffies_update = tick_next_period; 86 period = last_jiffies_update; 87 write_sequnlock(&xtime_lock); 88 return period; 89 } 90 91 /* 92 * NOHZ - aka dynamic tick functionality 93 */ 94 #ifdef CONFIG_NO_HZ 95 /* 96 * NO HZ enabled ? 97 */ 98 static int tick_nohz_enabled __read_mostly = 1; 99 100 /* 101 * Enable / Disable tickless mode 102 */ 103 static int __init setup_tick_nohz(char *str) 104 { 105 if (!strcmp(str, "off")) 106 tick_nohz_enabled = 0; 107 else if (!strcmp(str, "on")) 108 tick_nohz_enabled = 1; 109 else 110 return 0; 111 return 1; 112 } 113 114 __setup("nohz=", setup_tick_nohz); 115 116 /** 117 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 118 * 119 * Called from interrupt entry when the CPU was idle 120 * 121 * In case the sched_tick was stopped on this CPU, we have to check if jiffies 122 * must be updated. Otherwise an interrupt handler could use a stale jiffy 123 * value. We do this unconditionally on any cpu, as we don't know whether the 124 * cpu, which has the update task assigned is in a long sleep. 125 */ 126 void tick_nohz_update_jiffies(void) 127 { 128 int cpu = smp_processor_id(); 129 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 130 unsigned long flags; 131 ktime_t now; 132 133 if (!ts->tick_stopped) 134 return; 135 136 touch_softlockup_watchdog(); 137 138 cpu_clear(cpu, nohz_cpu_mask); 139 now = ktime_get(); 140 141 local_irq_save(flags); 142 tick_do_update_jiffies64(now); 143 local_irq_restore(flags); 144 } 145 146 /** 147 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 148 * 149 * When the next event is more than a tick into the future, stop the idle tick 150 * Called either from the idle loop or from irq_exit() when an idle period was 151 * just interrupted by an interrupt which did not cause a reschedule. 152 */ 153 void tick_nohz_stop_sched_tick(void) 154 { 155 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 156 unsigned long rt_jiffies; 157 struct tick_sched *ts; 158 ktime_t last_update, expires, now, delta; 159 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 160 int cpu; 161 162 local_irq_save(flags); 163 164 cpu = smp_processor_id(); 165 ts = &per_cpu(tick_cpu_sched, cpu); 166 167 /* 168 * If this cpu is offline and it is the one which updates 169 * jiffies, then give up the assignment and let it be taken by 170 * the cpu which runs the tick timer next. If we don't drop 171 * this here the jiffies might be stale and do_timer() never 172 * invoked. 173 */ 174 if (unlikely(!cpu_online(cpu))) { 175 if (cpu == tick_do_timer_cpu) 176 tick_do_timer_cpu = -1; 177 } 178 179 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 180 goto end; 181 182 if (need_resched()) 183 goto end; 184 185 cpu = smp_processor_id(); 186 if (unlikely(local_softirq_pending())) { 187 static int ratelimit; 188 189 if (ratelimit < 10) { 190 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 191 local_softirq_pending()); 192 ratelimit++; 193 } 194 } 195 196 now = ktime_get(); 197 /* 198 * When called from irq_exit we need to account the idle sleep time 199 * correctly. 200 */ 201 if (ts->tick_stopped) { 202 delta = ktime_sub(now, ts->idle_entrytime); 203 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 204 } 205 206 ts->idle_entrytime = now; 207 ts->idle_calls++; 208 209 /* Read jiffies and the time when jiffies were updated last */ 210 do { 211 seq = read_seqbegin(&xtime_lock); 212 last_update = last_jiffies_update; 213 last_jiffies = jiffies; 214 } while (read_seqretry(&xtime_lock, seq)); 215 216 /* Get the next timer wheel timer */ 217 next_jiffies = get_next_timer_interrupt(last_jiffies); 218 delta_jiffies = next_jiffies - last_jiffies; 219 220 rt_jiffies = rt_needs_cpu(cpu); 221 if (rt_jiffies && rt_jiffies < delta_jiffies) 222 delta_jiffies = rt_jiffies; 223 224 if (rcu_needs_cpu(cpu)) 225 delta_jiffies = 1; 226 /* 227 * Do not stop the tick, if we are only one off 228 * or if the cpu is required for rcu 229 */ 230 if (!ts->tick_stopped && delta_jiffies == 1) 231 goto out; 232 233 /* Schedule the tick, if we are at least one jiffie off */ 234 if ((long)delta_jiffies >= 1) { 235 236 if (delta_jiffies > 1) 237 cpu_set(cpu, nohz_cpu_mask); 238 /* 239 * nohz_stop_sched_tick can be called several times before 240 * the nohz_restart_sched_tick is called. This happens when 241 * interrupts arrive which do not cause a reschedule. In the 242 * first call we save the current tick time, so we can restart 243 * the scheduler tick in nohz_restart_sched_tick. 244 */ 245 if (!ts->tick_stopped) { 246 if (select_nohz_load_balancer(1)) { 247 /* 248 * sched tick not stopped! 249 */ 250 cpu_clear(cpu, nohz_cpu_mask); 251 goto out; 252 } 253 254 ts->idle_tick = ts->sched_timer.expires; 255 ts->tick_stopped = 1; 256 ts->idle_jiffies = last_jiffies; 257 } 258 259 /* 260 * If this cpu is the one which updates jiffies, then 261 * give up the assignment and let it be taken by the 262 * cpu which runs the tick timer next, which might be 263 * this cpu as well. If we don't drop this here the 264 * jiffies might be stale and do_timer() never 265 * invoked. 266 */ 267 if (cpu == tick_do_timer_cpu) 268 tick_do_timer_cpu = -1; 269 270 ts->idle_sleeps++; 271 272 /* 273 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that 274 * there is no timer pending or at least extremly far 275 * into the future (12 days for HZ=1000). In this case 276 * we simply stop the tick timer: 277 */ 278 if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { 279 ts->idle_expires.tv64 = KTIME_MAX; 280 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 281 hrtimer_cancel(&ts->sched_timer); 282 goto out; 283 } 284 285 /* 286 * calculate the expiry time for the next timer wheel 287 * timer 288 */ 289 expires = ktime_add_ns(last_update, tick_period.tv64 * 290 delta_jiffies); 291 ts->idle_expires = expires; 292 293 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 294 hrtimer_start(&ts->sched_timer, expires, 295 HRTIMER_MODE_ABS); 296 /* Check, if the timer was already in the past */ 297 if (hrtimer_active(&ts->sched_timer)) 298 goto out; 299 } else if (!tick_program_event(expires, 0)) 300 goto out; 301 /* 302 * We are past the event already. So we crossed a 303 * jiffie boundary. Update jiffies and raise the 304 * softirq. 305 */ 306 tick_do_update_jiffies64(ktime_get()); 307 cpu_clear(cpu, nohz_cpu_mask); 308 } 309 raise_softirq_irqoff(TIMER_SOFTIRQ); 310 out: 311 ts->next_jiffies = next_jiffies; 312 ts->last_jiffies = last_jiffies; 313 ts->sleep_length = ktime_sub(dev->next_event, now); 314 end: 315 local_irq_restore(flags); 316 } 317 318 /** 319 * tick_nohz_get_sleep_length - return the length of the current sleep 320 * 321 * Called from power state control code with interrupts disabled 322 */ 323 ktime_t tick_nohz_get_sleep_length(void) 324 { 325 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 326 327 return ts->sleep_length; 328 } 329 330 /** 331 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task 332 * 333 * Restart the idle tick when the CPU is woken up from idle 334 */ 335 void tick_nohz_restart_sched_tick(void) 336 { 337 int cpu = smp_processor_id(); 338 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 339 unsigned long ticks; 340 ktime_t now, delta; 341 342 if (!ts->tick_stopped) 343 return; 344 345 /* Update jiffies first */ 346 now = ktime_get(); 347 348 local_irq_disable(); 349 select_nohz_load_balancer(0); 350 tick_do_update_jiffies64(now); 351 cpu_clear(cpu, nohz_cpu_mask); 352 353 /* Account the idle time */ 354 delta = ktime_sub(now, ts->idle_entrytime); 355 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 356 357 /* 358 * We stopped the tick in idle. Update process times would miss the 359 * time we slept as update_process_times does only a 1 tick 360 * accounting. Enforce that this is accounted to idle ! 361 */ 362 ticks = jiffies - ts->idle_jiffies; 363 /* 364 * We might be one off. Do not randomly account a huge number of ticks! 365 */ 366 if (ticks && ticks < LONG_MAX) { 367 add_preempt_count(HARDIRQ_OFFSET); 368 account_system_time(current, HARDIRQ_OFFSET, 369 jiffies_to_cputime(ticks)); 370 sub_preempt_count(HARDIRQ_OFFSET); 371 } 372 373 /* 374 * Cancel the scheduled timer and restore the tick 375 */ 376 ts->tick_stopped = 0; 377 hrtimer_cancel(&ts->sched_timer); 378 ts->sched_timer.expires = ts->idle_tick; 379 380 while (1) { 381 /* Forward the time to expire in the future */ 382 hrtimer_forward(&ts->sched_timer, now, tick_period); 383 384 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 385 hrtimer_start(&ts->sched_timer, 386 ts->sched_timer.expires, 387 HRTIMER_MODE_ABS); 388 /* Check, if the timer was already in the past */ 389 if (hrtimer_active(&ts->sched_timer)) 390 break; 391 } else { 392 if (!tick_program_event(ts->sched_timer.expires, 0)) 393 break; 394 } 395 /* Update jiffies and reread time */ 396 tick_do_update_jiffies64(now); 397 now = ktime_get(); 398 } 399 local_irq_enable(); 400 } 401 402 static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) 403 { 404 hrtimer_forward(&ts->sched_timer, now, tick_period); 405 return tick_program_event(ts->sched_timer.expires, 0); 406 } 407 408 /* 409 * The nohz low res interrupt handler 410 */ 411 static void tick_nohz_handler(struct clock_event_device *dev) 412 { 413 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 414 struct pt_regs *regs = get_irq_regs(); 415 int cpu = smp_processor_id(); 416 ktime_t now = ktime_get(); 417 418 dev->next_event.tv64 = KTIME_MAX; 419 420 /* 421 * Check if the do_timer duty was dropped. We don't care about 422 * concurrency: This happens only when the cpu in charge went 423 * into a long sleep. If two cpus happen to assign themself to 424 * this duty, then the jiffies update is still serialized by 425 * xtime_lock. 426 */ 427 if (unlikely(tick_do_timer_cpu == -1)) 428 tick_do_timer_cpu = cpu; 429 430 /* Check, if the jiffies need an update */ 431 if (tick_do_timer_cpu == cpu) 432 tick_do_update_jiffies64(now); 433 434 /* 435 * When we are idle and the tick is stopped, we have to touch 436 * the watchdog as we might not schedule for a really long 437 * time. This happens on complete idle SMP systems while 438 * waiting on the login prompt. We also increment the "start 439 * of idle" jiffy stamp so the idle accounting adjustment we 440 * do when we go busy again does not account too much ticks. 441 */ 442 if (ts->tick_stopped) { 443 touch_softlockup_watchdog(); 444 ts->idle_jiffies++; 445 } 446 447 update_process_times(user_mode(regs)); 448 profile_tick(CPU_PROFILING); 449 450 /* Do not restart, when we are in the idle loop */ 451 if (ts->tick_stopped) 452 return; 453 454 while (tick_nohz_reprogram(ts, now)) { 455 now = ktime_get(); 456 tick_do_update_jiffies64(now); 457 } 458 } 459 460 /** 461 * tick_nohz_switch_to_nohz - switch to nohz mode 462 */ 463 static void tick_nohz_switch_to_nohz(void) 464 { 465 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 466 ktime_t next; 467 468 if (!tick_nohz_enabled) 469 return; 470 471 local_irq_disable(); 472 if (tick_switch_to_oneshot(tick_nohz_handler)) { 473 local_irq_enable(); 474 return; 475 } 476 477 ts->nohz_mode = NOHZ_MODE_LOWRES; 478 479 /* 480 * Recycle the hrtimer in ts, so we can share the 481 * hrtimer_forward with the highres code. 482 */ 483 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 484 /* Get the next period */ 485 next = tick_init_jiffy_update(); 486 487 for (;;) { 488 ts->sched_timer.expires = next; 489 if (!tick_program_event(next, 0)) 490 break; 491 next = ktime_add(next, tick_period); 492 } 493 local_irq_enable(); 494 495 printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", 496 smp_processor_id()); 497 } 498 499 #else 500 501 static inline void tick_nohz_switch_to_nohz(void) { } 502 503 #endif /* NO_HZ */ 504 505 /* 506 * High resolution timer specific code 507 */ 508 #ifdef CONFIG_HIGH_RES_TIMERS 509 /* 510 * We rearm the timer until we get disabled by the idle code. 511 * Called with interrupts disabled and timer->base->cpu_base->lock held. 512 */ 513 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 514 { 515 struct tick_sched *ts = 516 container_of(timer, struct tick_sched, sched_timer); 517 struct pt_regs *regs = get_irq_regs(); 518 ktime_t now = ktime_get(); 519 int cpu = smp_processor_id(); 520 521 #ifdef CONFIG_NO_HZ 522 /* 523 * Check if the do_timer duty was dropped. We don't care about 524 * concurrency: This happens only when the cpu in charge went 525 * into a long sleep. If two cpus happen to assign themself to 526 * this duty, then the jiffies update is still serialized by 527 * xtime_lock. 528 */ 529 if (unlikely(tick_do_timer_cpu == -1)) 530 tick_do_timer_cpu = cpu; 531 #endif 532 533 /* Check, if the jiffies need an update */ 534 if (tick_do_timer_cpu == cpu) 535 tick_do_update_jiffies64(now); 536 537 /* 538 * Do not call, when we are not in irq context and have 539 * no valid regs pointer 540 */ 541 if (regs) { 542 /* 543 * When we are idle and the tick is stopped, we have to touch 544 * the watchdog as we might not schedule for a really long 545 * time. This happens on complete idle SMP systems while 546 * waiting on the login prompt. We also increment the "start of 547 * idle" jiffy stamp so the idle accounting adjustment we do 548 * when we go busy again does not account too much ticks. 549 */ 550 if (ts->tick_stopped) { 551 touch_softlockup_watchdog(); 552 ts->idle_jiffies++; 553 } 554 update_process_times(user_mode(regs)); 555 profile_tick(CPU_PROFILING); 556 } 557 558 /* Do not restart, when we are in the idle loop */ 559 if (ts->tick_stopped) 560 return HRTIMER_NORESTART; 561 562 hrtimer_forward(timer, now, tick_period); 563 564 return HRTIMER_RESTART; 565 } 566 567 /** 568 * tick_setup_sched_timer - setup the tick emulation timer 569 */ 570 void tick_setup_sched_timer(void) 571 { 572 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 573 ktime_t now = ktime_get(); 574 u64 offset; 575 576 /* 577 * Emulate tick processing via per-CPU hrtimers: 578 */ 579 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 580 ts->sched_timer.function = tick_sched_timer; 581 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 582 583 /* Get the next period (per cpu) */ 584 ts->sched_timer.expires = tick_init_jiffy_update(); 585 offset = ktime_to_ns(tick_period) >> 1; 586 do_div(offset, num_possible_cpus()); 587 offset *= smp_processor_id(); 588 ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); 589 590 for (;;) { 591 hrtimer_forward(&ts->sched_timer, now, tick_period); 592 hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, 593 HRTIMER_MODE_ABS); 594 /* Check, if the timer was already in the past */ 595 if (hrtimer_active(&ts->sched_timer)) 596 break; 597 now = ktime_get(); 598 } 599 600 #ifdef CONFIG_NO_HZ 601 if (tick_nohz_enabled) 602 ts->nohz_mode = NOHZ_MODE_HIGHRES; 603 #endif 604 } 605 606 void tick_cancel_sched_timer(int cpu) 607 { 608 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 609 610 if (ts->sched_timer.base) 611 hrtimer_cancel(&ts->sched_timer); 612 ts->tick_stopped = 0; 613 ts->nohz_mode = NOHZ_MODE_INACTIVE; 614 } 615 #endif /* HIGH_RES_TIMERS */ 616 617 /** 618 * Async notification about clocksource changes 619 */ 620 void tick_clock_notify(void) 621 { 622 int cpu; 623 624 for_each_possible_cpu(cpu) 625 set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); 626 } 627 628 /* 629 * Async notification about clock event changes 630 */ 631 void tick_oneshot_notify(void) 632 { 633 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 634 635 set_bit(0, &ts->check_clocks); 636 } 637 638 /** 639 * Check, if a change happened, which makes oneshot possible. 640 * 641 * Called cyclic from the hrtimer softirq (driven by the timer 642 * softirq) allow_nohz signals, that we can switch into low-res nohz 643 * mode, because high resolution timers are disabled (either compile 644 * or runtime). 645 */ 646 int tick_check_oneshot_change(int allow_nohz) 647 { 648 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 649 650 if (!test_and_clear_bit(0, &ts->check_clocks)) 651 return 0; 652 653 if (ts->nohz_mode != NOHZ_MODE_INACTIVE) 654 return 0; 655 656 if (!timekeeping_is_continuous() || !tick_is_oneshot_available()) 657 return 0; 658 659 if (!allow_nohz) 660 return 1; 661 662 tick_nohz_switch_to_nohz(); 663 return 0; 664 } 665