1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * sched_clock() for unstable CPU clocks 4 * 5 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra 6 * 7 * Updates and enhancements: 8 * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com> 9 * 10 * Based on code by: 11 * Ingo Molnar <mingo@redhat.com> 12 * Guillaume Chazarain <guichaz@gmail.com> 13 * 14 * 15 * What this file implements: 16 * 17 * cpu_clock(i) provides a fast (execution time) high resolution 18 * clock with bounded drift between CPUs. The value of cpu_clock(i) 19 * is monotonic for constant i. The timestamp returned is in nanoseconds. 20 * 21 * ######################### BIG FAT WARNING ########################## 22 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # 23 * # go backwards !! # 24 * #################################################################### 25 * 26 * There is no strict promise about the base, although it tends to start 27 * at 0 on boot (but people really shouldn't rely on that). 28 * 29 * cpu_clock(i) -- can be used from any context, including NMI. 30 * local_clock() -- is cpu_clock() on the current CPU. 31 * 32 * sched_clock_cpu(i) 33 * 34 * How it is implemented: 35 * 36 * The implementation either uses sched_clock() when 37 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the 38 * sched_clock() is assumed to provide these properties (mostly it means 39 * the architecture provides a globally synchronized highres time source). 40 * 41 * Otherwise it tries to create a semi stable clock from a mixture of other 42 * clocks, including: 43 * 44 * - GTOD (clock monotomic) 45 * - sched_clock() 46 * - explicit idle events 47 * 48 * We use GTOD as base and use sched_clock() deltas to improve resolution. The 49 * deltas are filtered to provide monotonicity and keeping it within an 50 * expected window. 51 * 52 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 53 * that is otherwise invisible (TSC gets stopped). 54 * 55 */ 56 #include "sched.h" 57 #include <linux/sched_clock.h> 58 59 /* 60 * Scheduler clock - returns current time in nanosec units. 61 * This is default implementation. 62 * Architectures and sub-architectures can override this. 63 */ 64 unsigned long long __weak sched_clock(void) 65 { 66 return (unsigned long long)(jiffies - INITIAL_JIFFIES) 67 * (NSEC_PER_SEC / HZ); 68 } 69 EXPORT_SYMBOL_GPL(sched_clock); 70 71 static DEFINE_STATIC_KEY_FALSE(sched_clock_running); 72 73 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 74 /* 75 * We must start with !__sched_clock_stable because the unstable -> stable 76 * transition is accurate, while the stable -> unstable transition is not. 77 * 78 * Similarly we start with __sched_clock_stable_early, thereby assuming we 79 * will become stable, such that there's only a single 1 -> 0 transition. 80 */ 81 static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); 82 static int __sched_clock_stable_early = 1; 83 84 /* 85 * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset 86 */ 87 __read_mostly u64 __sched_clock_offset; 88 static __read_mostly u64 __gtod_offset; 89 90 struct sched_clock_data { 91 u64 tick_raw; 92 u64 tick_gtod; 93 u64 clock; 94 }; 95 96 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); 97 98 static inline struct sched_clock_data *this_scd(void) 99 { 100 return this_cpu_ptr(&sched_clock_data); 101 } 102 103 static inline struct sched_clock_data *cpu_sdc(int cpu) 104 { 105 return &per_cpu(sched_clock_data, cpu); 106 } 107 108 int sched_clock_stable(void) 109 { 110 return static_branch_likely(&__sched_clock_stable); 111 } 112 113 static void __scd_stamp(struct sched_clock_data *scd) 114 { 115 scd->tick_gtod = ktime_get_ns(); 116 scd->tick_raw = sched_clock(); 117 } 118 119 static void __set_sched_clock_stable(void) 120 { 121 struct sched_clock_data *scd; 122 123 /* 124 * Since we're still unstable and the tick is already running, we have 125 * to disable IRQs in order to get a consistent scd->tick* reading. 126 */ 127 local_irq_disable(); 128 scd = this_scd(); 129 /* 130 * Attempt to make the (initial) unstable->stable transition continuous. 131 */ 132 __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); 133 local_irq_enable(); 134 135 printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", 136 scd->tick_gtod, __gtod_offset, 137 scd->tick_raw, __sched_clock_offset); 138 139 static_branch_enable(&__sched_clock_stable); 140 tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); 141 } 142 143 /* 144 * If we ever get here, we're screwed, because we found out -- typically after 145 * the fact -- that TSC wasn't good. This means all our clocksources (including 146 * ktime) could have reported wrong values. 147 * 148 * What we do here is an attempt to fix up and continue sort of where we left 149 * off in a coherent manner. 150 * 151 * The only way to fully avoid random clock jumps is to boot with: 152 * "tsc=unstable". 153 */ 154 static void __sched_clock_work(struct work_struct *work) 155 { 156 struct sched_clock_data *scd; 157 int cpu; 158 159 /* take a current timestamp and set 'now' */ 160 preempt_disable(); 161 scd = this_scd(); 162 __scd_stamp(scd); 163 scd->clock = scd->tick_gtod + __gtod_offset; 164 preempt_enable(); 165 166 /* clone to all CPUs */ 167 for_each_possible_cpu(cpu) 168 per_cpu(sched_clock_data, cpu) = *scd; 169 170 printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n"); 171 printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", 172 scd->tick_gtod, __gtod_offset, 173 scd->tick_raw, __sched_clock_offset); 174 175 static_branch_disable(&__sched_clock_stable); 176 } 177 178 static DECLARE_WORK(sched_clock_work, __sched_clock_work); 179 180 static void __clear_sched_clock_stable(void) 181 { 182 if (!sched_clock_stable()) 183 return; 184 185 tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); 186 schedule_work(&sched_clock_work); 187 } 188 189 void clear_sched_clock_stable(void) 190 { 191 __sched_clock_stable_early = 0; 192 193 smp_mb(); /* matches sched_clock_init_late() */ 194 195 if (static_key_count(&sched_clock_running.key) == 2) 196 __clear_sched_clock_stable(); 197 } 198 199 static void __sched_clock_gtod_offset(void) 200 { 201 struct sched_clock_data *scd = this_scd(); 202 203 __scd_stamp(scd); 204 __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod; 205 } 206 207 void __init sched_clock_init(void) 208 { 209 /* 210 * Set __gtod_offset such that once we mark sched_clock_running, 211 * sched_clock_tick() continues where sched_clock() left off. 212 * 213 * Even if TSC is buggered, we're still UP at this point so it 214 * can't really be out of sync. 215 */ 216 local_irq_disable(); 217 __sched_clock_gtod_offset(); 218 local_irq_enable(); 219 220 static_branch_inc(&sched_clock_running); 221 } 222 /* 223 * We run this as late_initcall() such that it runs after all built-in drivers, 224 * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. 225 */ 226 static int __init sched_clock_init_late(void) 227 { 228 static_branch_inc(&sched_clock_running); 229 /* 230 * Ensure that it is impossible to not do a static_key update. 231 * 232 * Either {set,clear}_sched_clock_stable() must see sched_clock_running 233 * and do the update, or we must see their __sched_clock_stable_early 234 * and do the update, or both. 235 */ 236 smp_mb(); /* matches {set,clear}_sched_clock_stable() */ 237 238 if (__sched_clock_stable_early) 239 __set_sched_clock_stable(); 240 241 return 0; 242 } 243 late_initcall(sched_clock_init_late); 244 245 /* 246 * min, max except they take wrapping into account 247 */ 248 249 static inline u64 wrap_min(u64 x, u64 y) 250 { 251 return (s64)(x - y) < 0 ? x : y; 252 } 253 254 static inline u64 wrap_max(u64 x, u64 y) 255 { 256 return (s64)(x - y) > 0 ? x : y; 257 } 258 259 /* 260 * update the percpu scd from the raw @now value 261 * 262 * - filter out backward motion 263 * - use the GTOD tick value to create a window to filter crazy TSC values 264 */ 265 static u64 sched_clock_local(struct sched_clock_data *scd) 266 { 267 u64 now, clock, old_clock, min_clock, max_clock, gtod; 268 s64 delta; 269 270 again: 271 now = sched_clock(); 272 delta = now - scd->tick_raw; 273 if (unlikely(delta < 0)) 274 delta = 0; 275 276 old_clock = scd->clock; 277 278 /* 279 * scd->clock = clamp(scd->tick_gtod + delta, 280 * max(scd->tick_gtod, scd->clock), 281 * scd->tick_gtod + TICK_NSEC); 282 */ 283 284 gtod = scd->tick_gtod + __gtod_offset; 285 clock = gtod + delta; 286 min_clock = wrap_max(gtod, old_clock); 287 max_clock = wrap_max(old_clock, gtod + TICK_NSEC); 288 289 clock = wrap_max(clock, min_clock); 290 clock = wrap_min(clock, max_clock); 291 292 if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) 293 goto again; 294 295 return clock; 296 } 297 298 static u64 sched_clock_remote(struct sched_clock_data *scd) 299 { 300 struct sched_clock_data *my_scd = this_scd(); 301 u64 this_clock, remote_clock; 302 u64 *ptr, old_val, val; 303 304 #if BITS_PER_LONG != 64 305 again: 306 /* 307 * Careful here: The local and the remote clock values need to 308 * be read out atomic as we need to compare the values and 309 * then update either the local or the remote side. So the 310 * cmpxchg64 below only protects one readout. 311 * 312 * We must reread via sched_clock_local() in the retry case on 313 * 32-bit kernels as an NMI could use sched_clock_local() via the 314 * tracer and hit between the readout of 315 * the low 32-bit and the high 32-bit portion. 316 */ 317 this_clock = sched_clock_local(my_scd); 318 /* 319 * We must enforce atomic readout on 32-bit, otherwise the 320 * update on the remote CPU can hit inbetween the readout of 321 * the low 32-bit and the high 32-bit portion. 322 */ 323 remote_clock = cmpxchg64(&scd->clock, 0, 0); 324 #else 325 /* 326 * On 64-bit kernels the read of [my]scd->clock is atomic versus the 327 * update, so we can avoid the above 32-bit dance. 328 */ 329 sched_clock_local(my_scd); 330 again: 331 this_clock = my_scd->clock; 332 remote_clock = scd->clock; 333 #endif 334 335 /* 336 * Use the opportunity that we have both locks 337 * taken to couple the two clocks: we take the 338 * larger time as the latest time for both 339 * runqueues. (this creates monotonic movement) 340 */ 341 if (likely((s64)(remote_clock - this_clock) < 0)) { 342 ptr = &scd->clock; 343 old_val = remote_clock; 344 val = this_clock; 345 } else { 346 /* 347 * Should be rare, but possible: 348 */ 349 ptr = &my_scd->clock; 350 old_val = this_clock; 351 val = remote_clock; 352 } 353 354 if (cmpxchg64(ptr, old_val, val) != old_val) 355 goto again; 356 357 return val; 358 } 359 360 /* 361 * Similar to cpu_clock(), but requires local IRQs to be disabled. 362 * 363 * See cpu_clock(). 364 */ 365 u64 sched_clock_cpu(int cpu) 366 { 367 struct sched_clock_data *scd; 368 u64 clock; 369 370 if (sched_clock_stable()) 371 return sched_clock() + __sched_clock_offset; 372 373 if (!static_branch_likely(&sched_clock_running)) 374 return sched_clock(); 375 376 preempt_disable_notrace(); 377 scd = cpu_sdc(cpu); 378 379 if (cpu != smp_processor_id()) 380 clock = sched_clock_remote(scd); 381 else 382 clock = sched_clock_local(scd); 383 preempt_enable_notrace(); 384 385 return clock; 386 } 387 EXPORT_SYMBOL_GPL(sched_clock_cpu); 388 389 void sched_clock_tick(void) 390 { 391 struct sched_clock_data *scd; 392 393 if (sched_clock_stable()) 394 return; 395 396 if (!static_branch_likely(&sched_clock_running)) 397 return; 398 399 lockdep_assert_irqs_disabled(); 400 401 scd = this_scd(); 402 __scd_stamp(scd); 403 sched_clock_local(scd); 404 } 405 406 void sched_clock_tick_stable(void) 407 { 408 if (!sched_clock_stable()) 409 return; 410 411 /* 412 * Called under watchdog_lock. 413 * 414 * The watchdog just found this TSC to (still) be stable, so now is a 415 * good moment to update our __gtod_offset. Because once we find the 416 * TSC to be unstable, any computation will be computing crap. 417 */ 418 local_irq_disable(); 419 __sched_clock_gtod_offset(); 420 local_irq_enable(); 421 } 422 423 /* 424 * We are going deep-idle (irqs are disabled): 425 */ 426 void sched_clock_idle_sleep_event(void) 427 { 428 sched_clock_cpu(smp_processor_id()); 429 } 430 EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); 431 432 /* 433 * We just idled; resync with ktime. 434 */ 435 void sched_clock_idle_wakeup_event(void) 436 { 437 unsigned long flags; 438 439 if (sched_clock_stable()) 440 return; 441 442 if (unlikely(timekeeping_suspended)) 443 return; 444 445 local_irq_save(flags); 446 sched_clock_tick(); 447 local_irq_restore(flags); 448 } 449 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 450 451 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 452 453 void __init sched_clock_init(void) 454 { 455 static_branch_inc(&sched_clock_running); 456 local_irq_disable(); 457 generic_sched_clock_init(); 458 local_irq_enable(); 459 } 460 461 u64 sched_clock_cpu(int cpu) 462 { 463 if (!static_branch_likely(&sched_clock_running)) 464 return 0; 465 466 return sched_clock(); 467 } 468 469 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 470 471 /* 472 * Running clock - returns the time that has elapsed while a guest has been 473 * running. 474 * On a guest this value should be local_clock minus the time the guest was 475 * suspended by the hypervisor (for any reason). 476 * On bare metal this function should return the same as local_clock. 477 * Architectures and sub-architectures can override this. 478 */ 479 u64 __weak running_clock(void) 480 { 481 return local_clock(); 482 } 483