1 /* 2 * Queued spinlock 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 15 * (C) Copyright 2013-2014 Red Hat, Inc. 16 * (C) Copyright 2015 Intel Corp. 17 * 18 * Authors: Waiman Long <waiman.long@hp.com> 19 * Peter Zijlstra <peterz@infradead.org> 20 */ 21 22 #ifndef _GEN_PV_LOCK_SLOWPATH 23 24 #include <linux/smp.h> 25 #include <linux/bug.h> 26 #include <linux/cpumask.h> 27 #include <linux/percpu.h> 28 #include <linux/hardirq.h> 29 #include <linux/mutex.h> 30 #include <asm/byteorder.h> 31 #include <asm/qspinlock.h> 32 33 /* 34 * The basic principle of a queue-based spinlock can best be understood 35 * by studying a classic queue-based spinlock implementation called the 36 * MCS lock. The paper below provides a good description for this kind 37 * of lock. 38 * 39 * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf 40 * 41 * This queued spinlock implementation is based on the MCS lock, however to make 42 * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing 43 * API, we must modify it somehow. 44 * 45 * In particular; where the traditional MCS lock consists of a tail pointer 46 * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to 47 * unlock the next pending (next->locked), we compress both these: {tail, 48 * next->locked} into a single u32 value. 49 * 50 * Since a spinlock disables recursion of its own context and there is a limit 51 * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there 52 * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now 53 * we can encode the tail by combining the 2-bit nesting level with the cpu 54 * number. With one byte for the lock value and 3 bytes for the tail, only a 55 * 32-bit word is now needed. Even though we only need 1 bit for the lock, 56 * we extend it to a full byte to achieve better performance for architectures 57 * that support atomic byte write. 58 * 59 * We also change the first spinner to spin on the lock bit instead of its 60 * node; whereby avoiding the need to carry a node from lock to unlock, and 61 * preserving existing lock API. This also makes the unlock code simpler and 62 * faster. 63 * 64 * N.B. The current implementation only supports architectures that allow 65 * atomic operations on smaller 8-bit and 16-bit data types. 66 * 67 */ 68 69 #include "mcs_spinlock.h" 70 71 #ifdef CONFIG_PARAVIRT_SPINLOCKS 72 #define MAX_NODES 8 73 #else 74 #define MAX_NODES 4 75 #endif 76 77 /* 78 * Per-CPU queue node structures; we can never have more than 4 nested 79 * contexts: task, softirq, hardirq, nmi. 80 * 81 * Exactly fits one 64-byte cacheline on a 64-bit architecture. 82 * 83 * PV doubles the storage and uses the second cacheline for PV state. 84 */ 85 static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); 86 87 /* 88 * We must be able to distinguish between no-tail and the tail at 0:0, 89 * therefore increment the cpu number by one. 90 */ 91 92 static inline u32 encode_tail(int cpu, int idx) 93 { 94 u32 tail; 95 96 #ifdef CONFIG_DEBUG_SPINLOCK 97 BUG_ON(idx > 3); 98 #endif 99 tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; 100 tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ 101 102 return tail; 103 } 104 105 static inline struct mcs_spinlock *decode_tail(u32 tail) 106 { 107 int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 108 int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 109 110 return per_cpu_ptr(&mcs_nodes[idx], cpu); 111 } 112 113 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 114 115 /* 116 * By using the whole 2nd least significant byte for the pending bit, we 117 * can allow better optimization of the lock acquisition for the pending 118 * bit holder. 119 * 120 * This internal structure is also used by the set_locked function which 121 * is not restricted to _Q_PENDING_BITS == 8. 122 */ 123 struct __qspinlock { 124 union { 125 atomic_t val; 126 #ifdef __LITTLE_ENDIAN 127 struct { 128 u8 locked; 129 u8 pending; 130 }; 131 struct { 132 u16 locked_pending; 133 u16 tail; 134 }; 135 #else 136 struct { 137 u16 tail; 138 u16 locked_pending; 139 }; 140 struct { 141 u8 reserved[2]; 142 u8 pending; 143 u8 locked; 144 }; 145 #endif 146 }; 147 }; 148 149 #if _Q_PENDING_BITS == 8 150 /** 151 * clear_pending_set_locked - take ownership and clear the pending bit. 152 * @lock: Pointer to queued spinlock structure 153 * 154 * *,1,0 -> *,0,1 155 * 156 * Lock stealing is not allowed if this function is used. 157 */ 158 static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 159 { 160 struct __qspinlock *l = (void *)lock; 161 162 WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL); 163 } 164 165 /* 166 * xchg_tail - Put in the new queue tail code word & retrieve previous one 167 * @lock : Pointer to queued spinlock structure 168 * @tail : The new queue tail code word 169 * Return: The previous queue tail code word 170 * 171 * xchg(lock, tail) 172 * 173 * p,*,* -> n,*,* ; prev = xchg(lock, node) 174 */ 175 static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 176 { 177 struct __qspinlock *l = (void *)lock; 178 179 return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 180 } 181 182 #else /* _Q_PENDING_BITS == 8 */ 183 184 /** 185 * clear_pending_set_locked - take ownership and clear the pending bit. 186 * @lock: Pointer to queued spinlock structure 187 * 188 * *,1,0 -> *,0,1 189 */ 190 static __always_inline void clear_pending_set_locked(struct qspinlock *lock) 191 { 192 atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); 193 } 194 195 /** 196 * xchg_tail - Put in the new queue tail code word & retrieve previous one 197 * @lock : Pointer to queued spinlock structure 198 * @tail : The new queue tail code word 199 * Return: The previous queue tail code word 200 * 201 * xchg(lock, tail) 202 * 203 * p,*,* -> n,*,* ; prev = xchg(lock, node) 204 */ 205 static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) 206 { 207 u32 old, new, val = atomic_read(&lock->val); 208 209 for (;;) { 210 new = (val & _Q_LOCKED_PENDING_MASK) | tail; 211 old = atomic_cmpxchg(&lock->val, val, new); 212 if (old == val) 213 break; 214 215 val = old; 216 } 217 return old; 218 } 219 #endif /* _Q_PENDING_BITS == 8 */ 220 221 /** 222 * set_locked - Set the lock bit and own the lock 223 * @lock: Pointer to queued spinlock structure 224 * 225 * *,*,0 -> *,0,1 226 */ 227 static __always_inline void set_locked(struct qspinlock *lock) 228 { 229 struct __qspinlock *l = (void *)lock; 230 231 WRITE_ONCE(l->locked, _Q_LOCKED_VAL); 232 } 233 234 235 /* 236 * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for 237 * all the PV callbacks. 238 */ 239 240 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } 241 static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } 242 static __always_inline void __pv_kick_node(struct qspinlock *lock, 243 struct mcs_spinlock *node) { } 244 static __always_inline void __pv_wait_head(struct qspinlock *lock, 245 struct mcs_spinlock *node) { } 246 247 #define pv_enabled() false 248 249 #define pv_init_node __pv_init_node 250 #define pv_wait_node __pv_wait_node 251 #define pv_kick_node __pv_kick_node 252 #define pv_wait_head __pv_wait_head 253 254 #ifdef CONFIG_PARAVIRT_SPINLOCKS 255 #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath 256 #endif 257 258 #endif /* _GEN_PV_LOCK_SLOWPATH */ 259 260 /** 261 * queued_spin_lock_slowpath - acquire the queued spinlock 262 * @lock: Pointer to queued spinlock structure 263 * @val: Current value of the queued spinlock 32-bit word 264 * 265 * (queue tail, pending bit, lock value) 266 * 267 * fast : slow : unlock 268 * : : 269 * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) 270 * : | ^--------.------. / : 271 * : v \ \ | : 272 * pending : (0,1,1) +--> (0,1,0) \ | : 273 * : | ^--' | | : 274 * : v | | : 275 * uncontended : (n,x,y) +--> (n,0,0) --' | : 276 * queue : | ^--' | : 277 * : v | : 278 * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : 279 * queue : ^--' : 280 */ 281 void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) 282 { 283 struct mcs_spinlock *prev, *next, *node; 284 u32 new, old, tail; 285 int idx; 286 287 BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); 288 289 if (pv_enabled()) 290 goto queue; 291 292 if (virt_spin_lock(lock)) 293 return; 294 295 /* 296 * wait for in-progress pending->locked hand-overs 297 * 298 * 0,1,0 -> 0,0,1 299 */ 300 if (val == _Q_PENDING_VAL) { 301 while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL) 302 cpu_relax(); 303 } 304 305 /* 306 * trylock || pending 307 * 308 * 0,0,0 -> 0,0,1 ; trylock 309 * 0,0,1 -> 0,1,1 ; pending 310 */ 311 for (;;) { 312 /* 313 * If we observe any contention; queue. 314 */ 315 if (val & ~_Q_LOCKED_MASK) 316 goto queue; 317 318 new = _Q_LOCKED_VAL; 319 if (val == new) 320 new |= _Q_PENDING_VAL; 321 322 old = atomic_cmpxchg(&lock->val, val, new); 323 if (old == val) 324 break; 325 326 val = old; 327 } 328 329 /* 330 * we won the trylock 331 */ 332 if (new == _Q_LOCKED_VAL) 333 return; 334 335 /* 336 * we're pending, wait for the owner to go away. 337 * 338 * *,1,1 -> *,1,0 339 * 340 * this wait loop must be a load-acquire such that we match the 341 * store-release that clears the locked bit and create lock 342 * sequentiality; this is because not all clear_pending_set_locked() 343 * implementations imply full barriers. 344 */ 345 while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) 346 cpu_relax(); 347 348 /* 349 * take ownership and clear the pending bit. 350 * 351 * *,1,0 -> *,0,1 352 */ 353 clear_pending_set_locked(lock); 354 return; 355 356 /* 357 * End of pending bit optimistic spinning and beginning of MCS 358 * queuing. 359 */ 360 queue: 361 node = this_cpu_ptr(&mcs_nodes[0]); 362 idx = node->count++; 363 tail = encode_tail(smp_processor_id(), idx); 364 365 node += idx; 366 node->locked = 0; 367 node->next = NULL; 368 pv_init_node(node); 369 370 /* 371 * We touched a (possibly) cold cacheline in the per-cpu queue node; 372 * attempt the trylock once more in the hope someone let go while we 373 * weren't watching. 374 */ 375 if (queued_spin_trylock(lock)) 376 goto release; 377 378 /* 379 * We have already touched the queueing cacheline; don't bother with 380 * pending stuff. 381 * 382 * p,*,* -> n,*,* 383 */ 384 old = xchg_tail(lock, tail); 385 386 /* 387 * if there was a previous node; link it and wait until reaching the 388 * head of the waitqueue. 389 */ 390 if (old & _Q_TAIL_MASK) { 391 prev = decode_tail(old); 392 WRITE_ONCE(prev->next, node); 393 394 pv_wait_node(node); 395 arch_mcs_spin_lock_contended(&node->locked); 396 } 397 398 /* 399 * we're at the head of the waitqueue, wait for the owner & pending to 400 * go away. 401 * 402 * *,x,y -> *,0,0 403 * 404 * this wait loop must use a load-acquire such that we match the 405 * store-release that clears the locked bit and create lock 406 * sequentiality; this is because the set_locked() function below 407 * does not imply a full barrier. 408 * 409 */ 410 pv_wait_head(lock, node); 411 while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) 412 cpu_relax(); 413 414 /* 415 * claim the lock: 416 * 417 * n,0,0 -> 0,0,1 : lock, uncontended 418 * *,0,0 -> *,0,1 : lock, contended 419 * 420 * If the queue head is the only one in the queue (lock value == tail), 421 * clear the tail code and grab the lock. Otherwise, we only need 422 * to grab the lock. 423 */ 424 for (;;) { 425 if (val != tail) { 426 set_locked(lock); 427 break; 428 } 429 old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); 430 if (old == val) 431 goto release; /* No contention */ 432 433 val = old; 434 } 435 436 /* 437 * contended path; wait for next, release. 438 */ 439 while (!(next = READ_ONCE(node->next))) 440 cpu_relax(); 441 442 arch_mcs_spin_unlock_contended(&next->locked); 443 pv_kick_node(lock, next); 444 445 release: 446 /* 447 * release the node 448 */ 449 this_cpu_dec(mcs_nodes[0].count); 450 } 451 EXPORT_SYMBOL(queued_spin_lock_slowpath); 452 453 /* 454 * Generate the paravirt code for queued_spin_unlock_slowpath(). 455 */ 456 #if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS) 457 #define _GEN_PV_LOCK_SLOWPATH 458 459 #undef pv_enabled 460 #define pv_enabled() true 461 462 #undef pv_init_node 463 #undef pv_wait_node 464 #undef pv_kick_node 465 #undef pv_wait_head 466 467 #undef queued_spin_lock_slowpath 468 #define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath 469 470 #include "qspinlock_paravirt.h" 471 #include "qspinlock.c" 472 473 #endif 474