1a046f1a0SPeter Zijlstra // SPDX-License-Identifier: GPL-2.0-or-later 2a046f1a0SPeter Zijlstra 3a046f1a0SPeter Zijlstra #include <linux/sched/task.h> 4a046f1a0SPeter Zijlstra #include <linux/sched/signal.h> 5a046f1a0SPeter Zijlstra #include <linux/freezer.h> 6a046f1a0SPeter Zijlstra 7a046f1a0SPeter Zijlstra #include "futex.h" 8a046f1a0SPeter Zijlstra 9a046f1a0SPeter Zijlstra /* 10a046f1a0SPeter Zijlstra * READ this before attempting to hack on futexes! 11a046f1a0SPeter Zijlstra * 12a046f1a0SPeter Zijlstra * Basic futex operation and ordering guarantees 13a046f1a0SPeter Zijlstra * ============================================= 14a046f1a0SPeter Zijlstra * 15a046f1a0SPeter Zijlstra * The waiter reads the futex value in user space and calls 16a046f1a0SPeter Zijlstra * futex_wait(). This function computes the hash bucket and acquires 17a046f1a0SPeter Zijlstra * the hash bucket lock. After that it reads the futex user space value 18a046f1a0SPeter Zijlstra * again and verifies that the data has not changed. If it has not changed 19a046f1a0SPeter Zijlstra * it enqueues itself into the hash bucket, releases the hash bucket lock 20a046f1a0SPeter Zijlstra * and schedules. 21a046f1a0SPeter Zijlstra * 22a046f1a0SPeter Zijlstra * The waker side modifies the user space value of the futex and calls 23a046f1a0SPeter Zijlstra * futex_wake(). This function computes the hash bucket and acquires the 24a046f1a0SPeter Zijlstra * hash bucket lock. Then it looks for waiters on that futex in the hash 25a046f1a0SPeter Zijlstra * bucket and wakes them. 26a046f1a0SPeter Zijlstra * 27a046f1a0SPeter Zijlstra * In futex wake up scenarios where no tasks are blocked on a futex, taking 28a046f1a0SPeter Zijlstra * the hb spinlock can be avoided and simply return. In order for this 29a046f1a0SPeter Zijlstra * optimization to work, ordering guarantees must exist so that the waiter 30a046f1a0SPeter Zijlstra * being added to the list is acknowledged when the list is concurrently being 31a046f1a0SPeter Zijlstra * checked by the waker, avoiding scenarios like the following: 32a046f1a0SPeter Zijlstra * 33a046f1a0SPeter Zijlstra * CPU 0 CPU 1 34a046f1a0SPeter Zijlstra * val = *futex; 35a046f1a0SPeter Zijlstra * sys_futex(WAIT, futex, val); 36a046f1a0SPeter Zijlstra * futex_wait(futex, val); 37a046f1a0SPeter Zijlstra * uval = *futex; 38a046f1a0SPeter Zijlstra * *futex = newval; 39a046f1a0SPeter Zijlstra * sys_futex(WAKE, futex); 40a046f1a0SPeter Zijlstra * futex_wake(futex); 41a046f1a0SPeter Zijlstra * if (queue_empty()) 42a046f1a0SPeter Zijlstra * return; 43a046f1a0SPeter Zijlstra * if (uval == val) 44a046f1a0SPeter Zijlstra * lock(hash_bucket(futex)); 45a046f1a0SPeter Zijlstra * queue(); 46a046f1a0SPeter Zijlstra * unlock(hash_bucket(futex)); 47a046f1a0SPeter Zijlstra * schedule(); 48a046f1a0SPeter Zijlstra * 49a046f1a0SPeter Zijlstra * This would cause the waiter on CPU 0 to wait forever because it 50a046f1a0SPeter Zijlstra * missed the transition of the user space value from val to newval 51a046f1a0SPeter Zijlstra * and the waker did not find the waiter in the hash bucket queue. 52a046f1a0SPeter Zijlstra * 53a046f1a0SPeter Zijlstra * The correct serialization ensures that a waiter either observes 54a046f1a0SPeter Zijlstra * the changed user space value before blocking or is woken by a 55a046f1a0SPeter Zijlstra * concurrent waker: 56a046f1a0SPeter Zijlstra * 57a046f1a0SPeter Zijlstra * CPU 0 CPU 1 58a046f1a0SPeter Zijlstra * val = *futex; 59a046f1a0SPeter Zijlstra * sys_futex(WAIT, futex, val); 60a046f1a0SPeter Zijlstra * futex_wait(futex, val); 61a046f1a0SPeter Zijlstra * 62a046f1a0SPeter Zijlstra * waiters++; (a) 63a046f1a0SPeter Zijlstra * smp_mb(); (A) <-- paired with -. 64a046f1a0SPeter Zijlstra * | 65a046f1a0SPeter Zijlstra * lock(hash_bucket(futex)); | 66a046f1a0SPeter Zijlstra * | 67a046f1a0SPeter Zijlstra * uval = *futex; | 68a046f1a0SPeter Zijlstra * | *futex = newval; 69a046f1a0SPeter Zijlstra * | sys_futex(WAKE, futex); 70a046f1a0SPeter Zijlstra * | futex_wake(futex); 71a046f1a0SPeter Zijlstra * | 72a046f1a0SPeter Zijlstra * `--------> smp_mb(); (B) 73a046f1a0SPeter Zijlstra * if (uval == val) 74a046f1a0SPeter Zijlstra * queue(); 75a046f1a0SPeter Zijlstra * unlock(hash_bucket(futex)); 76a046f1a0SPeter Zijlstra * schedule(); if (waiters) 77a046f1a0SPeter Zijlstra * lock(hash_bucket(futex)); 78a046f1a0SPeter Zijlstra * else wake_waiters(futex); 79a046f1a0SPeter Zijlstra * waiters--; (b) unlock(hash_bucket(futex)); 80a046f1a0SPeter Zijlstra * 81a046f1a0SPeter Zijlstra * Where (A) orders the waiters increment and the futex value read through 82a046f1a0SPeter Zijlstra * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write 83a046f1a0SPeter Zijlstra * to futex and the waiters read (see futex_hb_waiters_pending()). 84a046f1a0SPeter Zijlstra * 85a046f1a0SPeter Zijlstra * This yields the following case (where X:=waiters, Y:=futex): 86a046f1a0SPeter Zijlstra * 87a046f1a0SPeter Zijlstra * X = Y = 0 88a046f1a0SPeter Zijlstra * 89a046f1a0SPeter Zijlstra * w[X]=1 w[Y]=1 90a046f1a0SPeter Zijlstra * MB MB 91a046f1a0SPeter Zijlstra * r[Y]=y r[X]=x 92a046f1a0SPeter Zijlstra * 93a046f1a0SPeter Zijlstra * Which guarantees that x==0 && y==0 is impossible; which translates back into 94a046f1a0SPeter Zijlstra * the guarantee that we cannot both miss the futex variable change and the 95a046f1a0SPeter Zijlstra * enqueue. 96a046f1a0SPeter Zijlstra * 97a046f1a0SPeter Zijlstra * Note that a new waiter is accounted for in (a) even when it is possible that 98a046f1a0SPeter Zijlstra * the wait call can return error, in which case we backtrack from it in (b). 99a046f1a0SPeter Zijlstra * Refer to the comment in futex_q_lock(). 100a046f1a0SPeter Zijlstra * 101a046f1a0SPeter Zijlstra * Similarly, in order to account for waiters being requeued on another 102a046f1a0SPeter Zijlstra * address we always increment the waiters for the destination bucket before 103a046f1a0SPeter Zijlstra * acquiring the lock. It then decrements them again after releasing it - 104a046f1a0SPeter Zijlstra * the code that actually moves the futex(es) between hash buckets (requeue_futex) 105a046f1a0SPeter Zijlstra * will do the additional required waiter count housekeeping. This is done for 106a046f1a0SPeter Zijlstra * double_lock_hb() and double_unlock_hb(), respectively. 107a046f1a0SPeter Zijlstra */ 108a046f1a0SPeter Zijlstra 109a046f1a0SPeter Zijlstra /* 110a046f1a0SPeter Zijlstra * The hash bucket lock must be held when this is called. 111a046f1a0SPeter Zijlstra * Afterwards, the futex_q must not be accessed. Callers 112a046f1a0SPeter Zijlstra * must ensure to later call wake_up_q() for the actual 113a046f1a0SPeter Zijlstra * wakeups to occur. 114a046f1a0SPeter Zijlstra */ 115a046f1a0SPeter Zijlstra void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) 116a046f1a0SPeter Zijlstra { 117a046f1a0SPeter Zijlstra struct task_struct *p = q->task; 118a046f1a0SPeter Zijlstra 119a046f1a0SPeter Zijlstra if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) 120a046f1a0SPeter Zijlstra return; 121a046f1a0SPeter Zijlstra 122a046f1a0SPeter Zijlstra get_task_struct(p); 123a046f1a0SPeter Zijlstra __futex_unqueue(q); 124a046f1a0SPeter Zijlstra /* 125a046f1a0SPeter Zijlstra * The waiting task can free the futex_q as soon as q->lock_ptr = NULL 126a046f1a0SPeter Zijlstra * is written, without taking any locks. This is possible in the event 127a046f1a0SPeter Zijlstra * of a spurious wakeup, for example. A memory barrier is required here 128a046f1a0SPeter Zijlstra * to prevent the following store to lock_ptr from getting ahead of the 129a046f1a0SPeter Zijlstra * plist_del in __futex_unqueue(). 130a046f1a0SPeter Zijlstra */ 131a046f1a0SPeter Zijlstra smp_store_release(&q->lock_ptr, NULL); 132a046f1a0SPeter Zijlstra 133a046f1a0SPeter Zijlstra /* 134a046f1a0SPeter Zijlstra * Queue the task for later wakeup for after we've released 135a046f1a0SPeter Zijlstra * the hb->lock. 136a046f1a0SPeter Zijlstra */ 137a046f1a0SPeter Zijlstra wake_q_add_safe(wake_q, p); 138a046f1a0SPeter Zijlstra } 139a046f1a0SPeter Zijlstra 140a046f1a0SPeter Zijlstra /* 141a046f1a0SPeter Zijlstra * Wake up waiters matching bitset queued on this futex (uaddr). 142a046f1a0SPeter Zijlstra */ 143a046f1a0SPeter Zijlstra int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) 144a046f1a0SPeter Zijlstra { 145a046f1a0SPeter Zijlstra struct futex_hash_bucket *hb; 146a046f1a0SPeter Zijlstra struct futex_q *this, *next; 147a046f1a0SPeter Zijlstra union futex_key key = FUTEX_KEY_INIT; 148a046f1a0SPeter Zijlstra int ret; 149a046f1a0SPeter Zijlstra DEFINE_WAKE_Q(wake_q); 150a046f1a0SPeter Zijlstra 151a046f1a0SPeter Zijlstra if (!bitset) 152a046f1a0SPeter Zijlstra return -EINVAL; 153a046f1a0SPeter Zijlstra 154a046f1a0SPeter Zijlstra ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); 155a046f1a0SPeter Zijlstra if (unlikely(ret != 0)) 156a046f1a0SPeter Zijlstra return ret; 157a046f1a0SPeter Zijlstra 158a046f1a0SPeter Zijlstra hb = futex_hash(&key); 159a046f1a0SPeter Zijlstra 160a046f1a0SPeter Zijlstra /* Make sure we really have tasks to wakeup */ 161a046f1a0SPeter Zijlstra if (!futex_hb_waiters_pending(hb)) 162a046f1a0SPeter Zijlstra return ret; 163a046f1a0SPeter Zijlstra 164a046f1a0SPeter Zijlstra spin_lock(&hb->lock); 165a046f1a0SPeter Zijlstra 166a046f1a0SPeter Zijlstra plist_for_each_entry_safe(this, next, &hb->chain, list) { 167a046f1a0SPeter Zijlstra if (futex_match (&this->key, &key)) { 168a046f1a0SPeter Zijlstra if (this->pi_state || this->rt_waiter) { 169a046f1a0SPeter Zijlstra ret = -EINVAL; 170a046f1a0SPeter Zijlstra break; 171a046f1a0SPeter Zijlstra } 172a046f1a0SPeter Zijlstra 173a046f1a0SPeter Zijlstra /* Check if one of the bits is set in both bitsets */ 174a046f1a0SPeter Zijlstra if (!(this->bitset & bitset)) 175a046f1a0SPeter Zijlstra continue; 176a046f1a0SPeter Zijlstra 177a046f1a0SPeter Zijlstra futex_wake_mark(&wake_q, this); 178a046f1a0SPeter Zijlstra if (++ret >= nr_wake) 179a046f1a0SPeter Zijlstra break; 180a046f1a0SPeter Zijlstra } 181a046f1a0SPeter Zijlstra } 182a046f1a0SPeter Zijlstra 183a046f1a0SPeter Zijlstra spin_unlock(&hb->lock); 184a046f1a0SPeter Zijlstra wake_up_q(&wake_q); 185a046f1a0SPeter Zijlstra return ret; 186a046f1a0SPeter Zijlstra } 187a046f1a0SPeter Zijlstra 188a046f1a0SPeter Zijlstra static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) 189a046f1a0SPeter Zijlstra { 190a046f1a0SPeter Zijlstra unsigned int op = (encoded_op & 0x70000000) >> 28; 191a046f1a0SPeter Zijlstra unsigned int cmp = (encoded_op & 0x0f000000) >> 24; 192a046f1a0SPeter Zijlstra int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); 193a046f1a0SPeter Zijlstra int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); 194a046f1a0SPeter Zijlstra int oldval, ret; 195a046f1a0SPeter Zijlstra 196a046f1a0SPeter Zijlstra if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { 197a046f1a0SPeter Zijlstra if (oparg < 0 || oparg > 31) { 198a046f1a0SPeter Zijlstra char comm[sizeof(current->comm)]; 199a046f1a0SPeter Zijlstra /* 200a046f1a0SPeter Zijlstra * kill this print and return -EINVAL when userspace 201a046f1a0SPeter Zijlstra * is sane again 202a046f1a0SPeter Zijlstra */ 203a046f1a0SPeter Zijlstra pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", 204a046f1a0SPeter Zijlstra get_task_comm(comm, current), oparg); 205a046f1a0SPeter Zijlstra oparg &= 31; 206a046f1a0SPeter Zijlstra } 207a046f1a0SPeter Zijlstra oparg = 1 << oparg; 208a046f1a0SPeter Zijlstra } 209a046f1a0SPeter Zijlstra 210a046f1a0SPeter Zijlstra pagefault_disable(); 211a046f1a0SPeter Zijlstra ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); 212a046f1a0SPeter Zijlstra pagefault_enable(); 213a046f1a0SPeter Zijlstra if (ret) 214a046f1a0SPeter Zijlstra return ret; 215a046f1a0SPeter Zijlstra 216a046f1a0SPeter Zijlstra switch (cmp) { 217a046f1a0SPeter Zijlstra case FUTEX_OP_CMP_EQ: 218a046f1a0SPeter Zijlstra return oldval == cmparg; 219a046f1a0SPeter Zijlstra case FUTEX_OP_CMP_NE: 220a046f1a0SPeter Zijlstra return oldval != cmparg; 221a046f1a0SPeter Zijlstra case FUTEX_OP_CMP_LT: 222a046f1a0SPeter Zijlstra return oldval < cmparg; 223a046f1a0SPeter Zijlstra case FUTEX_OP_CMP_GE: 224a046f1a0SPeter Zijlstra return oldval >= cmparg; 225a046f1a0SPeter Zijlstra case FUTEX_OP_CMP_LE: 226a046f1a0SPeter Zijlstra return oldval <= cmparg; 227a046f1a0SPeter Zijlstra case FUTEX_OP_CMP_GT: 228a046f1a0SPeter Zijlstra return oldval > cmparg; 229a046f1a0SPeter Zijlstra default: 230a046f1a0SPeter Zijlstra return -ENOSYS; 231a046f1a0SPeter Zijlstra } 232a046f1a0SPeter Zijlstra } 233a046f1a0SPeter Zijlstra 234a046f1a0SPeter Zijlstra /* 235a046f1a0SPeter Zijlstra * Wake up all waiters hashed on the physical page that is mapped 236a046f1a0SPeter Zijlstra * to this virtual address: 237a046f1a0SPeter Zijlstra */ 238a046f1a0SPeter Zijlstra int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, 239a046f1a0SPeter Zijlstra int nr_wake, int nr_wake2, int op) 240a046f1a0SPeter Zijlstra { 241a046f1a0SPeter Zijlstra union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 242a046f1a0SPeter Zijlstra struct futex_hash_bucket *hb1, *hb2; 243a046f1a0SPeter Zijlstra struct futex_q *this, *next; 244a046f1a0SPeter Zijlstra int ret, op_ret; 245a046f1a0SPeter Zijlstra DEFINE_WAKE_Q(wake_q); 246a046f1a0SPeter Zijlstra 247a046f1a0SPeter Zijlstra retry: 248a046f1a0SPeter Zijlstra ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); 249a046f1a0SPeter Zijlstra if (unlikely(ret != 0)) 250a046f1a0SPeter Zijlstra return ret; 251a046f1a0SPeter Zijlstra ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); 252a046f1a0SPeter Zijlstra if (unlikely(ret != 0)) 253a046f1a0SPeter Zijlstra return ret; 254a046f1a0SPeter Zijlstra 255a046f1a0SPeter Zijlstra hb1 = futex_hash(&key1); 256a046f1a0SPeter Zijlstra hb2 = futex_hash(&key2); 257a046f1a0SPeter Zijlstra 258a046f1a0SPeter Zijlstra retry_private: 259a046f1a0SPeter Zijlstra double_lock_hb(hb1, hb2); 260a046f1a0SPeter Zijlstra op_ret = futex_atomic_op_inuser(op, uaddr2); 261a046f1a0SPeter Zijlstra if (unlikely(op_ret < 0)) { 262a046f1a0SPeter Zijlstra double_unlock_hb(hb1, hb2); 263a046f1a0SPeter Zijlstra 264a046f1a0SPeter Zijlstra if (!IS_ENABLED(CONFIG_MMU) || 265a046f1a0SPeter Zijlstra unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { 266a046f1a0SPeter Zijlstra /* 267a046f1a0SPeter Zijlstra * we don't get EFAULT from MMU faults if we don't have 268a046f1a0SPeter Zijlstra * an MMU, but we might get them from range checking 269a046f1a0SPeter Zijlstra */ 270a046f1a0SPeter Zijlstra ret = op_ret; 271a046f1a0SPeter Zijlstra return ret; 272a046f1a0SPeter Zijlstra } 273a046f1a0SPeter Zijlstra 274a046f1a0SPeter Zijlstra if (op_ret == -EFAULT) { 275a046f1a0SPeter Zijlstra ret = fault_in_user_writeable(uaddr2); 276a046f1a0SPeter Zijlstra if (ret) 277a046f1a0SPeter Zijlstra return ret; 278a046f1a0SPeter Zijlstra } 279a046f1a0SPeter Zijlstra 280a046f1a0SPeter Zijlstra cond_resched(); 281a046f1a0SPeter Zijlstra if (!(flags & FLAGS_SHARED)) 282a046f1a0SPeter Zijlstra goto retry_private; 283a046f1a0SPeter Zijlstra goto retry; 284a046f1a0SPeter Zijlstra } 285a046f1a0SPeter Zijlstra 286a046f1a0SPeter Zijlstra plist_for_each_entry_safe(this, next, &hb1->chain, list) { 287a046f1a0SPeter Zijlstra if (futex_match (&this->key, &key1)) { 288a046f1a0SPeter Zijlstra if (this->pi_state || this->rt_waiter) { 289a046f1a0SPeter Zijlstra ret = -EINVAL; 290a046f1a0SPeter Zijlstra goto out_unlock; 291a046f1a0SPeter Zijlstra } 292a046f1a0SPeter Zijlstra futex_wake_mark(&wake_q, this); 293a046f1a0SPeter Zijlstra if (++ret >= nr_wake) 294a046f1a0SPeter Zijlstra break; 295a046f1a0SPeter Zijlstra } 296a046f1a0SPeter Zijlstra } 297a046f1a0SPeter Zijlstra 298a046f1a0SPeter Zijlstra if (op_ret > 0) { 299a046f1a0SPeter Zijlstra op_ret = 0; 300a046f1a0SPeter Zijlstra plist_for_each_entry_safe(this, next, &hb2->chain, list) { 301a046f1a0SPeter Zijlstra if (futex_match (&this->key, &key2)) { 302a046f1a0SPeter Zijlstra if (this->pi_state || this->rt_waiter) { 303a046f1a0SPeter Zijlstra ret = -EINVAL; 304a046f1a0SPeter Zijlstra goto out_unlock; 305a046f1a0SPeter Zijlstra } 306a046f1a0SPeter Zijlstra futex_wake_mark(&wake_q, this); 307a046f1a0SPeter Zijlstra if (++op_ret >= nr_wake2) 308a046f1a0SPeter Zijlstra break; 309a046f1a0SPeter Zijlstra } 310a046f1a0SPeter Zijlstra } 311a046f1a0SPeter Zijlstra ret += op_ret; 312a046f1a0SPeter Zijlstra } 313a046f1a0SPeter Zijlstra 314a046f1a0SPeter Zijlstra out_unlock: 315a046f1a0SPeter Zijlstra double_unlock_hb(hb1, hb2); 316a046f1a0SPeter Zijlstra wake_up_q(&wake_q); 317a046f1a0SPeter Zijlstra return ret; 318a046f1a0SPeter Zijlstra } 319a046f1a0SPeter Zijlstra 320a046f1a0SPeter Zijlstra static long futex_wait_restart(struct restart_block *restart); 321a046f1a0SPeter Zijlstra 322a046f1a0SPeter Zijlstra /** 323a046f1a0SPeter Zijlstra * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal 324a046f1a0SPeter Zijlstra * @hb: the futex hash bucket, must be locked by the caller 325a046f1a0SPeter Zijlstra * @q: the futex_q to queue up on 326a046f1a0SPeter Zijlstra * @timeout: the prepared hrtimer_sleeper, or null for no timeout 327a046f1a0SPeter Zijlstra */ 328a046f1a0SPeter Zijlstra void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, 329a046f1a0SPeter Zijlstra struct hrtimer_sleeper *timeout) 330a046f1a0SPeter Zijlstra { 331a046f1a0SPeter Zijlstra /* 332a046f1a0SPeter Zijlstra * The task state is guaranteed to be set before another task can 333a046f1a0SPeter Zijlstra * wake it. set_current_state() is implemented using smp_store_mb() and 334a046f1a0SPeter Zijlstra * futex_queue() calls spin_unlock() upon completion, both serializing 335a046f1a0SPeter Zijlstra * access to the hash list and forcing another memory barrier. 336a046f1a0SPeter Zijlstra */ 337a046f1a0SPeter Zijlstra set_current_state(TASK_INTERRUPTIBLE); 338a046f1a0SPeter Zijlstra futex_queue(q, hb); 339a046f1a0SPeter Zijlstra 340a046f1a0SPeter Zijlstra /* Arm the timer */ 341a046f1a0SPeter Zijlstra if (timeout) 342a046f1a0SPeter Zijlstra hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); 343a046f1a0SPeter Zijlstra 344a046f1a0SPeter Zijlstra /* 345a046f1a0SPeter Zijlstra * If we have been removed from the hash list, then another task 346a046f1a0SPeter Zijlstra * has tried to wake us, and we can skip the call to schedule(). 347a046f1a0SPeter Zijlstra */ 348a046f1a0SPeter Zijlstra if (likely(!plist_node_empty(&q->list))) { 349a046f1a0SPeter Zijlstra /* 350a046f1a0SPeter Zijlstra * If the timer has already expired, current will already be 351a046f1a0SPeter Zijlstra * flagged for rescheduling. Only call schedule if there 352a046f1a0SPeter Zijlstra * is no timeout, or if it has yet to expire. 353a046f1a0SPeter Zijlstra */ 354a046f1a0SPeter Zijlstra if (!timeout || timeout->task) 355a046f1a0SPeter Zijlstra freezable_schedule(); 356a046f1a0SPeter Zijlstra } 357a046f1a0SPeter Zijlstra __set_current_state(TASK_RUNNING); 358a046f1a0SPeter Zijlstra } 359a046f1a0SPeter Zijlstra 360a046f1a0SPeter Zijlstra /** 361*bf69bad3SAndré Almeida * unqueue_multiple - Remove various futexes from their hash bucket 362*bf69bad3SAndré Almeida * @v: The list of futexes to unqueue 363*bf69bad3SAndré Almeida * @count: Number of futexes in the list 364*bf69bad3SAndré Almeida * 365*bf69bad3SAndré Almeida * Helper to unqueue a list of futexes. This can't fail. 366*bf69bad3SAndré Almeida * 367*bf69bad3SAndré Almeida * Return: 368*bf69bad3SAndré Almeida * - >=0 - Index of the last futex that was awoken; 369*bf69bad3SAndré Almeida * - -1 - No futex was awoken 370*bf69bad3SAndré Almeida */ 371*bf69bad3SAndré Almeida static int unqueue_multiple(struct futex_vector *v, int count) 372*bf69bad3SAndré Almeida { 373*bf69bad3SAndré Almeida int ret = -1, i; 374*bf69bad3SAndré Almeida 375*bf69bad3SAndré Almeida for (i = 0; i < count; i++) { 376*bf69bad3SAndré Almeida if (!futex_unqueue(&v[i].q)) 377*bf69bad3SAndré Almeida ret = i; 378*bf69bad3SAndré Almeida } 379*bf69bad3SAndré Almeida 380*bf69bad3SAndré Almeida return ret; 381*bf69bad3SAndré Almeida } 382*bf69bad3SAndré Almeida 383*bf69bad3SAndré Almeida /** 384*bf69bad3SAndré Almeida * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes 385*bf69bad3SAndré Almeida * @vs: The futex list to wait on 386*bf69bad3SAndré Almeida * @count: The size of the list 387*bf69bad3SAndré Almeida * @woken: Index of the last woken futex, if any. Used to notify the 388*bf69bad3SAndré Almeida * caller that it can return this index to userspace (return parameter) 389*bf69bad3SAndré Almeida * 390*bf69bad3SAndré Almeida * Prepare multiple futexes in a single step and enqueue them. This may fail if 391*bf69bad3SAndré Almeida * the futex list is invalid or if any futex was already awoken. On success the 392*bf69bad3SAndré Almeida * task is ready to interruptible sleep. 393*bf69bad3SAndré Almeida * 394*bf69bad3SAndré Almeida * Return: 395*bf69bad3SAndré Almeida * - 1 - One of the futexes was woken by another thread 396*bf69bad3SAndré Almeida * - 0 - Success 397*bf69bad3SAndré Almeida * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL 398*bf69bad3SAndré Almeida */ 399*bf69bad3SAndré Almeida static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) 400*bf69bad3SAndré Almeida { 401*bf69bad3SAndré Almeida struct futex_hash_bucket *hb; 402*bf69bad3SAndré Almeida bool retry = false; 403*bf69bad3SAndré Almeida int ret, i; 404*bf69bad3SAndré Almeida u32 uval; 405*bf69bad3SAndré Almeida 406*bf69bad3SAndré Almeida /* 407*bf69bad3SAndré Almeida * Enqueuing multiple futexes is tricky, because we need to enqueue 408*bf69bad3SAndré Almeida * each futex on the list before dealing with the next one to avoid 409*bf69bad3SAndré Almeida * deadlocking on the hash bucket. But, before enqueuing, we need to 410*bf69bad3SAndré Almeida * make sure that current->state is TASK_INTERRUPTIBLE, so we don't 411*bf69bad3SAndré Almeida * lose any wake events, which cannot be done before the get_futex_key 412*bf69bad3SAndré Almeida * of the next key, because it calls get_user_pages, which can sleep. 413*bf69bad3SAndré Almeida * Thus, we fetch the list of futexes keys in two steps, by first 414*bf69bad3SAndré Almeida * pinning all the memory keys in the futex key, and only then we read 415*bf69bad3SAndré Almeida * each key and queue the corresponding futex. 416*bf69bad3SAndré Almeida * 417*bf69bad3SAndré Almeida * Private futexes doesn't need to recalculate hash in retry, so skip 418*bf69bad3SAndré Almeida * get_futex_key() when retrying. 419*bf69bad3SAndré Almeida */ 420*bf69bad3SAndré Almeida retry: 421*bf69bad3SAndré Almeida for (i = 0; i < count; i++) { 422*bf69bad3SAndré Almeida if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) 423*bf69bad3SAndré Almeida continue; 424*bf69bad3SAndré Almeida 425*bf69bad3SAndré Almeida ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), 426*bf69bad3SAndré Almeida !(vs[i].w.flags & FUTEX_PRIVATE_FLAG), 427*bf69bad3SAndré Almeida &vs[i].q.key, FUTEX_READ); 428*bf69bad3SAndré Almeida 429*bf69bad3SAndré Almeida if (unlikely(ret)) 430*bf69bad3SAndré Almeida return ret; 431*bf69bad3SAndré Almeida } 432*bf69bad3SAndré Almeida 433*bf69bad3SAndré Almeida set_current_state(TASK_INTERRUPTIBLE); 434*bf69bad3SAndré Almeida 435*bf69bad3SAndré Almeida for (i = 0; i < count; i++) { 436*bf69bad3SAndré Almeida u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; 437*bf69bad3SAndré Almeida struct futex_q *q = &vs[i].q; 438*bf69bad3SAndré Almeida u32 val = (u32)vs[i].w.val; 439*bf69bad3SAndré Almeida 440*bf69bad3SAndré Almeida hb = futex_q_lock(q); 441*bf69bad3SAndré Almeida ret = futex_get_value_locked(&uval, uaddr); 442*bf69bad3SAndré Almeida 443*bf69bad3SAndré Almeida if (!ret && uval == val) { 444*bf69bad3SAndré Almeida /* 445*bf69bad3SAndré Almeida * The bucket lock can't be held while dealing with the 446*bf69bad3SAndré Almeida * next futex. Queue each futex at this moment so hb can 447*bf69bad3SAndré Almeida * be unlocked. 448*bf69bad3SAndré Almeida */ 449*bf69bad3SAndré Almeida futex_queue(q, hb); 450*bf69bad3SAndré Almeida continue; 451*bf69bad3SAndré Almeida } 452*bf69bad3SAndré Almeida 453*bf69bad3SAndré Almeida futex_q_unlock(hb); 454*bf69bad3SAndré Almeida __set_current_state(TASK_RUNNING); 455*bf69bad3SAndré Almeida 456*bf69bad3SAndré Almeida /* 457*bf69bad3SAndré Almeida * Even if something went wrong, if we find out that a futex 458*bf69bad3SAndré Almeida * was woken, we don't return error and return this index to 459*bf69bad3SAndré Almeida * userspace 460*bf69bad3SAndré Almeida */ 461*bf69bad3SAndré Almeida *woken = unqueue_multiple(vs, i); 462*bf69bad3SAndré Almeida if (*woken >= 0) 463*bf69bad3SAndré Almeida return 1; 464*bf69bad3SAndré Almeida 465*bf69bad3SAndré Almeida if (ret) { 466*bf69bad3SAndré Almeida /* 467*bf69bad3SAndré Almeida * If we need to handle a page fault, we need to do so 468*bf69bad3SAndré Almeida * without any lock and any enqueued futex (otherwise 469*bf69bad3SAndré Almeida * we could lose some wakeup). So we do it here, after 470*bf69bad3SAndré Almeida * undoing all the work done so far. In success, we 471*bf69bad3SAndré Almeida * retry all the work. 472*bf69bad3SAndré Almeida */ 473*bf69bad3SAndré Almeida if (get_user(uval, uaddr)) 474*bf69bad3SAndré Almeida return -EFAULT; 475*bf69bad3SAndré Almeida 476*bf69bad3SAndré Almeida retry = true; 477*bf69bad3SAndré Almeida goto retry; 478*bf69bad3SAndré Almeida } 479*bf69bad3SAndré Almeida 480*bf69bad3SAndré Almeida if (uval != val) 481*bf69bad3SAndré Almeida return -EWOULDBLOCK; 482*bf69bad3SAndré Almeida } 483*bf69bad3SAndré Almeida 484*bf69bad3SAndré Almeida return 0; 485*bf69bad3SAndré Almeida } 486*bf69bad3SAndré Almeida 487*bf69bad3SAndré Almeida /** 488*bf69bad3SAndré Almeida * futex_sleep_multiple - Check sleeping conditions and sleep 489*bf69bad3SAndré Almeida * @vs: List of futexes to wait for 490*bf69bad3SAndré Almeida * @count: Length of vs 491*bf69bad3SAndré Almeida * @to: Timeout 492*bf69bad3SAndré Almeida * 493*bf69bad3SAndré Almeida * Sleep if and only if the timeout hasn't expired and no futex on the list has 494*bf69bad3SAndré Almeida * been woken up. 495*bf69bad3SAndré Almeida */ 496*bf69bad3SAndré Almeida static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, 497*bf69bad3SAndré Almeida struct hrtimer_sleeper *to) 498*bf69bad3SAndré Almeida { 499*bf69bad3SAndré Almeida if (to && !to->task) 500*bf69bad3SAndré Almeida return; 501*bf69bad3SAndré Almeida 502*bf69bad3SAndré Almeida for (; count; count--, vs++) { 503*bf69bad3SAndré Almeida if (!READ_ONCE(vs->q.lock_ptr)) 504*bf69bad3SAndré Almeida return; 505*bf69bad3SAndré Almeida } 506*bf69bad3SAndré Almeida 507*bf69bad3SAndré Almeida freezable_schedule(); 508*bf69bad3SAndré Almeida } 509*bf69bad3SAndré Almeida 510*bf69bad3SAndré Almeida /** 511*bf69bad3SAndré Almeida * futex_wait_multiple - Prepare to wait on and enqueue several futexes 512*bf69bad3SAndré Almeida * @vs: The list of futexes to wait on 513*bf69bad3SAndré Almeida * @count: The number of objects 514*bf69bad3SAndré Almeida * @to: Timeout before giving up and returning to userspace 515*bf69bad3SAndré Almeida * 516*bf69bad3SAndré Almeida * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function 517*bf69bad3SAndré Almeida * sleeps on a group of futexes and returns on the first futex that is 518*bf69bad3SAndré Almeida * wake, or after the timeout has elapsed. 519*bf69bad3SAndré Almeida * 520*bf69bad3SAndré Almeida * Return: 521*bf69bad3SAndré Almeida * - >=0 - Hint to the futex that was awoken 522*bf69bad3SAndré Almeida * - <0 - On error 523*bf69bad3SAndré Almeida */ 524*bf69bad3SAndré Almeida int futex_wait_multiple(struct futex_vector *vs, unsigned int count, 525*bf69bad3SAndré Almeida struct hrtimer_sleeper *to) 526*bf69bad3SAndré Almeida { 527*bf69bad3SAndré Almeida int ret, hint = 0; 528*bf69bad3SAndré Almeida 529*bf69bad3SAndré Almeida if (to) 530*bf69bad3SAndré Almeida hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 531*bf69bad3SAndré Almeida 532*bf69bad3SAndré Almeida while (1) { 533*bf69bad3SAndré Almeida ret = futex_wait_multiple_setup(vs, count, &hint); 534*bf69bad3SAndré Almeida if (ret) { 535*bf69bad3SAndré Almeida if (ret > 0) { 536*bf69bad3SAndré Almeida /* A futex was woken during setup */ 537*bf69bad3SAndré Almeida ret = hint; 538*bf69bad3SAndré Almeida } 539*bf69bad3SAndré Almeida return ret; 540*bf69bad3SAndré Almeida } 541*bf69bad3SAndré Almeida 542*bf69bad3SAndré Almeida futex_sleep_multiple(vs, count, to); 543*bf69bad3SAndré Almeida 544*bf69bad3SAndré Almeida __set_current_state(TASK_RUNNING); 545*bf69bad3SAndré Almeida 546*bf69bad3SAndré Almeida ret = unqueue_multiple(vs, count); 547*bf69bad3SAndré Almeida if (ret >= 0) 548*bf69bad3SAndré Almeida return ret; 549*bf69bad3SAndré Almeida 550*bf69bad3SAndré Almeida if (to && !to->task) 551*bf69bad3SAndré Almeida return -ETIMEDOUT; 552*bf69bad3SAndré Almeida else if (signal_pending(current)) 553*bf69bad3SAndré Almeida return -ERESTARTSYS; 554*bf69bad3SAndré Almeida /* 555*bf69bad3SAndré Almeida * The final case is a spurious wakeup, for 556*bf69bad3SAndré Almeida * which just retry. 557*bf69bad3SAndré Almeida */ 558*bf69bad3SAndré Almeida } 559*bf69bad3SAndré Almeida } 560*bf69bad3SAndré Almeida 561*bf69bad3SAndré Almeida /** 562a046f1a0SPeter Zijlstra * futex_wait_setup() - Prepare to wait on a futex 563a046f1a0SPeter Zijlstra * @uaddr: the futex userspace address 564a046f1a0SPeter Zijlstra * @val: the expected value 565a046f1a0SPeter Zijlstra * @flags: futex flags (FLAGS_SHARED, etc.) 566a046f1a0SPeter Zijlstra * @q: the associated futex_q 567a046f1a0SPeter Zijlstra * @hb: storage for hash_bucket pointer to be returned to caller 568a046f1a0SPeter Zijlstra * 569a046f1a0SPeter Zijlstra * Setup the futex_q and locate the hash_bucket. Get the futex value and 570a046f1a0SPeter Zijlstra * compare it with the expected value. Handle atomic faults internally. 571a046f1a0SPeter Zijlstra * Return with the hb lock held on success, and unlocked on failure. 572a046f1a0SPeter Zijlstra * 573a046f1a0SPeter Zijlstra * Return: 574a046f1a0SPeter Zijlstra * - 0 - uaddr contains val and hb has been locked; 575a046f1a0SPeter Zijlstra * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 576a046f1a0SPeter Zijlstra */ 577a046f1a0SPeter Zijlstra int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 578a046f1a0SPeter Zijlstra struct futex_q *q, struct futex_hash_bucket **hb) 579a046f1a0SPeter Zijlstra { 580a046f1a0SPeter Zijlstra u32 uval; 581a046f1a0SPeter Zijlstra int ret; 582a046f1a0SPeter Zijlstra 583a046f1a0SPeter Zijlstra /* 584a046f1a0SPeter Zijlstra * Access the page AFTER the hash-bucket is locked. 585a046f1a0SPeter Zijlstra * Order is important: 586a046f1a0SPeter Zijlstra * 587a046f1a0SPeter Zijlstra * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); 588a046f1a0SPeter Zijlstra * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } 589a046f1a0SPeter Zijlstra * 590a046f1a0SPeter Zijlstra * The basic logical guarantee of a futex is that it blocks ONLY 591a046f1a0SPeter Zijlstra * if cond(var) is known to be true at the time of blocking, for 592a046f1a0SPeter Zijlstra * any cond. If we locked the hash-bucket after testing *uaddr, that 593a046f1a0SPeter Zijlstra * would open a race condition where we could block indefinitely with 594a046f1a0SPeter Zijlstra * cond(var) false, which would violate the guarantee. 595a046f1a0SPeter Zijlstra * 596a046f1a0SPeter Zijlstra * On the other hand, we insert q and release the hash-bucket only 597a046f1a0SPeter Zijlstra * after testing *uaddr. This guarantees that futex_wait() will NOT 598a046f1a0SPeter Zijlstra * absorb a wakeup if *uaddr does not match the desired values 599a046f1a0SPeter Zijlstra * while the syscall executes. 600a046f1a0SPeter Zijlstra */ 601a046f1a0SPeter Zijlstra retry: 602a046f1a0SPeter Zijlstra ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); 603a046f1a0SPeter Zijlstra if (unlikely(ret != 0)) 604a046f1a0SPeter Zijlstra return ret; 605a046f1a0SPeter Zijlstra 606a046f1a0SPeter Zijlstra retry_private: 607a046f1a0SPeter Zijlstra *hb = futex_q_lock(q); 608a046f1a0SPeter Zijlstra 609a046f1a0SPeter Zijlstra ret = futex_get_value_locked(&uval, uaddr); 610a046f1a0SPeter Zijlstra 611a046f1a0SPeter Zijlstra if (ret) { 612a046f1a0SPeter Zijlstra futex_q_unlock(*hb); 613a046f1a0SPeter Zijlstra 614a046f1a0SPeter Zijlstra ret = get_user(uval, uaddr); 615a046f1a0SPeter Zijlstra if (ret) 616a046f1a0SPeter Zijlstra return ret; 617a046f1a0SPeter Zijlstra 618a046f1a0SPeter Zijlstra if (!(flags & FLAGS_SHARED)) 619a046f1a0SPeter Zijlstra goto retry_private; 620a046f1a0SPeter Zijlstra 621a046f1a0SPeter Zijlstra goto retry; 622a046f1a0SPeter Zijlstra } 623a046f1a0SPeter Zijlstra 624a046f1a0SPeter Zijlstra if (uval != val) { 625a046f1a0SPeter Zijlstra futex_q_unlock(*hb); 626a046f1a0SPeter Zijlstra ret = -EWOULDBLOCK; 627a046f1a0SPeter Zijlstra } 628a046f1a0SPeter Zijlstra 629a046f1a0SPeter Zijlstra return ret; 630a046f1a0SPeter Zijlstra } 631a046f1a0SPeter Zijlstra 632a046f1a0SPeter Zijlstra int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) 633a046f1a0SPeter Zijlstra { 634a046f1a0SPeter Zijlstra struct hrtimer_sleeper timeout, *to; 635a046f1a0SPeter Zijlstra struct restart_block *restart; 636a046f1a0SPeter Zijlstra struct futex_hash_bucket *hb; 637a046f1a0SPeter Zijlstra struct futex_q q = futex_q_init; 638a046f1a0SPeter Zijlstra int ret; 639a046f1a0SPeter Zijlstra 640a046f1a0SPeter Zijlstra if (!bitset) 641a046f1a0SPeter Zijlstra return -EINVAL; 642a046f1a0SPeter Zijlstra q.bitset = bitset; 643a046f1a0SPeter Zijlstra 644a046f1a0SPeter Zijlstra to = futex_setup_timer(abs_time, &timeout, flags, 645a046f1a0SPeter Zijlstra current->timer_slack_ns); 646a046f1a0SPeter Zijlstra retry: 647a046f1a0SPeter Zijlstra /* 648a046f1a0SPeter Zijlstra * Prepare to wait on uaddr. On success, it holds hb->lock and q 649a046f1a0SPeter Zijlstra * is initialized. 650a046f1a0SPeter Zijlstra */ 651a046f1a0SPeter Zijlstra ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 652a046f1a0SPeter Zijlstra if (ret) 653a046f1a0SPeter Zijlstra goto out; 654a046f1a0SPeter Zijlstra 655a046f1a0SPeter Zijlstra /* futex_queue and wait for wakeup, timeout, or a signal. */ 656a046f1a0SPeter Zijlstra futex_wait_queue(hb, &q, to); 657a046f1a0SPeter Zijlstra 658a046f1a0SPeter Zijlstra /* If we were woken (and unqueued), we succeeded, whatever. */ 659a046f1a0SPeter Zijlstra ret = 0; 660a046f1a0SPeter Zijlstra if (!futex_unqueue(&q)) 661a046f1a0SPeter Zijlstra goto out; 662a046f1a0SPeter Zijlstra ret = -ETIMEDOUT; 663a046f1a0SPeter Zijlstra if (to && !to->task) 664a046f1a0SPeter Zijlstra goto out; 665a046f1a0SPeter Zijlstra 666a046f1a0SPeter Zijlstra /* 667a046f1a0SPeter Zijlstra * We expect signal_pending(current), but we might be the 668a046f1a0SPeter Zijlstra * victim of a spurious wakeup as well. 669a046f1a0SPeter Zijlstra */ 670a046f1a0SPeter Zijlstra if (!signal_pending(current)) 671a046f1a0SPeter Zijlstra goto retry; 672a046f1a0SPeter Zijlstra 673a046f1a0SPeter Zijlstra ret = -ERESTARTSYS; 674a046f1a0SPeter Zijlstra if (!abs_time) 675a046f1a0SPeter Zijlstra goto out; 676a046f1a0SPeter Zijlstra 677a046f1a0SPeter Zijlstra restart = ¤t->restart_block; 678a046f1a0SPeter Zijlstra restart->futex.uaddr = uaddr; 679a046f1a0SPeter Zijlstra restart->futex.val = val; 680a046f1a0SPeter Zijlstra restart->futex.time = *abs_time; 681a046f1a0SPeter Zijlstra restart->futex.bitset = bitset; 682a046f1a0SPeter Zijlstra restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; 683a046f1a0SPeter Zijlstra 684a046f1a0SPeter Zijlstra ret = set_restart_fn(restart, futex_wait_restart); 685a046f1a0SPeter Zijlstra 686a046f1a0SPeter Zijlstra out: 687a046f1a0SPeter Zijlstra if (to) { 688a046f1a0SPeter Zijlstra hrtimer_cancel(&to->timer); 689a046f1a0SPeter Zijlstra destroy_hrtimer_on_stack(&to->timer); 690a046f1a0SPeter Zijlstra } 691a046f1a0SPeter Zijlstra return ret; 692a046f1a0SPeter Zijlstra } 693a046f1a0SPeter Zijlstra 694a046f1a0SPeter Zijlstra static long futex_wait_restart(struct restart_block *restart) 695a046f1a0SPeter Zijlstra { 696a046f1a0SPeter Zijlstra u32 __user *uaddr = restart->futex.uaddr; 697a046f1a0SPeter Zijlstra ktime_t t, *tp = NULL; 698a046f1a0SPeter Zijlstra 699a046f1a0SPeter Zijlstra if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { 700a046f1a0SPeter Zijlstra t = restart->futex.time; 701a046f1a0SPeter Zijlstra tp = &t; 702a046f1a0SPeter Zijlstra } 703a046f1a0SPeter Zijlstra restart->fn = do_no_restart_syscall; 704a046f1a0SPeter Zijlstra 705a046f1a0SPeter Zijlstra return (long)futex_wait(uaddr, restart->futex.flags, 706a046f1a0SPeter Zijlstra restart->futex.val, tp, restart->futex.bitset); 707a046f1a0SPeter Zijlstra } 708a046f1a0SPeter Zijlstra 709