177e52ae3SPeter Zijlstra // SPDX-License-Identifier: GPL-2.0-or-later 277e52ae3SPeter Zijlstra /* 377e52ae3SPeter Zijlstra * Fast Userspace Mutexes (which I call "Futexes!"). 477e52ae3SPeter Zijlstra * (C) Rusty Russell, IBM 2002 577e52ae3SPeter Zijlstra * 677e52ae3SPeter Zijlstra * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 777e52ae3SPeter Zijlstra * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 877e52ae3SPeter Zijlstra * 977e52ae3SPeter Zijlstra * Removed page pinning, fix privately mapped COW pages and other cleanups 1077e52ae3SPeter Zijlstra * (C) Copyright 2003, 2004 Jamie Lokier 1177e52ae3SPeter Zijlstra * 1277e52ae3SPeter Zijlstra * Robust futex support started by Ingo Molnar 1377e52ae3SPeter Zijlstra * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 1477e52ae3SPeter Zijlstra * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 1577e52ae3SPeter Zijlstra * 1677e52ae3SPeter Zijlstra * PI-futex support started by Ingo Molnar and Thomas Gleixner 1777e52ae3SPeter Zijlstra * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 1877e52ae3SPeter Zijlstra * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 1977e52ae3SPeter Zijlstra * 2077e52ae3SPeter Zijlstra * PRIVATE futexes by Eric Dumazet 2177e52ae3SPeter Zijlstra * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 2277e52ae3SPeter Zijlstra * 2377e52ae3SPeter Zijlstra * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> 2477e52ae3SPeter Zijlstra * Copyright (C) IBM Corporation, 2009 2577e52ae3SPeter Zijlstra * Thanks to Thomas Gleixner for conceptual design and careful reviews. 2677e52ae3SPeter Zijlstra * 2777e52ae3SPeter Zijlstra * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 2877e52ae3SPeter Zijlstra * enough at me, Linus for the original (flawed) idea, Matthew 2977e52ae3SPeter Zijlstra * Kirkwood for proof-of-concept implementation. 3077e52ae3SPeter Zijlstra * 3177e52ae3SPeter Zijlstra * "The futexes are also cursed." 3277e52ae3SPeter Zijlstra * "But they come in a choice of three flavours!" 3377e52ae3SPeter Zijlstra */ 3477e52ae3SPeter Zijlstra #include <linux/compat.h> 3577e52ae3SPeter Zijlstra #include <linux/jhash.h> 3677e52ae3SPeter Zijlstra #include <linux/pagemap.h> 3777e52ae3SPeter Zijlstra #include <linux/memblock.h> 3877e52ae3SPeter Zijlstra #include <linux/fault-inject.h> 39af8cc960SPeter Zijlstra #include <linux/slab.h> 4077e52ae3SPeter Zijlstra 41af8cc960SPeter Zijlstra #include "futex.h" 4277e52ae3SPeter Zijlstra #include "../locking/rtmutex_common.h" 4377e52ae3SPeter Zijlstra 44af8cc960SPeter Zijlstra #ifndef CONFIG_HAVE_FUTEX_CMPXCHG 45af8cc960SPeter Zijlstra int __read_mostly futex_cmpxchg_enabled; 4677e52ae3SPeter Zijlstra #endif 4777e52ae3SPeter Zijlstra 4877e52ae3SPeter Zijlstra 4977e52ae3SPeter Zijlstra /* 5077e52ae3SPeter Zijlstra * The base of the bucket array and its size are always used together 51eee5a7bcSPeter Zijlstra * (after initialization only in futex_hash()), so ensure that they 5277e52ae3SPeter Zijlstra * reside in the same cacheline. 5377e52ae3SPeter Zijlstra */ 5477e52ae3SPeter Zijlstra static struct { 5577e52ae3SPeter Zijlstra struct futex_hash_bucket *queues; 5677e52ae3SPeter Zijlstra unsigned long hashsize; 5777e52ae3SPeter Zijlstra } __futex_data __read_mostly __aligned(2*sizeof(long)); 5877e52ae3SPeter Zijlstra #define futex_queues (__futex_data.queues) 5977e52ae3SPeter Zijlstra #define futex_hashsize (__futex_data.hashsize) 6077e52ae3SPeter Zijlstra 6177e52ae3SPeter Zijlstra 6277e52ae3SPeter Zijlstra /* 6377e52ae3SPeter Zijlstra * Fault injections for futexes. 6477e52ae3SPeter Zijlstra */ 6577e52ae3SPeter Zijlstra #ifdef CONFIG_FAIL_FUTEX 6677e52ae3SPeter Zijlstra 6777e52ae3SPeter Zijlstra static struct { 6877e52ae3SPeter Zijlstra struct fault_attr attr; 6977e52ae3SPeter Zijlstra 7077e52ae3SPeter Zijlstra bool ignore_private; 7177e52ae3SPeter Zijlstra } fail_futex = { 7277e52ae3SPeter Zijlstra .attr = FAULT_ATTR_INITIALIZER, 7377e52ae3SPeter Zijlstra .ignore_private = false, 7477e52ae3SPeter Zijlstra }; 7577e52ae3SPeter Zijlstra 7677e52ae3SPeter Zijlstra static int __init setup_fail_futex(char *str) 7777e52ae3SPeter Zijlstra { 7877e52ae3SPeter Zijlstra return setup_fault_attr(&fail_futex.attr, str); 7977e52ae3SPeter Zijlstra } 8077e52ae3SPeter Zijlstra __setup("fail_futex=", setup_fail_futex); 8177e52ae3SPeter Zijlstra 82af8cc960SPeter Zijlstra bool should_fail_futex(bool fshared) 8377e52ae3SPeter Zijlstra { 8477e52ae3SPeter Zijlstra if (fail_futex.ignore_private && !fshared) 8577e52ae3SPeter Zijlstra return false; 8677e52ae3SPeter Zijlstra 8777e52ae3SPeter Zijlstra return should_fail(&fail_futex.attr, 1); 8877e52ae3SPeter Zijlstra } 8977e52ae3SPeter Zijlstra 9077e52ae3SPeter Zijlstra #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 9177e52ae3SPeter Zijlstra 9277e52ae3SPeter Zijlstra static int __init fail_futex_debugfs(void) 9377e52ae3SPeter Zijlstra { 9477e52ae3SPeter Zijlstra umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 9577e52ae3SPeter Zijlstra struct dentry *dir; 9677e52ae3SPeter Zijlstra 9777e52ae3SPeter Zijlstra dir = fault_create_debugfs_attr("fail_futex", NULL, 9877e52ae3SPeter Zijlstra &fail_futex.attr); 9977e52ae3SPeter Zijlstra if (IS_ERR(dir)) 10077e52ae3SPeter Zijlstra return PTR_ERR(dir); 10177e52ae3SPeter Zijlstra 10277e52ae3SPeter Zijlstra debugfs_create_bool("ignore-private", mode, dir, 10377e52ae3SPeter Zijlstra &fail_futex.ignore_private); 10477e52ae3SPeter Zijlstra return 0; 10577e52ae3SPeter Zijlstra } 10677e52ae3SPeter Zijlstra 10777e52ae3SPeter Zijlstra late_initcall(fail_futex_debugfs); 10877e52ae3SPeter Zijlstra 10977e52ae3SPeter Zijlstra #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 11077e52ae3SPeter Zijlstra 11177e52ae3SPeter Zijlstra #endif /* CONFIG_FAIL_FUTEX */ 11277e52ae3SPeter Zijlstra 11377e52ae3SPeter Zijlstra /** 114eee5a7bcSPeter Zijlstra * futex_hash - Return the hash bucket in the global hash 11577e52ae3SPeter Zijlstra * @key: Pointer to the futex key for which the hash is calculated 11677e52ae3SPeter Zijlstra * 11777e52ae3SPeter Zijlstra * We hash on the keys returned from get_futex_key (see below) and return the 11877e52ae3SPeter Zijlstra * corresponding hash bucket in the global hash. 11977e52ae3SPeter Zijlstra */ 12085dc28faSPeter Zijlstra struct futex_hash_bucket *futex_hash(union futex_key *key) 12177e52ae3SPeter Zijlstra { 12277e52ae3SPeter Zijlstra u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, 12377e52ae3SPeter Zijlstra key->both.offset); 12477e52ae3SPeter Zijlstra 12577e52ae3SPeter Zijlstra return &futex_queues[hash & (futex_hashsize - 1)]; 12677e52ae3SPeter Zijlstra } 12777e52ae3SPeter Zijlstra 12877e52ae3SPeter Zijlstra 12977e52ae3SPeter Zijlstra /** 13077e52ae3SPeter Zijlstra * futex_setup_timer - set up the sleeping hrtimer. 13177e52ae3SPeter Zijlstra * @time: ptr to the given timeout value 13277e52ae3SPeter Zijlstra * @timeout: the hrtimer_sleeper structure to be set up 13377e52ae3SPeter Zijlstra * @flags: futex flags 13477e52ae3SPeter Zijlstra * @range_ns: optional range in ns 13577e52ae3SPeter Zijlstra * 13677e52ae3SPeter Zijlstra * Return: Initialized hrtimer_sleeper structure or NULL if no timeout 13777e52ae3SPeter Zijlstra * value given 13877e52ae3SPeter Zijlstra */ 13985dc28faSPeter Zijlstra struct hrtimer_sleeper * 14077e52ae3SPeter Zijlstra futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, 14177e52ae3SPeter Zijlstra int flags, u64 range_ns) 14277e52ae3SPeter Zijlstra { 14377e52ae3SPeter Zijlstra if (!time) 14477e52ae3SPeter Zijlstra return NULL; 14577e52ae3SPeter Zijlstra 14677e52ae3SPeter Zijlstra hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? 14777e52ae3SPeter Zijlstra CLOCK_REALTIME : CLOCK_MONOTONIC, 14877e52ae3SPeter Zijlstra HRTIMER_MODE_ABS); 14977e52ae3SPeter Zijlstra /* 15077e52ae3SPeter Zijlstra * If range_ns is 0, calling hrtimer_set_expires_range_ns() is 15177e52ae3SPeter Zijlstra * effectively the same as calling hrtimer_set_expires(). 15277e52ae3SPeter Zijlstra */ 15377e52ae3SPeter Zijlstra hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); 15477e52ae3SPeter Zijlstra 15577e52ae3SPeter Zijlstra return timeout; 15677e52ae3SPeter Zijlstra } 15777e52ae3SPeter Zijlstra 15877e52ae3SPeter Zijlstra /* 15977e52ae3SPeter Zijlstra * Generate a machine wide unique identifier for this inode. 16077e52ae3SPeter Zijlstra * 16177e52ae3SPeter Zijlstra * This relies on u64 not wrapping in the life-time of the machine; which with 16277e52ae3SPeter Zijlstra * 1ns resolution means almost 585 years. 16377e52ae3SPeter Zijlstra * 16477e52ae3SPeter Zijlstra * This further relies on the fact that a well formed program will not unmap 16577e52ae3SPeter Zijlstra * the file while it has a (shared) futex waiting on it. This mapping will have 16677e52ae3SPeter Zijlstra * a file reference which pins the mount and inode. 16777e52ae3SPeter Zijlstra * 16877e52ae3SPeter Zijlstra * If for some reason an inode gets evicted and read back in again, it will get 16977e52ae3SPeter Zijlstra * a new sequence number and will _NOT_ match, even though it is the exact same 17077e52ae3SPeter Zijlstra * file. 17177e52ae3SPeter Zijlstra * 172f56a76fdSPeter Zijlstra * It is important that futex_match() will never have a false-positive, esp. 17377e52ae3SPeter Zijlstra * for PI futexes that can mess up the state. The above argues that false-negatives 17477e52ae3SPeter Zijlstra * are only possible for malformed programs. 17577e52ae3SPeter Zijlstra */ 17677e52ae3SPeter Zijlstra static u64 get_inode_sequence_number(struct inode *inode) 17777e52ae3SPeter Zijlstra { 17877e52ae3SPeter Zijlstra static atomic64_t i_seq; 17977e52ae3SPeter Zijlstra u64 old; 18077e52ae3SPeter Zijlstra 18177e52ae3SPeter Zijlstra /* Does the inode already have a sequence number? */ 18277e52ae3SPeter Zijlstra old = atomic64_read(&inode->i_sequence); 18377e52ae3SPeter Zijlstra if (likely(old)) 18477e52ae3SPeter Zijlstra return old; 18577e52ae3SPeter Zijlstra 18677e52ae3SPeter Zijlstra for (;;) { 18777e52ae3SPeter Zijlstra u64 new = atomic64_add_return(1, &i_seq); 18877e52ae3SPeter Zijlstra if (WARN_ON_ONCE(!new)) 18977e52ae3SPeter Zijlstra continue; 19077e52ae3SPeter Zijlstra 19177e52ae3SPeter Zijlstra old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); 19277e52ae3SPeter Zijlstra if (old) 19377e52ae3SPeter Zijlstra return old; 19477e52ae3SPeter Zijlstra return new; 19577e52ae3SPeter Zijlstra } 19677e52ae3SPeter Zijlstra } 19777e52ae3SPeter Zijlstra 19877e52ae3SPeter Zijlstra /** 19977e52ae3SPeter Zijlstra * get_futex_key() - Get parameters which are the keys for a futex 20077e52ae3SPeter Zijlstra * @uaddr: virtual address of the futex 20177e52ae3SPeter Zijlstra * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED 20277e52ae3SPeter Zijlstra * @key: address where result is stored. 20377e52ae3SPeter Zijlstra * @rw: mapping needs to be read/write (values: FUTEX_READ, 20477e52ae3SPeter Zijlstra * FUTEX_WRITE) 20577e52ae3SPeter Zijlstra * 20677e52ae3SPeter Zijlstra * Return: a negative error code or 0 20777e52ae3SPeter Zijlstra * 20877e52ae3SPeter Zijlstra * The key words are stored in @key on success. 20977e52ae3SPeter Zijlstra * 21077e52ae3SPeter Zijlstra * For shared mappings (when @fshared), the key is: 21177e52ae3SPeter Zijlstra * 21277e52ae3SPeter Zijlstra * ( inode->i_sequence, page->index, offset_within_page ) 21377e52ae3SPeter Zijlstra * 21477e52ae3SPeter Zijlstra * [ also see get_inode_sequence_number() ] 21577e52ae3SPeter Zijlstra * 21677e52ae3SPeter Zijlstra * For private mappings (or when !@fshared), the key is: 21777e52ae3SPeter Zijlstra * 21877e52ae3SPeter Zijlstra * ( current->mm, address, 0 ) 21977e52ae3SPeter Zijlstra * 22077e52ae3SPeter Zijlstra * This allows (cross process, where applicable) identification of the futex 22177e52ae3SPeter Zijlstra * without keeping the page pinned for the duration of the FUTEX_WAIT. 22277e52ae3SPeter Zijlstra * 22377e52ae3SPeter Zijlstra * lock_page() might sleep, the caller should not hold a spinlock. 22477e52ae3SPeter Zijlstra */ 22585dc28faSPeter Zijlstra int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, 22677e52ae3SPeter Zijlstra enum futex_access rw) 22777e52ae3SPeter Zijlstra { 22877e52ae3SPeter Zijlstra unsigned long address = (unsigned long)uaddr; 22977e52ae3SPeter Zijlstra struct mm_struct *mm = current->mm; 23077e52ae3SPeter Zijlstra struct page *page, *tail; 23177e52ae3SPeter Zijlstra struct address_space *mapping; 23277e52ae3SPeter Zijlstra int err, ro = 0; 23377e52ae3SPeter Zijlstra 23477e52ae3SPeter Zijlstra /* 23577e52ae3SPeter Zijlstra * The futex address must be "naturally" aligned. 23677e52ae3SPeter Zijlstra */ 23777e52ae3SPeter Zijlstra key->both.offset = address % PAGE_SIZE; 23877e52ae3SPeter Zijlstra if (unlikely((address % sizeof(u32)) != 0)) 23977e52ae3SPeter Zijlstra return -EINVAL; 24077e52ae3SPeter Zijlstra address -= key->both.offset; 24177e52ae3SPeter Zijlstra 24277e52ae3SPeter Zijlstra if (unlikely(!access_ok(uaddr, sizeof(u32)))) 24377e52ae3SPeter Zijlstra return -EFAULT; 24477e52ae3SPeter Zijlstra 24577e52ae3SPeter Zijlstra if (unlikely(should_fail_futex(fshared))) 24677e52ae3SPeter Zijlstra return -EFAULT; 24777e52ae3SPeter Zijlstra 24877e52ae3SPeter Zijlstra /* 24977e52ae3SPeter Zijlstra * PROCESS_PRIVATE futexes are fast. 25077e52ae3SPeter Zijlstra * As the mm cannot disappear under us and the 'key' only needs 25177e52ae3SPeter Zijlstra * virtual address, we dont even have to find the underlying vma. 25277e52ae3SPeter Zijlstra * Note : We do have to check 'uaddr' is a valid user address, 25377e52ae3SPeter Zijlstra * but access_ok() should be faster than find_vma() 25477e52ae3SPeter Zijlstra */ 25577e52ae3SPeter Zijlstra if (!fshared) { 25677e52ae3SPeter Zijlstra key->private.mm = mm; 25777e52ae3SPeter Zijlstra key->private.address = address; 25877e52ae3SPeter Zijlstra return 0; 25977e52ae3SPeter Zijlstra } 26077e52ae3SPeter Zijlstra 26177e52ae3SPeter Zijlstra again: 26277e52ae3SPeter Zijlstra /* Ignore any VERIFY_READ mapping (futex common case) */ 26377e52ae3SPeter Zijlstra if (unlikely(should_fail_futex(true))) 26477e52ae3SPeter Zijlstra return -EFAULT; 26577e52ae3SPeter Zijlstra 26677e52ae3SPeter Zijlstra err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); 26777e52ae3SPeter Zijlstra /* 26877e52ae3SPeter Zijlstra * If write access is not required (eg. FUTEX_WAIT), try 26977e52ae3SPeter Zijlstra * and get read-only access. 27077e52ae3SPeter Zijlstra */ 27177e52ae3SPeter Zijlstra if (err == -EFAULT && rw == FUTEX_READ) { 27277e52ae3SPeter Zijlstra err = get_user_pages_fast(address, 1, 0, &page); 27377e52ae3SPeter Zijlstra ro = 1; 27477e52ae3SPeter Zijlstra } 27577e52ae3SPeter Zijlstra if (err < 0) 27677e52ae3SPeter Zijlstra return err; 27777e52ae3SPeter Zijlstra else 27877e52ae3SPeter Zijlstra err = 0; 27977e52ae3SPeter Zijlstra 28077e52ae3SPeter Zijlstra /* 28177e52ae3SPeter Zijlstra * The treatment of mapping from this point on is critical. The page 28277e52ae3SPeter Zijlstra * lock protects many things but in this context the page lock 28377e52ae3SPeter Zijlstra * stabilizes mapping, prevents inode freeing in the shared 28477e52ae3SPeter Zijlstra * file-backed region case and guards against movement to swap cache. 28577e52ae3SPeter Zijlstra * 28677e52ae3SPeter Zijlstra * Strictly speaking the page lock is not needed in all cases being 28777e52ae3SPeter Zijlstra * considered here and page lock forces unnecessarily serialization 28877e52ae3SPeter Zijlstra * From this point on, mapping will be re-verified if necessary and 28977e52ae3SPeter Zijlstra * page lock will be acquired only if it is unavoidable 29077e52ae3SPeter Zijlstra * 29177e52ae3SPeter Zijlstra * Mapping checks require the head page for any compound page so the 29277e52ae3SPeter Zijlstra * head page and mapping is looked up now. For anonymous pages, it 29377e52ae3SPeter Zijlstra * does not matter if the page splits in the future as the key is 29477e52ae3SPeter Zijlstra * based on the address. For filesystem-backed pages, the tail is 29577e52ae3SPeter Zijlstra * required as the index of the page determines the key. For 29677e52ae3SPeter Zijlstra * base pages, there is no tail page and tail == page. 29777e52ae3SPeter Zijlstra */ 29877e52ae3SPeter Zijlstra tail = page; 29977e52ae3SPeter Zijlstra page = compound_head(page); 30077e52ae3SPeter Zijlstra mapping = READ_ONCE(page->mapping); 30177e52ae3SPeter Zijlstra 30277e52ae3SPeter Zijlstra /* 30377e52ae3SPeter Zijlstra * If page->mapping is NULL, then it cannot be a PageAnon 30477e52ae3SPeter Zijlstra * page; but it might be the ZERO_PAGE or in the gate area or 30577e52ae3SPeter Zijlstra * in a special mapping (all cases which we are happy to fail); 30677e52ae3SPeter Zijlstra * or it may have been a good file page when get_user_pages_fast 30777e52ae3SPeter Zijlstra * found it, but truncated or holepunched or subjected to 30877e52ae3SPeter Zijlstra * invalidate_complete_page2 before we got the page lock (also 30977e52ae3SPeter Zijlstra * cases which we are happy to fail). And we hold a reference, 31077e52ae3SPeter Zijlstra * so refcount care in invalidate_complete_page's remove_mapping 31177e52ae3SPeter Zijlstra * prevents drop_caches from setting mapping to NULL beneath us. 31277e52ae3SPeter Zijlstra * 31377e52ae3SPeter Zijlstra * The case we do have to guard against is when memory pressure made 31477e52ae3SPeter Zijlstra * shmem_writepage move it from filecache to swapcache beneath us: 31577e52ae3SPeter Zijlstra * an unlikely race, but we do need to retry for page->mapping. 31677e52ae3SPeter Zijlstra */ 31777e52ae3SPeter Zijlstra if (unlikely(!mapping)) { 31877e52ae3SPeter Zijlstra int shmem_swizzled; 31977e52ae3SPeter Zijlstra 32077e52ae3SPeter Zijlstra /* 32177e52ae3SPeter Zijlstra * Page lock is required to identify which special case above 32277e52ae3SPeter Zijlstra * applies. If this is really a shmem page then the page lock 32377e52ae3SPeter Zijlstra * will prevent unexpected transitions. 32477e52ae3SPeter Zijlstra */ 32577e52ae3SPeter Zijlstra lock_page(page); 32677e52ae3SPeter Zijlstra shmem_swizzled = PageSwapCache(page) || page->mapping; 32777e52ae3SPeter Zijlstra unlock_page(page); 32877e52ae3SPeter Zijlstra put_page(page); 32977e52ae3SPeter Zijlstra 33077e52ae3SPeter Zijlstra if (shmem_swizzled) 33177e52ae3SPeter Zijlstra goto again; 33277e52ae3SPeter Zijlstra 33377e52ae3SPeter Zijlstra return -EFAULT; 33477e52ae3SPeter Zijlstra } 33577e52ae3SPeter Zijlstra 33677e52ae3SPeter Zijlstra /* 33777e52ae3SPeter Zijlstra * Private mappings are handled in a simple way. 33877e52ae3SPeter Zijlstra * 33977e52ae3SPeter Zijlstra * If the futex key is stored on an anonymous page, then the associated 34077e52ae3SPeter Zijlstra * object is the mm which is implicitly pinned by the calling process. 34177e52ae3SPeter Zijlstra * 34277e52ae3SPeter Zijlstra * NOTE: When userspace waits on a MAP_SHARED mapping, even if 34377e52ae3SPeter Zijlstra * it's a read-only handle, it's expected that futexes attach to 34477e52ae3SPeter Zijlstra * the object not the particular process. 34577e52ae3SPeter Zijlstra */ 34677e52ae3SPeter Zijlstra if (PageAnon(page)) { 34777e52ae3SPeter Zijlstra /* 34877e52ae3SPeter Zijlstra * A RO anonymous page will never change and thus doesn't make 34977e52ae3SPeter Zijlstra * sense for futex operations. 35077e52ae3SPeter Zijlstra */ 35177e52ae3SPeter Zijlstra if (unlikely(should_fail_futex(true)) || ro) { 35277e52ae3SPeter Zijlstra err = -EFAULT; 35377e52ae3SPeter Zijlstra goto out; 35477e52ae3SPeter Zijlstra } 35577e52ae3SPeter Zijlstra 35677e52ae3SPeter Zijlstra key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 35777e52ae3SPeter Zijlstra key->private.mm = mm; 35877e52ae3SPeter Zijlstra key->private.address = address; 35977e52ae3SPeter Zijlstra 36077e52ae3SPeter Zijlstra } else { 36177e52ae3SPeter Zijlstra struct inode *inode; 36277e52ae3SPeter Zijlstra 36377e52ae3SPeter Zijlstra /* 36477e52ae3SPeter Zijlstra * The associated futex object in this case is the inode and 36577e52ae3SPeter Zijlstra * the page->mapping must be traversed. Ordinarily this should 36677e52ae3SPeter Zijlstra * be stabilised under page lock but it's not strictly 36777e52ae3SPeter Zijlstra * necessary in this case as we just want to pin the inode, not 36877e52ae3SPeter Zijlstra * update the radix tree or anything like that. 36977e52ae3SPeter Zijlstra * 37077e52ae3SPeter Zijlstra * The RCU read lock is taken as the inode is finally freed 37177e52ae3SPeter Zijlstra * under RCU. If the mapping still matches expectations then the 37277e52ae3SPeter Zijlstra * mapping->host can be safely accessed as being a valid inode. 37377e52ae3SPeter Zijlstra */ 37477e52ae3SPeter Zijlstra rcu_read_lock(); 37577e52ae3SPeter Zijlstra 37677e52ae3SPeter Zijlstra if (READ_ONCE(page->mapping) != mapping) { 37777e52ae3SPeter Zijlstra rcu_read_unlock(); 37877e52ae3SPeter Zijlstra put_page(page); 37977e52ae3SPeter Zijlstra 38077e52ae3SPeter Zijlstra goto again; 38177e52ae3SPeter Zijlstra } 38277e52ae3SPeter Zijlstra 38377e52ae3SPeter Zijlstra inode = READ_ONCE(mapping->host); 38477e52ae3SPeter Zijlstra if (!inode) { 38577e52ae3SPeter Zijlstra rcu_read_unlock(); 38677e52ae3SPeter Zijlstra put_page(page); 38777e52ae3SPeter Zijlstra 38877e52ae3SPeter Zijlstra goto again; 38977e52ae3SPeter Zijlstra } 39077e52ae3SPeter Zijlstra 39177e52ae3SPeter Zijlstra key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 39277e52ae3SPeter Zijlstra key->shared.i_seq = get_inode_sequence_number(inode); 39377e52ae3SPeter Zijlstra key->shared.pgoff = page_to_pgoff(tail); 39477e52ae3SPeter Zijlstra rcu_read_unlock(); 39577e52ae3SPeter Zijlstra } 39677e52ae3SPeter Zijlstra 39777e52ae3SPeter Zijlstra out: 39877e52ae3SPeter Zijlstra put_page(page); 39977e52ae3SPeter Zijlstra return err; 40077e52ae3SPeter Zijlstra } 40177e52ae3SPeter Zijlstra 40277e52ae3SPeter Zijlstra /** 40377e52ae3SPeter Zijlstra * fault_in_user_writeable() - Fault in user address and verify RW access 40477e52ae3SPeter Zijlstra * @uaddr: pointer to faulting user space address 40577e52ae3SPeter Zijlstra * 40677e52ae3SPeter Zijlstra * Slow path to fixup the fault we just took in the atomic write 40777e52ae3SPeter Zijlstra * access to @uaddr. 40877e52ae3SPeter Zijlstra * 40977e52ae3SPeter Zijlstra * We have no generic implementation of a non-destructive write to the 41077e52ae3SPeter Zijlstra * user address. We know that we faulted in the atomic pagefault 41177e52ae3SPeter Zijlstra * disabled section so we can as well avoid the #PF overhead by 41277e52ae3SPeter Zijlstra * calling get_user_pages() right away. 41377e52ae3SPeter Zijlstra */ 41485dc28faSPeter Zijlstra int fault_in_user_writeable(u32 __user *uaddr) 41577e52ae3SPeter Zijlstra { 41677e52ae3SPeter Zijlstra struct mm_struct *mm = current->mm; 41777e52ae3SPeter Zijlstra int ret; 41877e52ae3SPeter Zijlstra 41977e52ae3SPeter Zijlstra mmap_read_lock(mm); 42077e52ae3SPeter Zijlstra ret = fixup_user_fault(mm, (unsigned long)uaddr, 42177e52ae3SPeter Zijlstra FAULT_FLAG_WRITE, NULL); 42277e52ae3SPeter Zijlstra mmap_read_unlock(mm); 42377e52ae3SPeter Zijlstra 42477e52ae3SPeter Zijlstra return ret < 0 ? ret : 0; 42577e52ae3SPeter Zijlstra } 42677e52ae3SPeter Zijlstra 42777e52ae3SPeter Zijlstra /** 42877e52ae3SPeter Zijlstra * futex_top_waiter() - Return the highest priority waiter on a futex 42977e52ae3SPeter Zijlstra * @hb: the hash bucket the futex_q's reside in 43077e52ae3SPeter Zijlstra * @key: the futex key (to distinguish it from other futex futex_q's) 43177e52ae3SPeter Zijlstra * 43277e52ae3SPeter Zijlstra * Must be called with the hb lock held. 43377e52ae3SPeter Zijlstra */ 43485dc28faSPeter Zijlstra struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) 43577e52ae3SPeter Zijlstra { 43677e52ae3SPeter Zijlstra struct futex_q *this; 43777e52ae3SPeter Zijlstra 43877e52ae3SPeter Zijlstra plist_for_each_entry(this, &hb->chain, list) { 439f56a76fdSPeter Zijlstra if (futex_match(&this->key, key)) 44077e52ae3SPeter Zijlstra return this; 44177e52ae3SPeter Zijlstra } 44277e52ae3SPeter Zijlstra return NULL; 44377e52ae3SPeter Zijlstra } 44477e52ae3SPeter Zijlstra 44585dc28faSPeter Zijlstra int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) 44677e52ae3SPeter Zijlstra { 44777e52ae3SPeter Zijlstra int ret; 44877e52ae3SPeter Zijlstra 44977e52ae3SPeter Zijlstra pagefault_disable(); 45077e52ae3SPeter Zijlstra ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); 45177e52ae3SPeter Zijlstra pagefault_enable(); 45277e52ae3SPeter Zijlstra 45377e52ae3SPeter Zijlstra return ret; 45477e52ae3SPeter Zijlstra } 45577e52ae3SPeter Zijlstra 45685dc28faSPeter Zijlstra int futex_get_value_locked(u32 *dest, u32 __user *from) 45777e52ae3SPeter Zijlstra { 45877e52ae3SPeter Zijlstra int ret; 45977e52ae3SPeter Zijlstra 46077e52ae3SPeter Zijlstra pagefault_disable(); 46177e52ae3SPeter Zijlstra ret = __get_user(*dest, from); 46277e52ae3SPeter Zijlstra pagefault_enable(); 46377e52ae3SPeter Zijlstra 46477e52ae3SPeter Zijlstra return ret ? -EFAULT : 0; 46577e52ae3SPeter Zijlstra } 46677e52ae3SPeter Zijlstra 46777e52ae3SPeter Zijlstra /** 46877e52ae3SPeter Zijlstra * wait_for_owner_exiting - Block until the owner has exited 46977e52ae3SPeter Zijlstra * @ret: owner's current futex lock status 47077e52ae3SPeter Zijlstra * @exiting: Pointer to the exiting task 47177e52ae3SPeter Zijlstra * 47277e52ae3SPeter Zijlstra * Caller must hold a refcount on @exiting. 47377e52ae3SPeter Zijlstra */ 47485dc28faSPeter Zijlstra void wait_for_owner_exiting(int ret, struct task_struct *exiting) 47577e52ae3SPeter Zijlstra { 47677e52ae3SPeter Zijlstra if (ret != -EBUSY) { 47777e52ae3SPeter Zijlstra WARN_ON_ONCE(exiting); 47877e52ae3SPeter Zijlstra return; 47977e52ae3SPeter Zijlstra } 48077e52ae3SPeter Zijlstra 48177e52ae3SPeter Zijlstra if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) 48277e52ae3SPeter Zijlstra return; 48377e52ae3SPeter Zijlstra 48477e52ae3SPeter Zijlstra mutex_lock(&exiting->futex_exit_mutex); 48577e52ae3SPeter Zijlstra /* 48677e52ae3SPeter Zijlstra * No point in doing state checking here. If the waiter got here 48777e52ae3SPeter Zijlstra * while the task was in exec()->exec_futex_release() then it can 48877e52ae3SPeter Zijlstra * have any FUTEX_STATE_* value when the waiter has acquired the 48977e52ae3SPeter Zijlstra * mutex. OK, if running, EXITING or DEAD if it reached exit() 49077e52ae3SPeter Zijlstra * already. Highly unlikely and not a problem. Just one more round 49177e52ae3SPeter Zijlstra * through the futex maze. 49277e52ae3SPeter Zijlstra */ 49377e52ae3SPeter Zijlstra mutex_unlock(&exiting->futex_exit_mutex); 49477e52ae3SPeter Zijlstra 49577e52ae3SPeter Zijlstra put_task_struct(exiting); 49677e52ae3SPeter Zijlstra } 49777e52ae3SPeter Zijlstra 49877e52ae3SPeter Zijlstra /** 499af92dceaSPeter Zijlstra * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket 50077e52ae3SPeter Zijlstra * @q: The futex_q to unqueue 50177e52ae3SPeter Zijlstra * 50277e52ae3SPeter Zijlstra * The q->lock_ptr must not be NULL and must be held by the caller. 50377e52ae3SPeter Zijlstra */ 504e5c68284SPeter Zijlstra void __futex_unqueue(struct futex_q *q) 50577e52ae3SPeter Zijlstra { 50677e52ae3SPeter Zijlstra struct futex_hash_bucket *hb; 50777e52ae3SPeter Zijlstra 50877e52ae3SPeter Zijlstra if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) 50977e52ae3SPeter Zijlstra return; 51077e52ae3SPeter Zijlstra lockdep_assert_held(q->lock_ptr); 51177e52ae3SPeter Zijlstra 51277e52ae3SPeter Zijlstra hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 51377e52ae3SPeter Zijlstra plist_del(&q->list, &hb->chain); 514832c0542SPeter Zijlstra futex_hb_waiters_dec(hb); 51577e52ae3SPeter Zijlstra } 51677e52ae3SPeter Zijlstra 51777e52ae3SPeter Zijlstra /* The key must be already stored in q->key. */ 51885dc28faSPeter Zijlstra struct futex_hash_bucket *futex_q_lock(struct futex_q *q) 51977e52ae3SPeter Zijlstra __acquires(&hb->lock) 52077e52ae3SPeter Zijlstra { 52177e52ae3SPeter Zijlstra struct futex_hash_bucket *hb; 52277e52ae3SPeter Zijlstra 523eee5a7bcSPeter Zijlstra hb = futex_hash(&q->key); 52477e52ae3SPeter Zijlstra 52577e52ae3SPeter Zijlstra /* 52677e52ae3SPeter Zijlstra * Increment the counter before taking the lock so that 52777e52ae3SPeter Zijlstra * a potential waker won't miss a to-be-slept task that is 528e7ba9c8fSPeter Zijlstra * waiting for the spinlock. This is safe as all futex_q_lock() 529bce760d3SPeter Zijlstra * users end up calling futex_queue(). Similarly, for housekeeping, 530e7ba9c8fSPeter Zijlstra * decrement the counter at futex_q_unlock() when some error has 53177e52ae3SPeter Zijlstra * occurred and we don't end up adding the task to the list. 53277e52ae3SPeter Zijlstra */ 533832c0542SPeter Zijlstra futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ 53477e52ae3SPeter Zijlstra 53577e52ae3SPeter Zijlstra q->lock_ptr = &hb->lock; 53677e52ae3SPeter Zijlstra 53777e52ae3SPeter Zijlstra spin_lock(&hb->lock); 53877e52ae3SPeter Zijlstra return hb; 53977e52ae3SPeter Zijlstra } 54077e52ae3SPeter Zijlstra 54185dc28faSPeter Zijlstra void futex_q_unlock(struct futex_hash_bucket *hb) 54277e52ae3SPeter Zijlstra __releases(&hb->lock) 54377e52ae3SPeter Zijlstra { 54477e52ae3SPeter Zijlstra spin_unlock(&hb->lock); 545832c0542SPeter Zijlstra futex_hb_waiters_dec(hb); 54677e52ae3SPeter Zijlstra } 54777e52ae3SPeter Zijlstra 54885dc28faSPeter Zijlstra void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) 54977e52ae3SPeter Zijlstra { 55077e52ae3SPeter Zijlstra int prio; 55177e52ae3SPeter Zijlstra 55277e52ae3SPeter Zijlstra /* 55377e52ae3SPeter Zijlstra * The priority used to register this element is 55477e52ae3SPeter Zijlstra * - either the real thread-priority for the real-time threads 55577e52ae3SPeter Zijlstra * (i.e. threads with a priority lower than MAX_RT_PRIO) 55677e52ae3SPeter Zijlstra * - or MAX_RT_PRIO for non-RT threads. 55777e52ae3SPeter Zijlstra * Thus, all RT-threads are woken first in priority order, and 55877e52ae3SPeter Zijlstra * the others are woken last, in FIFO order. 55977e52ae3SPeter Zijlstra */ 56077e52ae3SPeter Zijlstra prio = min(current->normal_prio, MAX_RT_PRIO); 56177e52ae3SPeter Zijlstra 56277e52ae3SPeter Zijlstra plist_node_init(&q->list, prio); 56377e52ae3SPeter Zijlstra plist_add(&q->list, &hb->chain); 56477e52ae3SPeter Zijlstra q->task = current; 56577e52ae3SPeter Zijlstra } 56677e52ae3SPeter Zijlstra 56777e52ae3SPeter Zijlstra /** 568bce760d3SPeter Zijlstra * futex_unqueue() - Remove the futex_q from its futex_hash_bucket 56977e52ae3SPeter Zijlstra * @q: The futex_q to unqueue 57077e52ae3SPeter Zijlstra * 571bce760d3SPeter Zijlstra * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must 572bce760d3SPeter Zijlstra * be paired with exactly one earlier call to futex_queue(). 57377e52ae3SPeter Zijlstra * 57477e52ae3SPeter Zijlstra * Return: 57577e52ae3SPeter Zijlstra * - 1 - if the futex_q was still queued (and we removed unqueued it); 57677e52ae3SPeter Zijlstra * - 0 - if the futex_q was already removed by the waking thread 57777e52ae3SPeter Zijlstra */ 578a046f1a0SPeter Zijlstra int futex_unqueue(struct futex_q *q) 57977e52ae3SPeter Zijlstra { 58077e52ae3SPeter Zijlstra spinlock_t *lock_ptr; 58177e52ae3SPeter Zijlstra int ret = 0; 58277e52ae3SPeter Zijlstra 58377e52ae3SPeter Zijlstra /* In the common case we don't take the spinlock, which is nice. */ 58477e52ae3SPeter Zijlstra retry: 58577e52ae3SPeter Zijlstra /* 58677e52ae3SPeter Zijlstra * q->lock_ptr can change between this read and the following spin_lock. 58777e52ae3SPeter Zijlstra * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and 58877e52ae3SPeter Zijlstra * optimizing lock_ptr out of the logic below. 58977e52ae3SPeter Zijlstra */ 59077e52ae3SPeter Zijlstra lock_ptr = READ_ONCE(q->lock_ptr); 59177e52ae3SPeter Zijlstra if (lock_ptr != NULL) { 59277e52ae3SPeter Zijlstra spin_lock(lock_ptr); 59377e52ae3SPeter Zijlstra /* 59477e52ae3SPeter Zijlstra * q->lock_ptr can change between reading it and 59577e52ae3SPeter Zijlstra * spin_lock(), causing us to take the wrong lock. This 59677e52ae3SPeter Zijlstra * corrects the race condition. 59777e52ae3SPeter Zijlstra * 59877e52ae3SPeter Zijlstra * Reasoning goes like this: if we have the wrong lock, 59977e52ae3SPeter Zijlstra * q->lock_ptr must have changed (maybe several times) 60077e52ae3SPeter Zijlstra * between reading it and the spin_lock(). It can 60177e52ae3SPeter Zijlstra * change again after the spin_lock() but only if it was 60277e52ae3SPeter Zijlstra * already changed before the spin_lock(). It cannot, 60377e52ae3SPeter Zijlstra * however, change back to the original value. Therefore 60477e52ae3SPeter Zijlstra * we can detect whether we acquired the correct lock. 60577e52ae3SPeter Zijlstra */ 60677e52ae3SPeter Zijlstra if (unlikely(lock_ptr != q->lock_ptr)) { 60777e52ae3SPeter Zijlstra spin_unlock(lock_ptr); 60877e52ae3SPeter Zijlstra goto retry; 60977e52ae3SPeter Zijlstra } 610af92dceaSPeter Zijlstra __futex_unqueue(q); 61177e52ae3SPeter Zijlstra 61277e52ae3SPeter Zijlstra BUG_ON(q->pi_state); 61377e52ae3SPeter Zijlstra 61477e52ae3SPeter Zijlstra spin_unlock(lock_ptr); 61577e52ae3SPeter Zijlstra ret = 1; 61677e52ae3SPeter Zijlstra } 61777e52ae3SPeter Zijlstra 61877e52ae3SPeter Zijlstra return ret; 61977e52ae3SPeter Zijlstra } 62077e52ae3SPeter Zijlstra 62177e52ae3SPeter Zijlstra /* 62277e52ae3SPeter Zijlstra * PI futexes can not be requeued and must remove themselves from the 62377e52ae3SPeter Zijlstra * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. 62477e52ae3SPeter Zijlstra */ 62585dc28faSPeter Zijlstra void futex_unqueue_pi(struct futex_q *q) 62677e52ae3SPeter Zijlstra { 627af92dceaSPeter Zijlstra __futex_unqueue(q); 62877e52ae3SPeter Zijlstra 62977e52ae3SPeter Zijlstra BUG_ON(!q->pi_state); 63077e52ae3SPeter Zijlstra put_pi_state(q->pi_state); 63177e52ae3SPeter Zijlstra q->pi_state = NULL; 63277e52ae3SPeter Zijlstra } 63377e52ae3SPeter Zijlstra 63477e52ae3SPeter Zijlstra /* Constants for the pending_op argument of handle_futex_death */ 63577e52ae3SPeter Zijlstra #define HANDLE_DEATH_PENDING true 63677e52ae3SPeter Zijlstra #define HANDLE_DEATH_LIST false 63777e52ae3SPeter Zijlstra 63877e52ae3SPeter Zijlstra /* 63977e52ae3SPeter Zijlstra * Process a futex-list entry, check whether it's owned by the 64077e52ae3SPeter Zijlstra * dying task, and do notification if so: 64177e52ae3SPeter Zijlstra */ 64277e52ae3SPeter Zijlstra static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, 64377e52ae3SPeter Zijlstra bool pi, bool pending_op) 64477e52ae3SPeter Zijlstra { 64577e52ae3SPeter Zijlstra u32 uval, nval, mval; 64677e52ae3SPeter Zijlstra int err; 64777e52ae3SPeter Zijlstra 64877e52ae3SPeter Zijlstra /* Futex address must be 32bit aligned */ 64977e52ae3SPeter Zijlstra if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) 65077e52ae3SPeter Zijlstra return -1; 65177e52ae3SPeter Zijlstra 65277e52ae3SPeter Zijlstra retry: 65377e52ae3SPeter Zijlstra if (get_user(uval, uaddr)) 65477e52ae3SPeter Zijlstra return -1; 65577e52ae3SPeter Zijlstra 65677e52ae3SPeter Zijlstra /* 65777e52ae3SPeter Zijlstra * Special case for regular (non PI) futexes. The unlock path in 65877e52ae3SPeter Zijlstra * user space has two race scenarios: 65977e52ae3SPeter Zijlstra * 66077e52ae3SPeter Zijlstra * 1. The unlock path releases the user space futex value and 66177e52ae3SPeter Zijlstra * before it can execute the futex() syscall to wake up 66277e52ae3SPeter Zijlstra * waiters it is killed. 66377e52ae3SPeter Zijlstra * 66477e52ae3SPeter Zijlstra * 2. A woken up waiter is killed before it can acquire the 66577e52ae3SPeter Zijlstra * futex in user space. 66677e52ae3SPeter Zijlstra * 66777e52ae3SPeter Zijlstra * In both cases the TID validation below prevents a wakeup of 66877e52ae3SPeter Zijlstra * potential waiters which can cause these waiters to block 66977e52ae3SPeter Zijlstra * forever. 67077e52ae3SPeter Zijlstra * 67177e52ae3SPeter Zijlstra * In both cases the following conditions are met: 67277e52ae3SPeter Zijlstra * 67377e52ae3SPeter Zijlstra * 1) task->robust_list->list_op_pending != NULL 67477e52ae3SPeter Zijlstra * @pending_op == true 67577e52ae3SPeter Zijlstra * 2) User space futex value == 0 67677e52ae3SPeter Zijlstra * 3) Regular futex: @pi == false 67777e52ae3SPeter Zijlstra * 67877e52ae3SPeter Zijlstra * If these conditions are met, it is safe to attempt waking up a 67977e52ae3SPeter Zijlstra * potential waiter without touching the user space futex value and 68077e52ae3SPeter Zijlstra * trying to set the OWNER_DIED bit. The user space futex value is 68177e52ae3SPeter Zijlstra * uncontended and the rest of the user space mutex state is 68277e52ae3SPeter Zijlstra * consistent, so a woken waiter will just take over the 68377e52ae3SPeter Zijlstra * uncontended futex. Setting the OWNER_DIED bit would create 68477e52ae3SPeter Zijlstra * inconsistent state and malfunction of the user space owner died 68577e52ae3SPeter Zijlstra * handling. 68677e52ae3SPeter Zijlstra */ 68777e52ae3SPeter Zijlstra if (pending_op && !pi && !uval) { 68877e52ae3SPeter Zijlstra futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); 68977e52ae3SPeter Zijlstra return 0; 69077e52ae3SPeter Zijlstra } 69177e52ae3SPeter Zijlstra 69277e52ae3SPeter Zijlstra if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) 69377e52ae3SPeter Zijlstra return 0; 69477e52ae3SPeter Zijlstra 69577e52ae3SPeter Zijlstra /* 69677e52ae3SPeter Zijlstra * Ok, this dying thread is truly holding a futex 69777e52ae3SPeter Zijlstra * of interest. Set the OWNER_DIED bit atomically 69877e52ae3SPeter Zijlstra * via cmpxchg, and if the value had FUTEX_WAITERS 69977e52ae3SPeter Zijlstra * set, wake up a waiter (if any). (We have to do a 70077e52ae3SPeter Zijlstra * futex_wake() even if OWNER_DIED is already set - 70177e52ae3SPeter Zijlstra * to handle the rare but possible case of recursive 70277e52ae3SPeter Zijlstra * thread-death.) The rest of the cleanup is done in 70377e52ae3SPeter Zijlstra * userspace. 70477e52ae3SPeter Zijlstra */ 70577e52ae3SPeter Zijlstra mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 70677e52ae3SPeter Zijlstra 70777e52ae3SPeter Zijlstra /* 70877e52ae3SPeter Zijlstra * We are not holding a lock here, but we want to have 70977e52ae3SPeter Zijlstra * the pagefault_disable/enable() protection because 71077e52ae3SPeter Zijlstra * we want to handle the fault gracefully. If the 71177e52ae3SPeter Zijlstra * access fails we try to fault in the futex with R/W 71277e52ae3SPeter Zijlstra * verification via get_user_pages. get_user() above 71377e52ae3SPeter Zijlstra * does not guarantee R/W access. If that fails we 71477e52ae3SPeter Zijlstra * give up and leave the futex locked. 71577e52ae3SPeter Zijlstra */ 716966cb75fSPeter Zijlstra if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { 71777e52ae3SPeter Zijlstra switch (err) { 71877e52ae3SPeter Zijlstra case -EFAULT: 71977e52ae3SPeter Zijlstra if (fault_in_user_writeable(uaddr)) 72077e52ae3SPeter Zijlstra return -1; 72177e52ae3SPeter Zijlstra goto retry; 72277e52ae3SPeter Zijlstra 72377e52ae3SPeter Zijlstra case -EAGAIN: 72477e52ae3SPeter Zijlstra cond_resched(); 72577e52ae3SPeter Zijlstra goto retry; 72677e52ae3SPeter Zijlstra 72777e52ae3SPeter Zijlstra default: 72877e52ae3SPeter Zijlstra WARN_ON_ONCE(1); 72977e52ae3SPeter Zijlstra return err; 73077e52ae3SPeter Zijlstra } 73177e52ae3SPeter Zijlstra } 73277e52ae3SPeter Zijlstra 73377e52ae3SPeter Zijlstra if (nval != uval) 73477e52ae3SPeter Zijlstra goto retry; 73577e52ae3SPeter Zijlstra 73677e52ae3SPeter Zijlstra /* 73777e52ae3SPeter Zijlstra * Wake robust non-PI futexes here. The wakeup of 73877e52ae3SPeter Zijlstra * PI futexes happens in exit_pi_state(): 73977e52ae3SPeter Zijlstra */ 74077e52ae3SPeter Zijlstra if (!pi && (uval & FUTEX_WAITERS)) 74177e52ae3SPeter Zijlstra futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); 74277e52ae3SPeter Zijlstra 74377e52ae3SPeter Zijlstra return 0; 74477e52ae3SPeter Zijlstra } 74577e52ae3SPeter Zijlstra 74677e52ae3SPeter Zijlstra /* 74777e52ae3SPeter Zijlstra * Fetch a robust-list pointer. Bit 0 signals PI futexes: 74877e52ae3SPeter Zijlstra */ 74977e52ae3SPeter Zijlstra static inline int fetch_robust_entry(struct robust_list __user **entry, 75077e52ae3SPeter Zijlstra struct robust_list __user * __user *head, 75177e52ae3SPeter Zijlstra unsigned int *pi) 75277e52ae3SPeter Zijlstra { 75377e52ae3SPeter Zijlstra unsigned long uentry; 75477e52ae3SPeter Zijlstra 75577e52ae3SPeter Zijlstra if (get_user(uentry, (unsigned long __user *)head)) 75677e52ae3SPeter Zijlstra return -EFAULT; 75777e52ae3SPeter Zijlstra 75877e52ae3SPeter Zijlstra *entry = (void __user *)(uentry & ~1UL); 75977e52ae3SPeter Zijlstra *pi = uentry & 1; 76077e52ae3SPeter Zijlstra 76177e52ae3SPeter Zijlstra return 0; 76277e52ae3SPeter Zijlstra } 76377e52ae3SPeter Zijlstra 76477e52ae3SPeter Zijlstra /* 76577e52ae3SPeter Zijlstra * Walk curr->robust_list (very carefully, it's a userspace list!) 76677e52ae3SPeter Zijlstra * and mark any locks found there dead, and notify any waiters. 76777e52ae3SPeter Zijlstra * 76877e52ae3SPeter Zijlstra * We silently return on any sign of list-walking problem. 76977e52ae3SPeter Zijlstra */ 77077e52ae3SPeter Zijlstra static void exit_robust_list(struct task_struct *curr) 77177e52ae3SPeter Zijlstra { 77277e52ae3SPeter Zijlstra struct robust_list_head __user *head = curr->robust_list; 77377e52ae3SPeter Zijlstra struct robust_list __user *entry, *next_entry, *pending; 77477e52ae3SPeter Zijlstra unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 77577e52ae3SPeter Zijlstra unsigned int next_pi; 77677e52ae3SPeter Zijlstra unsigned long futex_offset; 77777e52ae3SPeter Zijlstra int rc; 77877e52ae3SPeter Zijlstra 77977e52ae3SPeter Zijlstra if (!futex_cmpxchg_enabled) 78077e52ae3SPeter Zijlstra return; 78177e52ae3SPeter Zijlstra 78277e52ae3SPeter Zijlstra /* 78377e52ae3SPeter Zijlstra * Fetch the list head (which was registered earlier, via 78477e52ae3SPeter Zijlstra * sys_set_robust_list()): 78577e52ae3SPeter Zijlstra */ 78677e52ae3SPeter Zijlstra if (fetch_robust_entry(&entry, &head->list.next, &pi)) 78777e52ae3SPeter Zijlstra return; 78877e52ae3SPeter Zijlstra /* 78977e52ae3SPeter Zijlstra * Fetch the relative futex offset: 79077e52ae3SPeter Zijlstra */ 79177e52ae3SPeter Zijlstra if (get_user(futex_offset, &head->futex_offset)) 79277e52ae3SPeter Zijlstra return; 79377e52ae3SPeter Zijlstra /* 79477e52ae3SPeter Zijlstra * Fetch any possibly pending lock-add first, and handle it 79577e52ae3SPeter Zijlstra * if it exists: 79677e52ae3SPeter Zijlstra */ 79777e52ae3SPeter Zijlstra if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 79877e52ae3SPeter Zijlstra return; 79977e52ae3SPeter Zijlstra 80077e52ae3SPeter Zijlstra next_entry = NULL; /* avoid warning with gcc */ 80177e52ae3SPeter Zijlstra while (entry != &head->list) { 80277e52ae3SPeter Zijlstra /* 80377e52ae3SPeter Zijlstra * Fetch the next entry in the list before calling 80477e52ae3SPeter Zijlstra * handle_futex_death: 80577e52ae3SPeter Zijlstra */ 80677e52ae3SPeter Zijlstra rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 80777e52ae3SPeter Zijlstra /* 80877e52ae3SPeter Zijlstra * A pending lock might already be on the list, so 80977e52ae3SPeter Zijlstra * don't process it twice: 81077e52ae3SPeter Zijlstra */ 81177e52ae3SPeter Zijlstra if (entry != pending) { 81277e52ae3SPeter Zijlstra if (handle_futex_death((void __user *)entry + futex_offset, 81377e52ae3SPeter Zijlstra curr, pi, HANDLE_DEATH_LIST)) 81477e52ae3SPeter Zijlstra return; 81577e52ae3SPeter Zijlstra } 81677e52ae3SPeter Zijlstra if (rc) 81777e52ae3SPeter Zijlstra return; 81877e52ae3SPeter Zijlstra entry = next_entry; 81977e52ae3SPeter Zijlstra pi = next_pi; 82077e52ae3SPeter Zijlstra /* 82177e52ae3SPeter Zijlstra * Avoid excessively long or circular lists: 82277e52ae3SPeter Zijlstra */ 82377e52ae3SPeter Zijlstra if (!--limit) 82477e52ae3SPeter Zijlstra break; 82577e52ae3SPeter Zijlstra 82677e52ae3SPeter Zijlstra cond_resched(); 82777e52ae3SPeter Zijlstra } 82877e52ae3SPeter Zijlstra 82977e52ae3SPeter Zijlstra if (pending) { 83077e52ae3SPeter Zijlstra handle_futex_death((void __user *)pending + futex_offset, 83177e52ae3SPeter Zijlstra curr, pip, HANDLE_DEATH_PENDING); 83277e52ae3SPeter Zijlstra } 83377e52ae3SPeter Zijlstra } 83477e52ae3SPeter Zijlstra 835af8cc960SPeter Zijlstra #ifdef CONFIG_COMPAT 836af8cc960SPeter Zijlstra static void __user *futex_uaddr(struct robust_list __user *entry, 837af8cc960SPeter Zijlstra compat_long_t futex_offset) 838af8cc960SPeter Zijlstra { 839af8cc960SPeter Zijlstra compat_uptr_t base = ptr_to_compat(entry); 840af8cc960SPeter Zijlstra void __user *uaddr = compat_ptr(base + futex_offset); 841af8cc960SPeter Zijlstra 842af8cc960SPeter Zijlstra return uaddr; 843af8cc960SPeter Zijlstra } 844af8cc960SPeter Zijlstra 845af8cc960SPeter Zijlstra /* 846af8cc960SPeter Zijlstra * Fetch a robust-list pointer. Bit 0 signals PI futexes: 847af8cc960SPeter Zijlstra */ 848af8cc960SPeter Zijlstra static inline int 849af8cc960SPeter Zijlstra compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 850af8cc960SPeter Zijlstra compat_uptr_t __user *head, unsigned int *pi) 851af8cc960SPeter Zijlstra { 852af8cc960SPeter Zijlstra if (get_user(*uentry, head)) 853af8cc960SPeter Zijlstra return -EFAULT; 854af8cc960SPeter Zijlstra 855af8cc960SPeter Zijlstra *entry = compat_ptr((*uentry) & ~1); 856af8cc960SPeter Zijlstra *pi = (unsigned int)(*uentry) & 1; 857af8cc960SPeter Zijlstra 858af8cc960SPeter Zijlstra return 0; 859af8cc960SPeter Zijlstra } 860af8cc960SPeter Zijlstra 861af8cc960SPeter Zijlstra /* 862af8cc960SPeter Zijlstra * Walk curr->robust_list (very carefully, it's a userspace list!) 863af8cc960SPeter Zijlstra * and mark any locks found there dead, and notify any waiters. 864af8cc960SPeter Zijlstra * 865af8cc960SPeter Zijlstra * We silently return on any sign of list-walking problem. 866af8cc960SPeter Zijlstra */ 867af8cc960SPeter Zijlstra static void compat_exit_robust_list(struct task_struct *curr) 868af8cc960SPeter Zijlstra { 869af8cc960SPeter Zijlstra struct compat_robust_list_head __user *head = curr->compat_robust_list; 870af8cc960SPeter Zijlstra struct robust_list __user *entry, *next_entry, *pending; 871af8cc960SPeter Zijlstra unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 872af8cc960SPeter Zijlstra unsigned int next_pi; 873af8cc960SPeter Zijlstra compat_uptr_t uentry, next_uentry, upending; 874af8cc960SPeter Zijlstra compat_long_t futex_offset; 875af8cc960SPeter Zijlstra int rc; 876af8cc960SPeter Zijlstra 877af8cc960SPeter Zijlstra if (!futex_cmpxchg_enabled) 878af8cc960SPeter Zijlstra return; 879af8cc960SPeter Zijlstra 880af8cc960SPeter Zijlstra /* 881af8cc960SPeter Zijlstra * Fetch the list head (which was registered earlier, via 882af8cc960SPeter Zijlstra * sys_set_robust_list()): 883af8cc960SPeter Zijlstra */ 884af8cc960SPeter Zijlstra if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) 885af8cc960SPeter Zijlstra return; 886af8cc960SPeter Zijlstra /* 887af8cc960SPeter Zijlstra * Fetch the relative futex offset: 888af8cc960SPeter Zijlstra */ 889af8cc960SPeter Zijlstra if (get_user(futex_offset, &head->futex_offset)) 890af8cc960SPeter Zijlstra return; 891af8cc960SPeter Zijlstra /* 892af8cc960SPeter Zijlstra * Fetch any possibly pending lock-add first, and handle it 893af8cc960SPeter Zijlstra * if it exists: 894af8cc960SPeter Zijlstra */ 895af8cc960SPeter Zijlstra if (compat_fetch_robust_entry(&upending, &pending, 896af8cc960SPeter Zijlstra &head->list_op_pending, &pip)) 897af8cc960SPeter Zijlstra return; 898af8cc960SPeter Zijlstra 899af8cc960SPeter Zijlstra next_entry = NULL; /* avoid warning with gcc */ 900af8cc960SPeter Zijlstra while (entry != (struct robust_list __user *) &head->list) { 901af8cc960SPeter Zijlstra /* 902af8cc960SPeter Zijlstra * Fetch the next entry in the list before calling 903af8cc960SPeter Zijlstra * handle_futex_death: 904af8cc960SPeter Zijlstra */ 905af8cc960SPeter Zijlstra rc = compat_fetch_robust_entry(&next_uentry, &next_entry, 906af8cc960SPeter Zijlstra (compat_uptr_t __user *)&entry->next, &next_pi); 907af8cc960SPeter Zijlstra /* 908af8cc960SPeter Zijlstra * A pending lock might already be on the list, so 909af8cc960SPeter Zijlstra * dont process it twice: 910af8cc960SPeter Zijlstra */ 911af8cc960SPeter Zijlstra if (entry != pending) { 912af8cc960SPeter Zijlstra void __user *uaddr = futex_uaddr(entry, futex_offset); 913af8cc960SPeter Zijlstra 914af8cc960SPeter Zijlstra if (handle_futex_death(uaddr, curr, pi, 915af8cc960SPeter Zijlstra HANDLE_DEATH_LIST)) 916af8cc960SPeter Zijlstra return; 917af8cc960SPeter Zijlstra } 918af8cc960SPeter Zijlstra if (rc) 919af8cc960SPeter Zijlstra return; 920af8cc960SPeter Zijlstra uentry = next_uentry; 921af8cc960SPeter Zijlstra entry = next_entry; 922af8cc960SPeter Zijlstra pi = next_pi; 923af8cc960SPeter Zijlstra /* 924af8cc960SPeter Zijlstra * Avoid excessively long or circular lists: 925af8cc960SPeter Zijlstra */ 926af8cc960SPeter Zijlstra if (!--limit) 927af8cc960SPeter Zijlstra break; 928af8cc960SPeter Zijlstra 929af8cc960SPeter Zijlstra cond_resched(); 930af8cc960SPeter Zijlstra } 931af8cc960SPeter Zijlstra if (pending) { 932af8cc960SPeter Zijlstra void __user *uaddr = futex_uaddr(pending, futex_offset); 933af8cc960SPeter Zijlstra 934af8cc960SPeter Zijlstra handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); 935af8cc960SPeter Zijlstra } 936af8cc960SPeter Zijlstra } 937af8cc960SPeter Zijlstra #endif 938af8cc960SPeter Zijlstra 93985dc28faSPeter Zijlstra #ifdef CONFIG_FUTEX_PI 94085dc28faSPeter Zijlstra 94185dc28faSPeter Zijlstra /* 94285dc28faSPeter Zijlstra * This task is holding PI mutexes at exit time => bad. 94385dc28faSPeter Zijlstra * Kernel cleans up PI-state, but userspace is likely hosed. 94485dc28faSPeter Zijlstra * (Robust-futex cleanup is separate and might save the day for userspace.) 94585dc28faSPeter Zijlstra */ 94685dc28faSPeter Zijlstra static void exit_pi_state_list(struct task_struct *curr) 94785dc28faSPeter Zijlstra { 94885dc28faSPeter Zijlstra struct list_head *next, *head = &curr->pi_state_list; 94985dc28faSPeter Zijlstra struct futex_pi_state *pi_state; 95085dc28faSPeter Zijlstra struct futex_hash_bucket *hb; 95185dc28faSPeter Zijlstra union futex_key key = FUTEX_KEY_INIT; 95285dc28faSPeter Zijlstra 95385dc28faSPeter Zijlstra if (!futex_cmpxchg_enabled) 95485dc28faSPeter Zijlstra return; 95585dc28faSPeter Zijlstra /* 95685dc28faSPeter Zijlstra * We are a ZOMBIE and nobody can enqueue itself on 95785dc28faSPeter Zijlstra * pi_state_list anymore, but we have to be careful 95885dc28faSPeter Zijlstra * versus waiters unqueueing themselves: 95985dc28faSPeter Zijlstra */ 96085dc28faSPeter Zijlstra raw_spin_lock_irq(&curr->pi_lock); 96185dc28faSPeter Zijlstra while (!list_empty(head)) { 96285dc28faSPeter Zijlstra next = head->next; 96385dc28faSPeter Zijlstra pi_state = list_entry(next, struct futex_pi_state, list); 96485dc28faSPeter Zijlstra key = pi_state->key; 96585dc28faSPeter Zijlstra hb = futex_hash(&key); 96685dc28faSPeter Zijlstra 96785dc28faSPeter Zijlstra /* 96885dc28faSPeter Zijlstra * We can race against put_pi_state() removing itself from the 96985dc28faSPeter Zijlstra * list (a waiter going away). put_pi_state() will first 97085dc28faSPeter Zijlstra * decrement the reference count and then modify the list, so 97185dc28faSPeter Zijlstra * its possible to see the list entry but fail this reference 97285dc28faSPeter Zijlstra * acquire. 97385dc28faSPeter Zijlstra * 97485dc28faSPeter Zijlstra * In that case; drop the locks to let put_pi_state() make 97585dc28faSPeter Zijlstra * progress and retry the loop. 97685dc28faSPeter Zijlstra */ 97785dc28faSPeter Zijlstra if (!refcount_inc_not_zero(&pi_state->refcount)) { 97885dc28faSPeter Zijlstra raw_spin_unlock_irq(&curr->pi_lock); 97985dc28faSPeter Zijlstra cpu_relax(); 98085dc28faSPeter Zijlstra raw_spin_lock_irq(&curr->pi_lock); 98185dc28faSPeter Zijlstra continue; 98285dc28faSPeter Zijlstra } 98385dc28faSPeter Zijlstra raw_spin_unlock_irq(&curr->pi_lock); 98485dc28faSPeter Zijlstra 98585dc28faSPeter Zijlstra spin_lock(&hb->lock); 98685dc28faSPeter Zijlstra raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 98785dc28faSPeter Zijlstra raw_spin_lock(&curr->pi_lock); 98885dc28faSPeter Zijlstra /* 98985dc28faSPeter Zijlstra * We dropped the pi-lock, so re-check whether this 99085dc28faSPeter Zijlstra * task still owns the PI-state: 99185dc28faSPeter Zijlstra */ 99285dc28faSPeter Zijlstra if (head->next != next) { 99385dc28faSPeter Zijlstra /* retain curr->pi_lock for the loop invariant */ 99485dc28faSPeter Zijlstra raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 99585dc28faSPeter Zijlstra spin_unlock(&hb->lock); 99685dc28faSPeter Zijlstra put_pi_state(pi_state); 99785dc28faSPeter Zijlstra continue; 99885dc28faSPeter Zijlstra } 99985dc28faSPeter Zijlstra 100085dc28faSPeter Zijlstra WARN_ON(pi_state->owner != curr); 100185dc28faSPeter Zijlstra WARN_ON(list_empty(&pi_state->list)); 100285dc28faSPeter Zijlstra list_del_init(&pi_state->list); 100385dc28faSPeter Zijlstra pi_state->owner = NULL; 100485dc28faSPeter Zijlstra 100585dc28faSPeter Zijlstra raw_spin_unlock(&curr->pi_lock); 100685dc28faSPeter Zijlstra raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 100785dc28faSPeter Zijlstra spin_unlock(&hb->lock); 100885dc28faSPeter Zijlstra 100985dc28faSPeter Zijlstra rt_mutex_futex_unlock(&pi_state->pi_mutex); 101085dc28faSPeter Zijlstra put_pi_state(pi_state); 101185dc28faSPeter Zijlstra 101285dc28faSPeter Zijlstra raw_spin_lock_irq(&curr->pi_lock); 101385dc28faSPeter Zijlstra } 101485dc28faSPeter Zijlstra raw_spin_unlock_irq(&curr->pi_lock); 101585dc28faSPeter Zijlstra } 101685dc28faSPeter Zijlstra #else 101785dc28faSPeter Zijlstra static inline void exit_pi_state_list(struct task_struct *curr) { } 101885dc28faSPeter Zijlstra #endif 101985dc28faSPeter Zijlstra 102077e52ae3SPeter Zijlstra static void futex_cleanup(struct task_struct *tsk) 102177e52ae3SPeter Zijlstra { 102277e52ae3SPeter Zijlstra if (unlikely(tsk->robust_list)) { 102377e52ae3SPeter Zijlstra exit_robust_list(tsk); 102477e52ae3SPeter Zijlstra tsk->robust_list = NULL; 102577e52ae3SPeter Zijlstra } 102677e52ae3SPeter Zijlstra 102777e52ae3SPeter Zijlstra #ifdef CONFIG_COMPAT 102877e52ae3SPeter Zijlstra if (unlikely(tsk->compat_robust_list)) { 102977e52ae3SPeter Zijlstra compat_exit_robust_list(tsk); 103077e52ae3SPeter Zijlstra tsk->compat_robust_list = NULL; 103177e52ae3SPeter Zijlstra } 103277e52ae3SPeter Zijlstra #endif 103377e52ae3SPeter Zijlstra 103477e52ae3SPeter Zijlstra if (unlikely(!list_empty(&tsk->pi_state_list))) 103577e52ae3SPeter Zijlstra exit_pi_state_list(tsk); 103677e52ae3SPeter Zijlstra } 103777e52ae3SPeter Zijlstra 103877e52ae3SPeter Zijlstra /** 103977e52ae3SPeter Zijlstra * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD 104077e52ae3SPeter Zijlstra * @tsk: task to set the state on 104177e52ae3SPeter Zijlstra * 104277e52ae3SPeter Zijlstra * Set the futex exit state of the task lockless. The futex waiter code 104377e52ae3SPeter Zijlstra * observes that state when a task is exiting and loops until the task has 104477e52ae3SPeter Zijlstra * actually finished the futex cleanup. The worst case for this is that the 104577e52ae3SPeter Zijlstra * waiter runs through the wait loop until the state becomes visible. 104677e52ae3SPeter Zijlstra * 1047*05ea0424SEric W. Biederman * This is called from the recursive fault handling path in make_task_dead(). 104877e52ae3SPeter Zijlstra * 104977e52ae3SPeter Zijlstra * This is best effort. Either the futex exit code has run already or 105077e52ae3SPeter Zijlstra * not. If the OWNER_DIED bit has been set on the futex then the waiter can 105177e52ae3SPeter Zijlstra * take it over. If not, the problem is pushed back to user space. If the 105277e52ae3SPeter Zijlstra * futex exit code did not run yet, then an already queued waiter might 105377e52ae3SPeter Zijlstra * block forever, but there is nothing which can be done about that. 105477e52ae3SPeter Zijlstra */ 105577e52ae3SPeter Zijlstra void futex_exit_recursive(struct task_struct *tsk) 105677e52ae3SPeter Zijlstra { 105777e52ae3SPeter Zijlstra /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ 105877e52ae3SPeter Zijlstra if (tsk->futex_state == FUTEX_STATE_EXITING) 105977e52ae3SPeter Zijlstra mutex_unlock(&tsk->futex_exit_mutex); 106077e52ae3SPeter Zijlstra tsk->futex_state = FUTEX_STATE_DEAD; 106177e52ae3SPeter Zijlstra } 106277e52ae3SPeter Zijlstra 106377e52ae3SPeter Zijlstra static void futex_cleanup_begin(struct task_struct *tsk) 106477e52ae3SPeter Zijlstra { 106577e52ae3SPeter Zijlstra /* 106677e52ae3SPeter Zijlstra * Prevent various race issues against a concurrent incoming waiter 106777e52ae3SPeter Zijlstra * including live locks by forcing the waiter to block on 106877e52ae3SPeter Zijlstra * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in 106977e52ae3SPeter Zijlstra * attach_to_pi_owner(). 107077e52ae3SPeter Zijlstra */ 107177e52ae3SPeter Zijlstra mutex_lock(&tsk->futex_exit_mutex); 107277e52ae3SPeter Zijlstra 107377e52ae3SPeter Zijlstra /* 107477e52ae3SPeter Zijlstra * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. 107577e52ae3SPeter Zijlstra * 107677e52ae3SPeter Zijlstra * This ensures that all subsequent checks of tsk->futex_state in 107777e52ae3SPeter Zijlstra * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with 107877e52ae3SPeter Zijlstra * tsk->pi_lock held. 107977e52ae3SPeter Zijlstra * 108077e52ae3SPeter Zijlstra * It guarantees also that a pi_state which was queued right before 108177e52ae3SPeter Zijlstra * the state change under tsk->pi_lock by a concurrent waiter must 108277e52ae3SPeter Zijlstra * be observed in exit_pi_state_list(). 108377e52ae3SPeter Zijlstra */ 108477e52ae3SPeter Zijlstra raw_spin_lock_irq(&tsk->pi_lock); 108577e52ae3SPeter Zijlstra tsk->futex_state = FUTEX_STATE_EXITING; 108677e52ae3SPeter Zijlstra raw_spin_unlock_irq(&tsk->pi_lock); 108777e52ae3SPeter Zijlstra } 108877e52ae3SPeter Zijlstra 108977e52ae3SPeter Zijlstra static void futex_cleanup_end(struct task_struct *tsk, int state) 109077e52ae3SPeter Zijlstra { 109177e52ae3SPeter Zijlstra /* 109277e52ae3SPeter Zijlstra * Lockless store. The only side effect is that an observer might 109377e52ae3SPeter Zijlstra * take another loop until it becomes visible. 109477e52ae3SPeter Zijlstra */ 109577e52ae3SPeter Zijlstra tsk->futex_state = state; 109677e52ae3SPeter Zijlstra /* 109777e52ae3SPeter Zijlstra * Drop the exit protection. This unblocks waiters which observed 109877e52ae3SPeter Zijlstra * FUTEX_STATE_EXITING to reevaluate the state. 109977e52ae3SPeter Zijlstra */ 110077e52ae3SPeter Zijlstra mutex_unlock(&tsk->futex_exit_mutex); 110177e52ae3SPeter Zijlstra } 110277e52ae3SPeter Zijlstra 110377e52ae3SPeter Zijlstra void futex_exec_release(struct task_struct *tsk) 110477e52ae3SPeter Zijlstra { 110577e52ae3SPeter Zijlstra /* 110677e52ae3SPeter Zijlstra * The state handling is done for consistency, but in the case of 110777e52ae3SPeter Zijlstra * exec() there is no way to prevent further damage as the PID stays 110877e52ae3SPeter Zijlstra * the same. But for the unlikely and arguably buggy case that a 110977e52ae3SPeter Zijlstra * futex is held on exec(), this provides at least as much state 111077e52ae3SPeter Zijlstra * consistency protection which is possible. 111177e52ae3SPeter Zijlstra */ 111277e52ae3SPeter Zijlstra futex_cleanup_begin(tsk); 111377e52ae3SPeter Zijlstra futex_cleanup(tsk); 111477e52ae3SPeter Zijlstra /* 111577e52ae3SPeter Zijlstra * Reset the state to FUTEX_STATE_OK. The task is alive and about 111677e52ae3SPeter Zijlstra * exec a new binary. 111777e52ae3SPeter Zijlstra */ 111877e52ae3SPeter Zijlstra futex_cleanup_end(tsk, FUTEX_STATE_OK); 111977e52ae3SPeter Zijlstra } 112077e52ae3SPeter Zijlstra 112177e52ae3SPeter Zijlstra void futex_exit_release(struct task_struct *tsk) 112277e52ae3SPeter Zijlstra { 112377e52ae3SPeter Zijlstra futex_cleanup_begin(tsk); 112477e52ae3SPeter Zijlstra futex_cleanup(tsk); 112577e52ae3SPeter Zijlstra futex_cleanup_end(tsk, FUTEX_STATE_DEAD); 112677e52ae3SPeter Zijlstra } 112777e52ae3SPeter Zijlstra 112877e52ae3SPeter Zijlstra static void __init futex_detect_cmpxchg(void) 112977e52ae3SPeter Zijlstra { 113077e52ae3SPeter Zijlstra #ifndef CONFIG_HAVE_FUTEX_CMPXCHG 113177e52ae3SPeter Zijlstra u32 curval; 113277e52ae3SPeter Zijlstra 113377e52ae3SPeter Zijlstra /* 113477e52ae3SPeter Zijlstra * This will fail and we want it. Some arch implementations do 113577e52ae3SPeter Zijlstra * runtime detection of the futex_atomic_cmpxchg_inatomic() 113677e52ae3SPeter Zijlstra * functionality. We want to know that before we call in any 113777e52ae3SPeter Zijlstra * of the complex code paths. Also we want to prevent 113877e52ae3SPeter Zijlstra * registration of robust lists in that case. NULL is 113977e52ae3SPeter Zijlstra * guaranteed to fault and we get -EFAULT on functional 114077e52ae3SPeter Zijlstra * implementation, the non-functional ones will return 114177e52ae3SPeter Zijlstra * -ENOSYS. 114277e52ae3SPeter Zijlstra */ 1143966cb75fSPeter Zijlstra if (futex_cmpxchg_value_locked(&curval, NULL, 0, 0) == -EFAULT) 114477e52ae3SPeter Zijlstra futex_cmpxchg_enabled = 1; 114577e52ae3SPeter Zijlstra #endif 114677e52ae3SPeter Zijlstra } 114777e52ae3SPeter Zijlstra 114877e52ae3SPeter Zijlstra static int __init futex_init(void) 114977e52ae3SPeter Zijlstra { 115077e52ae3SPeter Zijlstra unsigned int futex_shift; 115177e52ae3SPeter Zijlstra unsigned long i; 115277e52ae3SPeter Zijlstra 115377e52ae3SPeter Zijlstra #if CONFIG_BASE_SMALL 115477e52ae3SPeter Zijlstra futex_hashsize = 16; 115577e52ae3SPeter Zijlstra #else 115677e52ae3SPeter Zijlstra futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 115777e52ae3SPeter Zijlstra #endif 115877e52ae3SPeter Zijlstra 115977e52ae3SPeter Zijlstra futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 116077e52ae3SPeter Zijlstra futex_hashsize, 0, 116177e52ae3SPeter Zijlstra futex_hashsize < 256 ? HASH_SMALL : 0, 116277e52ae3SPeter Zijlstra &futex_shift, NULL, 116377e52ae3SPeter Zijlstra futex_hashsize, futex_hashsize); 116477e52ae3SPeter Zijlstra futex_hashsize = 1UL << futex_shift; 116577e52ae3SPeter Zijlstra 116677e52ae3SPeter Zijlstra futex_detect_cmpxchg(); 116777e52ae3SPeter Zijlstra 116877e52ae3SPeter Zijlstra for (i = 0; i < futex_hashsize; i++) { 116977e52ae3SPeter Zijlstra atomic_set(&futex_queues[i].waiters, 0); 117077e52ae3SPeter Zijlstra plist_head_init(&futex_queues[i].chain); 117177e52ae3SPeter Zijlstra spin_lock_init(&futex_queues[i].lock); 117277e52ae3SPeter Zijlstra } 117377e52ae3SPeter Zijlstra 117477e52ae3SPeter Zijlstra return 0; 117577e52ae3SPeter Zijlstra } 117677e52ae3SPeter Zijlstra core_initcall(futex_init); 1177