xref: /openbmc/linux/ipc/sem.c (revision 78c5335b)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * linux/ipc/sem.c
4   * Copyright (C) 1992 Krishna Balasubramanian
5   * Copyright (C) 1995 Eric Schenk, Bruno Haible
6   *
7   * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
8   *
9   * SMP-threaded, sysctl's added
10   * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
11   * Enforced range limit on SEM_UNDO
12   * (c) 2001 Red Hat Inc
13   * Lockless wakeup
14   * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
15   * (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
16   * Further wakeup optimizations, documentation
17   * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
18   *
19   * support for audit of ipc object properties and permission changes
20   * Dustin Kirkland <dustin.kirkland@us.ibm.com>
21   *
22   * namespaces support
23   * OpenVZ, SWsoft Inc.
24   * Pavel Emelianov <xemul@openvz.org>
25   *
26   * Implementation notes: (May 2010)
27   * This file implements System V semaphores.
28   *
29   * User space visible behavior:
30   * - FIFO ordering for semop() operations (just FIFO, not starvation
31   *   protection)
32   * - multiple semaphore operations that alter the same semaphore in
33   *   one semop() are handled.
34   * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
35   *   SETALL calls.
36   * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
37   * - undo adjustments at process exit are limited to 0..SEMVMX.
38   * - namespace are supported.
39   * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing
40   *   to /proc/sys/kernel/sem.
41   * - statistics about the usage are reported in /proc/sysvipc/sem.
42   *
43   * Internals:
44   * - scalability:
45   *   - all global variables are read-mostly.
46   *   - semop() calls and semctl(RMID) are synchronized by RCU.
47   *   - most operations do write operations (actually: spin_lock calls) to
48   *     the per-semaphore array structure.
49   *   Thus: Perfect SMP scaling between independent semaphore arrays.
50   *         If multiple semaphores in one array are used, then cache line
51   *         trashing on the semaphore array spinlock will limit the scaling.
52   * - semncnt and semzcnt are calculated on demand in count_semcnt()
53   * - the task that performs a successful semop() scans the list of all
54   *   sleeping tasks and completes any pending operations that can be fulfilled.
55   *   Semaphores are actively given to waiting tasks (necessary for FIFO).
56   *   (see update_queue())
57   * - To improve the scalability, the actual wake-up calls are performed after
58   *   dropping all locks. (see wake_up_sem_queue_prepare())
59   * - All work is done by the waker, the woken up task does not have to do
60   *   anything - not even acquiring a lock or dropping a refcount.
61   * - A woken up task may not even touch the semaphore array anymore, it may
62   *   have been destroyed already by a semctl(RMID).
63   * - UNDO values are stored in an array (one per process and per
64   *   semaphore array, lazily allocated). For backwards compatibility, multiple
65   *   modes for the UNDO variables are supported (per process, per thread)
66   *   (see copy_semundo, CLONE_SYSVSEM)
67   * - There are two lists of the pending operations: a per-array list
68   *   and per-semaphore list (stored in the array). This allows to achieve FIFO
69   *   ordering without always scanning all pending operations.
70   *   The worst-case behavior is nevertheless O(N^2) for N wakeups.
71   */
72  
73  #include <linux/compat.h>
74  #include <linux/slab.h>
75  #include <linux/spinlock.h>
76  #include <linux/init.h>
77  #include <linux/proc_fs.h>
78  #include <linux/time.h>
79  #include <linux/security.h>
80  #include <linux/syscalls.h>
81  #include <linux/audit.h>
82  #include <linux/capability.h>
83  #include <linux/seq_file.h>
84  #include <linux/rwsem.h>
85  #include <linux/nsproxy.h>
86  #include <linux/ipc_namespace.h>
87  #include <linux/sched/wake_q.h>
88  #include <linux/nospec.h>
89  #include <linux/rhashtable.h>
90  
91  #include <linux/uaccess.h>
92  #include "util.h"
93  
94  /* One semaphore structure for each semaphore in the system. */
95  struct sem {
96  	int	semval;		/* current value */
97  	/*
98  	 * PID of the process that last modified the semaphore. For
99  	 * Linux, specifically these are:
100  	 *  - semop
101  	 *  - semctl, via SETVAL and SETALL.
102  	 *  - at task exit when performing undo adjustments (see exit_sem).
103  	 */
104  	struct pid *sempid;
105  	spinlock_t	lock;	/* spinlock for fine-grained semtimedop */
106  	struct list_head pending_alter; /* pending single-sop operations */
107  					/* that alter the semaphore */
108  	struct list_head pending_const; /* pending single-sop operations */
109  					/* that do not alter the semaphore*/
110  	time64_t	 sem_otime;	/* candidate for sem_otime */
111  } ____cacheline_aligned_in_smp;
112  
113  /* One sem_array data structure for each set of semaphores in the system. */
114  struct sem_array {
115  	struct kern_ipc_perm	sem_perm;	/* permissions .. see ipc.h */
116  	time64_t		sem_ctime;	/* create/last semctl() time */
117  	struct list_head	pending_alter;	/* pending operations */
118  						/* that alter the array */
119  	struct list_head	pending_const;	/* pending complex operations */
120  						/* that do not alter semvals */
121  	struct list_head	list_id;	/* undo requests on this array */
122  	int			sem_nsems;	/* no. of semaphores in array */
123  	int			complex_count;	/* pending complex operations */
124  	unsigned int		use_global_lock;/* >0: global lock required */
125  
126  	struct sem		sems[];
127  } __randomize_layout;
128  
129  /* One queue for each sleeping process in the system. */
130  struct sem_queue {
131  	struct list_head	list;	 /* queue of pending operations */
132  	struct task_struct	*sleeper; /* this process */
133  	struct sem_undo		*undo;	 /* undo structure */
134  	struct pid		*pid;	 /* process id of requesting process */
135  	int			status;	 /* completion status of operation */
136  	struct sembuf		*sops;	 /* array of pending operations */
137  	struct sembuf		*blocking; /* the operation that blocked */
138  	int			nsops;	 /* number of operations */
139  	bool			alter;	 /* does *sops alter the array? */
140  	bool                    dupsop;	 /* sops on more than one sem_num */
141  };
142  
143  /* Each task has a list of undo requests. They are executed automatically
144   * when the process exits.
145   */
146  struct sem_undo {
147  	struct list_head	list_proc;	/* per-process list: *
148  						 * all undos from one process
149  						 * rcu protected */
150  	struct rcu_head		rcu;		/* rcu struct for sem_undo */
151  	struct sem_undo_list	*ulp;		/* back ptr to sem_undo_list */
152  	struct list_head	list_id;	/* per semaphore array list:
153  						 * all undos for one array */
154  	int			semid;		/* semaphore set identifier */
155  	short			*semadj;	/* array of adjustments */
156  						/* one per semaphore */
157  };
158  
159  /* sem_undo_list controls shared access to the list of sem_undo structures
160   * that may be shared among all a CLONE_SYSVSEM task group.
161   */
162  struct sem_undo_list {
163  	refcount_t		refcnt;
164  	spinlock_t		lock;
165  	struct list_head	list_proc;
166  };
167  
168  
169  #define sem_ids(ns)	((ns)->ids[IPC_SEM_IDS])
170  
171  static int newary(struct ipc_namespace *, struct ipc_params *);
172  static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
173  #ifdef CONFIG_PROC_FS
174  static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
175  #endif
176  
177  #define SEMMSL_FAST	256 /* 512 bytes on stack */
178  #define SEMOPM_FAST	64  /* ~ 372 bytes on stack */
179  
180  /*
181   * Switching from the mode suitable for simple ops
182   * to the mode for complex ops is costly. Therefore:
183   * use some hysteresis
184   */
185  #define USE_GLOBAL_LOCK_HYSTERESIS	10
186  
187  /*
188   * Locking:
189   * a) global sem_lock() for read/write
190   *	sem_undo.id_next,
191   *	sem_array.complex_count,
192   *	sem_array.pending{_alter,_const},
193   *	sem_array.sem_undo
194   *
195   * b) global or semaphore sem_lock() for read/write:
196   *	sem_array.sems[i].pending_{const,alter}:
197   *
198   * c) special:
199   *	sem_undo_list.list_proc:
200   *	* undo_list->lock for write
201   *	* rcu for read
202   *	use_global_lock:
203   *	* global sem_lock() for write
204   *	* either local or global sem_lock() for read.
205   *
206   * Memory ordering:
207   * Most ordering is enforced by using spin_lock() and spin_unlock().
208   *
209   * Exceptions:
210   * 1) use_global_lock: (SEM_BARRIER_1)
211   * Setting it from non-zero to 0 is a RELEASE, this is ensured by
212   * using smp_store_release(): Immediately after setting it to 0,
213   * a simple op can start.
214   * Testing if it is non-zero is an ACQUIRE, this is ensured by using
215   * smp_load_acquire().
216   * Setting it from 0 to non-zero must be ordered with regards to
217   * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
218   * is inside a spin_lock() and after a write from 0 to non-zero a
219   * spin_lock()+spin_unlock() is done.
220   * To prevent the compiler/cpu temporarily writing 0 to use_global_lock,
221   * READ_ONCE()/WRITE_ONCE() is used.
222   *
223   * 2) queue.status: (SEM_BARRIER_2)
224   * Initialization is done while holding sem_lock(), so no further barrier is
225   * required.
226   * Setting it to a result code is a RELEASE, this is ensured by both a
227   * smp_store_release() (for case a) and while holding sem_lock()
228   * (for case b).
229   * The ACQUIRE when reading the result code without holding sem_lock() is
230   * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep().
231   * (case a above).
232   * Reading the result code while holding sem_lock() needs no further barriers,
233   * the locks inside sem_lock() enforce ordering (case b above)
234   *
235   * 3) current->state:
236   * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock().
237   * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may
238   * happen immediately after calling wake_q_add. As wake_q_add_safe() is called
239   * when holding sem_lock(), no further barriers are required.
240   *
241   * See also ipc/mqueue.c for more details on the covered races.
242   */
243  
244  #define sc_semmsl	sem_ctls[0]
245  #define sc_semmns	sem_ctls[1]
246  #define sc_semopm	sem_ctls[2]
247  #define sc_semmni	sem_ctls[3]
248  
249  void sem_init_ns(struct ipc_namespace *ns)
250  {
251  	ns->sc_semmsl = SEMMSL;
252  	ns->sc_semmns = SEMMNS;
253  	ns->sc_semopm = SEMOPM;
254  	ns->sc_semmni = SEMMNI;
255  	ns->used_sems = 0;
256  	ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
257  }
258  
259  #ifdef CONFIG_IPC_NS
260  void sem_exit_ns(struct ipc_namespace *ns)
261  {
262  	free_ipcs(ns, &sem_ids(ns), freeary);
263  	idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
264  	rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
265  }
266  #endif
267  
268  void __init sem_init(void)
269  {
270  	sem_init_ns(&init_ipc_ns);
271  	ipc_init_proc_interface("sysvipc/sem",
272  				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
273  				IPC_SEM_IDS, sysvipc_sem_proc_show);
274  }
275  
276  /**
277   * unmerge_queues - unmerge queues, if possible.
278   * @sma: semaphore array
279   *
280   * The function unmerges the wait queues if complex_count is 0.
281   * It must be called prior to dropping the global semaphore array lock.
282   */
283  static void unmerge_queues(struct sem_array *sma)
284  {
285  	struct sem_queue *q, *tq;
286  
287  	/* complex operations still around? */
288  	if (sma->complex_count)
289  		return;
290  	/*
291  	 * We will switch back to simple mode.
292  	 * Move all pending operation back into the per-semaphore
293  	 * queues.
294  	 */
295  	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
296  		struct sem *curr;
297  		curr = &sma->sems[q->sops[0].sem_num];
298  
299  		list_add_tail(&q->list, &curr->pending_alter);
300  	}
301  	INIT_LIST_HEAD(&sma->pending_alter);
302  }
303  
304  /**
305   * merge_queues - merge single semop queues into global queue
306   * @sma: semaphore array
307   *
308   * This function merges all per-semaphore queues into the global queue.
309   * It is necessary to achieve FIFO ordering for the pending single-sop
310   * operations when a multi-semop operation must sleep.
311   * Only the alter operations must be moved, the const operations can stay.
312   */
313  static void merge_queues(struct sem_array *sma)
314  {
315  	int i;
316  	for (i = 0; i < sma->sem_nsems; i++) {
317  		struct sem *sem = &sma->sems[i];
318  
319  		list_splice_init(&sem->pending_alter, &sma->pending_alter);
320  	}
321  }
322  
323  static void sem_rcu_free(struct rcu_head *head)
324  {
325  	struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu);
326  	struct sem_array *sma = container_of(p, struct sem_array, sem_perm);
327  
328  	security_sem_free(&sma->sem_perm);
329  	kvfree(sma);
330  }
331  
332  /*
333   * Enter the mode suitable for non-simple operations:
334   * Caller must own sem_perm.lock.
335   */
336  static void complexmode_enter(struct sem_array *sma)
337  {
338  	int i;
339  	struct sem *sem;
340  
341  	if (sma->use_global_lock > 0)  {
342  		/*
343  		 * We are already in global lock mode.
344  		 * Nothing to do, just reset the
345  		 * counter until we return to simple mode.
346  		 */
347  		WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS);
348  		return;
349  	}
350  	WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS);
351  
352  	for (i = 0; i < sma->sem_nsems; i++) {
353  		sem = &sma->sems[i];
354  		spin_lock(&sem->lock);
355  		spin_unlock(&sem->lock);
356  	}
357  }
358  
359  /*
360   * Try to leave the mode that disallows simple operations:
361   * Caller must own sem_perm.lock.
362   */
363  static void complexmode_tryleave(struct sem_array *sma)
364  {
365  	if (sma->complex_count)  {
366  		/* Complex ops are sleeping.
367  		 * We must stay in complex mode
368  		 */
369  		return;
370  	}
371  	if (sma->use_global_lock == 1) {
372  
373  		/* See SEM_BARRIER_1 for purpose/pairing */
374  		smp_store_release(&sma->use_global_lock, 0);
375  	} else {
376  		WRITE_ONCE(sma->use_global_lock,
377  				sma->use_global_lock-1);
378  	}
379  }
380  
381  #define SEM_GLOBAL_LOCK	(-1)
382  /*
383   * If the request contains only one semaphore operation, and there are
384   * no complex transactions pending, lock only the semaphore involved.
385   * Otherwise, lock the entire semaphore array, since we either have
386   * multiple semaphores in our own semops, or we need to look at
387   * semaphores from other pending complex operations.
388   */
389  static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
390  			      int nsops)
391  {
392  	struct sem *sem;
393  	int idx;
394  
395  	if (nsops != 1) {
396  		/* Complex operation - acquire a full lock */
397  		ipc_lock_object(&sma->sem_perm);
398  
399  		/* Prevent parallel simple ops */
400  		complexmode_enter(sma);
401  		return SEM_GLOBAL_LOCK;
402  	}
403  
404  	/*
405  	 * Only one semaphore affected - try to optimize locking.
406  	 * Optimized locking is possible if no complex operation
407  	 * is either enqueued or processed right now.
408  	 *
409  	 * Both facts are tracked by use_global_mode.
410  	 */
411  	idx = array_index_nospec(sops->sem_num, sma->sem_nsems);
412  	sem = &sma->sems[idx];
413  
414  	/*
415  	 * Initial check for use_global_lock. Just an optimization,
416  	 * no locking, no memory barrier.
417  	 */
418  	if (!READ_ONCE(sma->use_global_lock)) {
419  		/*
420  		 * It appears that no complex operation is around.
421  		 * Acquire the per-semaphore lock.
422  		 */
423  		spin_lock(&sem->lock);
424  
425  		/* see SEM_BARRIER_1 for purpose/pairing */
426  		if (!smp_load_acquire(&sma->use_global_lock)) {
427  			/* fast path successful! */
428  			return sops->sem_num;
429  		}
430  		spin_unlock(&sem->lock);
431  	}
432  
433  	/* slow path: acquire the full lock */
434  	ipc_lock_object(&sma->sem_perm);
435  
436  	if (sma->use_global_lock == 0) {
437  		/*
438  		 * The use_global_lock mode ended while we waited for
439  		 * sma->sem_perm.lock. Thus we must switch to locking
440  		 * with sem->lock.
441  		 * Unlike in the fast path, there is no need to recheck
442  		 * sma->use_global_lock after we have acquired sem->lock:
443  		 * We own sma->sem_perm.lock, thus use_global_lock cannot
444  		 * change.
445  		 */
446  		spin_lock(&sem->lock);
447  
448  		ipc_unlock_object(&sma->sem_perm);
449  		return sops->sem_num;
450  	} else {
451  		/*
452  		 * Not a false alarm, thus continue to use the global lock
453  		 * mode. No need for complexmode_enter(), this was done by
454  		 * the caller that has set use_global_mode to non-zero.
455  		 */
456  		return SEM_GLOBAL_LOCK;
457  	}
458  }
459  
460  static inline void sem_unlock(struct sem_array *sma, int locknum)
461  {
462  	if (locknum == SEM_GLOBAL_LOCK) {
463  		unmerge_queues(sma);
464  		complexmode_tryleave(sma);
465  		ipc_unlock_object(&sma->sem_perm);
466  	} else {
467  		struct sem *sem = &sma->sems[locknum];
468  		spin_unlock(&sem->lock);
469  	}
470  }
471  
472  /*
473   * sem_lock_(check_) routines are called in the paths where the rwsem
474   * is not held.
475   *
476   * The caller holds the RCU read lock.
477   */
478  static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
479  {
480  	struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
481  
482  	if (IS_ERR(ipcp))
483  		return ERR_CAST(ipcp);
484  
485  	return container_of(ipcp, struct sem_array, sem_perm);
486  }
487  
488  static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns,
489  							int id)
490  {
491  	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id);
492  
493  	if (IS_ERR(ipcp))
494  		return ERR_CAST(ipcp);
495  
496  	return container_of(ipcp, struct sem_array, sem_perm);
497  }
498  
499  static inline void sem_lock_and_putref(struct sem_array *sma)
500  {
501  	sem_lock(sma, NULL, -1);
502  	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
503  }
504  
505  static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
506  {
507  	ipc_rmid(&sem_ids(ns), &s->sem_perm);
508  }
509  
510  static struct sem_array *sem_alloc(size_t nsems)
511  {
512  	struct sem_array *sma;
513  
514  	if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0]))
515  		return NULL;
516  
517  	sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT);
518  	if (unlikely(!sma))
519  		return NULL;
520  
521  	return sma;
522  }
523  
524  /**
525   * newary - Create a new semaphore set
526   * @ns: namespace
527   * @params: ptr to the structure that contains key, semflg and nsems
528   *
529   * Called with sem_ids.rwsem held (as a writer)
530   */
531  static int newary(struct ipc_namespace *ns, struct ipc_params *params)
532  {
533  	int retval;
534  	struct sem_array *sma;
535  	key_t key = params->key;
536  	int nsems = params->u.nsems;
537  	int semflg = params->flg;
538  	int i;
539  
540  	if (!nsems)
541  		return -EINVAL;
542  	if (ns->used_sems + nsems > ns->sc_semmns)
543  		return -ENOSPC;
544  
545  	sma = sem_alloc(nsems);
546  	if (!sma)
547  		return -ENOMEM;
548  
549  	sma->sem_perm.mode = (semflg & S_IRWXUGO);
550  	sma->sem_perm.key = key;
551  
552  	sma->sem_perm.security = NULL;
553  	retval = security_sem_alloc(&sma->sem_perm);
554  	if (retval) {
555  		kvfree(sma);
556  		return retval;
557  	}
558  
559  	for (i = 0; i < nsems; i++) {
560  		INIT_LIST_HEAD(&sma->sems[i].pending_alter);
561  		INIT_LIST_HEAD(&sma->sems[i].pending_const);
562  		spin_lock_init(&sma->sems[i].lock);
563  	}
564  
565  	sma->complex_count = 0;
566  	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
567  	INIT_LIST_HEAD(&sma->pending_alter);
568  	INIT_LIST_HEAD(&sma->pending_const);
569  	INIT_LIST_HEAD(&sma->list_id);
570  	sma->sem_nsems = nsems;
571  	sma->sem_ctime = ktime_get_real_seconds();
572  
573  	/* ipc_addid() locks sma upon success. */
574  	retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
575  	if (retval < 0) {
576  		ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
577  		return retval;
578  	}
579  	ns->used_sems += nsems;
580  
581  	sem_unlock(sma, -1);
582  	rcu_read_unlock();
583  
584  	return sma->sem_perm.id;
585  }
586  
587  
588  /*
589   * Called with sem_ids.rwsem and ipcp locked.
590   */
591  static int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params)
592  {
593  	struct sem_array *sma;
594  
595  	sma = container_of(ipcp, struct sem_array, sem_perm);
596  	if (params->u.nsems > sma->sem_nsems)
597  		return -EINVAL;
598  
599  	return 0;
600  }
601  
602  long ksys_semget(key_t key, int nsems, int semflg)
603  {
604  	struct ipc_namespace *ns;
605  	static const struct ipc_ops sem_ops = {
606  		.getnew = newary,
607  		.associate = security_sem_associate,
608  		.more_checks = sem_more_checks,
609  	};
610  	struct ipc_params sem_params;
611  
612  	ns = current->nsproxy->ipc_ns;
613  
614  	if (nsems < 0 || nsems > ns->sc_semmsl)
615  		return -EINVAL;
616  
617  	sem_params.key = key;
618  	sem_params.flg = semflg;
619  	sem_params.u.nsems = nsems;
620  
621  	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
622  }
623  
624  SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
625  {
626  	return ksys_semget(key, nsems, semflg);
627  }
628  
629  /**
630   * perform_atomic_semop[_slow] - Attempt to perform semaphore
631   *                               operations on a given array.
632   * @sma: semaphore array
633   * @q: struct sem_queue that describes the operation
634   *
635   * Caller blocking are as follows, based the value
636   * indicated by the semaphore operation (sem_op):
637   *
638   *  (1) >0 never blocks.
639   *  (2)  0 (wait-for-zero operation): semval is non-zero.
640   *  (3) <0 attempting to decrement semval to a value smaller than zero.
641   *
642   * Returns 0 if the operation was possible.
643   * Returns 1 if the operation is impossible, the caller must sleep.
644   * Returns <0 for error codes.
645   */
646  static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q)
647  {
648  	int result, sem_op, nsops;
649  	struct pid *pid;
650  	struct sembuf *sop;
651  	struct sem *curr;
652  	struct sembuf *sops;
653  	struct sem_undo *un;
654  
655  	sops = q->sops;
656  	nsops = q->nsops;
657  	un = q->undo;
658  
659  	for (sop = sops; sop < sops + nsops; sop++) {
660  		int idx = array_index_nospec(sop->sem_num, sma->sem_nsems);
661  		curr = &sma->sems[idx];
662  		sem_op = sop->sem_op;
663  		result = curr->semval;
664  
665  		if (!sem_op && result)
666  			goto would_block;
667  
668  		result += sem_op;
669  		if (result < 0)
670  			goto would_block;
671  		if (result > SEMVMX)
672  			goto out_of_range;
673  
674  		if (sop->sem_flg & SEM_UNDO) {
675  			int undo = un->semadj[sop->sem_num] - sem_op;
676  			/* Exceeding the undo range is an error. */
677  			if (undo < (-SEMAEM - 1) || undo > SEMAEM)
678  				goto out_of_range;
679  			un->semadj[sop->sem_num] = undo;
680  		}
681  
682  		curr->semval = result;
683  	}
684  
685  	sop--;
686  	pid = q->pid;
687  	while (sop >= sops) {
688  		ipc_update_pid(&sma->sems[sop->sem_num].sempid, pid);
689  		sop--;
690  	}
691  
692  	return 0;
693  
694  out_of_range:
695  	result = -ERANGE;
696  	goto undo;
697  
698  would_block:
699  	q->blocking = sop;
700  
701  	if (sop->sem_flg & IPC_NOWAIT)
702  		result = -EAGAIN;
703  	else
704  		result = 1;
705  
706  undo:
707  	sop--;
708  	while (sop >= sops) {
709  		sem_op = sop->sem_op;
710  		sma->sems[sop->sem_num].semval -= sem_op;
711  		if (sop->sem_flg & SEM_UNDO)
712  			un->semadj[sop->sem_num] += sem_op;
713  		sop--;
714  	}
715  
716  	return result;
717  }
718  
719  static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
720  {
721  	int result, sem_op, nsops;
722  	struct sembuf *sop;
723  	struct sem *curr;
724  	struct sembuf *sops;
725  	struct sem_undo *un;
726  
727  	sops = q->sops;
728  	nsops = q->nsops;
729  	un = q->undo;
730  
731  	if (unlikely(q->dupsop))
732  		return perform_atomic_semop_slow(sma, q);
733  
734  	/*
735  	 * We scan the semaphore set twice, first to ensure that the entire
736  	 * operation can succeed, therefore avoiding any pointless writes
737  	 * to shared memory and having to undo such changes in order to block
738  	 * until the operations can go through.
739  	 */
740  	for (sop = sops; sop < sops + nsops; sop++) {
741  		int idx = array_index_nospec(sop->sem_num, sma->sem_nsems);
742  
743  		curr = &sma->sems[idx];
744  		sem_op = sop->sem_op;
745  		result = curr->semval;
746  
747  		if (!sem_op && result)
748  			goto would_block; /* wait-for-zero */
749  
750  		result += sem_op;
751  		if (result < 0)
752  			goto would_block;
753  
754  		if (result > SEMVMX)
755  			return -ERANGE;
756  
757  		if (sop->sem_flg & SEM_UNDO) {
758  			int undo = un->semadj[sop->sem_num] - sem_op;
759  
760  			/* Exceeding the undo range is an error. */
761  			if (undo < (-SEMAEM - 1) || undo > SEMAEM)
762  				return -ERANGE;
763  		}
764  	}
765  
766  	for (sop = sops; sop < sops + nsops; sop++) {
767  		curr = &sma->sems[sop->sem_num];
768  		sem_op = sop->sem_op;
769  		result = curr->semval;
770  
771  		if (sop->sem_flg & SEM_UNDO) {
772  			int undo = un->semadj[sop->sem_num] - sem_op;
773  
774  			un->semadj[sop->sem_num] = undo;
775  		}
776  		curr->semval += sem_op;
777  		ipc_update_pid(&curr->sempid, q->pid);
778  	}
779  
780  	return 0;
781  
782  would_block:
783  	q->blocking = sop;
784  	return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1;
785  }
786  
787  static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
788  					     struct wake_q_head *wake_q)
789  {
790  	struct task_struct *sleeper;
791  
792  	sleeper = get_task_struct(q->sleeper);
793  
794  	/* see SEM_BARRIER_2 for purpose/pairing */
795  	smp_store_release(&q->status, error);
796  
797  	wake_q_add_safe(wake_q, sleeper);
798  }
799  
800  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
801  {
802  	list_del(&q->list);
803  	if (q->nsops > 1)
804  		sma->complex_count--;
805  }
806  
807  /** check_restart(sma, q)
808   * @sma: semaphore array
809   * @q: the operation that just completed
810   *
811   * update_queue is O(N^2) when it restarts scanning the whole queue of
812   * waiting operations. Therefore this function checks if the restart is
813   * really necessary. It is called after a previously waiting operation
814   * modified the array.
815   * Note that wait-for-zero operations are handled without restart.
816   */
817  static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
818  {
819  	/* pending complex alter operations are too difficult to analyse */
820  	if (!list_empty(&sma->pending_alter))
821  		return 1;
822  
823  	/* we were a sleeping complex operation. Too difficult */
824  	if (q->nsops > 1)
825  		return 1;
826  
827  	/* It is impossible that someone waits for the new value:
828  	 * - complex operations always restart.
829  	 * - wait-for-zero are handled separately.
830  	 * - q is a previously sleeping simple operation that
831  	 *   altered the array. It must be a decrement, because
832  	 *   simple increments never sleep.
833  	 * - If there are older (higher priority) decrements
834  	 *   in the queue, then they have observed the original
835  	 *   semval value and couldn't proceed. The operation
836  	 *   decremented to value - thus they won't proceed either.
837  	 */
838  	return 0;
839  }
840  
841  /**
842   * wake_const_ops - wake up non-alter tasks
843   * @sma: semaphore array.
844   * @semnum: semaphore that was modified.
845   * @wake_q: lockless wake-queue head.
846   *
847   * wake_const_ops must be called after a semaphore in a semaphore array
848   * was set to 0. If complex const operations are pending, wake_const_ops must
849   * be called with semnum = -1, as well as with the number of each modified
850   * semaphore.
851   * The tasks that must be woken up are added to @wake_q. The return code
852   * is stored in q->pid.
853   * The function returns 1 if at least one operation was completed successfully.
854   */
855  static int wake_const_ops(struct sem_array *sma, int semnum,
856  			  struct wake_q_head *wake_q)
857  {
858  	struct sem_queue *q, *tmp;
859  	struct list_head *pending_list;
860  	int semop_completed = 0;
861  
862  	if (semnum == -1)
863  		pending_list = &sma->pending_const;
864  	else
865  		pending_list = &sma->sems[semnum].pending_const;
866  
867  	list_for_each_entry_safe(q, tmp, pending_list, list) {
868  		int error = perform_atomic_semop(sma, q);
869  
870  		if (error > 0)
871  			continue;
872  		/* operation completed, remove from queue & wakeup */
873  		unlink_queue(sma, q);
874  
875  		wake_up_sem_queue_prepare(q, error, wake_q);
876  		if (error == 0)
877  			semop_completed = 1;
878  	}
879  
880  	return semop_completed;
881  }
882  
883  /**
884   * do_smart_wakeup_zero - wakeup all wait for zero tasks
885   * @sma: semaphore array
886   * @sops: operations that were performed
887   * @nsops: number of operations
888   * @wake_q: lockless wake-queue head
889   *
890   * Checks all required queue for wait-for-zero operations, based
891   * on the actual changes that were performed on the semaphore array.
892   * The function returns 1 if at least one operation was completed successfully.
893   */
894  static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
895  				int nsops, struct wake_q_head *wake_q)
896  {
897  	int i;
898  	int semop_completed = 0;
899  	int got_zero = 0;
900  
901  	/* first: the per-semaphore queues, if known */
902  	if (sops) {
903  		for (i = 0; i < nsops; i++) {
904  			int num = sops[i].sem_num;
905  
906  			if (sma->sems[num].semval == 0) {
907  				got_zero = 1;
908  				semop_completed |= wake_const_ops(sma, num, wake_q);
909  			}
910  		}
911  	} else {
912  		/*
913  		 * No sops means modified semaphores not known.
914  		 * Assume all were changed.
915  		 */
916  		for (i = 0; i < sma->sem_nsems; i++) {
917  			if (sma->sems[i].semval == 0) {
918  				got_zero = 1;
919  				semop_completed |= wake_const_ops(sma, i, wake_q);
920  			}
921  		}
922  	}
923  	/*
924  	 * If one of the modified semaphores got 0,
925  	 * then check the global queue, too.
926  	 */
927  	if (got_zero)
928  		semop_completed |= wake_const_ops(sma, -1, wake_q);
929  
930  	return semop_completed;
931  }
932  
933  
934  /**
935   * update_queue - look for tasks that can be completed.
936   * @sma: semaphore array.
937   * @semnum: semaphore that was modified.
938   * @wake_q: lockless wake-queue head.
939   *
940   * update_queue must be called after a semaphore in a semaphore array
941   * was modified. If multiple semaphores were modified, update_queue must
942   * be called with semnum = -1, as well as with the number of each modified
943   * semaphore.
944   * The tasks that must be woken up are added to @wake_q. The return code
945   * is stored in q->pid.
946   * The function internally checks if const operations can now succeed.
947   *
948   * The function return 1 if at least one semop was completed successfully.
949   */
950  static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q)
951  {
952  	struct sem_queue *q, *tmp;
953  	struct list_head *pending_list;
954  	int semop_completed = 0;
955  
956  	if (semnum == -1)
957  		pending_list = &sma->pending_alter;
958  	else
959  		pending_list = &sma->sems[semnum].pending_alter;
960  
961  again:
962  	list_for_each_entry_safe(q, tmp, pending_list, list) {
963  		int error, restart;
964  
965  		/* If we are scanning the single sop, per-semaphore list of
966  		 * one semaphore and that semaphore is 0, then it is not
967  		 * necessary to scan further: simple increments
968  		 * that affect only one entry succeed immediately and cannot
969  		 * be in the  per semaphore pending queue, and decrements
970  		 * cannot be successful if the value is already 0.
971  		 */
972  		if (semnum != -1 && sma->sems[semnum].semval == 0)
973  			break;
974  
975  		error = perform_atomic_semop(sma, q);
976  
977  		/* Does q->sleeper still need to sleep? */
978  		if (error > 0)
979  			continue;
980  
981  		unlink_queue(sma, q);
982  
983  		if (error) {
984  			restart = 0;
985  		} else {
986  			semop_completed = 1;
987  			do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
988  			restart = check_restart(sma, q);
989  		}
990  
991  		wake_up_sem_queue_prepare(q, error, wake_q);
992  		if (restart)
993  			goto again;
994  	}
995  	return semop_completed;
996  }
997  
998  /**
999   * set_semotime - set sem_otime
1000   * @sma: semaphore array
1001   * @sops: operations that modified the array, may be NULL
1002   *
1003   * sem_otime is replicated to avoid cache line trashing.
1004   * This function sets one instance to the current time.
1005   */
1006  static void set_semotime(struct sem_array *sma, struct sembuf *sops)
1007  {
1008  	if (sops == NULL) {
1009  		sma->sems[0].sem_otime = ktime_get_real_seconds();
1010  	} else {
1011  		sma->sems[sops[0].sem_num].sem_otime =
1012  						ktime_get_real_seconds();
1013  	}
1014  }
1015  
1016  /**
1017   * do_smart_update - optimized update_queue
1018   * @sma: semaphore array
1019   * @sops: operations that were performed
1020   * @nsops: number of operations
1021   * @otime: force setting otime
1022   * @wake_q: lockless wake-queue head
1023   *
1024   * do_smart_update() does the required calls to update_queue and wakeup_zero,
1025   * based on the actual changes that were performed on the semaphore array.
1026   * Note that the function does not do the actual wake-up: the caller is
1027   * responsible for calling wake_up_q().
1028   * It is safe to perform this call after dropping all locks.
1029   */
1030  static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
1031  			    int otime, struct wake_q_head *wake_q)
1032  {
1033  	int i;
1034  
1035  	otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q);
1036  
1037  	if (!list_empty(&sma->pending_alter)) {
1038  		/* semaphore array uses the global queue - just process it. */
1039  		otime |= update_queue(sma, -1, wake_q);
1040  	} else {
1041  		if (!sops) {
1042  			/*
1043  			 * No sops, thus the modified semaphores are not
1044  			 * known. Check all.
1045  			 */
1046  			for (i = 0; i < sma->sem_nsems; i++)
1047  				otime |= update_queue(sma, i, wake_q);
1048  		} else {
1049  			/*
1050  			 * Check the semaphores that were increased:
1051  			 * - No complex ops, thus all sleeping ops are
1052  			 *   decrease.
1053  			 * - if we decreased the value, then any sleeping
1054  			 *   semaphore ops won't be able to run: If the
1055  			 *   previous value was too small, then the new
1056  			 *   value will be too small, too.
1057  			 */
1058  			for (i = 0; i < nsops; i++) {
1059  				if (sops[i].sem_op > 0) {
1060  					otime |= update_queue(sma,
1061  							      sops[i].sem_num, wake_q);
1062  				}
1063  			}
1064  		}
1065  	}
1066  	if (otime)
1067  		set_semotime(sma, sops);
1068  }
1069  
1070  /*
1071   * check_qop: Test if a queued operation sleeps on the semaphore semnum
1072   */
1073  static int check_qop(struct sem_array *sma, int semnum, struct sem_queue *q,
1074  			bool count_zero)
1075  {
1076  	struct sembuf *sop = q->blocking;
1077  
1078  	/*
1079  	 * Linux always (since 0.99.10) reported a task as sleeping on all
1080  	 * semaphores. This violates SUS, therefore it was changed to the
1081  	 * standard compliant behavior.
1082  	 * Give the administrators a chance to notice that an application
1083  	 * might misbehave because it relies on the Linux behavior.
1084  	 */
1085  	pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n"
1086  			"The task %s (%d) triggered the difference, watch for misbehavior.\n",
1087  			current->comm, task_pid_nr(current));
1088  
1089  	if (sop->sem_num != semnum)
1090  		return 0;
1091  
1092  	if (count_zero && sop->sem_op == 0)
1093  		return 1;
1094  	if (!count_zero && sop->sem_op < 0)
1095  		return 1;
1096  
1097  	return 0;
1098  }
1099  
1100  /* The following counts are associated to each semaphore:
1101   *   semncnt        number of tasks waiting on semval being nonzero
1102   *   semzcnt        number of tasks waiting on semval being zero
1103   *
1104   * Per definition, a task waits only on the semaphore of the first semop
1105   * that cannot proceed, even if additional operation would block, too.
1106   */
1107  static int count_semcnt(struct sem_array *sma, ushort semnum,
1108  			bool count_zero)
1109  {
1110  	struct list_head *l;
1111  	struct sem_queue *q;
1112  	int semcnt;
1113  
1114  	semcnt = 0;
1115  	/* First: check the simple operations. They are easy to evaluate */
1116  	if (count_zero)
1117  		l = &sma->sems[semnum].pending_const;
1118  	else
1119  		l = &sma->sems[semnum].pending_alter;
1120  
1121  	list_for_each_entry(q, l, list) {
1122  		/* all task on a per-semaphore list sleep on exactly
1123  		 * that semaphore
1124  		 */
1125  		semcnt++;
1126  	}
1127  
1128  	/* Then: check the complex operations. */
1129  	list_for_each_entry(q, &sma->pending_alter, list) {
1130  		semcnt += check_qop(sma, semnum, q, count_zero);
1131  	}
1132  	if (count_zero) {
1133  		list_for_each_entry(q, &sma->pending_const, list) {
1134  			semcnt += check_qop(sma, semnum, q, count_zero);
1135  		}
1136  	}
1137  	return semcnt;
1138  }
1139  
1140  /* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
1141   * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
1142   * remains locked on exit.
1143   */
1144  static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1145  {
1146  	struct sem_undo *un, *tu;
1147  	struct sem_queue *q, *tq;
1148  	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
1149  	int i;
1150  	DEFINE_WAKE_Q(wake_q);
1151  
1152  	/* Free the existing undo structures for this semaphore set.  */
1153  	ipc_assert_locked_object(&sma->sem_perm);
1154  	list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
1155  		list_del(&un->list_id);
1156  		spin_lock(&un->ulp->lock);
1157  		un->semid = -1;
1158  		list_del_rcu(&un->list_proc);
1159  		spin_unlock(&un->ulp->lock);
1160  		kvfree_rcu(un, rcu);
1161  	}
1162  
1163  	/* Wake up all pending processes and let them fail with EIDRM. */
1164  	list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
1165  		unlink_queue(sma, q);
1166  		wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1167  	}
1168  
1169  	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
1170  		unlink_queue(sma, q);
1171  		wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1172  	}
1173  	for (i = 0; i < sma->sem_nsems; i++) {
1174  		struct sem *sem = &sma->sems[i];
1175  		list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
1176  			unlink_queue(sma, q);
1177  			wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1178  		}
1179  		list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
1180  			unlink_queue(sma, q);
1181  			wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1182  		}
1183  		ipc_update_pid(&sem->sempid, NULL);
1184  	}
1185  
1186  	/* Remove the semaphore set from the IDR */
1187  	sem_rmid(ns, sma);
1188  	sem_unlock(sma, -1);
1189  	rcu_read_unlock();
1190  
1191  	wake_up_q(&wake_q);
1192  	ns->used_sems -= sma->sem_nsems;
1193  	ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
1194  }
1195  
1196  static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
1197  {
1198  	switch (version) {
1199  	case IPC_64:
1200  		return copy_to_user(buf, in, sizeof(*in));
1201  	case IPC_OLD:
1202  	    {
1203  		struct semid_ds out;
1204  
1205  		memset(&out, 0, sizeof(out));
1206  
1207  		ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);
1208  
1209  		out.sem_otime	= in->sem_otime;
1210  		out.sem_ctime	= in->sem_ctime;
1211  		out.sem_nsems	= in->sem_nsems;
1212  
1213  		return copy_to_user(buf, &out, sizeof(out));
1214  	    }
1215  	default:
1216  		return -EINVAL;
1217  	}
1218  }
1219  
1220  static time64_t get_semotime(struct sem_array *sma)
1221  {
1222  	int i;
1223  	time64_t res;
1224  
1225  	res = sma->sems[0].sem_otime;
1226  	for (i = 1; i < sma->sem_nsems; i++) {
1227  		time64_t to = sma->sems[i].sem_otime;
1228  
1229  		if (to > res)
1230  			res = to;
1231  	}
1232  	return res;
1233  }
1234  
1235  static int semctl_stat(struct ipc_namespace *ns, int semid,
1236  			 int cmd, struct semid64_ds *semid64)
1237  {
1238  	struct sem_array *sma;
1239  	time64_t semotime;
1240  	int err;
1241  
1242  	memset(semid64, 0, sizeof(*semid64));
1243  
1244  	rcu_read_lock();
1245  	if (cmd == SEM_STAT || cmd == SEM_STAT_ANY) {
1246  		sma = sem_obtain_object(ns, semid);
1247  		if (IS_ERR(sma)) {
1248  			err = PTR_ERR(sma);
1249  			goto out_unlock;
1250  		}
1251  	} else { /* IPC_STAT */
1252  		sma = sem_obtain_object_check(ns, semid);
1253  		if (IS_ERR(sma)) {
1254  			err = PTR_ERR(sma);
1255  			goto out_unlock;
1256  		}
1257  	}
1258  
1259  	/* see comment for SHM_STAT_ANY */
1260  	if (cmd == SEM_STAT_ANY)
1261  		audit_ipc_obj(&sma->sem_perm);
1262  	else {
1263  		err = -EACCES;
1264  		if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
1265  			goto out_unlock;
1266  	}
1267  
1268  	err = security_sem_semctl(&sma->sem_perm, cmd);
1269  	if (err)
1270  		goto out_unlock;
1271  
1272  	ipc_lock_object(&sma->sem_perm);
1273  
1274  	if (!ipc_valid_object(&sma->sem_perm)) {
1275  		ipc_unlock_object(&sma->sem_perm);
1276  		err = -EIDRM;
1277  		goto out_unlock;
1278  	}
1279  
1280  	kernel_to_ipc64_perm(&sma->sem_perm, &semid64->sem_perm);
1281  	semotime = get_semotime(sma);
1282  	semid64->sem_otime = semotime;
1283  	semid64->sem_ctime = sma->sem_ctime;
1284  #ifndef CONFIG_64BIT
1285  	semid64->sem_otime_high = semotime >> 32;
1286  	semid64->sem_ctime_high = sma->sem_ctime >> 32;
1287  #endif
1288  	semid64->sem_nsems = sma->sem_nsems;
1289  
1290  	if (cmd == IPC_STAT) {
1291  		/*
1292  		 * As defined in SUS:
1293  		 * Return 0 on success
1294  		 */
1295  		err = 0;
1296  	} else {
1297  		/*
1298  		 * SEM_STAT and SEM_STAT_ANY (both Linux specific)
1299  		 * Return the full id, including the sequence number
1300  		 */
1301  		err = sma->sem_perm.id;
1302  	}
1303  	ipc_unlock_object(&sma->sem_perm);
1304  out_unlock:
1305  	rcu_read_unlock();
1306  	return err;
1307  }
1308  
1309  static int semctl_info(struct ipc_namespace *ns, int semid,
1310  			 int cmd, void __user *p)
1311  {
1312  	struct seminfo seminfo;
1313  	int max_idx;
1314  	int err;
1315  
1316  	err = security_sem_semctl(NULL, cmd);
1317  	if (err)
1318  		return err;
1319  
1320  	memset(&seminfo, 0, sizeof(seminfo));
1321  	seminfo.semmni = ns->sc_semmni;
1322  	seminfo.semmns = ns->sc_semmns;
1323  	seminfo.semmsl = ns->sc_semmsl;
1324  	seminfo.semopm = ns->sc_semopm;
1325  	seminfo.semvmx = SEMVMX;
1326  	seminfo.semmnu = SEMMNU;
1327  	seminfo.semmap = SEMMAP;
1328  	seminfo.semume = SEMUME;
1329  	down_read(&sem_ids(ns).rwsem);
1330  	if (cmd == SEM_INFO) {
1331  		seminfo.semusz = sem_ids(ns).in_use;
1332  		seminfo.semaem = ns->used_sems;
1333  	} else {
1334  		seminfo.semusz = SEMUSZ;
1335  		seminfo.semaem = SEMAEM;
1336  	}
1337  	max_idx = ipc_get_maxidx(&sem_ids(ns));
1338  	up_read(&sem_ids(ns).rwsem);
1339  	if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
1340  		return -EFAULT;
1341  	return (max_idx < 0) ? 0 : max_idx;
1342  }
1343  
1344  static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1345  		int val)
1346  {
1347  	struct sem_undo *un;
1348  	struct sem_array *sma;
1349  	struct sem *curr;
1350  	int err;
1351  	DEFINE_WAKE_Q(wake_q);
1352  
1353  	if (val > SEMVMX || val < 0)
1354  		return -ERANGE;
1355  
1356  	rcu_read_lock();
1357  	sma = sem_obtain_object_check(ns, semid);
1358  	if (IS_ERR(sma)) {
1359  		rcu_read_unlock();
1360  		return PTR_ERR(sma);
1361  	}
1362  
1363  	if (semnum < 0 || semnum >= sma->sem_nsems) {
1364  		rcu_read_unlock();
1365  		return -EINVAL;
1366  	}
1367  
1368  
1369  	if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) {
1370  		rcu_read_unlock();
1371  		return -EACCES;
1372  	}
1373  
1374  	err = security_sem_semctl(&sma->sem_perm, SETVAL);
1375  	if (err) {
1376  		rcu_read_unlock();
1377  		return -EACCES;
1378  	}
1379  
1380  	sem_lock(sma, NULL, -1);
1381  
1382  	if (!ipc_valid_object(&sma->sem_perm)) {
1383  		sem_unlock(sma, -1);
1384  		rcu_read_unlock();
1385  		return -EIDRM;
1386  	}
1387  
1388  	semnum = array_index_nospec(semnum, sma->sem_nsems);
1389  	curr = &sma->sems[semnum];
1390  
1391  	ipc_assert_locked_object(&sma->sem_perm);
1392  	list_for_each_entry(un, &sma->list_id, list_id)
1393  		un->semadj[semnum] = 0;
1394  
1395  	curr->semval = val;
1396  	ipc_update_pid(&curr->sempid, task_tgid(current));
1397  	sma->sem_ctime = ktime_get_real_seconds();
1398  	/* maybe some queued-up processes were waiting for this */
1399  	do_smart_update(sma, NULL, 0, 0, &wake_q);
1400  	sem_unlock(sma, -1);
1401  	rcu_read_unlock();
1402  	wake_up_q(&wake_q);
1403  	return 0;
1404  }
1405  
1406  static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
1407  		int cmd, void __user *p)
1408  {
1409  	struct sem_array *sma;
1410  	struct sem *curr;
1411  	int err, nsems;
1412  	ushort fast_sem_io[SEMMSL_FAST];
1413  	ushort *sem_io = fast_sem_io;
1414  	DEFINE_WAKE_Q(wake_q);
1415  
1416  	rcu_read_lock();
1417  	sma = sem_obtain_object_check(ns, semid);
1418  	if (IS_ERR(sma)) {
1419  		rcu_read_unlock();
1420  		return PTR_ERR(sma);
1421  	}
1422  
1423  	nsems = sma->sem_nsems;
1424  
1425  	err = -EACCES;
1426  	if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO))
1427  		goto out_rcu_wakeup;
1428  
1429  	err = security_sem_semctl(&sma->sem_perm, cmd);
1430  	if (err)
1431  		goto out_rcu_wakeup;
1432  
1433  	err = -EACCES;
1434  	switch (cmd) {
1435  	case GETALL:
1436  	{
1437  		ushort __user *array = p;
1438  		int i;
1439  
1440  		sem_lock(sma, NULL, -1);
1441  		if (!ipc_valid_object(&sma->sem_perm)) {
1442  			err = -EIDRM;
1443  			goto out_unlock;
1444  		}
1445  		if (nsems > SEMMSL_FAST) {
1446  			if (!ipc_rcu_getref(&sma->sem_perm)) {
1447  				err = -EIDRM;
1448  				goto out_unlock;
1449  			}
1450  			sem_unlock(sma, -1);
1451  			rcu_read_unlock();
1452  			sem_io = kvmalloc_array(nsems, sizeof(ushort),
1453  						GFP_KERNEL);
1454  			if (sem_io == NULL) {
1455  				ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
1456  				return -ENOMEM;
1457  			}
1458  
1459  			rcu_read_lock();
1460  			sem_lock_and_putref(sma);
1461  			if (!ipc_valid_object(&sma->sem_perm)) {
1462  				err = -EIDRM;
1463  				goto out_unlock;
1464  			}
1465  		}
1466  		for (i = 0; i < sma->sem_nsems; i++)
1467  			sem_io[i] = sma->sems[i].semval;
1468  		sem_unlock(sma, -1);
1469  		rcu_read_unlock();
1470  		err = 0;
1471  		if (copy_to_user(array, sem_io, nsems*sizeof(ushort)))
1472  			err = -EFAULT;
1473  		goto out_free;
1474  	}
1475  	case SETALL:
1476  	{
1477  		int i;
1478  		struct sem_undo *un;
1479  
1480  		if (!ipc_rcu_getref(&sma->sem_perm)) {
1481  			err = -EIDRM;
1482  			goto out_rcu_wakeup;
1483  		}
1484  		rcu_read_unlock();
1485  
1486  		if (nsems > SEMMSL_FAST) {
1487  			sem_io = kvmalloc_array(nsems, sizeof(ushort),
1488  						GFP_KERNEL);
1489  			if (sem_io == NULL) {
1490  				ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
1491  				return -ENOMEM;
1492  			}
1493  		}
1494  
1495  		if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) {
1496  			ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
1497  			err = -EFAULT;
1498  			goto out_free;
1499  		}
1500  
1501  		for (i = 0; i < nsems; i++) {
1502  			if (sem_io[i] > SEMVMX) {
1503  				ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
1504  				err = -ERANGE;
1505  				goto out_free;
1506  			}
1507  		}
1508  		rcu_read_lock();
1509  		sem_lock_and_putref(sma);
1510  		if (!ipc_valid_object(&sma->sem_perm)) {
1511  			err = -EIDRM;
1512  			goto out_unlock;
1513  		}
1514  
1515  		for (i = 0; i < nsems; i++) {
1516  			sma->sems[i].semval = sem_io[i];
1517  			ipc_update_pid(&sma->sems[i].sempid, task_tgid(current));
1518  		}
1519  
1520  		ipc_assert_locked_object(&sma->sem_perm);
1521  		list_for_each_entry(un, &sma->list_id, list_id) {
1522  			for (i = 0; i < nsems; i++)
1523  				un->semadj[i] = 0;
1524  		}
1525  		sma->sem_ctime = ktime_get_real_seconds();
1526  		/* maybe some queued-up processes were waiting for this */
1527  		do_smart_update(sma, NULL, 0, 0, &wake_q);
1528  		err = 0;
1529  		goto out_unlock;
1530  	}
1531  	/* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
1532  	}
1533  	err = -EINVAL;
1534  	if (semnum < 0 || semnum >= nsems)
1535  		goto out_rcu_wakeup;
1536  
1537  	sem_lock(sma, NULL, -1);
1538  	if (!ipc_valid_object(&sma->sem_perm)) {
1539  		err = -EIDRM;
1540  		goto out_unlock;
1541  	}
1542  
1543  	semnum = array_index_nospec(semnum, nsems);
1544  	curr = &sma->sems[semnum];
1545  
1546  	switch (cmd) {
1547  	case GETVAL:
1548  		err = curr->semval;
1549  		goto out_unlock;
1550  	case GETPID:
1551  		err = pid_vnr(curr->sempid);
1552  		goto out_unlock;
1553  	case GETNCNT:
1554  		err = count_semcnt(sma, semnum, 0);
1555  		goto out_unlock;
1556  	case GETZCNT:
1557  		err = count_semcnt(sma, semnum, 1);
1558  		goto out_unlock;
1559  	}
1560  
1561  out_unlock:
1562  	sem_unlock(sma, -1);
1563  out_rcu_wakeup:
1564  	rcu_read_unlock();
1565  	wake_up_q(&wake_q);
1566  out_free:
1567  	if (sem_io != fast_sem_io)
1568  		kvfree(sem_io);
1569  	return err;
1570  }
1571  
1572  static inline unsigned long
1573  copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
1574  {
1575  	switch (version) {
1576  	case IPC_64:
1577  		if (copy_from_user(out, buf, sizeof(*out)))
1578  			return -EFAULT;
1579  		return 0;
1580  	case IPC_OLD:
1581  	    {
1582  		struct semid_ds tbuf_old;
1583  
1584  		if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
1585  			return -EFAULT;
1586  
1587  		out->sem_perm.uid	= tbuf_old.sem_perm.uid;
1588  		out->sem_perm.gid	= tbuf_old.sem_perm.gid;
1589  		out->sem_perm.mode	= tbuf_old.sem_perm.mode;
1590  
1591  		return 0;
1592  	    }
1593  	default:
1594  		return -EINVAL;
1595  	}
1596  }
1597  
1598  /*
1599   * This function handles some semctl commands which require the rwsem
1600   * to be held in write mode.
1601   * NOTE: no locks must be held, the rwsem is taken inside this function.
1602   */
1603  static int semctl_down(struct ipc_namespace *ns, int semid,
1604  		       int cmd, struct semid64_ds *semid64)
1605  {
1606  	struct sem_array *sma;
1607  	int err;
1608  	struct kern_ipc_perm *ipcp;
1609  
1610  	down_write(&sem_ids(ns).rwsem);
1611  	rcu_read_lock();
1612  
1613  	ipcp = ipcctl_obtain_check(ns, &sem_ids(ns), semid, cmd,
1614  				      &semid64->sem_perm, 0);
1615  	if (IS_ERR(ipcp)) {
1616  		err = PTR_ERR(ipcp);
1617  		goto out_unlock1;
1618  	}
1619  
1620  	sma = container_of(ipcp, struct sem_array, sem_perm);
1621  
1622  	err = security_sem_semctl(&sma->sem_perm, cmd);
1623  	if (err)
1624  		goto out_unlock1;
1625  
1626  	switch (cmd) {
1627  	case IPC_RMID:
1628  		sem_lock(sma, NULL, -1);
1629  		/* freeary unlocks the ipc object and rcu */
1630  		freeary(ns, ipcp);
1631  		goto out_up;
1632  	case IPC_SET:
1633  		sem_lock(sma, NULL, -1);
1634  		err = ipc_update_perm(&semid64->sem_perm, ipcp);
1635  		if (err)
1636  			goto out_unlock0;
1637  		sma->sem_ctime = ktime_get_real_seconds();
1638  		break;
1639  	default:
1640  		err = -EINVAL;
1641  		goto out_unlock1;
1642  	}
1643  
1644  out_unlock0:
1645  	sem_unlock(sma, -1);
1646  out_unlock1:
1647  	rcu_read_unlock();
1648  out_up:
1649  	up_write(&sem_ids(ns).rwsem);
1650  	return err;
1651  }
1652  
1653  static long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg, int version)
1654  {
1655  	struct ipc_namespace *ns;
1656  	void __user *p = (void __user *)arg;
1657  	struct semid64_ds semid64;
1658  	int err;
1659  
1660  	if (semid < 0)
1661  		return -EINVAL;
1662  
1663  	ns = current->nsproxy->ipc_ns;
1664  
1665  	switch (cmd) {
1666  	case IPC_INFO:
1667  	case SEM_INFO:
1668  		return semctl_info(ns, semid, cmd, p);
1669  	case IPC_STAT:
1670  	case SEM_STAT:
1671  	case SEM_STAT_ANY:
1672  		err = semctl_stat(ns, semid, cmd, &semid64);
1673  		if (err < 0)
1674  			return err;
1675  		if (copy_semid_to_user(p, &semid64, version))
1676  			err = -EFAULT;
1677  		return err;
1678  	case GETALL:
1679  	case GETVAL:
1680  	case GETPID:
1681  	case GETNCNT:
1682  	case GETZCNT:
1683  	case SETALL:
1684  		return semctl_main(ns, semid, semnum, cmd, p);
1685  	case SETVAL: {
1686  		int val;
1687  #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
1688  		/* big-endian 64bit */
1689  		val = arg >> 32;
1690  #else
1691  		/* 32bit or little-endian 64bit */
1692  		val = arg;
1693  #endif
1694  		return semctl_setval(ns, semid, semnum, val);
1695  	}
1696  	case IPC_SET:
1697  		if (copy_semid_from_user(&semid64, p, version))
1698  			return -EFAULT;
1699  		fallthrough;
1700  	case IPC_RMID:
1701  		return semctl_down(ns, semid, cmd, &semid64);
1702  	default:
1703  		return -EINVAL;
1704  	}
1705  }
1706  
1707  SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
1708  {
1709  	return ksys_semctl(semid, semnum, cmd, arg, IPC_64);
1710  }
1711  
1712  #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
1713  long ksys_old_semctl(int semid, int semnum, int cmd, unsigned long arg)
1714  {
1715  	int version = ipc_parse_version(&cmd);
1716  
1717  	return ksys_semctl(semid, semnum, cmd, arg, version);
1718  }
1719  
1720  SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
1721  {
1722  	return ksys_old_semctl(semid, semnum, cmd, arg);
1723  }
1724  #endif
1725  
1726  #ifdef CONFIG_COMPAT
1727  
1728  struct compat_semid_ds {
1729  	struct compat_ipc_perm sem_perm;
1730  	old_time32_t sem_otime;
1731  	old_time32_t sem_ctime;
1732  	compat_uptr_t sem_base;
1733  	compat_uptr_t sem_pending;
1734  	compat_uptr_t sem_pending_last;
1735  	compat_uptr_t undo;
1736  	unsigned short sem_nsems;
1737  };
1738  
1739  static int copy_compat_semid_from_user(struct semid64_ds *out, void __user *buf,
1740  					int version)
1741  {
1742  	memset(out, 0, sizeof(*out));
1743  	if (version == IPC_64) {
1744  		struct compat_semid64_ds __user *p = buf;
1745  		return get_compat_ipc64_perm(&out->sem_perm, &p->sem_perm);
1746  	} else {
1747  		struct compat_semid_ds __user *p = buf;
1748  		return get_compat_ipc_perm(&out->sem_perm, &p->sem_perm);
1749  	}
1750  }
1751  
1752  static int copy_compat_semid_to_user(void __user *buf, struct semid64_ds *in,
1753  					int version)
1754  {
1755  	if (version == IPC_64) {
1756  		struct compat_semid64_ds v;
1757  		memset(&v, 0, sizeof(v));
1758  		to_compat_ipc64_perm(&v.sem_perm, &in->sem_perm);
1759  		v.sem_otime	 = lower_32_bits(in->sem_otime);
1760  		v.sem_otime_high = upper_32_bits(in->sem_otime);
1761  		v.sem_ctime	 = lower_32_bits(in->sem_ctime);
1762  		v.sem_ctime_high = upper_32_bits(in->sem_ctime);
1763  		v.sem_nsems = in->sem_nsems;
1764  		return copy_to_user(buf, &v, sizeof(v));
1765  	} else {
1766  		struct compat_semid_ds v;
1767  		memset(&v, 0, sizeof(v));
1768  		to_compat_ipc_perm(&v.sem_perm, &in->sem_perm);
1769  		v.sem_otime = in->sem_otime;
1770  		v.sem_ctime = in->sem_ctime;
1771  		v.sem_nsems = in->sem_nsems;
1772  		return copy_to_user(buf, &v, sizeof(v));
1773  	}
1774  }
1775  
1776  static long compat_ksys_semctl(int semid, int semnum, int cmd, int arg, int version)
1777  {
1778  	void __user *p = compat_ptr(arg);
1779  	struct ipc_namespace *ns;
1780  	struct semid64_ds semid64;
1781  	int err;
1782  
1783  	ns = current->nsproxy->ipc_ns;
1784  
1785  	if (semid < 0)
1786  		return -EINVAL;
1787  
1788  	switch (cmd & (~IPC_64)) {
1789  	case IPC_INFO:
1790  	case SEM_INFO:
1791  		return semctl_info(ns, semid, cmd, p);
1792  	case IPC_STAT:
1793  	case SEM_STAT:
1794  	case SEM_STAT_ANY:
1795  		err = semctl_stat(ns, semid, cmd, &semid64);
1796  		if (err < 0)
1797  			return err;
1798  		if (copy_compat_semid_to_user(p, &semid64, version))
1799  			err = -EFAULT;
1800  		return err;
1801  	case GETVAL:
1802  	case GETPID:
1803  	case GETNCNT:
1804  	case GETZCNT:
1805  	case GETALL:
1806  	case SETALL:
1807  		return semctl_main(ns, semid, semnum, cmd, p);
1808  	case SETVAL:
1809  		return semctl_setval(ns, semid, semnum, arg);
1810  	case IPC_SET:
1811  		if (copy_compat_semid_from_user(&semid64, p, version))
1812  			return -EFAULT;
1813  		fallthrough;
1814  	case IPC_RMID:
1815  		return semctl_down(ns, semid, cmd, &semid64);
1816  	default:
1817  		return -EINVAL;
1818  	}
1819  }
1820  
1821  COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg)
1822  {
1823  	return compat_ksys_semctl(semid, semnum, cmd, arg, IPC_64);
1824  }
1825  
1826  #ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION
1827  long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg)
1828  {
1829  	int version = compat_ipc_parse_version(&cmd);
1830  
1831  	return compat_ksys_semctl(semid, semnum, cmd, arg, version);
1832  }
1833  
1834  COMPAT_SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, int, arg)
1835  {
1836  	return compat_ksys_old_semctl(semid, semnum, cmd, arg);
1837  }
1838  #endif
1839  #endif
1840  
1841  /* If the task doesn't already have a undo_list, then allocate one
1842   * here.  We guarantee there is only one thread using this undo list,
1843   * and current is THE ONE
1844   *
1845   * If this allocation and assignment succeeds, but later
1846   * portions of this code fail, there is no need to free the sem_undo_list.
1847   * Just let it stay associated with the task, and it'll be freed later
1848   * at exit time.
1849   *
1850   * This can block, so callers must hold no locks.
1851   */
1852  static inline int get_undo_list(struct sem_undo_list **undo_listp)
1853  {
1854  	struct sem_undo_list *undo_list;
1855  
1856  	undo_list = current->sysvsem.undo_list;
1857  	if (!undo_list) {
1858  		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT);
1859  		if (undo_list == NULL)
1860  			return -ENOMEM;
1861  		spin_lock_init(&undo_list->lock);
1862  		refcount_set(&undo_list->refcnt, 1);
1863  		INIT_LIST_HEAD(&undo_list->list_proc);
1864  
1865  		current->sysvsem.undo_list = undo_list;
1866  	}
1867  	*undo_listp = undo_list;
1868  	return 0;
1869  }
1870  
1871  static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid)
1872  {
1873  	struct sem_undo *un;
1874  
1875  	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc,
1876  				spin_is_locked(&ulp->lock)) {
1877  		if (un->semid == semid)
1878  			return un;
1879  	}
1880  	return NULL;
1881  }
1882  
1883  static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
1884  {
1885  	struct sem_undo *un;
1886  
1887  	assert_spin_locked(&ulp->lock);
1888  
1889  	un = __lookup_undo(ulp, semid);
1890  	if (un) {
1891  		list_del_rcu(&un->list_proc);
1892  		list_add_rcu(&un->list_proc, &ulp->list_proc);
1893  	}
1894  	return un;
1895  }
1896  
1897  /**
1898   * find_alloc_undo - lookup (and if not present create) undo array
1899   * @ns: namespace
1900   * @semid: semaphore array id
1901   *
1902   * The function looks up (and if not present creates) the undo structure.
1903   * The size of the undo structure depends on the size of the semaphore
1904   * array, thus the alloc path is not that straightforward.
1905   * Lifetime-rules: sem_undo is rcu-protected, on success, the function
1906   * performs a rcu_read_lock().
1907   */
1908  static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
1909  {
1910  	struct sem_array *sma;
1911  	struct sem_undo_list *ulp;
1912  	struct sem_undo *un, *new;
1913  	int nsems, error;
1914  
1915  	error = get_undo_list(&ulp);
1916  	if (error)
1917  		return ERR_PTR(error);
1918  
1919  	rcu_read_lock();
1920  	spin_lock(&ulp->lock);
1921  	un = lookup_undo(ulp, semid);
1922  	spin_unlock(&ulp->lock);
1923  	if (likely(un != NULL))
1924  		goto out;
1925  
1926  	/* no undo structure around - allocate one. */
1927  	/* step 1: figure out the size of the semaphore array */
1928  	sma = sem_obtain_object_check(ns, semid);
1929  	if (IS_ERR(sma)) {
1930  		rcu_read_unlock();
1931  		return ERR_CAST(sma);
1932  	}
1933  
1934  	nsems = sma->sem_nsems;
1935  	if (!ipc_rcu_getref(&sma->sem_perm)) {
1936  		rcu_read_unlock();
1937  		un = ERR_PTR(-EIDRM);
1938  		goto out;
1939  	}
1940  	rcu_read_unlock();
1941  
1942  	/* step 2: allocate new undo structure */
1943  	new = kvzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems,
1944  		       GFP_KERNEL_ACCOUNT);
1945  	if (!new) {
1946  		ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
1947  		return ERR_PTR(-ENOMEM);
1948  	}
1949  
1950  	/* step 3: Acquire the lock on semaphore array */
1951  	rcu_read_lock();
1952  	sem_lock_and_putref(sma);
1953  	if (!ipc_valid_object(&sma->sem_perm)) {
1954  		sem_unlock(sma, -1);
1955  		rcu_read_unlock();
1956  		kvfree(new);
1957  		un = ERR_PTR(-EIDRM);
1958  		goto out;
1959  	}
1960  	spin_lock(&ulp->lock);
1961  
1962  	/*
1963  	 * step 4: check for races: did someone else allocate the undo struct?
1964  	 */
1965  	un = lookup_undo(ulp, semid);
1966  	if (un) {
1967  		kvfree(new);
1968  		goto success;
1969  	}
1970  	/* step 5: initialize & link new undo structure */
1971  	new->semadj = (short *) &new[1];
1972  	new->ulp = ulp;
1973  	new->semid = semid;
1974  	assert_spin_locked(&ulp->lock);
1975  	list_add_rcu(&new->list_proc, &ulp->list_proc);
1976  	ipc_assert_locked_object(&sma->sem_perm);
1977  	list_add(&new->list_id, &sma->list_id);
1978  	un = new;
1979  
1980  success:
1981  	spin_unlock(&ulp->lock);
1982  	sem_unlock(sma, -1);
1983  out:
1984  	return un;
1985  }
1986  
1987  long __do_semtimedop(int semid, struct sembuf *sops,
1988  		unsigned nsops, const struct timespec64 *timeout,
1989  		struct ipc_namespace *ns)
1990  {
1991  	int error = -EINVAL;
1992  	struct sem_array *sma;
1993  	struct sembuf *sop;
1994  	struct sem_undo *un;
1995  	int max, locknum;
1996  	bool undos = false, alter = false, dupsop = false;
1997  	struct sem_queue queue;
1998  	unsigned long dup = 0, jiffies_left = 0;
1999  
2000  	if (nsops < 1 || semid < 0)
2001  		return -EINVAL;
2002  	if (nsops > ns->sc_semopm)
2003  		return -E2BIG;
2004  
2005  	if (timeout) {
2006  		if (timeout->tv_sec < 0 || timeout->tv_nsec < 0 ||
2007  			timeout->tv_nsec >= 1000000000L) {
2008  			error = -EINVAL;
2009  			goto out;
2010  		}
2011  		jiffies_left = timespec64_to_jiffies(timeout);
2012  	}
2013  
2014  
2015  	max = 0;
2016  	for (sop = sops; sop < sops + nsops; sop++) {
2017  		unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG);
2018  
2019  		if (sop->sem_num >= max)
2020  			max = sop->sem_num;
2021  		if (sop->sem_flg & SEM_UNDO)
2022  			undos = true;
2023  		if (dup & mask) {
2024  			/*
2025  			 * There was a previous alter access that appears
2026  			 * to have accessed the same semaphore, thus use
2027  			 * the dupsop logic. "appears", because the detection
2028  			 * can only check % BITS_PER_LONG.
2029  			 */
2030  			dupsop = true;
2031  		}
2032  		if (sop->sem_op != 0) {
2033  			alter = true;
2034  			dup |= mask;
2035  		}
2036  	}
2037  
2038  	if (undos) {
2039  		/* On success, find_alloc_undo takes the rcu_read_lock */
2040  		un = find_alloc_undo(ns, semid);
2041  		if (IS_ERR(un)) {
2042  			error = PTR_ERR(un);
2043  			goto out;
2044  		}
2045  	} else {
2046  		un = NULL;
2047  		rcu_read_lock();
2048  	}
2049  
2050  	sma = sem_obtain_object_check(ns, semid);
2051  	if (IS_ERR(sma)) {
2052  		rcu_read_unlock();
2053  		error = PTR_ERR(sma);
2054  		goto out;
2055  	}
2056  
2057  	error = -EFBIG;
2058  	if (max >= sma->sem_nsems) {
2059  		rcu_read_unlock();
2060  		goto out;
2061  	}
2062  
2063  	error = -EACCES;
2064  	if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) {
2065  		rcu_read_unlock();
2066  		goto out;
2067  	}
2068  
2069  	error = security_sem_semop(&sma->sem_perm, sops, nsops, alter);
2070  	if (error) {
2071  		rcu_read_unlock();
2072  		goto out;
2073  	}
2074  
2075  	error = -EIDRM;
2076  	locknum = sem_lock(sma, sops, nsops);
2077  	/*
2078  	 * We eventually might perform the following check in a lockless
2079  	 * fashion, considering ipc_valid_object() locking constraints.
2080  	 * If nsops == 1 and there is no contention for sem_perm.lock, then
2081  	 * only a per-semaphore lock is held and it's OK to proceed with the
2082  	 * check below. More details on the fine grained locking scheme
2083  	 * entangled here and why it's RMID race safe on comments at sem_lock()
2084  	 */
2085  	if (!ipc_valid_object(&sma->sem_perm))
2086  		goto out_unlock;
2087  	/*
2088  	 * semid identifiers are not unique - find_alloc_undo may have
2089  	 * allocated an undo structure, it was invalidated by an RMID
2090  	 * and now a new array with received the same id. Check and fail.
2091  	 * This case can be detected checking un->semid. The existence of
2092  	 * "un" itself is guaranteed by rcu.
2093  	 */
2094  	if (un && un->semid == -1)
2095  		goto out_unlock;
2096  
2097  	queue.sops = sops;
2098  	queue.nsops = nsops;
2099  	queue.undo = un;
2100  	queue.pid = task_tgid(current);
2101  	queue.alter = alter;
2102  	queue.dupsop = dupsop;
2103  
2104  	error = perform_atomic_semop(sma, &queue);
2105  	if (error == 0) { /* non-blocking successful path */
2106  		DEFINE_WAKE_Q(wake_q);
2107  
2108  		/*
2109  		 * If the operation was successful, then do
2110  		 * the required updates.
2111  		 */
2112  		if (alter)
2113  			do_smart_update(sma, sops, nsops, 1, &wake_q);
2114  		else
2115  			set_semotime(sma, sops);
2116  
2117  		sem_unlock(sma, locknum);
2118  		rcu_read_unlock();
2119  		wake_up_q(&wake_q);
2120  
2121  		goto out;
2122  	}
2123  	if (error < 0) /* non-blocking error path */
2124  		goto out_unlock;
2125  
2126  	/*
2127  	 * We need to sleep on this operation, so we put the current
2128  	 * task into the pending queue and go to sleep.
2129  	 */
2130  	if (nsops == 1) {
2131  		struct sem *curr;
2132  		int idx = array_index_nospec(sops->sem_num, sma->sem_nsems);
2133  		curr = &sma->sems[idx];
2134  
2135  		if (alter) {
2136  			if (sma->complex_count) {
2137  				list_add_tail(&queue.list,
2138  						&sma->pending_alter);
2139  			} else {
2140  
2141  				list_add_tail(&queue.list,
2142  						&curr->pending_alter);
2143  			}
2144  		} else {
2145  			list_add_tail(&queue.list, &curr->pending_const);
2146  		}
2147  	} else {
2148  		if (!sma->complex_count)
2149  			merge_queues(sma);
2150  
2151  		if (alter)
2152  			list_add_tail(&queue.list, &sma->pending_alter);
2153  		else
2154  			list_add_tail(&queue.list, &sma->pending_const);
2155  
2156  		sma->complex_count++;
2157  	}
2158  
2159  	do {
2160  		/* memory ordering ensured by the lock in sem_lock() */
2161  		WRITE_ONCE(queue.status, -EINTR);
2162  		queue.sleeper = current;
2163  
2164  		/* memory ordering is ensured by the lock in sem_lock() */
2165  		__set_current_state(TASK_INTERRUPTIBLE);
2166  		sem_unlock(sma, locknum);
2167  		rcu_read_unlock();
2168  
2169  		if (timeout)
2170  			jiffies_left = schedule_timeout(jiffies_left);
2171  		else
2172  			schedule();
2173  
2174  		/*
2175  		 * fastpath: the semop has completed, either successfully or
2176  		 * not, from the syscall pov, is quite irrelevant to us at this
2177  		 * point; we're done.
2178  		 *
2179  		 * We _do_ care, nonetheless, about being awoken by a signal or
2180  		 * spuriously.  The queue.status is checked again in the
2181  		 * slowpath (aka after taking sem_lock), such that we can detect
2182  		 * scenarios where we were awakened externally, during the
2183  		 * window between wake_q_add() and wake_up_q().
2184  		 */
2185  		error = READ_ONCE(queue.status);
2186  		if (error != -EINTR) {
2187  			/* see SEM_BARRIER_2 for purpose/pairing */
2188  			smp_acquire__after_ctrl_dep();
2189  			goto out;
2190  		}
2191  
2192  		rcu_read_lock();
2193  		locknum = sem_lock(sma, sops, nsops);
2194  
2195  		if (!ipc_valid_object(&sma->sem_perm))
2196  			goto out_unlock;
2197  
2198  		/*
2199  		 * No necessity for any barrier: We are protect by sem_lock()
2200  		 */
2201  		error = READ_ONCE(queue.status);
2202  
2203  		/*
2204  		 * If queue.status != -EINTR we are woken up by another process.
2205  		 * Leave without unlink_queue(), but with sem_unlock().
2206  		 */
2207  		if (error != -EINTR)
2208  			goto out_unlock;
2209  
2210  		/*
2211  		 * If an interrupt occurred we have to clean up the queue.
2212  		 */
2213  		if (timeout && jiffies_left == 0)
2214  			error = -EAGAIN;
2215  	} while (error == -EINTR && !signal_pending(current)); /* spurious */
2216  
2217  	unlink_queue(sma, &queue);
2218  
2219  out_unlock:
2220  	sem_unlock(sma, locknum);
2221  	rcu_read_unlock();
2222  out:
2223  	return error;
2224  }
2225  
2226  static long do_semtimedop(int semid, struct sembuf __user *tsops,
2227  		unsigned nsops, const struct timespec64 *timeout)
2228  {
2229  	struct sembuf fast_sops[SEMOPM_FAST];
2230  	struct sembuf *sops = fast_sops;
2231  	struct ipc_namespace *ns;
2232  	int ret;
2233  
2234  	ns = current->nsproxy->ipc_ns;
2235  	if (nsops > ns->sc_semopm)
2236  		return -E2BIG;
2237  	if (nsops < 1)
2238  		return -EINVAL;
2239  
2240  	if (nsops > SEMOPM_FAST) {
2241  		sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL);
2242  		if (sops == NULL)
2243  			return -ENOMEM;
2244  	}
2245  
2246  	if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
2247  		ret =  -EFAULT;
2248  		goto out_free;
2249  	}
2250  
2251  	ret = __do_semtimedop(semid, sops, nsops, timeout, ns);
2252  
2253  out_free:
2254  	if (sops != fast_sops)
2255  		kvfree(sops);
2256  
2257  	return ret;
2258  }
2259  
2260  long ksys_semtimedop(int semid, struct sembuf __user *tsops,
2261  		     unsigned int nsops, const struct __kernel_timespec __user *timeout)
2262  {
2263  	if (timeout) {
2264  		struct timespec64 ts;
2265  		if (get_timespec64(&ts, timeout))
2266  			return -EFAULT;
2267  		return do_semtimedop(semid, tsops, nsops, &ts);
2268  	}
2269  	return do_semtimedop(semid, tsops, nsops, NULL);
2270  }
2271  
2272  SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
2273  		unsigned int, nsops, const struct __kernel_timespec __user *, timeout)
2274  {
2275  	return ksys_semtimedop(semid, tsops, nsops, timeout);
2276  }
2277  
2278  #ifdef CONFIG_COMPAT_32BIT_TIME
2279  long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems,
2280  			    unsigned int nsops,
2281  			    const struct old_timespec32 __user *timeout)
2282  {
2283  	if (timeout) {
2284  		struct timespec64 ts;
2285  		if (get_old_timespec32(&ts, timeout))
2286  			return -EFAULT;
2287  		return do_semtimedop(semid, tsems, nsops, &ts);
2288  	}
2289  	return do_semtimedop(semid, tsems, nsops, NULL);
2290  }
2291  
2292  SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems,
2293  		       unsigned int, nsops,
2294  		       const struct old_timespec32 __user *, timeout)
2295  {
2296  	return compat_ksys_semtimedop(semid, tsems, nsops, timeout);
2297  }
2298  #endif
2299  
2300  SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
2301  		unsigned, nsops)
2302  {
2303  	return do_semtimedop(semid, tsops, nsops, NULL);
2304  }
2305  
2306  /* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
2307   * parent and child tasks.
2308   */
2309  
2310  int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
2311  {
2312  	struct sem_undo_list *undo_list;
2313  	int error;
2314  
2315  	if (clone_flags & CLONE_SYSVSEM) {
2316  		error = get_undo_list(&undo_list);
2317  		if (error)
2318  			return error;
2319  		refcount_inc(&undo_list->refcnt);
2320  		tsk->sysvsem.undo_list = undo_list;
2321  	} else
2322  		tsk->sysvsem.undo_list = NULL;
2323  
2324  	return 0;
2325  }
2326  
2327  /*
2328   * add semadj values to semaphores, free undo structures.
2329   * undo structures are not freed when semaphore arrays are destroyed
2330   * so some of them may be out of date.
2331   * IMPLEMENTATION NOTE: There is some confusion over whether the
2332   * set of adjustments that needs to be done should be done in an atomic
2333   * manner or not. That is, if we are attempting to decrement the semval
2334   * should we queue up and wait until we can do so legally?
2335   * The original implementation attempted to do this (queue and wait).
2336   * The current implementation does not do so. The POSIX standard
2337   * and SVID should be consulted to determine what behavior is mandated.
2338   */
2339  void exit_sem(struct task_struct *tsk)
2340  {
2341  	struct sem_undo_list *ulp;
2342  
2343  	ulp = tsk->sysvsem.undo_list;
2344  	if (!ulp)
2345  		return;
2346  	tsk->sysvsem.undo_list = NULL;
2347  
2348  	if (!refcount_dec_and_test(&ulp->refcnt))
2349  		return;
2350  
2351  	for (;;) {
2352  		struct sem_array *sma;
2353  		struct sem_undo *un;
2354  		int semid, i;
2355  		DEFINE_WAKE_Q(wake_q);
2356  
2357  		cond_resched();
2358  
2359  		rcu_read_lock();
2360  		un = list_entry_rcu(ulp->list_proc.next,
2361  				    struct sem_undo, list_proc);
2362  		if (&un->list_proc == &ulp->list_proc) {
2363  			/*
2364  			 * We must wait for freeary() before freeing this ulp,
2365  			 * in case we raced with last sem_undo. There is a small
2366  			 * possibility where we exit while freeary() didn't
2367  			 * finish unlocking sem_undo_list.
2368  			 */
2369  			spin_lock(&ulp->lock);
2370  			spin_unlock(&ulp->lock);
2371  			rcu_read_unlock();
2372  			break;
2373  		}
2374  		spin_lock(&ulp->lock);
2375  		semid = un->semid;
2376  		spin_unlock(&ulp->lock);
2377  
2378  		/* exit_sem raced with IPC_RMID, nothing to do */
2379  		if (semid == -1) {
2380  			rcu_read_unlock();
2381  			continue;
2382  		}
2383  
2384  		sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, semid);
2385  		/* exit_sem raced with IPC_RMID, nothing to do */
2386  		if (IS_ERR(sma)) {
2387  			rcu_read_unlock();
2388  			continue;
2389  		}
2390  
2391  		sem_lock(sma, NULL, -1);
2392  		/* exit_sem raced with IPC_RMID, nothing to do */
2393  		if (!ipc_valid_object(&sma->sem_perm)) {
2394  			sem_unlock(sma, -1);
2395  			rcu_read_unlock();
2396  			continue;
2397  		}
2398  		un = __lookup_undo(ulp, semid);
2399  		if (un == NULL) {
2400  			/* exit_sem raced with IPC_RMID+semget() that created
2401  			 * exactly the same semid. Nothing to do.
2402  			 */
2403  			sem_unlock(sma, -1);
2404  			rcu_read_unlock();
2405  			continue;
2406  		}
2407  
2408  		/* remove un from the linked lists */
2409  		ipc_assert_locked_object(&sma->sem_perm);
2410  		list_del(&un->list_id);
2411  
2412  		spin_lock(&ulp->lock);
2413  		list_del_rcu(&un->list_proc);
2414  		spin_unlock(&ulp->lock);
2415  
2416  		/* perform adjustments registered in un */
2417  		for (i = 0; i < sma->sem_nsems; i++) {
2418  			struct sem *semaphore = &sma->sems[i];
2419  			if (un->semadj[i]) {
2420  				semaphore->semval += un->semadj[i];
2421  				/*
2422  				 * Range checks of the new semaphore value,
2423  				 * not defined by sus:
2424  				 * - Some unices ignore the undo entirely
2425  				 *   (e.g. HP UX 11i 11.22, Tru64 V5.1)
2426  				 * - some cap the value (e.g. FreeBSD caps
2427  				 *   at 0, but doesn't enforce SEMVMX)
2428  				 *
2429  				 * Linux caps the semaphore value, both at 0
2430  				 * and at SEMVMX.
2431  				 *
2432  				 *	Manfred <manfred@colorfullife.com>
2433  				 */
2434  				if (semaphore->semval < 0)
2435  					semaphore->semval = 0;
2436  				if (semaphore->semval > SEMVMX)
2437  					semaphore->semval = SEMVMX;
2438  				ipc_update_pid(&semaphore->sempid, task_tgid(current));
2439  			}
2440  		}
2441  		/* maybe some queued-up processes were waiting for this */
2442  		do_smart_update(sma, NULL, 0, 1, &wake_q);
2443  		sem_unlock(sma, -1);
2444  		rcu_read_unlock();
2445  		wake_up_q(&wake_q);
2446  
2447  		kvfree_rcu(un, rcu);
2448  	}
2449  	kfree(ulp);
2450  }
2451  
2452  #ifdef CONFIG_PROC_FS
2453  static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
2454  {
2455  	struct user_namespace *user_ns = seq_user_ns(s);
2456  	struct kern_ipc_perm *ipcp = it;
2457  	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
2458  	time64_t sem_otime;
2459  
2460  	/*
2461  	 * The proc interface isn't aware of sem_lock(), it calls
2462  	 * ipc_lock_object(), i.e. spin_lock(&sma->sem_perm.lock).
2463  	 * (in sysvipc_find_ipc)
2464  	 * In order to stay compatible with sem_lock(), we must
2465  	 * enter / leave complex_mode.
2466  	 */
2467  	complexmode_enter(sma);
2468  
2469  	sem_otime = get_semotime(sma);
2470  
2471  	seq_printf(s,
2472  		   "%10d %10d  %4o %10u %5u %5u %5u %5u %10llu %10llu\n",
2473  		   sma->sem_perm.key,
2474  		   sma->sem_perm.id,
2475  		   sma->sem_perm.mode,
2476  		   sma->sem_nsems,
2477  		   from_kuid_munged(user_ns, sma->sem_perm.uid),
2478  		   from_kgid_munged(user_ns, sma->sem_perm.gid),
2479  		   from_kuid_munged(user_ns, sma->sem_perm.cuid),
2480  		   from_kgid_munged(user_ns, sma->sem_perm.cgid),
2481  		   sem_otime,
2482  		   sma->sem_ctime);
2483  
2484  	complexmode_tryleave(sma);
2485  
2486  	return 0;
2487  }
2488  #endif
2489