1 /* 2 * linux/ipc/sem.c 3 * Copyright (C) 1992 Krishna Balasubramanian 4 * Copyright (C) 1995 Eric Schenk, Bruno Haible 5 * 6 * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com> 7 * 8 * SMP-threaded, sysctl's added 9 * (c) 1999 Manfred Spraul <manfred@colorfullife.com> 10 * Enforced range limit on SEM_UNDO 11 * (c) 2001 Red Hat Inc 12 * Lockless wakeup 13 * (c) 2003 Manfred Spraul <manfred@colorfullife.com> 14 * Further wakeup optimizations, documentation 15 * (c) 2010 Manfred Spraul <manfred@colorfullife.com> 16 * 17 * support for audit of ipc object properties and permission changes 18 * Dustin Kirkland <dustin.kirkland@us.ibm.com> 19 * 20 * namespaces support 21 * OpenVZ, SWsoft Inc. 22 * Pavel Emelianov <xemul@openvz.org> 23 * 24 * Implementation notes: (May 2010) 25 * This file implements System V semaphores. 26 * 27 * User space visible behavior: 28 * - FIFO ordering for semop() operations (just FIFO, not starvation 29 * protection) 30 * - multiple semaphore operations that alter the same semaphore in 31 * one semop() are handled. 32 * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and 33 * SETALL calls. 34 * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO. 35 * - undo adjustments at process exit are limited to 0..SEMVMX. 36 * - namespace are supported. 37 * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing 38 * to /proc/sys/kernel/sem. 39 * - statistics about the usage are reported in /proc/sysvipc/sem. 40 * 41 * Internals: 42 * - scalability: 43 * - all global variables are read-mostly. 44 * - semop() calls and semctl(RMID) are synchronized by RCU. 45 * - most operations do write operations (actually: spin_lock calls) to 46 * the per-semaphore array structure. 47 * Thus: Perfect SMP scaling between independent semaphore arrays. 48 * If multiple semaphores in one array are used, then cache line 49 * trashing on the semaphore array spinlock will limit the scaling. 50 * - semncnt and semzcnt are calculated on demand in count_semncnt() and 51 * count_semzcnt() 52 * - the task that performs a successful semop() scans the list of all 53 * sleeping tasks and completes any pending operations that can be fulfilled. 54 * Semaphores are actively given to waiting tasks (necessary for FIFO). 55 * (see update_queue()) 56 * - To improve the scalability, the actual wake-up calls are performed after 57 * dropping all locks. (see wake_up_sem_queue_prepare(), 58 * wake_up_sem_queue_do()) 59 * - All work is done by the waker, the woken up task does not have to do 60 * anything - not even acquiring a lock or dropping a refcount. 61 * - A woken up task may not even touch the semaphore array anymore, it may 62 * have been destroyed already by a semctl(RMID). 63 * - The synchronizations between wake-ups due to a timeout/signal and a 64 * wake-up due to a completed semaphore operation is achieved by using an 65 * intermediate state (IN_WAKEUP). 66 * - UNDO values are stored in an array (one per process and per 67 * semaphore array, lazily allocated). For backwards compatibility, multiple 68 * modes for the UNDO variables are supported (per process, per thread) 69 * (see copy_semundo, CLONE_SYSVSEM) 70 * - There are two lists of the pending operations: a per-array list 71 * and per-semaphore list (stored in the array). This allows to achieve FIFO 72 * ordering without always scanning all pending operations. 73 * The worst-case behavior is nevertheless O(N^2) for N wakeups. 74 */ 75 76 #include <linux/slab.h> 77 #include <linux/spinlock.h> 78 #include <linux/init.h> 79 #include <linux/proc_fs.h> 80 #include <linux/time.h> 81 #include <linux/security.h> 82 #include <linux/syscalls.h> 83 #include <linux/audit.h> 84 #include <linux/capability.h> 85 #include <linux/seq_file.h> 86 #include <linux/rwsem.h> 87 #include <linux/nsproxy.h> 88 #include <linux/ipc_namespace.h> 89 90 #include <asm/uaccess.h> 91 #include "util.h" 92 93 /* One semaphore structure for each semaphore in the system. */ 94 struct sem { 95 int semval; /* current value */ 96 int sempid; /* pid of last operation */ 97 spinlock_t lock; /* spinlock for fine-grained semtimedop */ 98 struct list_head pending_alter; /* pending single-sop operations */ 99 /* that alter the semaphore */ 100 struct list_head pending_const; /* pending single-sop operations */ 101 /* that do not alter the semaphore*/ 102 time_t sem_otime; /* candidate for sem_otime */ 103 } ____cacheline_aligned_in_smp; 104 105 /* One queue for each sleeping process in the system. */ 106 struct sem_queue { 107 struct list_head list; /* queue of pending operations */ 108 struct task_struct *sleeper; /* this process */ 109 struct sem_undo *undo; /* undo structure */ 110 int pid; /* process id of requesting process */ 111 int status; /* completion status of operation */ 112 struct sembuf *sops; /* array of pending operations */ 113 int nsops; /* number of operations */ 114 int alter; /* does *sops alter the array? */ 115 }; 116 117 /* Each task has a list of undo requests. They are executed automatically 118 * when the process exits. 119 */ 120 struct sem_undo { 121 struct list_head list_proc; /* per-process list: * 122 * all undos from one process 123 * rcu protected */ 124 struct rcu_head rcu; /* rcu struct for sem_undo */ 125 struct sem_undo_list *ulp; /* back ptr to sem_undo_list */ 126 struct list_head list_id; /* per semaphore array list: 127 * all undos for one array */ 128 int semid; /* semaphore set identifier */ 129 short *semadj; /* array of adjustments */ 130 /* one per semaphore */ 131 }; 132 133 /* sem_undo_list controls shared access to the list of sem_undo structures 134 * that may be shared among all a CLONE_SYSVSEM task group. 135 */ 136 struct sem_undo_list { 137 atomic_t refcnt; 138 spinlock_t lock; 139 struct list_head list_proc; 140 }; 141 142 143 #define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS]) 144 145 #define sem_checkid(sma, semid) ipc_checkid(&sma->sem_perm, semid) 146 147 static int newary(struct ipc_namespace *, struct ipc_params *); 148 static void freeary(struct ipc_namespace *, struct kern_ipc_perm *); 149 #ifdef CONFIG_PROC_FS 150 static int sysvipc_sem_proc_show(struct seq_file *s, void *it); 151 #endif 152 153 #define SEMMSL_FAST 256 /* 512 bytes on stack */ 154 #define SEMOPM_FAST 64 /* ~ 372 bytes on stack */ 155 156 /* 157 * Locking: 158 * sem_undo.id_next, 159 * sem_array.complex_count, 160 * sem_array.pending{_alter,_cont}, 161 * sem_array.sem_undo: global sem_lock() for read/write 162 * sem_undo.proc_next: only "current" is allowed to read/write that field. 163 * 164 * sem_array.sem_base[i].pending_{const,alter}: 165 * global or semaphore sem_lock() for read/write 166 */ 167 168 #define sc_semmsl sem_ctls[0] 169 #define sc_semmns sem_ctls[1] 170 #define sc_semopm sem_ctls[2] 171 #define sc_semmni sem_ctls[3] 172 173 void sem_init_ns(struct ipc_namespace *ns) 174 { 175 ns->sc_semmsl = SEMMSL; 176 ns->sc_semmns = SEMMNS; 177 ns->sc_semopm = SEMOPM; 178 ns->sc_semmni = SEMMNI; 179 ns->used_sems = 0; 180 ipc_init_ids(&ns->ids[IPC_SEM_IDS]); 181 } 182 183 #ifdef CONFIG_IPC_NS 184 void sem_exit_ns(struct ipc_namespace *ns) 185 { 186 free_ipcs(ns, &sem_ids(ns), freeary); 187 idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr); 188 } 189 #endif 190 191 void __init sem_init (void) 192 { 193 sem_init_ns(&init_ipc_ns); 194 ipc_init_proc_interface("sysvipc/sem", 195 " key semid perms nsems uid gid cuid cgid otime ctime\n", 196 IPC_SEM_IDS, sysvipc_sem_proc_show); 197 } 198 199 /** 200 * unmerge_queues - unmerge queues, if possible. 201 * @sma: semaphore array 202 * 203 * The function unmerges the wait queues if complex_count is 0. 204 * It must be called prior to dropping the global semaphore array lock. 205 */ 206 static void unmerge_queues(struct sem_array *sma) 207 { 208 struct sem_queue *q, *tq; 209 210 /* complex operations still around? */ 211 if (sma->complex_count) 212 return; 213 /* 214 * We will switch back to simple mode. 215 * Move all pending operation back into the per-semaphore 216 * queues. 217 */ 218 list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { 219 struct sem *curr; 220 curr = &sma->sem_base[q->sops[0].sem_num]; 221 222 list_add_tail(&q->list, &curr->pending_alter); 223 } 224 INIT_LIST_HEAD(&sma->pending_alter); 225 } 226 227 /** 228 * merge_queues - Merge single semop queues into global queue 229 * @sma: semaphore array 230 * 231 * This function merges all per-semaphore queues into the global queue. 232 * It is necessary to achieve FIFO ordering for the pending single-sop 233 * operations when a multi-semop operation must sleep. 234 * Only the alter operations must be moved, the const operations can stay. 235 */ 236 static void merge_queues(struct sem_array *sma) 237 { 238 int i; 239 for (i = 0; i < sma->sem_nsems; i++) { 240 struct sem *sem = sma->sem_base + i; 241 242 list_splice_init(&sem->pending_alter, &sma->pending_alter); 243 } 244 } 245 246 static void sem_rcu_free(struct rcu_head *head) 247 { 248 struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); 249 struct sem_array *sma = ipc_rcu_to_struct(p); 250 251 security_sem_free(sma); 252 ipc_rcu_free(head); 253 } 254 255 /* 256 * If the request contains only one semaphore operation, and there are 257 * no complex transactions pending, lock only the semaphore involved. 258 * Otherwise, lock the entire semaphore array, since we either have 259 * multiple semaphores in our own semops, or we need to look at 260 * semaphores from other pending complex operations. 261 * 262 * Carefully guard against sma->complex_count changing between zero 263 * and non-zero while we are spinning for the lock. The value of 264 * sma->complex_count cannot change while we are holding the lock, 265 * so sem_unlock should be fine. 266 * 267 * The global lock path checks that all the local locks have been released, 268 * checking each local lock once. This means that the local lock paths 269 * cannot start their critical sections while the global lock is held. 270 */ 271 static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, 272 int nsops) 273 { 274 int locknum; 275 again: 276 if (nsops == 1 && !sma->complex_count) { 277 struct sem *sem = sma->sem_base + sops->sem_num; 278 279 /* Lock just the semaphore we are interested in. */ 280 spin_lock(&sem->lock); 281 282 /* 283 * If sma->complex_count was set while we were spinning, 284 * we may need to look at things we did not lock here. 285 */ 286 if (unlikely(sma->complex_count)) { 287 spin_unlock(&sem->lock); 288 goto lock_array; 289 } 290 291 /* 292 * Another process is holding the global lock on the 293 * sem_array; we cannot enter our critical section, 294 * but have to wait for the global lock to be released. 295 */ 296 if (unlikely(spin_is_locked(&sma->sem_perm.lock))) { 297 spin_unlock(&sem->lock); 298 spin_unlock_wait(&sma->sem_perm.lock); 299 goto again; 300 } 301 302 locknum = sops->sem_num; 303 } else { 304 int i; 305 /* 306 * Lock the semaphore array, and wait for all of the 307 * individual semaphore locks to go away. The code 308 * above ensures no new single-lock holders will enter 309 * their critical section while the array lock is held. 310 */ 311 lock_array: 312 ipc_lock_object(&sma->sem_perm); 313 for (i = 0; i < sma->sem_nsems; i++) { 314 struct sem *sem = sma->sem_base + i; 315 spin_unlock_wait(&sem->lock); 316 } 317 locknum = -1; 318 } 319 return locknum; 320 } 321 322 static inline void sem_unlock(struct sem_array *sma, int locknum) 323 { 324 if (locknum == -1) { 325 unmerge_queues(sma); 326 ipc_unlock_object(&sma->sem_perm); 327 } else { 328 struct sem *sem = sma->sem_base + locknum; 329 spin_unlock(&sem->lock); 330 } 331 } 332 333 /* 334 * sem_lock_(check_) routines are called in the paths where the rwsem 335 * is not held. 336 * 337 * The caller holds the RCU read lock. 338 */ 339 static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, 340 int id, struct sembuf *sops, int nsops, int *locknum) 341 { 342 struct kern_ipc_perm *ipcp; 343 struct sem_array *sma; 344 345 ipcp = ipc_obtain_object(&sem_ids(ns), id); 346 if (IS_ERR(ipcp)) 347 return ERR_CAST(ipcp); 348 349 sma = container_of(ipcp, struct sem_array, sem_perm); 350 *locknum = sem_lock(sma, sops, nsops); 351 352 /* ipc_rmid() may have already freed the ID while sem_lock 353 * was spinning: verify that the structure is still valid 354 */ 355 if (!ipcp->deleted) 356 return container_of(ipcp, struct sem_array, sem_perm); 357 358 sem_unlock(sma, *locknum); 359 return ERR_PTR(-EINVAL); 360 } 361 362 static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) 363 { 364 struct kern_ipc_perm *ipcp = ipc_obtain_object(&sem_ids(ns), id); 365 366 if (IS_ERR(ipcp)) 367 return ERR_CAST(ipcp); 368 369 return container_of(ipcp, struct sem_array, sem_perm); 370 } 371 372 static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns, 373 int id) 374 { 375 struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id); 376 377 if (IS_ERR(ipcp)) 378 return ERR_CAST(ipcp); 379 380 return container_of(ipcp, struct sem_array, sem_perm); 381 } 382 383 static inline void sem_lock_and_putref(struct sem_array *sma) 384 { 385 sem_lock(sma, NULL, -1); 386 ipc_rcu_putref(sma, ipc_rcu_free); 387 } 388 389 static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) 390 { 391 ipc_rmid(&sem_ids(ns), &s->sem_perm); 392 } 393 394 /* 395 * Lockless wakeup algorithm: 396 * Without the check/retry algorithm a lockless wakeup is possible: 397 * - queue.status is initialized to -EINTR before blocking. 398 * - wakeup is performed by 399 * * unlinking the queue entry from the pending list 400 * * setting queue.status to IN_WAKEUP 401 * This is the notification for the blocked thread that a 402 * result value is imminent. 403 * * call wake_up_process 404 * * set queue.status to the final value. 405 * - the previously blocked thread checks queue.status: 406 * * if it's IN_WAKEUP, then it must wait until the value changes 407 * * if it's not -EINTR, then the operation was completed by 408 * update_queue. semtimedop can return queue.status without 409 * performing any operation on the sem array. 410 * * otherwise it must acquire the spinlock and check what's up. 411 * 412 * The two-stage algorithm is necessary to protect against the following 413 * races: 414 * - if queue.status is set after wake_up_process, then the woken up idle 415 * thread could race forward and try (and fail) to acquire sma->lock 416 * before update_queue had a chance to set queue.status 417 * - if queue.status is written before wake_up_process and if the 418 * blocked process is woken up by a signal between writing 419 * queue.status and the wake_up_process, then the woken up 420 * process could return from semtimedop and die by calling 421 * sys_exit before wake_up_process is called. Then wake_up_process 422 * will oops, because the task structure is already invalid. 423 * (yes, this happened on s390 with sysv msg). 424 * 425 */ 426 #define IN_WAKEUP 1 427 428 /** 429 * newary - Create a new semaphore set 430 * @ns: namespace 431 * @params: ptr to the structure that contains key, semflg and nsems 432 * 433 * Called with sem_ids.rwsem held (as a writer) 434 */ 435 436 static int newary(struct ipc_namespace *ns, struct ipc_params *params) 437 { 438 int id; 439 int retval; 440 struct sem_array *sma; 441 int size; 442 key_t key = params->key; 443 int nsems = params->u.nsems; 444 int semflg = params->flg; 445 int i; 446 447 if (!nsems) 448 return -EINVAL; 449 if (ns->used_sems + nsems > ns->sc_semmns) 450 return -ENOSPC; 451 452 size = sizeof (*sma) + nsems * sizeof (struct sem); 453 sma = ipc_rcu_alloc(size); 454 if (!sma) { 455 return -ENOMEM; 456 } 457 memset (sma, 0, size); 458 459 sma->sem_perm.mode = (semflg & S_IRWXUGO); 460 sma->sem_perm.key = key; 461 462 sma->sem_perm.security = NULL; 463 retval = security_sem_alloc(sma); 464 if (retval) { 465 ipc_rcu_putref(sma, ipc_rcu_free); 466 return retval; 467 } 468 469 id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); 470 if (id < 0) { 471 ipc_rcu_putref(sma, sem_rcu_free); 472 return id; 473 } 474 ns->used_sems += nsems; 475 476 sma->sem_base = (struct sem *) &sma[1]; 477 478 for (i = 0; i < nsems; i++) { 479 INIT_LIST_HEAD(&sma->sem_base[i].pending_alter); 480 INIT_LIST_HEAD(&sma->sem_base[i].pending_const); 481 spin_lock_init(&sma->sem_base[i].lock); 482 } 483 484 sma->complex_count = 0; 485 INIT_LIST_HEAD(&sma->pending_alter); 486 INIT_LIST_HEAD(&sma->pending_const); 487 INIT_LIST_HEAD(&sma->list_id); 488 sma->sem_nsems = nsems; 489 sma->sem_ctime = get_seconds(); 490 sem_unlock(sma, -1); 491 rcu_read_unlock(); 492 493 return sma->sem_perm.id; 494 } 495 496 497 /* 498 * Called with sem_ids.rwsem and ipcp locked. 499 */ 500 static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg) 501 { 502 struct sem_array *sma; 503 504 sma = container_of(ipcp, struct sem_array, sem_perm); 505 return security_sem_associate(sma, semflg); 506 } 507 508 /* 509 * Called with sem_ids.rwsem and ipcp locked. 510 */ 511 static inline int sem_more_checks(struct kern_ipc_perm *ipcp, 512 struct ipc_params *params) 513 { 514 struct sem_array *sma; 515 516 sma = container_of(ipcp, struct sem_array, sem_perm); 517 if (params->u.nsems > sma->sem_nsems) 518 return -EINVAL; 519 520 return 0; 521 } 522 523 SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) 524 { 525 struct ipc_namespace *ns; 526 struct ipc_ops sem_ops; 527 struct ipc_params sem_params; 528 529 ns = current->nsproxy->ipc_ns; 530 531 if (nsems < 0 || nsems > ns->sc_semmsl) 532 return -EINVAL; 533 534 sem_ops.getnew = newary; 535 sem_ops.associate = sem_security; 536 sem_ops.more_checks = sem_more_checks; 537 538 sem_params.key = key; 539 sem_params.flg = semflg; 540 sem_params.u.nsems = nsems; 541 542 return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); 543 } 544 545 /** perform_atomic_semop - Perform (if possible) a semaphore operation 546 * @sma: semaphore array 547 * @sops: array with operations that should be checked 548 * @nsems: number of sops 549 * @un: undo array 550 * @pid: pid that did the change 551 * 552 * Returns 0 if the operation was possible. 553 * Returns 1 if the operation is impossible, the caller must sleep. 554 * Negative values are error codes. 555 */ 556 557 static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops, 558 int nsops, struct sem_undo *un, int pid) 559 { 560 int result, sem_op; 561 struct sembuf *sop; 562 struct sem * curr; 563 564 for (sop = sops; sop < sops + nsops; sop++) { 565 curr = sma->sem_base + sop->sem_num; 566 sem_op = sop->sem_op; 567 result = curr->semval; 568 569 if (!sem_op && result) 570 goto would_block; 571 572 result += sem_op; 573 if (result < 0) 574 goto would_block; 575 if (result > SEMVMX) 576 goto out_of_range; 577 if (sop->sem_flg & SEM_UNDO) { 578 int undo = un->semadj[sop->sem_num] - sem_op; 579 /* 580 * Exceeding the undo range is an error. 581 */ 582 if (undo < (-SEMAEM - 1) || undo > SEMAEM) 583 goto out_of_range; 584 } 585 curr->semval = result; 586 } 587 588 sop--; 589 while (sop >= sops) { 590 sma->sem_base[sop->sem_num].sempid = pid; 591 if (sop->sem_flg & SEM_UNDO) 592 un->semadj[sop->sem_num] -= sop->sem_op; 593 sop--; 594 } 595 596 return 0; 597 598 out_of_range: 599 result = -ERANGE; 600 goto undo; 601 602 would_block: 603 if (sop->sem_flg & IPC_NOWAIT) 604 result = -EAGAIN; 605 else 606 result = 1; 607 608 undo: 609 sop--; 610 while (sop >= sops) { 611 sma->sem_base[sop->sem_num].semval -= sop->sem_op; 612 sop--; 613 } 614 615 return result; 616 } 617 618 /** wake_up_sem_queue_prepare(q, error): Prepare wake-up 619 * @q: queue entry that must be signaled 620 * @error: Error value for the signal 621 * 622 * Prepare the wake-up of the queue entry q. 623 */ 624 static void wake_up_sem_queue_prepare(struct list_head *pt, 625 struct sem_queue *q, int error) 626 { 627 if (list_empty(pt)) { 628 /* 629 * Hold preempt off so that we don't get preempted and have the 630 * wakee busy-wait until we're scheduled back on. 631 */ 632 preempt_disable(); 633 } 634 q->status = IN_WAKEUP; 635 q->pid = error; 636 637 list_add_tail(&q->list, pt); 638 } 639 640 /** 641 * wake_up_sem_queue_do(pt) - do the actual wake-up 642 * @pt: list of tasks to be woken up 643 * 644 * Do the actual wake-up. 645 * The function is called without any locks held, thus the semaphore array 646 * could be destroyed already and the tasks can disappear as soon as the 647 * status is set to the actual return code. 648 */ 649 static void wake_up_sem_queue_do(struct list_head *pt) 650 { 651 struct sem_queue *q, *t; 652 int did_something; 653 654 did_something = !list_empty(pt); 655 list_for_each_entry_safe(q, t, pt, list) { 656 wake_up_process(q->sleeper); 657 /* q can disappear immediately after writing q->status. */ 658 smp_wmb(); 659 q->status = q->pid; 660 } 661 if (did_something) 662 preempt_enable(); 663 } 664 665 static void unlink_queue(struct sem_array *sma, struct sem_queue *q) 666 { 667 list_del(&q->list); 668 if (q->nsops > 1) 669 sma->complex_count--; 670 } 671 672 /** check_restart(sma, q) 673 * @sma: semaphore array 674 * @q: the operation that just completed 675 * 676 * update_queue is O(N^2) when it restarts scanning the whole queue of 677 * waiting operations. Therefore this function checks if the restart is 678 * really necessary. It is called after a previously waiting operation 679 * modified the array. 680 * Note that wait-for-zero operations are handled without restart. 681 */ 682 static int check_restart(struct sem_array *sma, struct sem_queue *q) 683 { 684 /* pending complex alter operations are too difficult to analyse */ 685 if (!list_empty(&sma->pending_alter)) 686 return 1; 687 688 /* we were a sleeping complex operation. Too difficult */ 689 if (q->nsops > 1) 690 return 1; 691 692 /* It is impossible that someone waits for the new value: 693 * - complex operations always restart. 694 * - wait-for-zero are handled seperately. 695 * - q is a previously sleeping simple operation that 696 * altered the array. It must be a decrement, because 697 * simple increments never sleep. 698 * - If there are older (higher priority) decrements 699 * in the queue, then they have observed the original 700 * semval value and couldn't proceed. The operation 701 * decremented to value - thus they won't proceed either. 702 */ 703 return 0; 704 } 705 706 /** 707 * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks 708 * @sma: semaphore array. 709 * @semnum: semaphore that was modified. 710 * @pt: list head for the tasks that must be woken up. 711 * 712 * wake_const_ops must be called after a semaphore in a semaphore array 713 * was set to 0. If complex const operations are pending, wake_const_ops must 714 * be called with semnum = -1, as well as with the number of each modified 715 * semaphore. 716 * The tasks that must be woken up are added to @pt. The return code 717 * is stored in q->pid. 718 * The function returns 1 if at least one operation was completed successfully. 719 */ 720 static int wake_const_ops(struct sem_array *sma, int semnum, 721 struct list_head *pt) 722 { 723 struct sem_queue *q; 724 struct list_head *walk; 725 struct list_head *pending_list; 726 int semop_completed = 0; 727 728 if (semnum == -1) 729 pending_list = &sma->pending_const; 730 else 731 pending_list = &sma->sem_base[semnum].pending_const; 732 733 walk = pending_list->next; 734 while (walk != pending_list) { 735 int error; 736 737 q = container_of(walk, struct sem_queue, list); 738 walk = walk->next; 739 740 error = perform_atomic_semop(sma, q->sops, q->nsops, 741 q->undo, q->pid); 742 743 if (error <= 0) { 744 /* operation completed, remove from queue & wakeup */ 745 746 unlink_queue(sma, q); 747 748 wake_up_sem_queue_prepare(pt, q, error); 749 if (error == 0) 750 semop_completed = 1; 751 } 752 } 753 return semop_completed; 754 } 755 756 /** 757 * do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks 758 * @sma: semaphore array 759 * @sops: operations that were performed 760 * @nsops: number of operations 761 * @pt: list head of the tasks that must be woken up. 762 * 763 * do_smart_wakeup_zero() checks all required queue for wait-for-zero 764 * operations, based on the actual changes that were performed on the 765 * semaphore array. 766 * The function returns 1 if at least one operation was completed successfully. 767 */ 768 static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, 769 int nsops, struct list_head *pt) 770 { 771 int i; 772 int semop_completed = 0; 773 int got_zero = 0; 774 775 /* first: the per-semaphore queues, if known */ 776 if (sops) { 777 for (i = 0; i < nsops; i++) { 778 int num = sops[i].sem_num; 779 780 if (sma->sem_base[num].semval == 0) { 781 got_zero = 1; 782 semop_completed |= wake_const_ops(sma, num, pt); 783 } 784 } 785 } else { 786 /* 787 * No sops means modified semaphores not known. 788 * Assume all were changed. 789 */ 790 for (i = 0; i < sma->sem_nsems; i++) { 791 if (sma->sem_base[i].semval == 0) { 792 got_zero = 1; 793 semop_completed |= wake_const_ops(sma, i, pt); 794 } 795 } 796 } 797 /* 798 * If one of the modified semaphores got 0, 799 * then check the global queue, too. 800 */ 801 if (got_zero) 802 semop_completed |= wake_const_ops(sma, -1, pt); 803 804 return semop_completed; 805 } 806 807 808 /** 809 * update_queue(sma, semnum): Look for tasks that can be completed. 810 * @sma: semaphore array. 811 * @semnum: semaphore that was modified. 812 * @pt: list head for the tasks that must be woken up. 813 * 814 * update_queue must be called after a semaphore in a semaphore array 815 * was modified. If multiple semaphores were modified, update_queue must 816 * be called with semnum = -1, as well as with the number of each modified 817 * semaphore. 818 * The tasks that must be woken up are added to @pt. The return code 819 * is stored in q->pid. 820 * The function internally checks if const operations can now succeed. 821 * 822 * The function return 1 if at least one semop was completed successfully. 823 */ 824 static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) 825 { 826 struct sem_queue *q; 827 struct list_head *walk; 828 struct list_head *pending_list; 829 int semop_completed = 0; 830 831 if (semnum == -1) 832 pending_list = &sma->pending_alter; 833 else 834 pending_list = &sma->sem_base[semnum].pending_alter; 835 836 again: 837 walk = pending_list->next; 838 while (walk != pending_list) { 839 int error, restart; 840 841 q = container_of(walk, struct sem_queue, list); 842 walk = walk->next; 843 844 /* If we are scanning the single sop, per-semaphore list of 845 * one semaphore and that semaphore is 0, then it is not 846 * necessary to scan further: simple increments 847 * that affect only one entry succeed immediately and cannot 848 * be in the per semaphore pending queue, and decrements 849 * cannot be successful if the value is already 0. 850 */ 851 if (semnum != -1 && sma->sem_base[semnum].semval == 0) 852 break; 853 854 error = perform_atomic_semop(sma, q->sops, q->nsops, 855 q->undo, q->pid); 856 857 /* Does q->sleeper still need to sleep? */ 858 if (error > 0) 859 continue; 860 861 unlink_queue(sma, q); 862 863 if (error) { 864 restart = 0; 865 } else { 866 semop_completed = 1; 867 do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); 868 restart = check_restart(sma, q); 869 } 870 871 wake_up_sem_queue_prepare(pt, q, error); 872 if (restart) 873 goto again; 874 } 875 return semop_completed; 876 } 877 878 /** 879 * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue 880 * @sma: semaphore array 881 * @sops: operations that were performed 882 * @nsops: number of operations 883 * @otime: force setting otime 884 * @pt: list head of the tasks that must be woken up. 885 * 886 * do_smart_update() does the required calls to update_queue and wakeup_zero, 887 * based on the actual changes that were performed on the semaphore array. 888 * Note that the function does not do the actual wake-up: the caller is 889 * responsible for calling wake_up_sem_queue_do(@pt). 890 * It is safe to perform this call after dropping all locks. 891 */ 892 static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, 893 int otime, struct list_head *pt) 894 { 895 int i; 896 897 otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); 898 899 if (!list_empty(&sma->pending_alter)) { 900 /* semaphore array uses the global queue - just process it. */ 901 otime |= update_queue(sma, -1, pt); 902 } else { 903 if (!sops) { 904 /* 905 * No sops, thus the modified semaphores are not 906 * known. Check all. 907 */ 908 for (i = 0; i < sma->sem_nsems; i++) 909 otime |= update_queue(sma, i, pt); 910 } else { 911 /* 912 * Check the semaphores that were increased: 913 * - No complex ops, thus all sleeping ops are 914 * decrease. 915 * - if we decreased the value, then any sleeping 916 * semaphore ops wont be able to run: If the 917 * previous value was too small, then the new 918 * value will be too small, too. 919 */ 920 for (i = 0; i < nsops; i++) { 921 if (sops[i].sem_op > 0) { 922 otime |= update_queue(sma, 923 sops[i].sem_num, pt); 924 } 925 } 926 } 927 } 928 if (otime) { 929 if (sops == NULL) { 930 sma->sem_base[0].sem_otime = get_seconds(); 931 } else { 932 sma->sem_base[sops[0].sem_num].sem_otime = 933 get_seconds(); 934 } 935 } 936 } 937 938 939 /* The following counts are associated to each semaphore: 940 * semncnt number of tasks waiting on semval being nonzero 941 * semzcnt number of tasks waiting on semval being zero 942 * This model assumes that a task waits on exactly one semaphore. 943 * Since semaphore operations are to be performed atomically, tasks actually 944 * wait on a whole sequence of semaphores simultaneously. 945 * The counts we return here are a rough approximation, but still 946 * warrant that semncnt+semzcnt>0 if the task is on the pending queue. 947 */ 948 static int count_semncnt (struct sem_array * sma, ushort semnum) 949 { 950 int semncnt; 951 struct sem_queue * q; 952 953 semncnt = 0; 954 list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) { 955 struct sembuf * sops = q->sops; 956 BUG_ON(sops->sem_num != semnum); 957 if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT)) 958 semncnt++; 959 } 960 961 list_for_each_entry(q, &sma->pending_alter, list) { 962 struct sembuf * sops = q->sops; 963 int nsops = q->nsops; 964 int i; 965 for (i = 0; i < nsops; i++) 966 if (sops[i].sem_num == semnum 967 && (sops[i].sem_op < 0) 968 && !(sops[i].sem_flg & IPC_NOWAIT)) 969 semncnt++; 970 } 971 return semncnt; 972 } 973 974 static int count_semzcnt (struct sem_array * sma, ushort semnum) 975 { 976 int semzcnt; 977 struct sem_queue * q; 978 979 semzcnt = 0; 980 list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) { 981 struct sembuf * sops = q->sops; 982 BUG_ON(sops->sem_num != semnum); 983 if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT)) 984 semzcnt++; 985 } 986 987 list_for_each_entry(q, &sma->pending_const, list) { 988 struct sembuf * sops = q->sops; 989 int nsops = q->nsops; 990 int i; 991 for (i = 0; i < nsops; i++) 992 if (sops[i].sem_num == semnum 993 && (sops[i].sem_op == 0) 994 && !(sops[i].sem_flg & IPC_NOWAIT)) 995 semzcnt++; 996 } 997 return semzcnt; 998 } 999 1000 /* Free a semaphore set. freeary() is called with sem_ids.rwsem locked 1001 * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem 1002 * remains locked on exit. 1003 */ 1004 static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) 1005 { 1006 struct sem_undo *un, *tu; 1007 struct sem_queue *q, *tq; 1008 struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); 1009 struct list_head tasks; 1010 int i; 1011 1012 /* Free the existing undo structures for this semaphore set. */ 1013 ipc_assert_locked_object(&sma->sem_perm); 1014 list_for_each_entry_safe(un, tu, &sma->list_id, list_id) { 1015 list_del(&un->list_id); 1016 spin_lock(&un->ulp->lock); 1017 un->semid = -1; 1018 list_del_rcu(&un->list_proc); 1019 spin_unlock(&un->ulp->lock); 1020 kfree_rcu(un, rcu); 1021 } 1022 1023 /* Wake up all pending processes and let them fail with EIDRM. */ 1024 INIT_LIST_HEAD(&tasks); 1025 list_for_each_entry_safe(q, tq, &sma->pending_const, list) { 1026 unlink_queue(sma, q); 1027 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1028 } 1029 1030 list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { 1031 unlink_queue(sma, q); 1032 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1033 } 1034 for (i = 0; i < sma->sem_nsems; i++) { 1035 struct sem *sem = sma->sem_base + i; 1036 list_for_each_entry_safe(q, tq, &sem->pending_const, list) { 1037 unlink_queue(sma, q); 1038 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1039 } 1040 list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { 1041 unlink_queue(sma, q); 1042 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1043 } 1044 } 1045 1046 /* Remove the semaphore set from the IDR */ 1047 sem_rmid(ns, sma); 1048 sem_unlock(sma, -1); 1049 rcu_read_unlock(); 1050 1051 wake_up_sem_queue_do(&tasks); 1052 ns->used_sems -= sma->sem_nsems; 1053 ipc_rcu_putref(sma, sem_rcu_free); 1054 } 1055 1056 static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version) 1057 { 1058 switch(version) { 1059 case IPC_64: 1060 return copy_to_user(buf, in, sizeof(*in)); 1061 case IPC_OLD: 1062 { 1063 struct semid_ds out; 1064 1065 memset(&out, 0, sizeof(out)); 1066 1067 ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm); 1068 1069 out.sem_otime = in->sem_otime; 1070 out.sem_ctime = in->sem_ctime; 1071 out.sem_nsems = in->sem_nsems; 1072 1073 return copy_to_user(buf, &out, sizeof(out)); 1074 } 1075 default: 1076 return -EINVAL; 1077 } 1078 } 1079 1080 static time_t get_semotime(struct sem_array *sma) 1081 { 1082 int i; 1083 time_t res; 1084 1085 res = sma->sem_base[0].sem_otime; 1086 for (i = 1; i < sma->sem_nsems; i++) { 1087 time_t to = sma->sem_base[i].sem_otime; 1088 1089 if (to > res) 1090 res = to; 1091 } 1092 return res; 1093 } 1094 1095 static int semctl_nolock(struct ipc_namespace *ns, int semid, 1096 int cmd, int version, void __user *p) 1097 { 1098 int err; 1099 struct sem_array *sma; 1100 1101 switch(cmd) { 1102 case IPC_INFO: 1103 case SEM_INFO: 1104 { 1105 struct seminfo seminfo; 1106 int max_id; 1107 1108 err = security_sem_semctl(NULL, cmd); 1109 if (err) 1110 return err; 1111 1112 memset(&seminfo,0,sizeof(seminfo)); 1113 seminfo.semmni = ns->sc_semmni; 1114 seminfo.semmns = ns->sc_semmns; 1115 seminfo.semmsl = ns->sc_semmsl; 1116 seminfo.semopm = ns->sc_semopm; 1117 seminfo.semvmx = SEMVMX; 1118 seminfo.semmnu = SEMMNU; 1119 seminfo.semmap = SEMMAP; 1120 seminfo.semume = SEMUME; 1121 down_read(&sem_ids(ns).rwsem); 1122 if (cmd == SEM_INFO) { 1123 seminfo.semusz = sem_ids(ns).in_use; 1124 seminfo.semaem = ns->used_sems; 1125 } else { 1126 seminfo.semusz = SEMUSZ; 1127 seminfo.semaem = SEMAEM; 1128 } 1129 max_id = ipc_get_maxid(&sem_ids(ns)); 1130 up_read(&sem_ids(ns).rwsem); 1131 if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) 1132 return -EFAULT; 1133 return (max_id < 0) ? 0: max_id; 1134 } 1135 case IPC_STAT: 1136 case SEM_STAT: 1137 { 1138 struct semid64_ds tbuf; 1139 int id = 0; 1140 1141 memset(&tbuf, 0, sizeof(tbuf)); 1142 1143 rcu_read_lock(); 1144 if (cmd == SEM_STAT) { 1145 sma = sem_obtain_object(ns, semid); 1146 if (IS_ERR(sma)) { 1147 err = PTR_ERR(sma); 1148 goto out_unlock; 1149 } 1150 id = sma->sem_perm.id; 1151 } else { 1152 sma = sem_obtain_object_check(ns, semid); 1153 if (IS_ERR(sma)) { 1154 err = PTR_ERR(sma); 1155 goto out_unlock; 1156 } 1157 } 1158 1159 err = -EACCES; 1160 if (ipcperms(ns, &sma->sem_perm, S_IRUGO)) 1161 goto out_unlock; 1162 1163 err = security_sem_semctl(sma, cmd); 1164 if (err) 1165 goto out_unlock; 1166 1167 kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm); 1168 tbuf.sem_otime = get_semotime(sma); 1169 tbuf.sem_ctime = sma->sem_ctime; 1170 tbuf.sem_nsems = sma->sem_nsems; 1171 rcu_read_unlock(); 1172 if (copy_semid_to_user(p, &tbuf, version)) 1173 return -EFAULT; 1174 return id; 1175 } 1176 default: 1177 return -EINVAL; 1178 } 1179 out_unlock: 1180 rcu_read_unlock(); 1181 return err; 1182 } 1183 1184 static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, 1185 unsigned long arg) 1186 { 1187 struct sem_undo *un; 1188 struct sem_array *sma; 1189 struct sem* curr; 1190 int err; 1191 struct list_head tasks; 1192 int val; 1193 #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) 1194 /* big-endian 64bit */ 1195 val = arg >> 32; 1196 #else 1197 /* 32bit or little-endian 64bit */ 1198 val = arg; 1199 #endif 1200 1201 if (val > SEMVMX || val < 0) 1202 return -ERANGE; 1203 1204 INIT_LIST_HEAD(&tasks); 1205 1206 rcu_read_lock(); 1207 sma = sem_obtain_object_check(ns, semid); 1208 if (IS_ERR(sma)) { 1209 rcu_read_unlock(); 1210 return PTR_ERR(sma); 1211 } 1212 1213 if (semnum < 0 || semnum >= sma->sem_nsems) { 1214 rcu_read_unlock(); 1215 return -EINVAL; 1216 } 1217 1218 1219 if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) { 1220 rcu_read_unlock(); 1221 return -EACCES; 1222 } 1223 1224 err = security_sem_semctl(sma, SETVAL); 1225 if (err) { 1226 rcu_read_unlock(); 1227 return -EACCES; 1228 } 1229 1230 sem_lock(sma, NULL, -1); 1231 1232 curr = &sma->sem_base[semnum]; 1233 1234 ipc_assert_locked_object(&sma->sem_perm); 1235 list_for_each_entry(un, &sma->list_id, list_id) 1236 un->semadj[semnum] = 0; 1237 1238 curr->semval = val; 1239 curr->sempid = task_tgid_vnr(current); 1240 sma->sem_ctime = get_seconds(); 1241 /* maybe some queued-up processes were waiting for this */ 1242 do_smart_update(sma, NULL, 0, 0, &tasks); 1243 sem_unlock(sma, -1); 1244 rcu_read_unlock(); 1245 wake_up_sem_queue_do(&tasks); 1246 return 0; 1247 } 1248 1249 static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, 1250 int cmd, void __user *p) 1251 { 1252 struct sem_array *sma; 1253 struct sem* curr; 1254 int err, nsems; 1255 ushort fast_sem_io[SEMMSL_FAST]; 1256 ushort* sem_io = fast_sem_io; 1257 struct list_head tasks; 1258 1259 INIT_LIST_HEAD(&tasks); 1260 1261 rcu_read_lock(); 1262 sma = sem_obtain_object_check(ns, semid); 1263 if (IS_ERR(sma)) { 1264 rcu_read_unlock(); 1265 return PTR_ERR(sma); 1266 } 1267 1268 nsems = sma->sem_nsems; 1269 1270 err = -EACCES; 1271 if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO)) 1272 goto out_rcu_wakeup; 1273 1274 err = security_sem_semctl(sma, cmd); 1275 if (err) 1276 goto out_rcu_wakeup; 1277 1278 err = -EACCES; 1279 switch (cmd) { 1280 case GETALL: 1281 { 1282 ushort __user *array = p; 1283 int i; 1284 1285 sem_lock(sma, NULL, -1); 1286 if(nsems > SEMMSL_FAST) { 1287 if (!ipc_rcu_getref(sma)) { 1288 sem_unlock(sma, -1); 1289 rcu_read_unlock(); 1290 err = -EIDRM; 1291 goto out_free; 1292 } 1293 sem_unlock(sma, -1); 1294 rcu_read_unlock(); 1295 sem_io = ipc_alloc(sizeof(ushort)*nsems); 1296 if(sem_io == NULL) { 1297 ipc_rcu_putref(sma, ipc_rcu_free); 1298 return -ENOMEM; 1299 } 1300 1301 rcu_read_lock(); 1302 sem_lock_and_putref(sma); 1303 if (sma->sem_perm.deleted) { 1304 sem_unlock(sma, -1); 1305 rcu_read_unlock(); 1306 err = -EIDRM; 1307 goto out_free; 1308 } 1309 } 1310 for (i = 0; i < sma->sem_nsems; i++) 1311 sem_io[i] = sma->sem_base[i].semval; 1312 sem_unlock(sma, -1); 1313 rcu_read_unlock(); 1314 err = 0; 1315 if(copy_to_user(array, sem_io, nsems*sizeof(ushort))) 1316 err = -EFAULT; 1317 goto out_free; 1318 } 1319 case SETALL: 1320 { 1321 int i; 1322 struct sem_undo *un; 1323 1324 if (!ipc_rcu_getref(sma)) { 1325 rcu_read_unlock(); 1326 return -EIDRM; 1327 } 1328 rcu_read_unlock(); 1329 1330 if(nsems > SEMMSL_FAST) { 1331 sem_io = ipc_alloc(sizeof(ushort)*nsems); 1332 if(sem_io == NULL) { 1333 ipc_rcu_putref(sma, ipc_rcu_free); 1334 return -ENOMEM; 1335 } 1336 } 1337 1338 if (copy_from_user (sem_io, p, nsems*sizeof(ushort))) { 1339 ipc_rcu_putref(sma, ipc_rcu_free); 1340 err = -EFAULT; 1341 goto out_free; 1342 } 1343 1344 for (i = 0; i < nsems; i++) { 1345 if (sem_io[i] > SEMVMX) { 1346 ipc_rcu_putref(sma, ipc_rcu_free); 1347 err = -ERANGE; 1348 goto out_free; 1349 } 1350 } 1351 rcu_read_lock(); 1352 sem_lock_and_putref(sma); 1353 if (sma->sem_perm.deleted) { 1354 sem_unlock(sma, -1); 1355 rcu_read_unlock(); 1356 err = -EIDRM; 1357 goto out_free; 1358 } 1359 1360 for (i = 0; i < nsems; i++) 1361 sma->sem_base[i].semval = sem_io[i]; 1362 1363 ipc_assert_locked_object(&sma->sem_perm); 1364 list_for_each_entry(un, &sma->list_id, list_id) { 1365 for (i = 0; i < nsems; i++) 1366 un->semadj[i] = 0; 1367 } 1368 sma->sem_ctime = get_seconds(); 1369 /* maybe some queued-up processes were waiting for this */ 1370 do_smart_update(sma, NULL, 0, 0, &tasks); 1371 err = 0; 1372 goto out_unlock; 1373 } 1374 /* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */ 1375 } 1376 err = -EINVAL; 1377 if (semnum < 0 || semnum >= nsems) 1378 goto out_rcu_wakeup; 1379 1380 sem_lock(sma, NULL, -1); 1381 curr = &sma->sem_base[semnum]; 1382 1383 switch (cmd) { 1384 case GETVAL: 1385 err = curr->semval; 1386 goto out_unlock; 1387 case GETPID: 1388 err = curr->sempid; 1389 goto out_unlock; 1390 case GETNCNT: 1391 err = count_semncnt(sma,semnum); 1392 goto out_unlock; 1393 case GETZCNT: 1394 err = count_semzcnt(sma,semnum); 1395 goto out_unlock; 1396 } 1397 1398 out_unlock: 1399 sem_unlock(sma, -1); 1400 out_rcu_wakeup: 1401 rcu_read_unlock(); 1402 wake_up_sem_queue_do(&tasks); 1403 out_free: 1404 if(sem_io != fast_sem_io) 1405 ipc_free(sem_io, sizeof(ushort)*nsems); 1406 return err; 1407 } 1408 1409 static inline unsigned long 1410 copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version) 1411 { 1412 switch(version) { 1413 case IPC_64: 1414 if (copy_from_user(out, buf, sizeof(*out))) 1415 return -EFAULT; 1416 return 0; 1417 case IPC_OLD: 1418 { 1419 struct semid_ds tbuf_old; 1420 1421 if(copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) 1422 return -EFAULT; 1423 1424 out->sem_perm.uid = tbuf_old.sem_perm.uid; 1425 out->sem_perm.gid = tbuf_old.sem_perm.gid; 1426 out->sem_perm.mode = tbuf_old.sem_perm.mode; 1427 1428 return 0; 1429 } 1430 default: 1431 return -EINVAL; 1432 } 1433 } 1434 1435 /* 1436 * This function handles some semctl commands which require the rwsem 1437 * to be held in write mode. 1438 * NOTE: no locks must be held, the rwsem is taken inside this function. 1439 */ 1440 static int semctl_down(struct ipc_namespace *ns, int semid, 1441 int cmd, int version, void __user *p) 1442 { 1443 struct sem_array *sma; 1444 int err; 1445 struct semid64_ds semid64; 1446 struct kern_ipc_perm *ipcp; 1447 1448 if(cmd == IPC_SET) { 1449 if (copy_semid_from_user(&semid64, p, version)) 1450 return -EFAULT; 1451 } 1452 1453 down_write(&sem_ids(ns).rwsem); 1454 rcu_read_lock(); 1455 1456 ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd, 1457 &semid64.sem_perm, 0); 1458 if (IS_ERR(ipcp)) { 1459 err = PTR_ERR(ipcp); 1460 goto out_unlock1; 1461 } 1462 1463 sma = container_of(ipcp, struct sem_array, sem_perm); 1464 1465 err = security_sem_semctl(sma, cmd); 1466 if (err) 1467 goto out_unlock1; 1468 1469 switch (cmd) { 1470 case IPC_RMID: 1471 sem_lock(sma, NULL, -1); 1472 /* freeary unlocks the ipc object and rcu */ 1473 freeary(ns, ipcp); 1474 goto out_up; 1475 case IPC_SET: 1476 sem_lock(sma, NULL, -1); 1477 err = ipc_update_perm(&semid64.sem_perm, ipcp); 1478 if (err) 1479 goto out_unlock0; 1480 sma->sem_ctime = get_seconds(); 1481 break; 1482 default: 1483 err = -EINVAL; 1484 goto out_unlock1; 1485 } 1486 1487 out_unlock0: 1488 sem_unlock(sma, -1); 1489 out_unlock1: 1490 rcu_read_unlock(); 1491 out_up: 1492 up_write(&sem_ids(ns).rwsem); 1493 return err; 1494 } 1495 1496 SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg) 1497 { 1498 int version; 1499 struct ipc_namespace *ns; 1500 void __user *p = (void __user *)arg; 1501 1502 if (semid < 0) 1503 return -EINVAL; 1504 1505 version = ipc_parse_version(&cmd); 1506 ns = current->nsproxy->ipc_ns; 1507 1508 switch(cmd) { 1509 case IPC_INFO: 1510 case SEM_INFO: 1511 case IPC_STAT: 1512 case SEM_STAT: 1513 return semctl_nolock(ns, semid, cmd, version, p); 1514 case GETALL: 1515 case GETVAL: 1516 case GETPID: 1517 case GETNCNT: 1518 case GETZCNT: 1519 case SETALL: 1520 return semctl_main(ns, semid, semnum, cmd, p); 1521 case SETVAL: 1522 return semctl_setval(ns, semid, semnum, arg); 1523 case IPC_RMID: 1524 case IPC_SET: 1525 return semctl_down(ns, semid, cmd, version, p); 1526 default: 1527 return -EINVAL; 1528 } 1529 } 1530 1531 /* If the task doesn't already have a undo_list, then allocate one 1532 * here. We guarantee there is only one thread using this undo list, 1533 * and current is THE ONE 1534 * 1535 * If this allocation and assignment succeeds, but later 1536 * portions of this code fail, there is no need to free the sem_undo_list. 1537 * Just let it stay associated with the task, and it'll be freed later 1538 * at exit time. 1539 * 1540 * This can block, so callers must hold no locks. 1541 */ 1542 static inline int get_undo_list(struct sem_undo_list **undo_listp) 1543 { 1544 struct sem_undo_list *undo_list; 1545 1546 undo_list = current->sysvsem.undo_list; 1547 if (!undo_list) { 1548 undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); 1549 if (undo_list == NULL) 1550 return -ENOMEM; 1551 spin_lock_init(&undo_list->lock); 1552 atomic_set(&undo_list->refcnt, 1); 1553 INIT_LIST_HEAD(&undo_list->list_proc); 1554 1555 current->sysvsem.undo_list = undo_list; 1556 } 1557 *undo_listp = undo_list; 1558 return 0; 1559 } 1560 1561 static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid) 1562 { 1563 struct sem_undo *un; 1564 1565 list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) { 1566 if (un->semid == semid) 1567 return un; 1568 } 1569 return NULL; 1570 } 1571 1572 static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid) 1573 { 1574 struct sem_undo *un; 1575 1576 assert_spin_locked(&ulp->lock); 1577 1578 un = __lookup_undo(ulp, semid); 1579 if (un) { 1580 list_del_rcu(&un->list_proc); 1581 list_add_rcu(&un->list_proc, &ulp->list_proc); 1582 } 1583 return un; 1584 } 1585 1586 /** 1587 * find_alloc_undo - Lookup (and if not present create) undo array 1588 * @ns: namespace 1589 * @semid: semaphore array id 1590 * 1591 * The function looks up (and if not present creates) the undo structure. 1592 * The size of the undo structure depends on the size of the semaphore 1593 * array, thus the alloc path is not that straightforward. 1594 * Lifetime-rules: sem_undo is rcu-protected, on success, the function 1595 * performs a rcu_read_lock(). 1596 */ 1597 static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) 1598 { 1599 struct sem_array *sma; 1600 struct sem_undo_list *ulp; 1601 struct sem_undo *un, *new; 1602 int nsems, error; 1603 1604 error = get_undo_list(&ulp); 1605 if (error) 1606 return ERR_PTR(error); 1607 1608 rcu_read_lock(); 1609 spin_lock(&ulp->lock); 1610 un = lookup_undo(ulp, semid); 1611 spin_unlock(&ulp->lock); 1612 if (likely(un!=NULL)) 1613 goto out; 1614 1615 /* no undo structure around - allocate one. */ 1616 /* step 1: figure out the size of the semaphore array */ 1617 sma = sem_obtain_object_check(ns, semid); 1618 if (IS_ERR(sma)) { 1619 rcu_read_unlock(); 1620 return ERR_CAST(sma); 1621 } 1622 1623 nsems = sma->sem_nsems; 1624 if (!ipc_rcu_getref(sma)) { 1625 rcu_read_unlock(); 1626 un = ERR_PTR(-EIDRM); 1627 goto out; 1628 } 1629 rcu_read_unlock(); 1630 1631 /* step 2: allocate new undo structure */ 1632 new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); 1633 if (!new) { 1634 ipc_rcu_putref(sma, ipc_rcu_free); 1635 return ERR_PTR(-ENOMEM); 1636 } 1637 1638 /* step 3: Acquire the lock on semaphore array */ 1639 rcu_read_lock(); 1640 sem_lock_and_putref(sma); 1641 if (sma->sem_perm.deleted) { 1642 sem_unlock(sma, -1); 1643 rcu_read_unlock(); 1644 kfree(new); 1645 un = ERR_PTR(-EIDRM); 1646 goto out; 1647 } 1648 spin_lock(&ulp->lock); 1649 1650 /* 1651 * step 4: check for races: did someone else allocate the undo struct? 1652 */ 1653 un = lookup_undo(ulp, semid); 1654 if (un) { 1655 kfree(new); 1656 goto success; 1657 } 1658 /* step 5: initialize & link new undo structure */ 1659 new->semadj = (short *) &new[1]; 1660 new->ulp = ulp; 1661 new->semid = semid; 1662 assert_spin_locked(&ulp->lock); 1663 list_add_rcu(&new->list_proc, &ulp->list_proc); 1664 ipc_assert_locked_object(&sma->sem_perm); 1665 list_add(&new->list_id, &sma->list_id); 1666 un = new; 1667 1668 success: 1669 spin_unlock(&ulp->lock); 1670 sem_unlock(sma, -1); 1671 out: 1672 return un; 1673 } 1674 1675 1676 /** 1677 * get_queue_result - Retrieve the result code from sem_queue 1678 * @q: Pointer to queue structure 1679 * 1680 * Retrieve the return code from the pending queue. If IN_WAKEUP is found in 1681 * q->status, then we must loop until the value is replaced with the final 1682 * value: This may happen if a task is woken up by an unrelated event (e.g. 1683 * signal) and in parallel the task is woken up by another task because it got 1684 * the requested semaphores. 1685 * 1686 * The function can be called with or without holding the semaphore spinlock. 1687 */ 1688 static int get_queue_result(struct sem_queue *q) 1689 { 1690 int error; 1691 1692 error = q->status; 1693 while (unlikely(error == IN_WAKEUP)) { 1694 cpu_relax(); 1695 error = q->status; 1696 } 1697 1698 return error; 1699 } 1700 1701 SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, 1702 unsigned, nsops, const struct timespec __user *, timeout) 1703 { 1704 int error = -EINVAL; 1705 struct sem_array *sma; 1706 struct sembuf fast_sops[SEMOPM_FAST]; 1707 struct sembuf* sops = fast_sops, *sop; 1708 struct sem_undo *un; 1709 int undos = 0, alter = 0, max, locknum; 1710 struct sem_queue queue; 1711 unsigned long jiffies_left = 0; 1712 struct ipc_namespace *ns; 1713 struct list_head tasks; 1714 1715 ns = current->nsproxy->ipc_ns; 1716 1717 if (nsops < 1 || semid < 0) 1718 return -EINVAL; 1719 if (nsops > ns->sc_semopm) 1720 return -E2BIG; 1721 if(nsops > SEMOPM_FAST) { 1722 sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); 1723 if(sops==NULL) 1724 return -ENOMEM; 1725 } 1726 if (copy_from_user (sops, tsops, nsops * sizeof(*tsops))) { 1727 error=-EFAULT; 1728 goto out_free; 1729 } 1730 if (timeout) { 1731 struct timespec _timeout; 1732 if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { 1733 error = -EFAULT; 1734 goto out_free; 1735 } 1736 if (_timeout.tv_sec < 0 || _timeout.tv_nsec < 0 || 1737 _timeout.tv_nsec >= 1000000000L) { 1738 error = -EINVAL; 1739 goto out_free; 1740 } 1741 jiffies_left = timespec_to_jiffies(&_timeout); 1742 } 1743 max = 0; 1744 for (sop = sops; sop < sops + nsops; sop++) { 1745 if (sop->sem_num >= max) 1746 max = sop->sem_num; 1747 if (sop->sem_flg & SEM_UNDO) 1748 undos = 1; 1749 if (sop->sem_op != 0) 1750 alter = 1; 1751 } 1752 1753 INIT_LIST_HEAD(&tasks); 1754 1755 if (undos) { 1756 /* On success, find_alloc_undo takes the rcu_read_lock */ 1757 un = find_alloc_undo(ns, semid); 1758 if (IS_ERR(un)) { 1759 error = PTR_ERR(un); 1760 goto out_free; 1761 } 1762 } else { 1763 un = NULL; 1764 rcu_read_lock(); 1765 } 1766 1767 sma = sem_obtain_object_check(ns, semid); 1768 if (IS_ERR(sma)) { 1769 rcu_read_unlock(); 1770 error = PTR_ERR(sma); 1771 goto out_free; 1772 } 1773 1774 error = -EFBIG; 1775 if (max >= sma->sem_nsems) 1776 goto out_rcu_wakeup; 1777 1778 error = -EACCES; 1779 if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) 1780 goto out_rcu_wakeup; 1781 1782 error = security_sem_semop(sma, sops, nsops, alter); 1783 if (error) 1784 goto out_rcu_wakeup; 1785 1786 /* 1787 * semid identifiers are not unique - find_alloc_undo may have 1788 * allocated an undo structure, it was invalidated by an RMID 1789 * and now a new array with received the same id. Check and fail. 1790 * This case can be detected checking un->semid. The existence of 1791 * "un" itself is guaranteed by rcu. 1792 */ 1793 error = -EIDRM; 1794 locknum = sem_lock(sma, sops, nsops); 1795 if (un && un->semid == -1) 1796 goto out_unlock_free; 1797 1798 error = perform_atomic_semop(sma, sops, nsops, un, 1799 task_tgid_vnr(current)); 1800 if (error <= 0) { 1801 if (alter && error == 0) 1802 do_smart_update(sma, sops, nsops, 1, &tasks); 1803 1804 goto out_unlock_free; 1805 } 1806 1807 /* We need to sleep on this operation, so we put the current 1808 * task into the pending queue and go to sleep. 1809 */ 1810 1811 queue.sops = sops; 1812 queue.nsops = nsops; 1813 queue.undo = un; 1814 queue.pid = task_tgid_vnr(current); 1815 queue.alter = alter; 1816 1817 if (nsops == 1) { 1818 struct sem *curr; 1819 curr = &sma->sem_base[sops->sem_num]; 1820 1821 if (alter) { 1822 if (sma->complex_count) { 1823 list_add_tail(&queue.list, 1824 &sma->pending_alter); 1825 } else { 1826 1827 list_add_tail(&queue.list, 1828 &curr->pending_alter); 1829 } 1830 } else { 1831 list_add_tail(&queue.list, &curr->pending_const); 1832 } 1833 } else { 1834 if (!sma->complex_count) 1835 merge_queues(sma); 1836 1837 if (alter) 1838 list_add_tail(&queue.list, &sma->pending_alter); 1839 else 1840 list_add_tail(&queue.list, &sma->pending_const); 1841 1842 sma->complex_count++; 1843 } 1844 1845 queue.status = -EINTR; 1846 queue.sleeper = current; 1847 1848 sleep_again: 1849 current->state = TASK_INTERRUPTIBLE; 1850 sem_unlock(sma, locknum); 1851 rcu_read_unlock(); 1852 1853 if (timeout) 1854 jiffies_left = schedule_timeout(jiffies_left); 1855 else 1856 schedule(); 1857 1858 error = get_queue_result(&queue); 1859 1860 if (error != -EINTR) { 1861 /* fast path: update_queue already obtained all requested 1862 * resources. 1863 * Perform a smp_mb(): User space could assume that semop() 1864 * is a memory barrier: Without the mb(), the cpu could 1865 * speculatively read in user space stale data that was 1866 * overwritten by the previous owner of the semaphore. 1867 */ 1868 smp_mb(); 1869 1870 goto out_free; 1871 } 1872 1873 rcu_read_lock(); 1874 sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); 1875 1876 /* 1877 * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. 1878 */ 1879 error = get_queue_result(&queue); 1880 1881 /* 1882 * Array removed? If yes, leave without sem_unlock(). 1883 */ 1884 if (IS_ERR(sma)) { 1885 rcu_read_unlock(); 1886 goto out_free; 1887 } 1888 1889 1890 /* 1891 * If queue.status != -EINTR we are woken up by another process. 1892 * Leave without unlink_queue(), but with sem_unlock(). 1893 */ 1894 1895 if (error != -EINTR) { 1896 goto out_unlock_free; 1897 } 1898 1899 /* 1900 * If an interrupt occurred we have to clean up the queue 1901 */ 1902 if (timeout && jiffies_left == 0) 1903 error = -EAGAIN; 1904 1905 /* 1906 * If the wakeup was spurious, just retry 1907 */ 1908 if (error == -EINTR && !signal_pending(current)) 1909 goto sleep_again; 1910 1911 unlink_queue(sma, &queue); 1912 1913 out_unlock_free: 1914 sem_unlock(sma, locknum); 1915 out_rcu_wakeup: 1916 rcu_read_unlock(); 1917 wake_up_sem_queue_do(&tasks); 1918 out_free: 1919 if(sops != fast_sops) 1920 kfree(sops); 1921 return error; 1922 } 1923 1924 SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops, 1925 unsigned, nsops) 1926 { 1927 return sys_semtimedop(semid, tsops, nsops, NULL); 1928 } 1929 1930 /* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between 1931 * parent and child tasks. 1932 */ 1933 1934 int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) 1935 { 1936 struct sem_undo_list *undo_list; 1937 int error; 1938 1939 if (clone_flags & CLONE_SYSVSEM) { 1940 error = get_undo_list(&undo_list); 1941 if (error) 1942 return error; 1943 atomic_inc(&undo_list->refcnt); 1944 tsk->sysvsem.undo_list = undo_list; 1945 } else 1946 tsk->sysvsem.undo_list = NULL; 1947 1948 return 0; 1949 } 1950 1951 /* 1952 * add semadj values to semaphores, free undo structures. 1953 * undo structures are not freed when semaphore arrays are destroyed 1954 * so some of them may be out of date. 1955 * IMPLEMENTATION NOTE: There is some confusion over whether the 1956 * set of adjustments that needs to be done should be done in an atomic 1957 * manner or not. That is, if we are attempting to decrement the semval 1958 * should we queue up and wait until we can do so legally? 1959 * The original implementation attempted to do this (queue and wait). 1960 * The current implementation does not do so. The POSIX standard 1961 * and SVID should be consulted to determine what behavior is mandated. 1962 */ 1963 void exit_sem(struct task_struct *tsk) 1964 { 1965 struct sem_undo_list *ulp; 1966 1967 ulp = tsk->sysvsem.undo_list; 1968 if (!ulp) 1969 return; 1970 tsk->sysvsem.undo_list = NULL; 1971 1972 if (!atomic_dec_and_test(&ulp->refcnt)) 1973 return; 1974 1975 for (;;) { 1976 struct sem_array *sma; 1977 struct sem_undo *un; 1978 struct list_head tasks; 1979 int semid, i; 1980 1981 rcu_read_lock(); 1982 un = list_entry_rcu(ulp->list_proc.next, 1983 struct sem_undo, list_proc); 1984 if (&un->list_proc == &ulp->list_proc) 1985 semid = -1; 1986 else 1987 semid = un->semid; 1988 1989 if (semid == -1) { 1990 rcu_read_unlock(); 1991 break; 1992 } 1993 1994 sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, un->semid); 1995 /* exit_sem raced with IPC_RMID, nothing to do */ 1996 if (IS_ERR(sma)) { 1997 rcu_read_unlock(); 1998 continue; 1999 } 2000 2001 sem_lock(sma, NULL, -1); 2002 un = __lookup_undo(ulp, semid); 2003 if (un == NULL) { 2004 /* exit_sem raced with IPC_RMID+semget() that created 2005 * exactly the same semid. Nothing to do. 2006 */ 2007 sem_unlock(sma, -1); 2008 rcu_read_unlock(); 2009 continue; 2010 } 2011 2012 /* remove un from the linked lists */ 2013 ipc_assert_locked_object(&sma->sem_perm); 2014 list_del(&un->list_id); 2015 2016 spin_lock(&ulp->lock); 2017 list_del_rcu(&un->list_proc); 2018 spin_unlock(&ulp->lock); 2019 2020 /* perform adjustments registered in un */ 2021 for (i = 0; i < sma->sem_nsems; i++) { 2022 struct sem * semaphore = &sma->sem_base[i]; 2023 if (un->semadj[i]) { 2024 semaphore->semval += un->semadj[i]; 2025 /* 2026 * Range checks of the new semaphore value, 2027 * not defined by sus: 2028 * - Some unices ignore the undo entirely 2029 * (e.g. HP UX 11i 11.22, Tru64 V5.1) 2030 * - some cap the value (e.g. FreeBSD caps 2031 * at 0, but doesn't enforce SEMVMX) 2032 * 2033 * Linux caps the semaphore value, both at 0 2034 * and at SEMVMX. 2035 * 2036 * Manfred <manfred@colorfullife.com> 2037 */ 2038 if (semaphore->semval < 0) 2039 semaphore->semval = 0; 2040 if (semaphore->semval > SEMVMX) 2041 semaphore->semval = SEMVMX; 2042 semaphore->sempid = task_tgid_vnr(current); 2043 } 2044 } 2045 /* maybe some queued-up processes were waiting for this */ 2046 INIT_LIST_HEAD(&tasks); 2047 do_smart_update(sma, NULL, 0, 1, &tasks); 2048 sem_unlock(sma, -1); 2049 rcu_read_unlock(); 2050 wake_up_sem_queue_do(&tasks); 2051 2052 kfree_rcu(un, rcu); 2053 } 2054 kfree(ulp); 2055 } 2056 2057 #ifdef CONFIG_PROC_FS 2058 static int sysvipc_sem_proc_show(struct seq_file *s, void *it) 2059 { 2060 struct user_namespace *user_ns = seq_user_ns(s); 2061 struct sem_array *sma = it; 2062 time_t sem_otime; 2063 2064 sem_otime = get_semotime(sma); 2065 2066 return seq_printf(s, 2067 "%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n", 2068 sma->sem_perm.key, 2069 sma->sem_perm.id, 2070 sma->sem_perm.mode, 2071 sma->sem_nsems, 2072 from_kuid_munged(user_ns, sma->sem_perm.uid), 2073 from_kgid_munged(user_ns, sma->sem_perm.gid), 2074 from_kuid_munged(user_ns, sma->sem_perm.cuid), 2075 from_kgid_munged(user_ns, sma->sem_perm.cgid), 2076 sem_otime, 2077 sma->sem_ctime); 2078 } 2079 #endif 2080