xref: /openbmc/linux/net/unix/af_unix.c (revision d0b73b48)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #include <linux/module.h>
84 #include <linux/kernel.h>
85 #include <linux/signal.h>
86 #include <linux/sched.h>
87 #include <linux/errno.h>
88 #include <linux/string.h>
89 #include <linux/stat.h>
90 #include <linux/dcache.h>
91 #include <linux/namei.h>
92 #include <linux/socket.h>
93 #include <linux/un.h>
94 #include <linux/fcntl.h>
95 #include <linux/termios.h>
96 #include <linux/sockios.h>
97 #include <linux/net.h>
98 #include <linux/in.h>
99 #include <linux/fs.h>
100 #include <linux/slab.h>
101 #include <asm/uaccess.h>
102 #include <linux/skbuff.h>
103 #include <linux/netdevice.h>
104 #include <net/net_namespace.h>
105 #include <net/sock.h>
106 #include <net/tcp_states.h>
107 #include <net/af_unix.h>
108 #include <linux/proc_fs.h>
109 #include <linux/seq_file.h>
110 #include <net/scm.h>
111 #include <linux/init.h>
112 #include <linux/poll.h>
113 #include <linux/rtnetlink.h>
114 #include <linux/mount.h>
115 #include <net/checksum.h>
116 #include <linux/security.h>
117 
118 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
119 EXPORT_SYMBOL_GPL(unix_socket_table);
120 DEFINE_SPINLOCK(unix_table_lock);
121 EXPORT_SYMBOL_GPL(unix_table_lock);
122 static atomic_long_t unix_nr_socks;
123 
124 
125 static struct hlist_head *unix_sockets_unbound(void *addr)
126 {
127 	unsigned long hash = (unsigned long)addr;
128 
129 	hash ^= hash >> 16;
130 	hash ^= hash >> 8;
131 	hash %= UNIX_HASH_SIZE;
132 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
133 }
134 
135 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
136 
137 #ifdef CONFIG_SECURITY_NETWORK
138 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
139 {
140 	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
141 }
142 
143 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
144 {
145 	scm->secid = *UNIXSID(skb);
146 }
147 #else
148 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
149 { }
150 
151 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
152 { }
153 #endif /* CONFIG_SECURITY_NETWORK */
154 
155 /*
156  *  SMP locking strategy:
157  *    hash table is protected with spinlock unix_table_lock
158  *    each socket state is protected by separate spin lock.
159  */
160 
161 static inline unsigned int unix_hash_fold(__wsum n)
162 {
163 	unsigned int hash = (__force unsigned int)n;
164 
165 	hash ^= hash>>16;
166 	hash ^= hash>>8;
167 	return hash&(UNIX_HASH_SIZE-1);
168 }
169 
170 #define unix_peer(sk) (unix_sk(sk)->peer)
171 
172 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
173 {
174 	return unix_peer(osk) == sk;
175 }
176 
177 static inline int unix_may_send(struct sock *sk, struct sock *osk)
178 {
179 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
180 }
181 
182 static inline int unix_recvq_full(struct sock const *sk)
183 {
184 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
185 }
186 
187 struct sock *unix_peer_get(struct sock *s)
188 {
189 	struct sock *peer;
190 
191 	unix_state_lock(s);
192 	peer = unix_peer(s);
193 	if (peer)
194 		sock_hold(peer);
195 	unix_state_unlock(s);
196 	return peer;
197 }
198 EXPORT_SYMBOL_GPL(unix_peer_get);
199 
200 static inline void unix_release_addr(struct unix_address *addr)
201 {
202 	if (atomic_dec_and_test(&addr->refcnt))
203 		kfree(addr);
204 }
205 
206 /*
207  *	Check unix socket name:
208  *		- should be not zero length.
209  *	        - if started by not zero, should be NULL terminated (FS object)
210  *		- if started by zero, it is abstract name.
211  */
212 
213 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
214 {
215 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
216 		return -EINVAL;
217 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
218 		return -EINVAL;
219 	if (sunaddr->sun_path[0]) {
220 		/*
221 		 * This may look like an off by one error but it is a bit more
222 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
223 		 * sun_path[108] doesn't as such exist.  However in kernel space
224 		 * we are guaranteed that it is a valid memory location in our
225 		 * kernel address buffer.
226 		 */
227 		((char *)sunaddr)[len] = 0;
228 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
229 		return len;
230 	}
231 
232 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
233 	return len;
234 }
235 
236 static void __unix_remove_socket(struct sock *sk)
237 {
238 	sk_del_node_init(sk);
239 }
240 
241 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
242 {
243 	WARN_ON(!sk_unhashed(sk));
244 	sk_add_node(sk, list);
245 }
246 
247 static inline void unix_remove_socket(struct sock *sk)
248 {
249 	spin_lock(&unix_table_lock);
250 	__unix_remove_socket(sk);
251 	spin_unlock(&unix_table_lock);
252 }
253 
254 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
255 {
256 	spin_lock(&unix_table_lock);
257 	__unix_insert_socket(list, sk);
258 	spin_unlock(&unix_table_lock);
259 }
260 
261 static struct sock *__unix_find_socket_byname(struct net *net,
262 					      struct sockaddr_un *sunname,
263 					      int len, int type, unsigned int hash)
264 {
265 	struct sock *s;
266 	struct hlist_node *node;
267 
268 	sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
269 		struct unix_sock *u = unix_sk(s);
270 
271 		if (!net_eq(sock_net(s), net))
272 			continue;
273 
274 		if (u->addr->len == len &&
275 		    !memcmp(u->addr->name, sunname, len))
276 			goto found;
277 	}
278 	s = NULL;
279 found:
280 	return s;
281 }
282 
283 static inline struct sock *unix_find_socket_byname(struct net *net,
284 						   struct sockaddr_un *sunname,
285 						   int len, int type,
286 						   unsigned int hash)
287 {
288 	struct sock *s;
289 
290 	spin_lock(&unix_table_lock);
291 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
292 	if (s)
293 		sock_hold(s);
294 	spin_unlock(&unix_table_lock);
295 	return s;
296 }
297 
298 static struct sock *unix_find_socket_byinode(struct inode *i)
299 {
300 	struct sock *s;
301 	struct hlist_node *node;
302 
303 	spin_lock(&unix_table_lock);
304 	sk_for_each(s, node,
305 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
306 		struct dentry *dentry = unix_sk(s)->path.dentry;
307 
308 		if (dentry && dentry->d_inode == i) {
309 			sock_hold(s);
310 			goto found;
311 		}
312 	}
313 	s = NULL;
314 found:
315 	spin_unlock(&unix_table_lock);
316 	return s;
317 }
318 
319 static inline int unix_writable(struct sock *sk)
320 {
321 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
322 }
323 
324 static void unix_write_space(struct sock *sk)
325 {
326 	struct socket_wq *wq;
327 
328 	rcu_read_lock();
329 	if (unix_writable(sk)) {
330 		wq = rcu_dereference(sk->sk_wq);
331 		if (wq_has_sleeper(wq))
332 			wake_up_interruptible_sync_poll(&wq->wait,
333 				POLLOUT | POLLWRNORM | POLLWRBAND);
334 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
335 	}
336 	rcu_read_unlock();
337 }
338 
339 /* When dgram socket disconnects (or changes its peer), we clear its receive
340  * queue of packets arrived from previous peer. First, it allows to do
341  * flow control based only on wmem_alloc; second, sk connected to peer
342  * may receive messages only from that peer. */
343 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
344 {
345 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
346 		skb_queue_purge(&sk->sk_receive_queue);
347 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
348 
349 		/* If one link of bidirectional dgram pipe is disconnected,
350 		 * we signal error. Messages are lost. Do not make this,
351 		 * when peer was not connected to us.
352 		 */
353 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
354 			other->sk_err = ECONNRESET;
355 			other->sk_error_report(other);
356 		}
357 	}
358 }
359 
360 static void unix_sock_destructor(struct sock *sk)
361 {
362 	struct unix_sock *u = unix_sk(sk);
363 
364 	skb_queue_purge(&sk->sk_receive_queue);
365 
366 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
367 	WARN_ON(!sk_unhashed(sk));
368 	WARN_ON(sk->sk_socket);
369 	if (!sock_flag(sk, SOCK_DEAD)) {
370 		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
371 		return;
372 	}
373 
374 	if (u->addr)
375 		unix_release_addr(u->addr);
376 
377 	atomic_long_dec(&unix_nr_socks);
378 	local_bh_disable();
379 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
380 	local_bh_enable();
381 #ifdef UNIX_REFCNT_DEBUG
382 	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
383 		atomic_long_read(&unix_nr_socks));
384 #endif
385 }
386 
387 static int unix_release_sock(struct sock *sk, int embrion)
388 {
389 	struct unix_sock *u = unix_sk(sk);
390 	struct path path;
391 	struct sock *skpair;
392 	struct sk_buff *skb;
393 	int state;
394 
395 	unix_remove_socket(sk);
396 
397 	/* Clear state */
398 	unix_state_lock(sk);
399 	sock_orphan(sk);
400 	sk->sk_shutdown = SHUTDOWN_MASK;
401 	path	     = u->path;
402 	u->path.dentry = NULL;
403 	u->path.mnt = NULL;
404 	state = sk->sk_state;
405 	sk->sk_state = TCP_CLOSE;
406 	unix_state_unlock(sk);
407 
408 	wake_up_interruptible_all(&u->peer_wait);
409 
410 	skpair = unix_peer(sk);
411 
412 	if (skpair != NULL) {
413 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
414 			unix_state_lock(skpair);
415 			/* No more writes */
416 			skpair->sk_shutdown = SHUTDOWN_MASK;
417 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
418 				skpair->sk_err = ECONNRESET;
419 			unix_state_unlock(skpair);
420 			skpair->sk_state_change(skpair);
421 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
422 		}
423 		sock_put(skpair); /* It may now die */
424 		unix_peer(sk) = NULL;
425 	}
426 
427 	/* Try to flush out this socket. Throw out buffers at least */
428 
429 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
430 		if (state == TCP_LISTEN)
431 			unix_release_sock(skb->sk, 1);
432 		/* passed fds are erased in the kfree_skb hook	      */
433 		kfree_skb(skb);
434 	}
435 
436 	if (path.dentry)
437 		path_put(&path);
438 
439 	sock_put(sk);
440 
441 	/* ---- Socket is dead now and most probably destroyed ---- */
442 
443 	/*
444 	 * Fixme: BSD difference: In BSD all sockets connected to us get
445 	 *	  ECONNRESET and we die on the spot. In Linux we behave
446 	 *	  like files and pipes do and wait for the last
447 	 *	  dereference.
448 	 *
449 	 * Can't we simply set sock->err?
450 	 *
451 	 *	  What the above comment does talk about? --ANK(980817)
452 	 */
453 
454 	if (unix_tot_inflight)
455 		unix_gc();		/* Garbage collect fds */
456 
457 	return 0;
458 }
459 
460 static void init_peercred(struct sock *sk)
461 {
462 	put_pid(sk->sk_peer_pid);
463 	if (sk->sk_peer_cred)
464 		put_cred(sk->sk_peer_cred);
465 	sk->sk_peer_pid  = get_pid(task_tgid(current));
466 	sk->sk_peer_cred = get_current_cred();
467 }
468 
469 static void copy_peercred(struct sock *sk, struct sock *peersk)
470 {
471 	put_pid(sk->sk_peer_pid);
472 	if (sk->sk_peer_cred)
473 		put_cred(sk->sk_peer_cred);
474 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
475 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
476 }
477 
478 static int unix_listen(struct socket *sock, int backlog)
479 {
480 	int err;
481 	struct sock *sk = sock->sk;
482 	struct unix_sock *u = unix_sk(sk);
483 	struct pid *old_pid = NULL;
484 
485 	err = -EOPNOTSUPP;
486 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
487 		goto out;	/* Only stream/seqpacket sockets accept */
488 	err = -EINVAL;
489 	if (!u->addr)
490 		goto out;	/* No listens on an unbound socket */
491 	unix_state_lock(sk);
492 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
493 		goto out_unlock;
494 	if (backlog > sk->sk_max_ack_backlog)
495 		wake_up_interruptible_all(&u->peer_wait);
496 	sk->sk_max_ack_backlog	= backlog;
497 	sk->sk_state		= TCP_LISTEN;
498 	/* set credentials so connect can copy them */
499 	init_peercred(sk);
500 	err = 0;
501 
502 out_unlock:
503 	unix_state_unlock(sk);
504 	put_pid(old_pid);
505 out:
506 	return err;
507 }
508 
509 static int unix_release(struct socket *);
510 static int unix_bind(struct socket *, struct sockaddr *, int);
511 static int unix_stream_connect(struct socket *, struct sockaddr *,
512 			       int addr_len, int flags);
513 static int unix_socketpair(struct socket *, struct socket *);
514 static int unix_accept(struct socket *, struct socket *, int);
515 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
516 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
517 static unsigned int unix_dgram_poll(struct file *, struct socket *,
518 				    poll_table *);
519 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
520 static int unix_shutdown(struct socket *, int);
521 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
522 			       struct msghdr *, size_t);
523 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
524 			       struct msghdr *, size_t, int);
525 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
526 			      struct msghdr *, size_t);
527 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
528 			      struct msghdr *, size_t, int);
529 static int unix_dgram_connect(struct socket *, struct sockaddr *,
530 			      int, int);
531 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
532 				  struct msghdr *, size_t);
533 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
534 				  struct msghdr *, size_t, int);
535 
536 static void unix_set_peek_off(struct sock *sk, int val)
537 {
538 	struct unix_sock *u = unix_sk(sk);
539 
540 	mutex_lock(&u->readlock);
541 	sk->sk_peek_off = val;
542 	mutex_unlock(&u->readlock);
543 }
544 
545 
546 static const struct proto_ops unix_stream_ops = {
547 	.family =	PF_UNIX,
548 	.owner =	THIS_MODULE,
549 	.release =	unix_release,
550 	.bind =		unix_bind,
551 	.connect =	unix_stream_connect,
552 	.socketpair =	unix_socketpair,
553 	.accept =	unix_accept,
554 	.getname =	unix_getname,
555 	.poll =		unix_poll,
556 	.ioctl =	unix_ioctl,
557 	.listen =	unix_listen,
558 	.shutdown =	unix_shutdown,
559 	.setsockopt =	sock_no_setsockopt,
560 	.getsockopt =	sock_no_getsockopt,
561 	.sendmsg =	unix_stream_sendmsg,
562 	.recvmsg =	unix_stream_recvmsg,
563 	.mmap =		sock_no_mmap,
564 	.sendpage =	sock_no_sendpage,
565 	.set_peek_off =	unix_set_peek_off,
566 };
567 
568 static const struct proto_ops unix_dgram_ops = {
569 	.family =	PF_UNIX,
570 	.owner =	THIS_MODULE,
571 	.release =	unix_release,
572 	.bind =		unix_bind,
573 	.connect =	unix_dgram_connect,
574 	.socketpair =	unix_socketpair,
575 	.accept =	sock_no_accept,
576 	.getname =	unix_getname,
577 	.poll =		unix_dgram_poll,
578 	.ioctl =	unix_ioctl,
579 	.listen =	sock_no_listen,
580 	.shutdown =	unix_shutdown,
581 	.setsockopt =	sock_no_setsockopt,
582 	.getsockopt =	sock_no_getsockopt,
583 	.sendmsg =	unix_dgram_sendmsg,
584 	.recvmsg =	unix_dgram_recvmsg,
585 	.mmap =		sock_no_mmap,
586 	.sendpage =	sock_no_sendpage,
587 	.set_peek_off =	unix_set_peek_off,
588 };
589 
590 static const struct proto_ops unix_seqpacket_ops = {
591 	.family =	PF_UNIX,
592 	.owner =	THIS_MODULE,
593 	.release =	unix_release,
594 	.bind =		unix_bind,
595 	.connect =	unix_stream_connect,
596 	.socketpair =	unix_socketpair,
597 	.accept =	unix_accept,
598 	.getname =	unix_getname,
599 	.poll =		unix_dgram_poll,
600 	.ioctl =	unix_ioctl,
601 	.listen =	unix_listen,
602 	.shutdown =	unix_shutdown,
603 	.setsockopt =	sock_no_setsockopt,
604 	.getsockopt =	sock_no_getsockopt,
605 	.sendmsg =	unix_seqpacket_sendmsg,
606 	.recvmsg =	unix_seqpacket_recvmsg,
607 	.mmap =		sock_no_mmap,
608 	.sendpage =	sock_no_sendpage,
609 	.set_peek_off =	unix_set_peek_off,
610 };
611 
612 static struct proto unix_proto = {
613 	.name			= "UNIX",
614 	.owner			= THIS_MODULE,
615 	.obj_size		= sizeof(struct unix_sock),
616 };
617 
618 /*
619  * AF_UNIX sockets do not interact with hardware, hence they
620  * dont trigger interrupts - so it's safe for them to have
621  * bh-unsafe locking for their sk_receive_queue.lock. Split off
622  * this special lock-class by reinitializing the spinlock key:
623  */
624 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
625 
626 static struct sock *unix_create1(struct net *net, struct socket *sock)
627 {
628 	struct sock *sk = NULL;
629 	struct unix_sock *u;
630 
631 	atomic_long_inc(&unix_nr_socks);
632 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
633 		goto out;
634 
635 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
636 	if (!sk)
637 		goto out;
638 
639 	sock_init_data(sock, sk);
640 	lockdep_set_class(&sk->sk_receive_queue.lock,
641 				&af_unix_sk_receive_queue_lock_key);
642 
643 	sk->sk_write_space	= unix_write_space;
644 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
645 	sk->sk_destruct		= unix_sock_destructor;
646 	u	  = unix_sk(sk);
647 	u->path.dentry = NULL;
648 	u->path.mnt = NULL;
649 	spin_lock_init(&u->lock);
650 	atomic_long_set(&u->inflight, 0);
651 	INIT_LIST_HEAD(&u->link);
652 	mutex_init(&u->readlock); /* single task reading lock */
653 	init_waitqueue_head(&u->peer_wait);
654 	unix_insert_socket(unix_sockets_unbound(sk), sk);
655 out:
656 	if (sk == NULL)
657 		atomic_long_dec(&unix_nr_socks);
658 	else {
659 		local_bh_disable();
660 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
661 		local_bh_enable();
662 	}
663 	return sk;
664 }
665 
666 static int unix_create(struct net *net, struct socket *sock, int protocol,
667 		       int kern)
668 {
669 	if (protocol && protocol != PF_UNIX)
670 		return -EPROTONOSUPPORT;
671 
672 	sock->state = SS_UNCONNECTED;
673 
674 	switch (sock->type) {
675 	case SOCK_STREAM:
676 		sock->ops = &unix_stream_ops;
677 		break;
678 		/*
679 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
680 		 *	nothing uses it.
681 		 */
682 	case SOCK_RAW:
683 		sock->type = SOCK_DGRAM;
684 	case SOCK_DGRAM:
685 		sock->ops = &unix_dgram_ops;
686 		break;
687 	case SOCK_SEQPACKET:
688 		sock->ops = &unix_seqpacket_ops;
689 		break;
690 	default:
691 		return -ESOCKTNOSUPPORT;
692 	}
693 
694 	return unix_create1(net, sock) ? 0 : -ENOMEM;
695 }
696 
697 static int unix_release(struct socket *sock)
698 {
699 	struct sock *sk = sock->sk;
700 
701 	if (!sk)
702 		return 0;
703 
704 	sock->sk = NULL;
705 
706 	return unix_release_sock(sk, 0);
707 }
708 
709 static int unix_autobind(struct socket *sock)
710 {
711 	struct sock *sk = sock->sk;
712 	struct net *net = sock_net(sk);
713 	struct unix_sock *u = unix_sk(sk);
714 	static u32 ordernum = 1;
715 	struct unix_address *addr;
716 	int err;
717 	unsigned int retries = 0;
718 
719 	mutex_lock(&u->readlock);
720 
721 	err = 0;
722 	if (u->addr)
723 		goto out;
724 
725 	err = -ENOMEM;
726 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
727 	if (!addr)
728 		goto out;
729 
730 	addr->name->sun_family = AF_UNIX;
731 	atomic_set(&addr->refcnt, 1);
732 
733 retry:
734 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
735 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
736 
737 	spin_lock(&unix_table_lock);
738 	ordernum = (ordernum+1)&0xFFFFF;
739 
740 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
741 				      addr->hash)) {
742 		spin_unlock(&unix_table_lock);
743 		/*
744 		 * __unix_find_socket_byname() may take long time if many names
745 		 * are already in use.
746 		 */
747 		cond_resched();
748 		/* Give up if all names seems to be in use. */
749 		if (retries++ == 0xFFFFF) {
750 			err = -ENOSPC;
751 			kfree(addr);
752 			goto out;
753 		}
754 		goto retry;
755 	}
756 	addr->hash ^= sk->sk_type;
757 
758 	__unix_remove_socket(sk);
759 	u->addr = addr;
760 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
761 	spin_unlock(&unix_table_lock);
762 	err = 0;
763 
764 out:	mutex_unlock(&u->readlock);
765 	return err;
766 }
767 
768 static struct sock *unix_find_other(struct net *net,
769 				    struct sockaddr_un *sunname, int len,
770 				    int type, unsigned int hash, int *error)
771 {
772 	struct sock *u;
773 	struct path path;
774 	int err = 0;
775 
776 	if (sunname->sun_path[0]) {
777 		struct inode *inode;
778 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
779 		if (err)
780 			goto fail;
781 		inode = path.dentry->d_inode;
782 		err = inode_permission(inode, MAY_WRITE);
783 		if (err)
784 			goto put_fail;
785 
786 		err = -ECONNREFUSED;
787 		if (!S_ISSOCK(inode->i_mode))
788 			goto put_fail;
789 		u = unix_find_socket_byinode(inode);
790 		if (!u)
791 			goto put_fail;
792 
793 		if (u->sk_type == type)
794 			touch_atime(&path);
795 
796 		path_put(&path);
797 
798 		err = -EPROTOTYPE;
799 		if (u->sk_type != type) {
800 			sock_put(u);
801 			goto fail;
802 		}
803 	} else {
804 		err = -ECONNREFUSED;
805 		u = unix_find_socket_byname(net, sunname, len, type, hash);
806 		if (u) {
807 			struct dentry *dentry;
808 			dentry = unix_sk(u)->path.dentry;
809 			if (dentry)
810 				touch_atime(&unix_sk(u)->path);
811 		} else
812 			goto fail;
813 	}
814 	return u;
815 
816 put_fail:
817 	path_put(&path);
818 fail:
819 	*error = err;
820 	return NULL;
821 }
822 
823 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
824 {
825 	struct dentry *dentry;
826 	struct path path;
827 	int err = 0;
828 	/*
829 	 * Get the parent directory, calculate the hash for last
830 	 * component.
831 	 */
832 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
833 	err = PTR_ERR(dentry);
834 	if (IS_ERR(dentry))
835 		return err;
836 
837 	/*
838 	 * All right, let's create it.
839 	 */
840 	err = security_path_mknod(&path, dentry, mode, 0);
841 	if (!err) {
842 		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
843 		if (!err) {
844 			res->mnt = mntget(path.mnt);
845 			res->dentry = dget(dentry);
846 		}
847 	}
848 	done_path_create(&path, dentry);
849 	return err;
850 }
851 
852 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
853 {
854 	struct sock *sk = sock->sk;
855 	struct net *net = sock_net(sk);
856 	struct unix_sock *u = unix_sk(sk);
857 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
858 	char *sun_path = sunaddr->sun_path;
859 	int err;
860 	unsigned int hash;
861 	struct unix_address *addr;
862 	struct hlist_head *list;
863 
864 	err = -EINVAL;
865 	if (sunaddr->sun_family != AF_UNIX)
866 		goto out;
867 
868 	if (addr_len == sizeof(short)) {
869 		err = unix_autobind(sock);
870 		goto out;
871 	}
872 
873 	err = unix_mkname(sunaddr, addr_len, &hash);
874 	if (err < 0)
875 		goto out;
876 	addr_len = err;
877 
878 	mutex_lock(&u->readlock);
879 
880 	err = -EINVAL;
881 	if (u->addr)
882 		goto out_up;
883 
884 	err = -ENOMEM;
885 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
886 	if (!addr)
887 		goto out_up;
888 
889 	memcpy(addr->name, sunaddr, addr_len);
890 	addr->len = addr_len;
891 	addr->hash = hash ^ sk->sk_type;
892 	atomic_set(&addr->refcnt, 1);
893 
894 	if (sun_path[0]) {
895 		struct path path;
896 		umode_t mode = S_IFSOCK |
897 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
898 		err = unix_mknod(sun_path, mode, &path);
899 		if (err) {
900 			if (err == -EEXIST)
901 				err = -EADDRINUSE;
902 			unix_release_addr(addr);
903 			goto out_up;
904 		}
905 		addr->hash = UNIX_HASH_SIZE;
906 		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
907 		spin_lock(&unix_table_lock);
908 		u->path = path;
909 		list = &unix_socket_table[hash];
910 	} else {
911 		spin_lock(&unix_table_lock);
912 		err = -EADDRINUSE;
913 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
914 					      sk->sk_type, hash)) {
915 			unix_release_addr(addr);
916 			goto out_unlock;
917 		}
918 
919 		list = &unix_socket_table[addr->hash];
920 	}
921 
922 	err = 0;
923 	__unix_remove_socket(sk);
924 	u->addr = addr;
925 	__unix_insert_socket(list, sk);
926 
927 out_unlock:
928 	spin_unlock(&unix_table_lock);
929 out_up:
930 	mutex_unlock(&u->readlock);
931 out:
932 	return err;
933 }
934 
935 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
936 {
937 	if (unlikely(sk1 == sk2) || !sk2) {
938 		unix_state_lock(sk1);
939 		return;
940 	}
941 	if (sk1 < sk2) {
942 		unix_state_lock(sk1);
943 		unix_state_lock_nested(sk2);
944 	} else {
945 		unix_state_lock(sk2);
946 		unix_state_lock_nested(sk1);
947 	}
948 }
949 
950 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
951 {
952 	if (unlikely(sk1 == sk2) || !sk2) {
953 		unix_state_unlock(sk1);
954 		return;
955 	}
956 	unix_state_unlock(sk1);
957 	unix_state_unlock(sk2);
958 }
959 
960 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
961 			      int alen, int flags)
962 {
963 	struct sock *sk = sock->sk;
964 	struct net *net = sock_net(sk);
965 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
966 	struct sock *other;
967 	unsigned int hash;
968 	int err;
969 
970 	if (addr->sa_family != AF_UNSPEC) {
971 		err = unix_mkname(sunaddr, alen, &hash);
972 		if (err < 0)
973 			goto out;
974 		alen = err;
975 
976 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
977 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
978 			goto out;
979 
980 restart:
981 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
982 		if (!other)
983 			goto out;
984 
985 		unix_state_double_lock(sk, other);
986 
987 		/* Apparently VFS overslept socket death. Retry. */
988 		if (sock_flag(other, SOCK_DEAD)) {
989 			unix_state_double_unlock(sk, other);
990 			sock_put(other);
991 			goto restart;
992 		}
993 
994 		err = -EPERM;
995 		if (!unix_may_send(sk, other))
996 			goto out_unlock;
997 
998 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
999 		if (err)
1000 			goto out_unlock;
1001 
1002 	} else {
1003 		/*
1004 		 *	1003.1g breaking connected state with AF_UNSPEC
1005 		 */
1006 		other = NULL;
1007 		unix_state_double_lock(sk, other);
1008 	}
1009 
1010 	/*
1011 	 * If it was connected, reconnect.
1012 	 */
1013 	if (unix_peer(sk)) {
1014 		struct sock *old_peer = unix_peer(sk);
1015 		unix_peer(sk) = other;
1016 		unix_state_double_unlock(sk, other);
1017 
1018 		if (other != old_peer)
1019 			unix_dgram_disconnected(sk, old_peer);
1020 		sock_put(old_peer);
1021 	} else {
1022 		unix_peer(sk) = other;
1023 		unix_state_double_unlock(sk, other);
1024 	}
1025 	return 0;
1026 
1027 out_unlock:
1028 	unix_state_double_unlock(sk, other);
1029 	sock_put(other);
1030 out:
1031 	return err;
1032 }
1033 
1034 static long unix_wait_for_peer(struct sock *other, long timeo)
1035 {
1036 	struct unix_sock *u = unix_sk(other);
1037 	int sched;
1038 	DEFINE_WAIT(wait);
1039 
1040 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1041 
1042 	sched = !sock_flag(other, SOCK_DEAD) &&
1043 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1044 		unix_recvq_full(other);
1045 
1046 	unix_state_unlock(other);
1047 
1048 	if (sched)
1049 		timeo = schedule_timeout(timeo);
1050 
1051 	finish_wait(&u->peer_wait, &wait);
1052 	return timeo;
1053 }
1054 
1055 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1056 			       int addr_len, int flags)
1057 {
1058 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1059 	struct sock *sk = sock->sk;
1060 	struct net *net = sock_net(sk);
1061 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1062 	struct sock *newsk = NULL;
1063 	struct sock *other = NULL;
1064 	struct sk_buff *skb = NULL;
1065 	unsigned int hash;
1066 	int st;
1067 	int err;
1068 	long timeo;
1069 
1070 	err = unix_mkname(sunaddr, addr_len, &hash);
1071 	if (err < 0)
1072 		goto out;
1073 	addr_len = err;
1074 
1075 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1076 	    (err = unix_autobind(sock)) != 0)
1077 		goto out;
1078 
1079 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1080 
1081 	/* First of all allocate resources.
1082 	   If we will make it after state is locked,
1083 	   we will have to recheck all again in any case.
1084 	 */
1085 
1086 	err = -ENOMEM;
1087 
1088 	/* create new sock for complete connection */
1089 	newsk = unix_create1(sock_net(sk), NULL);
1090 	if (newsk == NULL)
1091 		goto out;
1092 
1093 	/* Allocate skb for sending to listening sock */
1094 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1095 	if (skb == NULL)
1096 		goto out;
1097 
1098 restart:
1099 	/*  Find listening sock. */
1100 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1101 	if (!other)
1102 		goto out;
1103 
1104 	/* Latch state of peer */
1105 	unix_state_lock(other);
1106 
1107 	/* Apparently VFS overslept socket death. Retry. */
1108 	if (sock_flag(other, SOCK_DEAD)) {
1109 		unix_state_unlock(other);
1110 		sock_put(other);
1111 		goto restart;
1112 	}
1113 
1114 	err = -ECONNREFUSED;
1115 	if (other->sk_state != TCP_LISTEN)
1116 		goto out_unlock;
1117 	if (other->sk_shutdown & RCV_SHUTDOWN)
1118 		goto out_unlock;
1119 
1120 	if (unix_recvq_full(other)) {
1121 		err = -EAGAIN;
1122 		if (!timeo)
1123 			goto out_unlock;
1124 
1125 		timeo = unix_wait_for_peer(other, timeo);
1126 
1127 		err = sock_intr_errno(timeo);
1128 		if (signal_pending(current))
1129 			goto out;
1130 		sock_put(other);
1131 		goto restart;
1132 	}
1133 
1134 	/* Latch our state.
1135 
1136 	   It is tricky place. We need to grab our state lock and cannot
1137 	   drop lock on peer. It is dangerous because deadlock is
1138 	   possible. Connect to self case and simultaneous
1139 	   attempt to connect are eliminated by checking socket
1140 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1141 	   check this before attempt to grab lock.
1142 
1143 	   Well, and we have to recheck the state after socket locked.
1144 	 */
1145 	st = sk->sk_state;
1146 
1147 	switch (st) {
1148 	case TCP_CLOSE:
1149 		/* This is ok... continue with connect */
1150 		break;
1151 	case TCP_ESTABLISHED:
1152 		/* Socket is already connected */
1153 		err = -EISCONN;
1154 		goto out_unlock;
1155 	default:
1156 		err = -EINVAL;
1157 		goto out_unlock;
1158 	}
1159 
1160 	unix_state_lock_nested(sk);
1161 
1162 	if (sk->sk_state != st) {
1163 		unix_state_unlock(sk);
1164 		unix_state_unlock(other);
1165 		sock_put(other);
1166 		goto restart;
1167 	}
1168 
1169 	err = security_unix_stream_connect(sk, other, newsk);
1170 	if (err) {
1171 		unix_state_unlock(sk);
1172 		goto out_unlock;
1173 	}
1174 
1175 	/* The way is open! Fastly set all the necessary fields... */
1176 
1177 	sock_hold(sk);
1178 	unix_peer(newsk)	= sk;
1179 	newsk->sk_state		= TCP_ESTABLISHED;
1180 	newsk->sk_type		= sk->sk_type;
1181 	init_peercred(newsk);
1182 	newu = unix_sk(newsk);
1183 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1184 	otheru = unix_sk(other);
1185 
1186 	/* copy address information from listening to new sock*/
1187 	if (otheru->addr) {
1188 		atomic_inc(&otheru->addr->refcnt);
1189 		newu->addr = otheru->addr;
1190 	}
1191 	if (otheru->path.dentry) {
1192 		path_get(&otheru->path);
1193 		newu->path = otheru->path;
1194 	}
1195 
1196 	/* Set credentials */
1197 	copy_peercred(sk, other);
1198 
1199 	sock->state	= SS_CONNECTED;
1200 	sk->sk_state	= TCP_ESTABLISHED;
1201 	sock_hold(newsk);
1202 
1203 	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1204 	unix_peer(sk)	= newsk;
1205 
1206 	unix_state_unlock(sk);
1207 
1208 	/* take ten and and send info to listening sock */
1209 	spin_lock(&other->sk_receive_queue.lock);
1210 	__skb_queue_tail(&other->sk_receive_queue, skb);
1211 	spin_unlock(&other->sk_receive_queue.lock);
1212 	unix_state_unlock(other);
1213 	other->sk_data_ready(other, 0);
1214 	sock_put(other);
1215 	return 0;
1216 
1217 out_unlock:
1218 	if (other)
1219 		unix_state_unlock(other);
1220 
1221 out:
1222 	kfree_skb(skb);
1223 	if (newsk)
1224 		unix_release_sock(newsk, 0);
1225 	if (other)
1226 		sock_put(other);
1227 	return err;
1228 }
1229 
1230 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1231 {
1232 	struct sock *ska = socka->sk, *skb = sockb->sk;
1233 
1234 	/* Join our sockets back to back */
1235 	sock_hold(ska);
1236 	sock_hold(skb);
1237 	unix_peer(ska) = skb;
1238 	unix_peer(skb) = ska;
1239 	init_peercred(ska);
1240 	init_peercred(skb);
1241 
1242 	if (ska->sk_type != SOCK_DGRAM) {
1243 		ska->sk_state = TCP_ESTABLISHED;
1244 		skb->sk_state = TCP_ESTABLISHED;
1245 		socka->state  = SS_CONNECTED;
1246 		sockb->state  = SS_CONNECTED;
1247 	}
1248 	return 0;
1249 }
1250 
1251 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1252 {
1253 	struct sock *sk = sock->sk;
1254 	struct sock *tsk;
1255 	struct sk_buff *skb;
1256 	int err;
1257 
1258 	err = -EOPNOTSUPP;
1259 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1260 		goto out;
1261 
1262 	err = -EINVAL;
1263 	if (sk->sk_state != TCP_LISTEN)
1264 		goto out;
1265 
1266 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1267 	 * so that no locks are necessary.
1268 	 */
1269 
1270 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1271 	if (!skb) {
1272 		/* This means receive shutdown. */
1273 		if (err == 0)
1274 			err = -EINVAL;
1275 		goto out;
1276 	}
1277 
1278 	tsk = skb->sk;
1279 	skb_free_datagram(sk, skb);
1280 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1281 
1282 	/* attach accepted sock to socket */
1283 	unix_state_lock(tsk);
1284 	newsock->state = SS_CONNECTED;
1285 	sock_graft(tsk, newsock);
1286 	unix_state_unlock(tsk);
1287 	return 0;
1288 
1289 out:
1290 	return err;
1291 }
1292 
1293 
1294 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1295 {
1296 	struct sock *sk = sock->sk;
1297 	struct unix_sock *u;
1298 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1299 	int err = 0;
1300 
1301 	if (peer) {
1302 		sk = unix_peer_get(sk);
1303 
1304 		err = -ENOTCONN;
1305 		if (!sk)
1306 			goto out;
1307 		err = 0;
1308 	} else {
1309 		sock_hold(sk);
1310 	}
1311 
1312 	u = unix_sk(sk);
1313 	unix_state_lock(sk);
1314 	if (!u->addr) {
1315 		sunaddr->sun_family = AF_UNIX;
1316 		sunaddr->sun_path[0] = 0;
1317 		*uaddr_len = sizeof(short);
1318 	} else {
1319 		struct unix_address *addr = u->addr;
1320 
1321 		*uaddr_len = addr->len;
1322 		memcpy(sunaddr, addr->name, *uaddr_len);
1323 	}
1324 	unix_state_unlock(sk);
1325 	sock_put(sk);
1326 out:
1327 	return err;
1328 }
1329 
1330 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1331 {
1332 	int i;
1333 
1334 	scm->fp = UNIXCB(skb).fp;
1335 	UNIXCB(skb).fp = NULL;
1336 
1337 	for (i = scm->fp->count-1; i >= 0; i--)
1338 		unix_notinflight(scm->fp->fp[i]);
1339 }
1340 
1341 static void unix_destruct_scm(struct sk_buff *skb)
1342 {
1343 	struct scm_cookie scm;
1344 	memset(&scm, 0, sizeof(scm));
1345 	scm.pid  = UNIXCB(skb).pid;
1346 	scm.cred = UNIXCB(skb).cred;
1347 	if (UNIXCB(skb).fp)
1348 		unix_detach_fds(&scm, skb);
1349 
1350 	/* Alas, it calls VFS */
1351 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1352 	scm_destroy(&scm);
1353 	sock_wfree(skb);
1354 }
1355 
1356 #define MAX_RECURSION_LEVEL 4
1357 
1358 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1359 {
1360 	int i;
1361 	unsigned char max_level = 0;
1362 	int unix_sock_count = 0;
1363 
1364 	for (i = scm->fp->count - 1; i >= 0; i--) {
1365 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1366 
1367 		if (sk) {
1368 			unix_sock_count++;
1369 			max_level = max(max_level,
1370 					unix_sk(sk)->recursion_level);
1371 		}
1372 	}
1373 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1374 		return -ETOOMANYREFS;
1375 
1376 	/*
1377 	 * Need to duplicate file references for the sake of garbage
1378 	 * collection.  Otherwise a socket in the fps might become a
1379 	 * candidate for GC while the skb is not yet queued.
1380 	 */
1381 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1382 	if (!UNIXCB(skb).fp)
1383 		return -ENOMEM;
1384 
1385 	if (unix_sock_count) {
1386 		for (i = scm->fp->count - 1; i >= 0; i--)
1387 			unix_inflight(scm->fp->fp[i]);
1388 	}
1389 	return max_level;
1390 }
1391 
1392 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1393 {
1394 	int err = 0;
1395 
1396 	UNIXCB(skb).pid  = get_pid(scm->pid);
1397 	if (scm->cred)
1398 		UNIXCB(skb).cred = get_cred(scm->cred);
1399 	UNIXCB(skb).fp = NULL;
1400 	if (scm->fp && send_fds)
1401 		err = unix_attach_fds(scm, skb);
1402 
1403 	skb->destructor = unix_destruct_scm;
1404 	return err;
1405 }
1406 
1407 /*
1408  * Some apps rely on write() giving SCM_CREDENTIALS
1409  * We include credentials if source or destination socket
1410  * asserted SOCK_PASSCRED.
1411  */
1412 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1413 			    const struct sock *other)
1414 {
1415 	if (UNIXCB(skb).cred)
1416 		return;
1417 	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1418 	    !other->sk_socket ||
1419 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1420 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1421 		UNIXCB(skb).cred = get_current_cred();
1422 	}
1423 }
1424 
1425 /*
1426  *	Send AF_UNIX data.
1427  */
1428 
1429 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1430 			      struct msghdr *msg, size_t len)
1431 {
1432 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1433 	struct sock *sk = sock->sk;
1434 	struct net *net = sock_net(sk);
1435 	struct unix_sock *u = unix_sk(sk);
1436 	struct sockaddr_un *sunaddr = msg->msg_name;
1437 	struct sock *other = NULL;
1438 	int namelen = 0; /* fake GCC */
1439 	int err;
1440 	unsigned int hash;
1441 	struct sk_buff *skb;
1442 	long timeo;
1443 	struct scm_cookie tmp_scm;
1444 	int max_level;
1445 	int data_len = 0;
1446 
1447 	if (NULL == siocb->scm)
1448 		siocb->scm = &tmp_scm;
1449 	wait_for_unix_gc();
1450 	err = scm_send(sock, msg, siocb->scm, false);
1451 	if (err < 0)
1452 		return err;
1453 
1454 	err = -EOPNOTSUPP;
1455 	if (msg->msg_flags&MSG_OOB)
1456 		goto out;
1457 
1458 	if (msg->msg_namelen) {
1459 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1460 		if (err < 0)
1461 			goto out;
1462 		namelen = err;
1463 	} else {
1464 		sunaddr = NULL;
1465 		err = -ENOTCONN;
1466 		other = unix_peer_get(sk);
1467 		if (!other)
1468 			goto out;
1469 	}
1470 
1471 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1472 	    && (err = unix_autobind(sock)) != 0)
1473 		goto out;
1474 
1475 	err = -EMSGSIZE;
1476 	if (len > sk->sk_sndbuf - 32)
1477 		goto out;
1478 
1479 	if (len > SKB_MAX_ALLOC)
1480 		data_len = min_t(size_t,
1481 				 len - SKB_MAX_ALLOC,
1482 				 MAX_SKB_FRAGS * PAGE_SIZE);
1483 
1484 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1485 				   msg->msg_flags & MSG_DONTWAIT, &err);
1486 	if (skb == NULL)
1487 		goto out;
1488 
1489 	err = unix_scm_to_skb(siocb->scm, skb, true);
1490 	if (err < 0)
1491 		goto out_free;
1492 	max_level = err + 1;
1493 	unix_get_secdata(siocb->scm, skb);
1494 
1495 	skb_put(skb, len - data_len);
1496 	skb->data_len = data_len;
1497 	skb->len = len;
1498 	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1499 	if (err)
1500 		goto out_free;
1501 
1502 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1503 
1504 restart:
1505 	if (!other) {
1506 		err = -ECONNRESET;
1507 		if (sunaddr == NULL)
1508 			goto out_free;
1509 
1510 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1511 					hash, &err);
1512 		if (other == NULL)
1513 			goto out_free;
1514 	}
1515 
1516 	if (sk_filter(other, skb) < 0) {
1517 		/* Toss the packet but do not return any error to the sender */
1518 		err = len;
1519 		goto out_free;
1520 	}
1521 
1522 	unix_state_lock(other);
1523 	err = -EPERM;
1524 	if (!unix_may_send(sk, other))
1525 		goto out_unlock;
1526 
1527 	if (sock_flag(other, SOCK_DEAD)) {
1528 		/*
1529 		 *	Check with 1003.1g - what should
1530 		 *	datagram error
1531 		 */
1532 		unix_state_unlock(other);
1533 		sock_put(other);
1534 
1535 		err = 0;
1536 		unix_state_lock(sk);
1537 		if (unix_peer(sk) == other) {
1538 			unix_peer(sk) = NULL;
1539 			unix_state_unlock(sk);
1540 
1541 			unix_dgram_disconnected(sk, other);
1542 			sock_put(other);
1543 			err = -ECONNREFUSED;
1544 		} else {
1545 			unix_state_unlock(sk);
1546 		}
1547 
1548 		other = NULL;
1549 		if (err)
1550 			goto out_free;
1551 		goto restart;
1552 	}
1553 
1554 	err = -EPIPE;
1555 	if (other->sk_shutdown & RCV_SHUTDOWN)
1556 		goto out_unlock;
1557 
1558 	if (sk->sk_type != SOCK_SEQPACKET) {
1559 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1560 		if (err)
1561 			goto out_unlock;
1562 	}
1563 
1564 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1565 		if (!timeo) {
1566 			err = -EAGAIN;
1567 			goto out_unlock;
1568 		}
1569 
1570 		timeo = unix_wait_for_peer(other, timeo);
1571 
1572 		err = sock_intr_errno(timeo);
1573 		if (signal_pending(current))
1574 			goto out_free;
1575 
1576 		goto restart;
1577 	}
1578 
1579 	if (sock_flag(other, SOCK_RCVTSTAMP))
1580 		__net_timestamp(skb);
1581 	maybe_add_creds(skb, sock, other);
1582 	skb_queue_tail(&other->sk_receive_queue, skb);
1583 	if (max_level > unix_sk(other)->recursion_level)
1584 		unix_sk(other)->recursion_level = max_level;
1585 	unix_state_unlock(other);
1586 	other->sk_data_ready(other, len);
1587 	sock_put(other);
1588 	scm_destroy(siocb->scm);
1589 	return len;
1590 
1591 out_unlock:
1592 	unix_state_unlock(other);
1593 out_free:
1594 	kfree_skb(skb);
1595 out:
1596 	if (other)
1597 		sock_put(other);
1598 	scm_destroy(siocb->scm);
1599 	return err;
1600 }
1601 
1602 
1603 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1604 			       struct msghdr *msg, size_t len)
1605 {
1606 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1607 	struct sock *sk = sock->sk;
1608 	struct sock *other = NULL;
1609 	int err, size;
1610 	struct sk_buff *skb;
1611 	int sent = 0;
1612 	struct scm_cookie tmp_scm;
1613 	bool fds_sent = false;
1614 	int max_level;
1615 
1616 	if (NULL == siocb->scm)
1617 		siocb->scm = &tmp_scm;
1618 	wait_for_unix_gc();
1619 	err = scm_send(sock, msg, siocb->scm, false);
1620 	if (err < 0)
1621 		return err;
1622 
1623 	err = -EOPNOTSUPP;
1624 	if (msg->msg_flags&MSG_OOB)
1625 		goto out_err;
1626 
1627 	if (msg->msg_namelen) {
1628 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1629 		goto out_err;
1630 	} else {
1631 		err = -ENOTCONN;
1632 		other = unix_peer(sk);
1633 		if (!other)
1634 			goto out_err;
1635 	}
1636 
1637 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1638 		goto pipe_err;
1639 
1640 	while (sent < len) {
1641 		/*
1642 		 *	Optimisation for the fact that under 0.01% of X
1643 		 *	messages typically need breaking up.
1644 		 */
1645 
1646 		size = len-sent;
1647 
1648 		/* Keep two messages in the pipe so it schedules better */
1649 		if (size > ((sk->sk_sndbuf >> 1) - 64))
1650 			size = (sk->sk_sndbuf >> 1) - 64;
1651 
1652 		if (size > SKB_MAX_ALLOC)
1653 			size = SKB_MAX_ALLOC;
1654 
1655 		/*
1656 		 *	Grab a buffer
1657 		 */
1658 
1659 		skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
1660 					  &err);
1661 
1662 		if (skb == NULL)
1663 			goto out_err;
1664 
1665 		/*
1666 		 *	If you pass two values to the sock_alloc_send_skb
1667 		 *	it tries to grab the large buffer with GFP_NOFS
1668 		 *	(which can fail easily), and if it fails grab the
1669 		 *	fallback size buffer which is under a page and will
1670 		 *	succeed. [Alan]
1671 		 */
1672 		size = min_t(int, size, skb_tailroom(skb));
1673 
1674 
1675 		/* Only send the fds in the first buffer */
1676 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1677 		if (err < 0) {
1678 			kfree_skb(skb);
1679 			goto out_err;
1680 		}
1681 		max_level = err + 1;
1682 		fds_sent = true;
1683 
1684 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
1685 		if (err) {
1686 			kfree_skb(skb);
1687 			goto out_err;
1688 		}
1689 
1690 		unix_state_lock(other);
1691 
1692 		if (sock_flag(other, SOCK_DEAD) ||
1693 		    (other->sk_shutdown & RCV_SHUTDOWN))
1694 			goto pipe_err_free;
1695 
1696 		maybe_add_creds(skb, sock, other);
1697 		skb_queue_tail(&other->sk_receive_queue, skb);
1698 		if (max_level > unix_sk(other)->recursion_level)
1699 			unix_sk(other)->recursion_level = max_level;
1700 		unix_state_unlock(other);
1701 		other->sk_data_ready(other, size);
1702 		sent += size;
1703 	}
1704 
1705 	scm_destroy(siocb->scm);
1706 	siocb->scm = NULL;
1707 
1708 	return sent;
1709 
1710 pipe_err_free:
1711 	unix_state_unlock(other);
1712 	kfree_skb(skb);
1713 pipe_err:
1714 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1715 		send_sig(SIGPIPE, current, 0);
1716 	err = -EPIPE;
1717 out_err:
1718 	scm_destroy(siocb->scm);
1719 	siocb->scm = NULL;
1720 	return sent ? : err;
1721 }
1722 
1723 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1724 				  struct msghdr *msg, size_t len)
1725 {
1726 	int err;
1727 	struct sock *sk = sock->sk;
1728 
1729 	err = sock_error(sk);
1730 	if (err)
1731 		return err;
1732 
1733 	if (sk->sk_state != TCP_ESTABLISHED)
1734 		return -ENOTCONN;
1735 
1736 	if (msg->msg_namelen)
1737 		msg->msg_namelen = 0;
1738 
1739 	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1740 }
1741 
1742 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1743 			      struct msghdr *msg, size_t size,
1744 			      int flags)
1745 {
1746 	struct sock *sk = sock->sk;
1747 
1748 	if (sk->sk_state != TCP_ESTABLISHED)
1749 		return -ENOTCONN;
1750 
1751 	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1752 }
1753 
1754 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1755 {
1756 	struct unix_sock *u = unix_sk(sk);
1757 
1758 	msg->msg_namelen = 0;
1759 	if (u->addr) {
1760 		msg->msg_namelen = u->addr->len;
1761 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1762 	}
1763 }
1764 
1765 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1766 			      struct msghdr *msg, size_t size,
1767 			      int flags)
1768 {
1769 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1770 	struct scm_cookie tmp_scm;
1771 	struct sock *sk = sock->sk;
1772 	struct unix_sock *u = unix_sk(sk);
1773 	int noblock = flags & MSG_DONTWAIT;
1774 	struct sk_buff *skb;
1775 	int err;
1776 	int peeked, skip;
1777 
1778 	err = -EOPNOTSUPP;
1779 	if (flags&MSG_OOB)
1780 		goto out;
1781 
1782 	msg->msg_namelen = 0;
1783 
1784 	err = mutex_lock_interruptible(&u->readlock);
1785 	if (err) {
1786 		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1787 		goto out;
1788 	}
1789 
1790 	skip = sk_peek_offset(sk, flags);
1791 
1792 	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1793 	if (!skb) {
1794 		unix_state_lock(sk);
1795 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1796 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1797 		    (sk->sk_shutdown & RCV_SHUTDOWN))
1798 			err = 0;
1799 		unix_state_unlock(sk);
1800 		goto out_unlock;
1801 	}
1802 
1803 	wake_up_interruptible_sync_poll(&u->peer_wait,
1804 					POLLOUT | POLLWRNORM | POLLWRBAND);
1805 
1806 	if (msg->msg_name)
1807 		unix_copy_addr(msg, skb->sk);
1808 
1809 	if (size > skb->len - skip)
1810 		size = skb->len - skip;
1811 	else if (size < skb->len - skip)
1812 		msg->msg_flags |= MSG_TRUNC;
1813 
1814 	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1815 	if (err)
1816 		goto out_free;
1817 
1818 	if (sock_flag(sk, SOCK_RCVTSTAMP))
1819 		__sock_recv_timestamp(msg, sk, skb);
1820 
1821 	if (!siocb->scm) {
1822 		siocb->scm = &tmp_scm;
1823 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1824 	}
1825 	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
1826 	unix_set_secdata(siocb->scm, skb);
1827 
1828 	if (!(flags & MSG_PEEK)) {
1829 		if (UNIXCB(skb).fp)
1830 			unix_detach_fds(siocb->scm, skb);
1831 
1832 		sk_peek_offset_bwd(sk, skb->len);
1833 	} else {
1834 		/* It is questionable: on PEEK we could:
1835 		   - do not return fds - good, but too simple 8)
1836 		   - return fds, and do not return them on read (old strategy,
1837 		     apparently wrong)
1838 		   - clone fds (I chose it for now, it is the most universal
1839 		     solution)
1840 
1841 		   POSIX 1003.1g does not actually define this clearly
1842 		   at all. POSIX 1003.1g doesn't define a lot of things
1843 		   clearly however!
1844 
1845 		*/
1846 
1847 		sk_peek_offset_fwd(sk, size);
1848 
1849 		if (UNIXCB(skb).fp)
1850 			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1851 	}
1852 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1853 
1854 	scm_recv(sock, msg, siocb->scm, flags);
1855 
1856 out_free:
1857 	skb_free_datagram(sk, skb);
1858 out_unlock:
1859 	mutex_unlock(&u->readlock);
1860 out:
1861 	return err;
1862 }
1863 
1864 /*
1865  *	Sleep until data has arrive. But check for races..
1866  */
1867 
1868 static long unix_stream_data_wait(struct sock *sk, long timeo)
1869 {
1870 	DEFINE_WAIT(wait);
1871 
1872 	unix_state_lock(sk);
1873 
1874 	for (;;) {
1875 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1876 
1877 		if (!skb_queue_empty(&sk->sk_receive_queue) ||
1878 		    sk->sk_err ||
1879 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1880 		    signal_pending(current) ||
1881 		    !timeo)
1882 			break;
1883 
1884 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1885 		unix_state_unlock(sk);
1886 		timeo = schedule_timeout(timeo);
1887 		unix_state_lock(sk);
1888 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1889 	}
1890 
1891 	finish_wait(sk_sleep(sk), &wait);
1892 	unix_state_unlock(sk);
1893 	return timeo;
1894 }
1895 
1896 
1897 
1898 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1899 			       struct msghdr *msg, size_t size,
1900 			       int flags)
1901 {
1902 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1903 	struct scm_cookie tmp_scm;
1904 	struct sock *sk = sock->sk;
1905 	struct unix_sock *u = unix_sk(sk);
1906 	struct sockaddr_un *sunaddr = msg->msg_name;
1907 	int copied = 0;
1908 	int check_creds = 0;
1909 	int target;
1910 	int err = 0;
1911 	long timeo;
1912 	int skip;
1913 
1914 	err = -EINVAL;
1915 	if (sk->sk_state != TCP_ESTABLISHED)
1916 		goto out;
1917 
1918 	err = -EOPNOTSUPP;
1919 	if (flags&MSG_OOB)
1920 		goto out;
1921 
1922 	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1923 	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1924 
1925 	msg->msg_namelen = 0;
1926 
1927 	/* Lock the socket to prevent queue disordering
1928 	 * while sleeps in memcpy_tomsg
1929 	 */
1930 
1931 	if (!siocb->scm) {
1932 		siocb->scm = &tmp_scm;
1933 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1934 	}
1935 
1936 	err = mutex_lock_interruptible(&u->readlock);
1937 	if (err) {
1938 		err = sock_intr_errno(timeo);
1939 		goto out;
1940 	}
1941 
1942 	skip = sk_peek_offset(sk, flags);
1943 
1944 	do {
1945 		int chunk;
1946 		struct sk_buff *skb;
1947 
1948 		unix_state_lock(sk);
1949 		skb = skb_peek(&sk->sk_receive_queue);
1950 again:
1951 		if (skb == NULL) {
1952 			unix_sk(sk)->recursion_level = 0;
1953 			if (copied >= target)
1954 				goto unlock;
1955 
1956 			/*
1957 			 *	POSIX 1003.1g mandates this order.
1958 			 */
1959 
1960 			err = sock_error(sk);
1961 			if (err)
1962 				goto unlock;
1963 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1964 				goto unlock;
1965 
1966 			unix_state_unlock(sk);
1967 			err = -EAGAIN;
1968 			if (!timeo)
1969 				break;
1970 			mutex_unlock(&u->readlock);
1971 
1972 			timeo = unix_stream_data_wait(sk, timeo);
1973 
1974 			if (signal_pending(current)
1975 			    ||  mutex_lock_interruptible(&u->readlock)) {
1976 				err = sock_intr_errno(timeo);
1977 				goto out;
1978 			}
1979 
1980 			continue;
1981  unlock:
1982 			unix_state_unlock(sk);
1983 			break;
1984 		}
1985 
1986 		if (skip >= skb->len) {
1987 			skip -= skb->len;
1988 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1989 			goto again;
1990 		}
1991 
1992 		unix_state_unlock(sk);
1993 
1994 		if (check_creds) {
1995 			/* Never glue messages from different writers */
1996 			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1997 			    (UNIXCB(skb).cred != siocb->scm->cred))
1998 				break;
1999 		} else {
2000 			/* Copy credentials */
2001 			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
2002 			check_creds = 1;
2003 		}
2004 
2005 		/* Copy address just once */
2006 		if (sunaddr) {
2007 			unix_copy_addr(msg, skb->sk);
2008 			sunaddr = NULL;
2009 		}
2010 
2011 		chunk = min_t(unsigned int, skb->len - skip, size);
2012 		if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) {
2013 			if (copied == 0)
2014 				copied = -EFAULT;
2015 			break;
2016 		}
2017 		copied += chunk;
2018 		size -= chunk;
2019 
2020 		/* Mark read part of skb as used */
2021 		if (!(flags & MSG_PEEK)) {
2022 			skb_pull(skb, chunk);
2023 
2024 			sk_peek_offset_bwd(sk, chunk);
2025 
2026 			if (UNIXCB(skb).fp)
2027 				unix_detach_fds(siocb->scm, skb);
2028 
2029 			if (skb->len)
2030 				break;
2031 
2032 			skb_unlink(skb, &sk->sk_receive_queue);
2033 			consume_skb(skb);
2034 
2035 			if (siocb->scm->fp)
2036 				break;
2037 		} else {
2038 			/* It is questionable, see note in unix_dgram_recvmsg.
2039 			 */
2040 			if (UNIXCB(skb).fp)
2041 				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2042 
2043 			sk_peek_offset_fwd(sk, chunk);
2044 
2045 			break;
2046 		}
2047 	} while (size);
2048 
2049 	mutex_unlock(&u->readlock);
2050 	scm_recv(sock, msg, siocb->scm, flags);
2051 out:
2052 	return copied ? : err;
2053 }
2054 
2055 static int unix_shutdown(struct socket *sock, int mode)
2056 {
2057 	struct sock *sk = sock->sk;
2058 	struct sock *other;
2059 
2060 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2061 		return -EINVAL;
2062 	/* This maps:
2063 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2064 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2065 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2066 	 */
2067 	++mode;
2068 
2069 	unix_state_lock(sk);
2070 	sk->sk_shutdown |= mode;
2071 	other = unix_peer(sk);
2072 	if (other)
2073 		sock_hold(other);
2074 	unix_state_unlock(sk);
2075 	sk->sk_state_change(sk);
2076 
2077 	if (other &&
2078 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2079 
2080 		int peer_mode = 0;
2081 
2082 		if (mode&RCV_SHUTDOWN)
2083 			peer_mode |= SEND_SHUTDOWN;
2084 		if (mode&SEND_SHUTDOWN)
2085 			peer_mode |= RCV_SHUTDOWN;
2086 		unix_state_lock(other);
2087 		other->sk_shutdown |= peer_mode;
2088 		unix_state_unlock(other);
2089 		other->sk_state_change(other);
2090 		if (peer_mode == SHUTDOWN_MASK)
2091 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2092 		else if (peer_mode & RCV_SHUTDOWN)
2093 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2094 	}
2095 	if (other)
2096 		sock_put(other);
2097 
2098 	return 0;
2099 }
2100 
2101 long unix_inq_len(struct sock *sk)
2102 {
2103 	struct sk_buff *skb;
2104 	long amount = 0;
2105 
2106 	if (sk->sk_state == TCP_LISTEN)
2107 		return -EINVAL;
2108 
2109 	spin_lock(&sk->sk_receive_queue.lock);
2110 	if (sk->sk_type == SOCK_STREAM ||
2111 	    sk->sk_type == SOCK_SEQPACKET) {
2112 		skb_queue_walk(&sk->sk_receive_queue, skb)
2113 			amount += skb->len;
2114 	} else {
2115 		skb = skb_peek(&sk->sk_receive_queue);
2116 		if (skb)
2117 			amount = skb->len;
2118 	}
2119 	spin_unlock(&sk->sk_receive_queue.lock);
2120 
2121 	return amount;
2122 }
2123 EXPORT_SYMBOL_GPL(unix_inq_len);
2124 
2125 long unix_outq_len(struct sock *sk)
2126 {
2127 	return sk_wmem_alloc_get(sk);
2128 }
2129 EXPORT_SYMBOL_GPL(unix_outq_len);
2130 
2131 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2132 {
2133 	struct sock *sk = sock->sk;
2134 	long amount = 0;
2135 	int err;
2136 
2137 	switch (cmd) {
2138 	case SIOCOUTQ:
2139 		amount = unix_outq_len(sk);
2140 		err = put_user(amount, (int __user *)arg);
2141 		break;
2142 	case SIOCINQ:
2143 		amount = unix_inq_len(sk);
2144 		if (amount < 0)
2145 			err = amount;
2146 		else
2147 			err = put_user(amount, (int __user *)arg);
2148 		break;
2149 	default:
2150 		err = -ENOIOCTLCMD;
2151 		break;
2152 	}
2153 	return err;
2154 }
2155 
2156 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2157 {
2158 	struct sock *sk = sock->sk;
2159 	unsigned int mask;
2160 
2161 	sock_poll_wait(file, sk_sleep(sk), wait);
2162 	mask = 0;
2163 
2164 	/* exceptional events? */
2165 	if (sk->sk_err)
2166 		mask |= POLLERR;
2167 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2168 		mask |= POLLHUP;
2169 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2170 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2171 
2172 	/* readable? */
2173 	if (!skb_queue_empty(&sk->sk_receive_queue))
2174 		mask |= POLLIN | POLLRDNORM;
2175 
2176 	/* Connection-based need to check for termination and startup */
2177 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2178 	    sk->sk_state == TCP_CLOSE)
2179 		mask |= POLLHUP;
2180 
2181 	/*
2182 	 * we set writable also when the other side has shut down the
2183 	 * connection. This prevents stuck sockets.
2184 	 */
2185 	if (unix_writable(sk))
2186 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2187 
2188 	return mask;
2189 }
2190 
2191 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2192 				    poll_table *wait)
2193 {
2194 	struct sock *sk = sock->sk, *other;
2195 	unsigned int mask, writable;
2196 
2197 	sock_poll_wait(file, sk_sleep(sk), wait);
2198 	mask = 0;
2199 
2200 	/* exceptional events? */
2201 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2202 		mask |= POLLERR;
2203 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2204 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2205 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2206 		mask |= POLLHUP;
2207 
2208 	/* readable? */
2209 	if (!skb_queue_empty(&sk->sk_receive_queue))
2210 		mask |= POLLIN | POLLRDNORM;
2211 
2212 	/* Connection-based need to check for termination and startup */
2213 	if (sk->sk_type == SOCK_SEQPACKET) {
2214 		if (sk->sk_state == TCP_CLOSE)
2215 			mask |= POLLHUP;
2216 		/* connection hasn't started yet? */
2217 		if (sk->sk_state == TCP_SYN_SENT)
2218 			return mask;
2219 	}
2220 
2221 	/* No write status requested, avoid expensive OUT tests. */
2222 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2223 		return mask;
2224 
2225 	writable = unix_writable(sk);
2226 	other = unix_peer_get(sk);
2227 	if (other) {
2228 		if (unix_peer(other) != sk) {
2229 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2230 			if (unix_recvq_full(other))
2231 				writable = 0;
2232 		}
2233 		sock_put(other);
2234 	}
2235 
2236 	if (writable)
2237 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2238 	else
2239 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2240 
2241 	return mask;
2242 }
2243 
2244 #ifdef CONFIG_PROC_FS
2245 
2246 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2247 
2248 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2249 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2250 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2251 
2252 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2253 {
2254 	unsigned long offset = get_offset(*pos);
2255 	unsigned long bucket = get_bucket(*pos);
2256 	struct sock *sk;
2257 	unsigned long count = 0;
2258 
2259 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2260 		if (sock_net(sk) != seq_file_net(seq))
2261 			continue;
2262 		if (++count == offset)
2263 			break;
2264 	}
2265 
2266 	return sk;
2267 }
2268 
2269 static struct sock *unix_next_socket(struct seq_file *seq,
2270 				     struct sock *sk,
2271 				     loff_t *pos)
2272 {
2273 	unsigned long bucket;
2274 
2275 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2276 		sk = sk_next(sk);
2277 		if (!sk)
2278 			goto next_bucket;
2279 		if (sock_net(sk) == seq_file_net(seq))
2280 			return sk;
2281 	}
2282 
2283 	do {
2284 		sk = unix_from_bucket(seq, pos);
2285 		if (sk)
2286 			return sk;
2287 
2288 next_bucket:
2289 		bucket = get_bucket(*pos) + 1;
2290 		*pos = set_bucket_offset(bucket, 1);
2291 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2292 
2293 	return NULL;
2294 }
2295 
2296 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2297 	__acquires(unix_table_lock)
2298 {
2299 	spin_lock(&unix_table_lock);
2300 
2301 	if (!*pos)
2302 		return SEQ_START_TOKEN;
2303 
2304 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2305 		return NULL;
2306 
2307 	return unix_next_socket(seq, NULL, pos);
2308 }
2309 
2310 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2311 {
2312 	++*pos;
2313 	return unix_next_socket(seq, v, pos);
2314 }
2315 
2316 static void unix_seq_stop(struct seq_file *seq, void *v)
2317 	__releases(unix_table_lock)
2318 {
2319 	spin_unlock(&unix_table_lock);
2320 }
2321 
2322 static int unix_seq_show(struct seq_file *seq, void *v)
2323 {
2324 
2325 	if (v == SEQ_START_TOKEN)
2326 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2327 			 "Inode Path\n");
2328 	else {
2329 		struct sock *s = v;
2330 		struct unix_sock *u = unix_sk(s);
2331 		unix_state_lock(s);
2332 
2333 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2334 			s,
2335 			atomic_read(&s->sk_refcnt),
2336 			0,
2337 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2338 			s->sk_type,
2339 			s->sk_socket ?
2340 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2341 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2342 			sock_i_ino(s));
2343 
2344 		if (u->addr) {
2345 			int i, len;
2346 			seq_putc(seq, ' ');
2347 
2348 			i = 0;
2349 			len = u->addr->len - sizeof(short);
2350 			if (!UNIX_ABSTRACT(s))
2351 				len--;
2352 			else {
2353 				seq_putc(seq, '@');
2354 				i++;
2355 			}
2356 			for ( ; i < len; i++)
2357 				seq_putc(seq, u->addr->name->sun_path[i]);
2358 		}
2359 		unix_state_unlock(s);
2360 		seq_putc(seq, '\n');
2361 	}
2362 
2363 	return 0;
2364 }
2365 
2366 static const struct seq_operations unix_seq_ops = {
2367 	.start  = unix_seq_start,
2368 	.next   = unix_seq_next,
2369 	.stop   = unix_seq_stop,
2370 	.show   = unix_seq_show,
2371 };
2372 
2373 static int unix_seq_open(struct inode *inode, struct file *file)
2374 {
2375 	return seq_open_net(inode, file, &unix_seq_ops,
2376 			    sizeof(struct seq_net_private));
2377 }
2378 
2379 static const struct file_operations unix_seq_fops = {
2380 	.owner		= THIS_MODULE,
2381 	.open		= unix_seq_open,
2382 	.read		= seq_read,
2383 	.llseek		= seq_lseek,
2384 	.release	= seq_release_net,
2385 };
2386 
2387 #endif
2388 
2389 static const struct net_proto_family unix_family_ops = {
2390 	.family = PF_UNIX,
2391 	.create = unix_create,
2392 	.owner	= THIS_MODULE,
2393 };
2394 
2395 
2396 static int __net_init unix_net_init(struct net *net)
2397 {
2398 	int error = -ENOMEM;
2399 
2400 	net->unx.sysctl_max_dgram_qlen = 10;
2401 	if (unix_sysctl_register(net))
2402 		goto out;
2403 
2404 #ifdef CONFIG_PROC_FS
2405 	if (!proc_net_fops_create(net, "unix", 0, &unix_seq_fops)) {
2406 		unix_sysctl_unregister(net);
2407 		goto out;
2408 	}
2409 #endif
2410 	error = 0;
2411 out:
2412 	return error;
2413 }
2414 
2415 static void __net_exit unix_net_exit(struct net *net)
2416 {
2417 	unix_sysctl_unregister(net);
2418 	proc_net_remove(net, "unix");
2419 }
2420 
2421 static struct pernet_operations unix_net_ops = {
2422 	.init = unix_net_init,
2423 	.exit = unix_net_exit,
2424 };
2425 
2426 static int __init af_unix_init(void)
2427 {
2428 	int rc = -1;
2429 	struct sk_buff *dummy_skb;
2430 
2431 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb));
2432 
2433 	rc = proto_register(&unix_proto, 1);
2434 	if (rc != 0) {
2435 		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2436 		       __func__);
2437 		goto out;
2438 	}
2439 
2440 	sock_register(&unix_family_ops);
2441 	register_pernet_subsys(&unix_net_ops);
2442 out:
2443 	return rc;
2444 }
2445 
2446 static void __exit af_unix_exit(void)
2447 {
2448 	sock_unregister(PF_UNIX);
2449 	proto_unregister(&unix_proto);
2450 	unregister_pernet_subsys(&unix_net_ops);
2451 }
2452 
2453 /* Earlier than device_initcall() so that other drivers invoking
2454    request_module() don't end up in a loop when modprobe tries
2455    to use a UNIX socket. But later than subsys_initcall() because
2456    we depend on stuff initialised there */
2457 fs_initcall(af_unix_init);
2458 module_exit(af_unix_exit);
2459 
2460 MODULE_LICENSE("GPL");
2461 MODULE_ALIAS_NETPROTO(PF_UNIX);
2462