xref: /openbmc/linux/net/unix/af_unix.c (revision 310ff2c8)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
84 
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
95 #include <linux/un.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <asm/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
120 
121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
122 EXPORT_SYMBOL_GPL(unix_socket_table);
123 DEFINE_SPINLOCK(unix_table_lock);
124 EXPORT_SYMBOL_GPL(unix_table_lock);
125 static atomic_long_t unix_nr_socks;
126 
127 
128 static struct hlist_head *unix_sockets_unbound(void *addr)
129 {
130 	unsigned long hash = (unsigned long)addr;
131 
132 	hash ^= hash >> 16;
133 	hash ^= hash >> 8;
134 	hash %= UNIX_HASH_SIZE;
135 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
136 }
137 
138 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
139 
140 #ifdef CONFIG_SECURITY_NETWORK
141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
142 {
143 	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
144 }
145 
146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
147 {
148 	scm->secid = *UNIXSID(skb);
149 }
150 #else
151 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
152 { }
153 
154 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
155 { }
156 #endif /* CONFIG_SECURITY_NETWORK */
157 
158 /*
159  *  SMP locking strategy:
160  *    hash table is protected with spinlock unix_table_lock
161  *    each socket state is protected by separate spin lock.
162  */
163 
164 static inline unsigned int unix_hash_fold(__wsum n)
165 {
166 	unsigned int hash = (__force unsigned int)n;
167 
168 	hash ^= hash>>16;
169 	hash ^= hash>>8;
170 	return hash&(UNIX_HASH_SIZE-1);
171 }
172 
173 #define unix_peer(sk) (unix_sk(sk)->peer)
174 
175 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
176 {
177 	return unix_peer(osk) == sk;
178 }
179 
180 static inline int unix_may_send(struct sock *sk, struct sock *osk)
181 {
182 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
183 }
184 
185 static inline int unix_recvq_full(struct sock const *sk)
186 {
187 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
188 }
189 
190 struct sock *unix_peer_get(struct sock *s)
191 {
192 	struct sock *peer;
193 
194 	unix_state_lock(s);
195 	peer = unix_peer(s);
196 	if (peer)
197 		sock_hold(peer);
198 	unix_state_unlock(s);
199 	return peer;
200 }
201 EXPORT_SYMBOL_GPL(unix_peer_get);
202 
203 static inline void unix_release_addr(struct unix_address *addr)
204 {
205 	if (atomic_dec_and_test(&addr->refcnt))
206 		kfree(addr);
207 }
208 
209 /*
210  *	Check unix socket name:
211  *		- should be not zero length.
212  *	        - if started by not zero, should be NULL terminated (FS object)
213  *		- if started by zero, it is abstract name.
214  */
215 
216 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
217 {
218 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
219 		return -EINVAL;
220 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
221 		return -EINVAL;
222 	if (sunaddr->sun_path[0]) {
223 		/*
224 		 * This may look like an off by one error but it is a bit more
225 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
226 		 * sun_path[108] doesn't as such exist.  However in kernel space
227 		 * we are guaranteed that it is a valid memory location in our
228 		 * kernel address buffer.
229 		 */
230 		((char *)sunaddr)[len] = 0;
231 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
232 		return len;
233 	}
234 
235 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
236 	return len;
237 }
238 
239 static void __unix_remove_socket(struct sock *sk)
240 {
241 	sk_del_node_init(sk);
242 }
243 
244 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
245 {
246 	WARN_ON(!sk_unhashed(sk));
247 	sk_add_node(sk, list);
248 }
249 
250 static inline void unix_remove_socket(struct sock *sk)
251 {
252 	spin_lock(&unix_table_lock);
253 	__unix_remove_socket(sk);
254 	spin_unlock(&unix_table_lock);
255 }
256 
257 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
258 {
259 	spin_lock(&unix_table_lock);
260 	__unix_insert_socket(list, sk);
261 	spin_unlock(&unix_table_lock);
262 }
263 
264 static struct sock *__unix_find_socket_byname(struct net *net,
265 					      struct sockaddr_un *sunname,
266 					      int len, int type, unsigned int hash)
267 {
268 	struct sock *s;
269 
270 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
271 		struct unix_sock *u = unix_sk(s);
272 
273 		if (!net_eq(sock_net(s), net))
274 			continue;
275 
276 		if (u->addr->len == len &&
277 		    !memcmp(u->addr->name, sunname, len))
278 			goto found;
279 	}
280 	s = NULL;
281 found:
282 	return s;
283 }
284 
285 static inline struct sock *unix_find_socket_byname(struct net *net,
286 						   struct sockaddr_un *sunname,
287 						   int len, int type,
288 						   unsigned int hash)
289 {
290 	struct sock *s;
291 
292 	spin_lock(&unix_table_lock);
293 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
294 	if (s)
295 		sock_hold(s);
296 	spin_unlock(&unix_table_lock);
297 	return s;
298 }
299 
300 static struct sock *unix_find_socket_byinode(struct inode *i)
301 {
302 	struct sock *s;
303 
304 	spin_lock(&unix_table_lock);
305 	sk_for_each(s,
306 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
307 		struct dentry *dentry = unix_sk(s)->path.dentry;
308 
309 		if (dentry && dentry->d_inode == i) {
310 			sock_hold(s);
311 			goto found;
312 		}
313 	}
314 	s = NULL;
315 found:
316 	spin_unlock(&unix_table_lock);
317 	return s;
318 }
319 
320 static inline int unix_writable(struct sock *sk)
321 {
322 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
323 }
324 
325 static void unix_write_space(struct sock *sk)
326 {
327 	struct socket_wq *wq;
328 
329 	rcu_read_lock();
330 	if (unix_writable(sk)) {
331 		wq = rcu_dereference(sk->sk_wq);
332 		if (wq_has_sleeper(wq))
333 			wake_up_interruptible_sync_poll(&wq->wait,
334 				POLLOUT | POLLWRNORM | POLLWRBAND);
335 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
336 	}
337 	rcu_read_unlock();
338 }
339 
340 /* When dgram socket disconnects (or changes its peer), we clear its receive
341  * queue of packets arrived from previous peer. First, it allows to do
342  * flow control based only on wmem_alloc; second, sk connected to peer
343  * may receive messages only from that peer. */
344 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
345 {
346 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
347 		skb_queue_purge(&sk->sk_receive_queue);
348 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
349 
350 		/* If one link of bidirectional dgram pipe is disconnected,
351 		 * we signal error. Messages are lost. Do not make this,
352 		 * when peer was not connected to us.
353 		 */
354 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
355 			other->sk_err = ECONNRESET;
356 			other->sk_error_report(other);
357 		}
358 	}
359 }
360 
361 static void unix_sock_destructor(struct sock *sk)
362 {
363 	struct unix_sock *u = unix_sk(sk);
364 
365 	skb_queue_purge(&sk->sk_receive_queue);
366 
367 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
368 	WARN_ON(!sk_unhashed(sk));
369 	WARN_ON(sk->sk_socket);
370 	if (!sock_flag(sk, SOCK_DEAD)) {
371 		pr_info("Attempt to release alive unix socket: %p\n", sk);
372 		return;
373 	}
374 
375 	if (u->addr)
376 		unix_release_addr(u->addr);
377 
378 	atomic_long_dec(&unix_nr_socks);
379 	local_bh_disable();
380 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
381 	local_bh_enable();
382 #ifdef UNIX_REFCNT_DEBUG
383 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
384 		atomic_long_read(&unix_nr_socks));
385 #endif
386 }
387 
388 static void unix_release_sock(struct sock *sk, int embrion)
389 {
390 	struct unix_sock *u = unix_sk(sk);
391 	struct path path;
392 	struct sock *skpair;
393 	struct sk_buff *skb;
394 	int state;
395 
396 	unix_remove_socket(sk);
397 
398 	/* Clear state */
399 	unix_state_lock(sk);
400 	sock_orphan(sk);
401 	sk->sk_shutdown = SHUTDOWN_MASK;
402 	path	     = u->path;
403 	u->path.dentry = NULL;
404 	u->path.mnt = NULL;
405 	state = sk->sk_state;
406 	sk->sk_state = TCP_CLOSE;
407 	unix_state_unlock(sk);
408 
409 	wake_up_interruptible_all(&u->peer_wait);
410 
411 	skpair = unix_peer(sk);
412 
413 	if (skpair != NULL) {
414 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
415 			unix_state_lock(skpair);
416 			/* No more writes */
417 			skpair->sk_shutdown = SHUTDOWN_MASK;
418 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
419 				skpair->sk_err = ECONNRESET;
420 			unix_state_unlock(skpair);
421 			skpair->sk_state_change(skpair);
422 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
423 		}
424 		sock_put(skpair); /* It may now die */
425 		unix_peer(sk) = NULL;
426 	}
427 
428 	/* Try to flush out this socket. Throw out buffers at least */
429 
430 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
431 		if (state == TCP_LISTEN)
432 			unix_release_sock(skb->sk, 1);
433 		/* passed fds are erased in the kfree_skb hook	      */
434 		kfree_skb(skb);
435 	}
436 
437 	if (path.dentry)
438 		path_put(&path);
439 
440 	sock_put(sk);
441 
442 	/* ---- Socket is dead now and most probably destroyed ---- */
443 
444 	/*
445 	 * Fixme: BSD difference: In BSD all sockets connected to us get
446 	 *	  ECONNRESET and we die on the spot. In Linux we behave
447 	 *	  like files and pipes do and wait for the last
448 	 *	  dereference.
449 	 *
450 	 * Can't we simply set sock->err?
451 	 *
452 	 *	  What the above comment does talk about? --ANK(980817)
453 	 */
454 
455 	if (unix_tot_inflight)
456 		unix_gc();		/* Garbage collect fds */
457 }
458 
459 static void init_peercred(struct sock *sk)
460 {
461 	put_pid(sk->sk_peer_pid);
462 	if (sk->sk_peer_cred)
463 		put_cred(sk->sk_peer_cred);
464 	sk->sk_peer_pid  = get_pid(task_tgid(current));
465 	sk->sk_peer_cred = get_current_cred();
466 }
467 
468 static void copy_peercred(struct sock *sk, struct sock *peersk)
469 {
470 	put_pid(sk->sk_peer_pid);
471 	if (sk->sk_peer_cred)
472 		put_cred(sk->sk_peer_cred);
473 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
474 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
475 }
476 
477 static int unix_listen(struct socket *sock, int backlog)
478 {
479 	int err;
480 	struct sock *sk = sock->sk;
481 	struct unix_sock *u = unix_sk(sk);
482 	struct pid *old_pid = NULL;
483 
484 	err = -EOPNOTSUPP;
485 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
486 		goto out;	/* Only stream/seqpacket sockets accept */
487 	err = -EINVAL;
488 	if (!u->addr)
489 		goto out;	/* No listens on an unbound socket */
490 	unix_state_lock(sk);
491 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
492 		goto out_unlock;
493 	if (backlog > sk->sk_max_ack_backlog)
494 		wake_up_interruptible_all(&u->peer_wait);
495 	sk->sk_max_ack_backlog	= backlog;
496 	sk->sk_state		= TCP_LISTEN;
497 	/* set credentials so connect can copy them */
498 	init_peercred(sk);
499 	err = 0;
500 
501 out_unlock:
502 	unix_state_unlock(sk);
503 	put_pid(old_pid);
504 out:
505 	return err;
506 }
507 
508 static int unix_release(struct socket *);
509 static int unix_bind(struct socket *, struct sockaddr *, int);
510 static int unix_stream_connect(struct socket *, struct sockaddr *,
511 			       int addr_len, int flags);
512 static int unix_socketpair(struct socket *, struct socket *);
513 static int unix_accept(struct socket *, struct socket *, int);
514 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
515 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
516 static unsigned int unix_dgram_poll(struct file *, struct socket *,
517 				    poll_table *);
518 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
519 static int unix_shutdown(struct socket *, int);
520 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
521 			       struct msghdr *, size_t);
522 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
523 			       struct msghdr *, size_t, int);
524 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
525 			      struct msghdr *, size_t);
526 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
527 			      struct msghdr *, size_t, int);
528 static int unix_dgram_connect(struct socket *, struct sockaddr *,
529 			      int, int);
530 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
531 				  struct msghdr *, size_t);
532 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
533 				  struct msghdr *, size_t, int);
534 
535 static int unix_set_peek_off(struct sock *sk, int val)
536 {
537 	struct unix_sock *u = unix_sk(sk);
538 
539 	if (mutex_lock_interruptible(&u->readlock))
540 		return -EINTR;
541 
542 	sk->sk_peek_off = val;
543 	mutex_unlock(&u->readlock);
544 
545 	return 0;
546 }
547 
548 
549 static const struct proto_ops unix_stream_ops = {
550 	.family =	PF_UNIX,
551 	.owner =	THIS_MODULE,
552 	.release =	unix_release,
553 	.bind =		unix_bind,
554 	.connect =	unix_stream_connect,
555 	.socketpair =	unix_socketpair,
556 	.accept =	unix_accept,
557 	.getname =	unix_getname,
558 	.poll =		unix_poll,
559 	.ioctl =	unix_ioctl,
560 	.listen =	unix_listen,
561 	.shutdown =	unix_shutdown,
562 	.setsockopt =	sock_no_setsockopt,
563 	.getsockopt =	sock_no_getsockopt,
564 	.sendmsg =	unix_stream_sendmsg,
565 	.recvmsg =	unix_stream_recvmsg,
566 	.mmap =		sock_no_mmap,
567 	.sendpage =	sock_no_sendpage,
568 	.set_peek_off =	unix_set_peek_off,
569 };
570 
571 static const struct proto_ops unix_dgram_ops = {
572 	.family =	PF_UNIX,
573 	.owner =	THIS_MODULE,
574 	.release =	unix_release,
575 	.bind =		unix_bind,
576 	.connect =	unix_dgram_connect,
577 	.socketpair =	unix_socketpair,
578 	.accept =	sock_no_accept,
579 	.getname =	unix_getname,
580 	.poll =		unix_dgram_poll,
581 	.ioctl =	unix_ioctl,
582 	.listen =	sock_no_listen,
583 	.shutdown =	unix_shutdown,
584 	.setsockopt =	sock_no_setsockopt,
585 	.getsockopt =	sock_no_getsockopt,
586 	.sendmsg =	unix_dgram_sendmsg,
587 	.recvmsg =	unix_dgram_recvmsg,
588 	.mmap =		sock_no_mmap,
589 	.sendpage =	sock_no_sendpage,
590 	.set_peek_off =	unix_set_peek_off,
591 };
592 
593 static const struct proto_ops unix_seqpacket_ops = {
594 	.family =	PF_UNIX,
595 	.owner =	THIS_MODULE,
596 	.release =	unix_release,
597 	.bind =		unix_bind,
598 	.connect =	unix_stream_connect,
599 	.socketpair =	unix_socketpair,
600 	.accept =	unix_accept,
601 	.getname =	unix_getname,
602 	.poll =		unix_dgram_poll,
603 	.ioctl =	unix_ioctl,
604 	.listen =	unix_listen,
605 	.shutdown =	unix_shutdown,
606 	.setsockopt =	sock_no_setsockopt,
607 	.getsockopt =	sock_no_getsockopt,
608 	.sendmsg =	unix_seqpacket_sendmsg,
609 	.recvmsg =	unix_seqpacket_recvmsg,
610 	.mmap =		sock_no_mmap,
611 	.sendpage =	sock_no_sendpage,
612 	.set_peek_off =	unix_set_peek_off,
613 };
614 
615 static struct proto unix_proto = {
616 	.name			= "UNIX",
617 	.owner			= THIS_MODULE,
618 	.obj_size		= sizeof(struct unix_sock),
619 };
620 
621 /*
622  * AF_UNIX sockets do not interact with hardware, hence they
623  * dont trigger interrupts - so it's safe for them to have
624  * bh-unsafe locking for their sk_receive_queue.lock. Split off
625  * this special lock-class by reinitializing the spinlock key:
626  */
627 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
628 
629 static struct sock *unix_create1(struct net *net, struct socket *sock)
630 {
631 	struct sock *sk = NULL;
632 	struct unix_sock *u;
633 
634 	atomic_long_inc(&unix_nr_socks);
635 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
636 		goto out;
637 
638 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
639 	if (!sk)
640 		goto out;
641 
642 	sock_init_data(sock, sk);
643 	lockdep_set_class(&sk->sk_receive_queue.lock,
644 				&af_unix_sk_receive_queue_lock_key);
645 
646 	sk->sk_write_space	= unix_write_space;
647 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
648 	sk->sk_destruct		= unix_sock_destructor;
649 	u	  = unix_sk(sk);
650 	u->path.dentry = NULL;
651 	u->path.mnt = NULL;
652 	spin_lock_init(&u->lock);
653 	atomic_long_set(&u->inflight, 0);
654 	INIT_LIST_HEAD(&u->link);
655 	mutex_init(&u->readlock); /* single task reading lock */
656 	init_waitqueue_head(&u->peer_wait);
657 	unix_insert_socket(unix_sockets_unbound(sk), sk);
658 out:
659 	if (sk == NULL)
660 		atomic_long_dec(&unix_nr_socks);
661 	else {
662 		local_bh_disable();
663 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
664 		local_bh_enable();
665 	}
666 	return sk;
667 }
668 
669 static int unix_create(struct net *net, struct socket *sock, int protocol,
670 		       int kern)
671 {
672 	if (protocol && protocol != PF_UNIX)
673 		return -EPROTONOSUPPORT;
674 
675 	sock->state = SS_UNCONNECTED;
676 
677 	switch (sock->type) {
678 	case SOCK_STREAM:
679 		sock->ops = &unix_stream_ops;
680 		break;
681 		/*
682 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
683 		 *	nothing uses it.
684 		 */
685 	case SOCK_RAW:
686 		sock->type = SOCK_DGRAM;
687 	case SOCK_DGRAM:
688 		sock->ops = &unix_dgram_ops;
689 		break;
690 	case SOCK_SEQPACKET:
691 		sock->ops = &unix_seqpacket_ops;
692 		break;
693 	default:
694 		return -ESOCKTNOSUPPORT;
695 	}
696 
697 	return unix_create1(net, sock) ? 0 : -ENOMEM;
698 }
699 
700 static int unix_release(struct socket *sock)
701 {
702 	struct sock *sk = sock->sk;
703 
704 	if (!sk)
705 		return 0;
706 
707 	unix_release_sock(sk, 0);
708 	sock->sk = NULL;
709 
710 	return 0;
711 }
712 
713 static int unix_autobind(struct socket *sock)
714 {
715 	struct sock *sk = sock->sk;
716 	struct net *net = sock_net(sk);
717 	struct unix_sock *u = unix_sk(sk);
718 	static u32 ordernum = 1;
719 	struct unix_address *addr;
720 	int err;
721 	unsigned int retries = 0;
722 
723 	err = mutex_lock_interruptible(&u->readlock);
724 	if (err)
725 		return err;
726 
727 	err = 0;
728 	if (u->addr)
729 		goto out;
730 
731 	err = -ENOMEM;
732 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
733 	if (!addr)
734 		goto out;
735 
736 	addr->name->sun_family = AF_UNIX;
737 	atomic_set(&addr->refcnt, 1);
738 
739 retry:
740 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
741 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
742 
743 	spin_lock(&unix_table_lock);
744 	ordernum = (ordernum+1)&0xFFFFF;
745 
746 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
747 				      addr->hash)) {
748 		spin_unlock(&unix_table_lock);
749 		/*
750 		 * __unix_find_socket_byname() may take long time if many names
751 		 * are already in use.
752 		 */
753 		cond_resched();
754 		/* Give up if all names seems to be in use. */
755 		if (retries++ == 0xFFFFF) {
756 			err = -ENOSPC;
757 			kfree(addr);
758 			goto out;
759 		}
760 		goto retry;
761 	}
762 	addr->hash ^= sk->sk_type;
763 
764 	__unix_remove_socket(sk);
765 	u->addr = addr;
766 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
767 	spin_unlock(&unix_table_lock);
768 	err = 0;
769 
770 out:	mutex_unlock(&u->readlock);
771 	return err;
772 }
773 
774 static struct sock *unix_find_other(struct net *net,
775 				    struct sockaddr_un *sunname, int len,
776 				    int type, unsigned int hash, int *error)
777 {
778 	struct sock *u;
779 	struct path path;
780 	int err = 0;
781 
782 	if (sunname->sun_path[0]) {
783 		struct inode *inode;
784 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
785 		if (err)
786 			goto fail;
787 		inode = path.dentry->d_inode;
788 		err = inode_permission(inode, MAY_WRITE);
789 		if (err)
790 			goto put_fail;
791 
792 		err = -ECONNREFUSED;
793 		if (!S_ISSOCK(inode->i_mode))
794 			goto put_fail;
795 		u = unix_find_socket_byinode(inode);
796 		if (!u)
797 			goto put_fail;
798 
799 		if (u->sk_type == type)
800 			touch_atime(&path);
801 
802 		path_put(&path);
803 
804 		err = -EPROTOTYPE;
805 		if (u->sk_type != type) {
806 			sock_put(u);
807 			goto fail;
808 		}
809 	} else {
810 		err = -ECONNREFUSED;
811 		u = unix_find_socket_byname(net, sunname, len, type, hash);
812 		if (u) {
813 			struct dentry *dentry;
814 			dentry = unix_sk(u)->path.dentry;
815 			if (dentry)
816 				touch_atime(&unix_sk(u)->path);
817 		} else
818 			goto fail;
819 	}
820 	return u;
821 
822 put_fail:
823 	path_put(&path);
824 fail:
825 	*error = err;
826 	return NULL;
827 }
828 
829 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
830 {
831 	struct dentry *dentry;
832 	struct path path;
833 	int err = 0;
834 	/*
835 	 * Get the parent directory, calculate the hash for last
836 	 * component.
837 	 */
838 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
839 	err = PTR_ERR(dentry);
840 	if (IS_ERR(dentry))
841 		return err;
842 
843 	/*
844 	 * All right, let's create it.
845 	 */
846 	err = security_path_mknod(&path, dentry, mode, 0);
847 	if (!err) {
848 		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
849 		if (!err) {
850 			res->mnt = mntget(path.mnt);
851 			res->dentry = dget(dentry);
852 		}
853 	}
854 	done_path_create(&path, dentry);
855 	return err;
856 }
857 
858 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
859 {
860 	struct sock *sk = sock->sk;
861 	struct net *net = sock_net(sk);
862 	struct unix_sock *u = unix_sk(sk);
863 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
864 	char *sun_path = sunaddr->sun_path;
865 	int err;
866 	unsigned int hash;
867 	struct unix_address *addr;
868 	struct hlist_head *list;
869 
870 	err = -EINVAL;
871 	if (sunaddr->sun_family != AF_UNIX)
872 		goto out;
873 
874 	if (addr_len == sizeof(short)) {
875 		err = unix_autobind(sock);
876 		goto out;
877 	}
878 
879 	err = unix_mkname(sunaddr, addr_len, &hash);
880 	if (err < 0)
881 		goto out;
882 	addr_len = err;
883 
884 	err = mutex_lock_interruptible(&u->readlock);
885 	if (err)
886 		goto out;
887 
888 	err = -EINVAL;
889 	if (u->addr)
890 		goto out_up;
891 
892 	err = -ENOMEM;
893 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
894 	if (!addr)
895 		goto out_up;
896 
897 	memcpy(addr->name, sunaddr, addr_len);
898 	addr->len = addr_len;
899 	addr->hash = hash ^ sk->sk_type;
900 	atomic_set(&addr->refcnt, 1);
901 
902 	if (sun_path[0]) {
903 		struct path path;
904 		umode_t mode = S_IFSOCK |
905 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
906 		err = unix_mknod(sun_path, mode, &path);
907 		if (err) {
908 			if (err == -EEXIST)
909 				err = -EADDRINUSE;
910 			unix_release_addr(addr);
911 			goto out_up;
912 		}
913 		addr->hash = UNIX_HASH_SIZE;
914 		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
915 		spin_lock(&unix_table_lock);
916 		u->path = path;
917 		list = &unix_socket_table[hash];
918 	} else {
919 		spin_lock(&unix_table_lock);
920 		err = -EADDRINUSE;
921 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
922 					      sk->sk_type, hash)) {
923 			unix_release_addr(addr);
924 			goto out_unlock;
925 		}
926 
927 		list = &unix_socket_table[addr->hash];
928 	}
929 
930 	err = 0;
931 	__unix_remove_socket(sk);
932 	u->addr = addr;
933 	__unix_insert_socket(list, sk);
934 
935 out_unlock:
936 	spin_unlock(&unix_table_lock);
937 out_up:
938 	mutex_unlock(&u->readlock);
939 out:
940 	return err;
941 }
942 
943 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
944 {
945 	if (unlikely(sk1 == sk2) || !sk2) {
946 		unix_state_lock(sk1);
947 		return;
948 	}
949 	if (sk1 < sk2) {
950 		unix_state_lock(sk1);
951 		unix_state_lock_nested(sk2);
952 	} else {
953 		unix_state_lock(sk2);
954 		unix_state_lock_nested(sk1);
955 	}
956 }
957 
958 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
959 {
960 	if (unlikely(sk1 == sk2) || !sk2) {
961 		unix_state_unlock(sk1);
962 		return;
963 	}
964 	unix_state_unlock(sk1);
965 	unix_state_unlock(sk2);
966 }
967 
968 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
969 			      int alen, int flags)
970 {
971 	struct sock *sk = sock->sk;
972 	struct net *net = sock_net(sk);
973 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
974 	struct sock *other;
975 	unsigned int hash;
976 	int err;
977 
978 	if (addr->sa_family != AF_UNSPEC) {
979 		err = unix_mkname(sunaddr, alen, &hash);
980 		if (err < 0)
981 			goto out;
982 		alen = err;
983 
984 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
985 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
986 			goto out;
987 
988 restart:
989 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
990 		if (!other)
991 			goto out;
992 
993 		unix_state_double_lock(sk, other);
994 
995 		/* Apparently VFS overslept socket death. Retry. */
996 		if (sock_flag(other, SOCK_DEAD)) {
997 			unix_state_double_unlock(sk, other);
998 			sock_put(other);
999 			goto restart;
1000 		}
1001 
1002 		err = -EPERM;
1003 		if (!unix_may_send(sk, other))
1004 			goto out_unlock;
1005 
1006 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1007 		if (err)
1008 			goto out_unlock;
1009 
1010 	} else {
1011 		/*
1012 		 *	1003.1g breaking connected state with AF_UNSPEC
1013 		 */
1014 		other = NULL;
1015 		unix_state_double_lock(sk, other);
1016 	}
1017 
1018 	/*
1019 	 * If it was connected, reconnect.
1020 	 */
1021 	if (unix_peer(sk)) {
1022 		struct sock *old_peer = unix_peer(sk);
1023 		unix_peer(sk) = other;
1024 		unix_state_double_unlock(sk, other);
1025 
1026 		if (other != old_peer)
1027 			unix_dgram_disconnected(sk, old_peer);
1028 		sock_put(old_peer);
1029 	} else {
1030 		unix_peer(sk) = other;
1031 		unix_state_double_unlock(sk, other);
1032 	}
1033 	return 0;
1034 
1035 out_unlock:
1036 	unix_state_double_unlock(sk, other);
1037 	sock_put(other);
1038 out:
1039 	return err;
1040 }
1041 
1042 static long unix_wait_for_peer(struct sock *other, long timeo)
1043 {
1044 	struct unix_sock *u = unix_sk(other);
1045 	int sched;
1046 	DEFINE_WAIT(wait);
1047 
1048 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1049 
1050 	sched = !sock_flag(other, SOCK_DEAD) &&
1051 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1052 		unix_recvq_full(other);
1053 
1054 	unix_state_unlock(other);
1055 
1056 	if (sched)
1057 		timeo = schedule_timeout(timeo);
1058 
1059 	finish_wait(&u->peer_wait, &wait);
1060 	return timeo;
1061 }
1062 
1063 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1064 			       int addr_len, int flags)
1065 {
1066 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1067 	struct sock *sk = sock->sk;
1068 	struct net *net = sock_net(sk);
1069 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1070 	struct sock *newsk = NULL;
1071 	struct sock *other = NULL;
1072 	struct sk_buff *skb = NULL;
1073 	unsigned int hash;
1074 	int st;
1075 	int err;
1076 	long timeo;
1077 
1078 	err = unix_mkname(sunaddr, addr_len, &hash);
1079 	if (err < 0)
1080 		goto out;
1081 	addr_len = err;
1082 
1083 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1084 	    (err = unix_autobind(sock)) != 0)
1085 		goto out;
1086 
1087 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1088 
1089 	/* First of all allocate resources.
1090 	   If we will make it after state is locked,
1091 	   we will have to recheck all again in any case.
1092 	 */
1093 
1094 	err = -ENOMEM;
1095 
1096 	/* create new sock for complete connection */
1097 	newsk = unix_create1(sock_net(sk), NULL);
1098 	if (newsk == NULL)
1099 		goto out;
1100 
1101 	/* Allocate skb for sending to listening sock */
1102 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1103 	if (skb == NULL)
1104 		goto out;
1105 
1106 restart:
1107 	/*  Find listening sock. */
1108 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1109 	if (!other)
1110 		goto out;
1111 
1112 	/* Latch state of peer */
1113 	unix_state_lock(other);
1114 
1115 	/* Apparently VFS overslept socket death. Retry. */
1116 	if (sock_flag(other, SOCK_DEAD)) {
1117 		unix_state_unlock(other);
1118 		sock_put(other);
1119 		goto restart;
1120 	}
1121 
1122 	err = -ECONNREFUSED;
1123 	if (other->sk_state != TCP_LISTEN)
1124 		goto out_unlock;
1125 	if (other->sk_shutdown & RCV_SHUTDOWN)
1126 		goto out_unlock;
1127 
1128 	if (unix_recvq_full(other)) {
1129 		err = -EAGAIN;
1130 		if (!timeo)
1131 			goto out_unlock;
1132 
1133 		timeo = unix_wait_for_peer(other, timeo);
1134 
1135 		err = sock_intr_errno(timeo);
1136 		if (signal_pending(current))
1137 			goto out;
1138 		sock_put(other);
1139 		goto restart;
1140 	}
1141 
1142 	/* Latch our state.
1143 
1144 	   It is tricky place. We need to grab our state lock and cannot
1145 	   drop lock on peer. It is dangerous because deadlock is
1146 	   possible. Connect to self case and simultaneous
1147 	   attempt to connect are eliminated by checking socket
1148 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1149 	   check this before attempt to grab lock.
1150 
1151 	   Well, and we have to recheck the state after socket locked.
1152 	 */
1153 	st = sk->sk_state;
1154 
1155 	switch (st) {
1156 	case TCP_CLOSE:
1157 		/* This is ok... continue with connect */
1158 		break;
1159 	case TCP_ESTABLISHED:
1160 		/* Socket is already connected */
1161 		err = -EISCONN;
1162 		goto out_unlock;
1163 	default:
1164 		err = -EINVAL;
1165 		goto out_unlock;
1166 	}
1167 
1168 	unix_state_lock_nested(sk);
1169 
1170 	if (sk->sk_state != st) {
1171 		unix_state_unlock(sk);
1172 		unix_state_unlock(other);
1173 		sock_put(other);
1174 		goto restart;
1175 	}
1176 
1177 	err = security_unix_stream_connect(sk, other, newsk);
1178 	if (err) {
1179 		unix_state_unlock(sk);
1180 		goto out_unlock;
1181 	}
1182 
1183 	/* The way is open! Fastly set all the necessary fields... */
1184 
1185 	sock_hold(sk);
1186 	unix_peer(newsk)	= sk;
1187 	newsk->sk_state		= TCP_ESTABLISHED;
1188 	newsk->sk_type		= sk->sk_type;
1189 	init_peercred(newsk);
1190 	newu = unix_sk(newsk);
1191 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1192 	otheru = unix_sk(other);
1193 
1194 	/* copy address information from listening to new sock*/
1195 	if (otheru->addr) {
1196 		atomic_inc(&otheru->addr->refcnt);
1197 		newu->addr = otheru->addr;
1198 	}
1199 	if (otheru->path.dentry) {
1200 		path_get(&otheru->path);
1201 		newu->path = otheru->path;
1202 	}
1203 
1204 	/* Set credentials */
1205 	copy_peercred(sk, other);
1206 
1207 	sock->state	= SS_CONNECTED;
1208 	sk->sk_state	= TCP_ESTABLISHED;
1209 	sock_hold(newsk);
1210 
1211 	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1212 	unix_peer(sk)	= newsk;
1213 
1214 	unix_state_unlock(sk);
1215 
1216 	/* take ten and and send info to listening sock */
1217 	spin_lock(&other->sk_receive_queue.lock);
1218 	__skb_queue_tail(&other->sk_receive_queue, skb);
1219 	spin_unlock(&other->sk_receive_queue.lock);
1220 	unix_state_unlock(other);
1221 	other->sk_data_ready(other, 0);
1222 	sock_put(other);
1223 	return 0;
1224 
1225 out_unlock:
1226 	if (other)
1227 		unix_state_unlock(other);
1228 
1229 out:
1230 	kfree_skb(skb);
1231 	if (newsk)
1232 		unix_release_sock(newsk, 0);
1233 	if (other)
1234 		sock_put(other);
1235 	return err;
1236 }
1237 
1238 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1239 {
1240 	struct sock *ska = socka->sk, *skb = sockb->sk;
1241 
1242 	/* Join our sockets back to back */
1243 	sock_hold(ska);
1244 	sock_hold(skb);
1245 	unix_peer(ska) = skb;
1246 	unix_peer(skb) = ska;
1247 	init_peercred(ska);
1248 	init_peercred(skb);
1249 
1250 	if (ska->sk_type != SOCK_DGRAM) {
1251 		ska->sk_state = TCP_ESTABLISHED;
1252 		skb->sk_state = TCP_ESTABLISHED;
1253 		socka->state  = SS_CONNECTED;
1254 		sockb->state  = SS_CONNECTED;
1255 	}
1256 	return 0;
1257 }
1258 
1259 static void unix_sock_inherit_flags(const struct socket *old,
1260 				    struct socket *new)
1261 {
1262 	if (test_bit(SOCK_PASSCRED, &old->flags))
1263 		set_bit(SOCK_PASSCRED, &new->flags);
1264 	if (test_bit(SOCK_PASSSEC, &old->flags))
1265 		set_bit(SOCK_PASSSEC, &new->flags);
1266 }
1267 
1268 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1269 {
1270 	struct sock *sk = sock->sk;
1271 	struct sock *tsk;
1272 	struct sk_buff *skb;
1273 	int err;
1274 
1275 	err = -EOPNOTSUPP;
1276 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1277 		goto out;
1278 
1279 	err = -EINVAL;
1280 	if (sk->sk_state != TCP_LISTEN)
1281 		goto out;
1282 
1283 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1284 	 * so that no locks are necessary.
1285 	 */
1286 
1287 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1288 	if (!skb) {
1289 		/* This means receive shutdown. */
1290 		if (err == 0)
1291 			err = -EINVAL;
1292 		goto out;
1293 	}
1294 
1295 	tsk = skb->sk;
1296 	skb_free_datagram(sk, skb);
1297 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1298 
1299 	/* attach accepted sock to socket */
1300 	unix_state_lock(tsk);
1301 	newsock->state = SS_CONNECTED;
1302 	unix_sock_inherit_flags(sock, newsock);
1303 	sock_graft(tsk, newsock);
1304 	unix_state_unlock(tsk);
1305 	return 0;
1306 
1307 out:
1308 	return err;
1309 }
1310 
1311 
1312 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1313 {
1314 	struct sock *sk = sock->sk;
1315 	struct unix_sock *u;
1316 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1317 	int err = 0;
1318 
1319 	if (peer) {
1320 		sk = unix_peer_get(sk);
1321 
1322 		err = -ENOTCONN;
1323 		if (!sk)
1324 			goto out;
1325 		err = 0;
1326 	} else {
1327 		sock_hold(sk);
1328 	}
1329 
1330 	u = unix_sk(sk);
1331 	unix_state_lock(sk);
1332 	if (!u->addr) {
1333 		sunaddr->sun_family = AF_UNIX;
1334 		sunaddr->sun_path[0] = 0;
1335 		*uaddr_len = sizeof(short);
1336 	} else {
1337 		struct unix_address *addr = u->addr;
1338 
1339 		*uaddr_len = addr->len;
1340 		memcpy(sunaddr, addr->name, *uaddr_len);
1341 	}
1342 	unix_state_unlock(sk);
1343 	sock_put(sk);
1344 out:
1345 	return err;
1346 }
1347 
1348 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1349 {
1350 	int i;
1351 
1352 	scm->fp = UNIXCB(skb).fp;
1353 	UNIXCB(skb).fp = NULL;
1354 
1355 	for (i = scm->fp->count-1; i >= 0; i--)
1356 		unix_notinflight(scm->fp->fp[i]);
1357 }
1358 
1359 static void unix_destruct_scm(struct sk_buff *skb)
1360 {
1361 	struct scm_cookie scm;
1362 	memset(&scm, 0, sizeof(scm));
1363 	scm.pid  = UNIXCB(skb).pid;
1364 	if (UNIXCB(skb).fp)
1365 		unix_detach_fds(&scm, skb);
1366 
1367 	/* Alas, it calls VFS */
1368 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1369 	scm_destroy(&scm);
1370 	sock_wfree(skb);
1371 }
1372 
1373 #define MAX_RECURSION_LEVEL 4
1374 
1375 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1376 {
1377 	int i;
1378 	unsigned char max_level = 0;
1379 	int unix_sock_count = 0;
1380 
1381 	for (i = scm->fp->count - 1; i >= 0; i--) {
1382 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1383 
1384 		if (sk) {
1385 			unix_sock_count++;
1386 			max_level = max(max_level,
1387 					unix_sk(sk)->recursion_level);
1388 		}
1389 	}
1390 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1391 		return -ETOOMANYREFS;
1392 
1393 	/*
1394 	 * Need to duplicate file references for the sake of garbage
1395 	 * collection.  Otherwise a socket in the fps might become a
1396 	 * candidate for GC while the skb is not yet queued.
1397 	 */
1398 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1399 	if (!UNIXCB(skb).fp)
1400 		return -ENOMEM;
1401 
1402 	if (unix_sock_count) {
1403 		for (i = scm->fp->count - 1; i >= 0; i--)
1404 			unix_inflight(scm->fp->fp[i]);
1405 	}
1406 	return max_level;
1407 }
1408 
1409 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1410 {
1411 	int err = 0;
1412 
1413 	UNIXCB(skb).pid  = get_pid(scm->pid);
1414 	UNIXCB(skb).uid = scm->creds.uid;
1415 	UNIXCB(skb).gid = scm->creds.gid;
1416 	UNIXCB(skb).fp = NULL;
1417 	if (scm->fp && send_fds)
1418 		err = unix_attach_fds(scm, skb);
1419 
1420 	skb->destructor = unix_destruct_scm;
1421 	return err;
1422 }
1423 
1424 /*
1425  * Some apps rely on write() giving SCM_CREDENTIALS
1426  * We include credentials if source or destination socket
1427  * asserted SOCK_PASSCRED.
1428  */
1429 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1430 			    const struct sock *other)
1431 {
1432 	if (UNIXCB(skb).pid)
1433 		return;
1434 	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1435 	    !other->sk_socket ||
1436 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1437 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1438 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1439 	}
1440 }
1441 
1442 /*
1443  *	Send AF_UNIX data.
1444  */
1445 
1446 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1447 			      struct msghdr *msg, size_t len)
1448 {
1449 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1450 	struct sock *sk = sock->sk;
1451 	struct net *net = sock_net(sk);
1452 	struct unix_sock *u = unix_sk(sk);
1453 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1454 	struct sock *other = NULL;
1455 	int namelen = 0; /* fake GCC */
1456 	int err;
1457 	unsigned int hash;
1458 	struct sk_buff *skb;
1459 	long timeo;
1460 	struct scm_cookie tmp_scm;
1461 	int max_level;
1462 	int data_len = 0;
1463 
1464 	if (NULL == siocb->scm)
1465 		siocb->scm = &tmp_scm;
1466 	wait_for_unix_gc();
1467 	err = scm_send(sock, msg, siocb->scm, false);
1468 	if (err < 0)
1469 		return err;
1470 
1471 	err = -EOPNOTSUPP;
1472 	if (msg->msg_flags&MSG_OOB)
1473 		goto out;
1474 
1475 	if (msg->msg_namelen) {
1476 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1477 		if (err < 0)
1478 			goto out;
1479 		namelen = err;
1480 	} else {
1481 		sunaddr = NULL;
1482 		err = -ENOTCONN;
1483 		other = unix_peer_get(sk);
1484 		if (!other)
1485 			goto out;
1486 	}
1487 
1488 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1489 	    && (err = unix_autobind(sock)) != 0)
1490 		goto out;
1491 
1492 	err = -EMSGSIZE;
1493 	if (len > sk->sk_sndbuf - 32)
1494 		goto out;
1495 
1496 	if (len > SKB_MAX_ALLOC)
1497 		data_len = min_t(size_t,
1498 				 len - SKB_MAX_ALLOC,
1499 				 MAX_SKB_FRAGS * PAGE_SIZE);
1500 
1501 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1502 				   msg->msg_flags & MSG_DONTWAIT, &err,
1503 				   PAGE_ALLOC_COSTLY_ORDER);
1504 	if (skb == NULL)
1505 		goto out;
1506 
1507 	err = unix_scm_to_skb(siocb->scm, skb, true);
1508 	if (err < 0)
1509 		goto out_free;
1510 	max_level = err + 1;
1511 	unix_get_secdata(siocb->scm, skb);
1512 
1513 	skb_put(skb, len - data_len);
1514 	skb->data_len = data_len;
1515 	skb->len = len;
1516 	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1517 	if (err)
1518 		goto out_free;
1519 
1520 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1521 
1522 restart:
1523 	if (!other) {
1524 		err = -ECONNRESET;
1525 		if (sunaddr == NULL)
1526 			goto out_free;
1527 
1528 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1529 					hash, &err);
1530 		if (other == NULL)
1531 			goto out_free;
1532 	}
1533 
1534 	if (sk_filter(other, skb) < 0) {
1535 		/* Toss the packet but do not return any error to the sender */
1536 		err = len;
1537 		goto out_free;
1538 	}
1539 
1540 	unix_state_lock(other);
1541 	err = -EPERM;
1542 	if (!unix_may_send(sk, other))
1543 		goto out_unlock;
1544 
1545 	if (sock_flag(other, SOCK_DEAD)) {
1546 		/*
1547 		 *	Check with 1003.1g - what should
1548 		 *	datagram error
1549 		 */
1550 		unix_state_unlock(other);
1551 		sock_put(other);
1552 
1553 		err = 0;
1554 		unix_state_lock(sk);
1555 		if (unix_peer(sk) == other) {
1556 			unix_peer(sk) = NULL;
1557 			unix_state_unlock(sk);
1558 
1559 			unix_dgram_disconnected(sk, other);
1560 			sock_put(other);
1561 			err = -ECONNREFUSED;
1562 		} else {
1563 			unix_state_unlock(sk);
1564 		}
1565 
1566 		other = NULL;
1567 		if (err)
1568 			goto out_free;
1569 		goto restart;
1570 	}
1571 
1572 	err = -EPIPE;
1573 	if (other->sk_shutdown & RCV_SHUTDOWN)
1574 		goto out_unlock;
1575 
1576 	if (sk->sk_type != SOCK_SEQPACKET) {
1577 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1578 		if (err)
1579 			goto out_unlock;
1580 	}
1581 
1582 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1583 		if (!timeo) {
1584 			err = -EAGAIN;
1585 			goto out_unlock;
1586 		}
1587 
1588 		timeo = unix_wait_for_peer(other, timeo);
1589 
1590 		err = sock_intr_errno(timeo);
1591 		if (signal_pending(current))
1592 			goto out_free;
1593 
1594 		goto restart;
1595 	}
1596 
1597 	if (sock_flag(other, SOCK_RCVTSTAMP))
1598 		__net_timestamp(skb);
1599 	maybe_add_creds(skb, sock, other);
1600 	skb_queue_tail(&other->sk_receive_queue, skb);
1601 	if (max_level > unix_sk(other)->recursion_level)
1602 		unix_sk(other)->recursion_level = max_level;
1603 	unix_state_unlock(other);
1604 	other->sk_data_ready(other, len);
1605 	sock_put(other);
1606 	scm_destroy(siocb->scm);
1607 	return len;
1608 
1609 out_unlock:
1610 	unix_state_unlock(other);
1611 out_free:
1612 	kfree_skb(skb);
1613 out:
1614 	if (other)
1615 		sock_put(other);
1616 	scm_destroy(siocb->scm);
1617 	return err;
1618 }
1619 
1620 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1621  * bytes, and a minimun of a full page.
1622  */
1623 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1624 
1625 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1626 			       struct msghdr *msg, size_t len)
1627 {
1628 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1629 	struct sock *sk = sock->sk;
1630 	struct sock *other = NULL;
1631 	int err, size;
1632 	struct sk_buff *skb;
1633 	int sent = 0;
1634 	struct scm_cookie tmp_scm;
1635 	bool fds_sent = false;
1636 	int max_level;
1637 	int data_len;
1638 
1639 	if (NULL == siocb->scm)
1640 		siocb->scm = &tmp_scm;
1641 	wait_for_unix_gc();
1642 	err = scm_send(sock, msg, siocb->scm, false);
1643 	if (err < 0)
1644 		return err;
1645 
1646 	err = -EOPNOTSUPP;
1647 	if (msg->msg_flags&MSG_OOB)
1648 		goto out_err;
1649 
1650 	if (msg->msg_namelen) {
1651 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1652 		goto out_err;
1653 	} else {
1654 		err = -ENOTCONN;
1655 		other = unix_peer(sk);
1656 		if (!other)
1657 			goto out_err;
1658 	}
1659 
1660 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1661 		goto pipe_err;
1662 
1663 	while (sent < len) {
1664 		size = len - sent;
1665 
1666 		/* Keep two messages in the pipe so it schedules better */
1667 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1668 
1669 		/* allow fallback to order-0 allocations */
1670 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1671 
1672 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1673 
1674 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1675 					   msg->msg_flags & MSG_DONTWAIT, &err,
1676 					   get_order(UNIX_SKB_FRAGS_SZ));
1677 		if (!skb)
1678 			goto out_err;
1679 
1680 		/* Only send the fds in the first buffer */
1681 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1682 		if (err < 0) {
1683 			kfree_skb(skb);
1684 			goto out_err;
1685 		}
1686 		max_level = err + 1;
1687 		fds_sent = true;
1688 
1689 		skb_put(skb, size - data_len);
1690 		skb->data_len = data_len;
1691 		skb->len = size;
1692 		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1693 						   sent, size);
1694 		if (err) {
1695 			kfree_skb(skb);
1696 			goto out_err;
1697 		}
1698 
1699 		unix_state_lock(other);
1700 
1701 		if (sock_flag(other, SOCK_DEAD) ||
1702 		    (other->sk_shutdown & RCV_SHUTDOWN))
1703 			goto pipe_err_free;
1704 
1705 		maybe_add_creds(skb, sock, other);
1706 		skb_queue_tail(&other->sk_receive_queue, skb);
1707 		if (max_level > unix_sk(other)->recursion_level)
1708 			unix_sk(other)->recursion_level = max_level;
1709 		unix_state_unlock(other);
1710 		other->sk_data_ready(other, size);
1711 		sent += size;
1712 	}
1713 
1714 	scm_destroy(siocb->scm);
1715 	siocb->scm = NULL;
1716 
1717 	return sent;
1718 
1719 pipe_err_free:
1720 	unix_state_unlock(other);
1721 	kfree_skb(skb);
1722 pipe_err:
1723 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1724 		send_sig(SIGPIPE, current, 0);
1725 	err = -EPIPE;
1726 out_err:
1727 	scm_destroy(siocb->scm);
1728 	siocb->scm = NULL;
1729 	return sent ? : err;
1730 }
1731 
1732 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1733 				  struct msghdr *msg, size_t len)
1734 {
1735 	int err;
1736 	struct sock *sk = sock->sk;
1737 
1738 	err = sock_error(sk);
1739 	if (err)
1740 		return err;
1741 
1742 	if (sk->sk_state != TCP_ESTABLISHED)
1743 		return -ENOTCONN;
1744 
1745 	if (msg->msg_namelen)
1746 		msg->msg_namelen = 0;
1747 
1748 	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1749 }
1750 
1751 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1752 			      struct msghdr *msg, size_t size,
1753 			      int flags)
1754 {
1755 	struct sock *sk = sock->sk;
1756 
1757 	if (sk->sk_state != TCP_ESTABLISHED)
1758 		return -ENOTCONN;
1759 
1760 	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1761 }
1762 
1763 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1764 {
1765 	struct unix_sock *u = unix_sk(sk);
1766 
1767 	if (u->addr) {
1768 		msg->msg_namelen = u->addr->len;
1769 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1770 	}
1771 }
1772 
1773 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1774 			      struct msghdr *msg, size_t size,
1775 			      int flags)
1776 {
1777 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1778 	struct scm_cookie tmp_scm;
1779 	struct sock *sk = sock->sk;
1780 	struct unix_sock *u = unix_sk(sk);
1781 	int noblock = flags & MSG_DONTWAIT;
1782 	struct sk_buff *skb;
1783 	int err;
1784 	int peeked, skip;
1785 
1786 	err = -EOPNOTSUPP;
1787 	if (flags&MSG_OOB)
1788 		goto out;
1789 
1790 	err = mutex_lock_interruptible(&u->readlock);
1791 	if (err) {
1792 		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1793 		goto out;
1794 	}
1795 
1796 	skip = sk_peek_offset(sk, flags);
1797 
1798 	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1799 	if (!skb) {
1800 		unix_state_lock(sk);
1801 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1802 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1803 		    (sk->sk_shutdown & RCV_SHUTDOWN))
1804 			err = 0;
1805 		unix_state_unlock(sk);
1806 		goto out_unlock;
1807 	}
1808 
1809 	wake_up_interruptible_sync_poll(&u->peer_wait,
1810 					POLLOUT | POLLWRNORM | POLLWRBAND);
1811 
1812 	if (msg->msg_name)
1813 		unix_copy_addr(msg, skb->sk);
1814 
1815 	if (size > skb->len - skip)
1816 		size = skb->len - skip;
1817 	else if (size < skb->len - skip)
1818 		msg->msg_flags |= MSG_TRUNC;
1819 
1820 	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1821 	if (err)
1822 		goto out_free;
1823 
1824 	if (sock_flag(sk, SOCK_RCVTSTAMP))
1825 		__sock_recv_timestamp(msg, sk, skb);
1826 
1827 	if (!siocb->scm) {
1828 		siocb->scm = &tmp_scm;
1829 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1830 	}
1831 	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1832 	unix_set_secdata(siocb->scm, skb);
1833 
1834 	if (!(flags & MSG_PEEK)) {
1835 		if (UNIXCB(skb).fp)
1836 			unix_detach_fds(siocb->scm, skb);
1837 
1838 		sk_peek_offset_bwd(sk, skb->len);
1839 	} else {
1840 		/* It is questionable: on PEEK we could:
1841 		   - do not return fds - good, but too simple 8)
1842 		   - return fds, and do not return them on read (old strategy,
1843 		     apparently wrong)
1844 		   - clone fds (I chose it for now, it is the most universal
1845 		     solution)
1846 
1847 		   POSIX 1003.1g does not actually define this clearly
1848 		   at all. POSIX 1003.1g doesn't define a lot of things
1849 		   clearly however!
1850 
1851 		*/
1852 
1853 		sk_peek_offset_fwd(sk, size);
1854 
1855 		if (UNIXCB(skb).fp)
1856 			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1857 	}
1858 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1859 
1860 	scm_recv(sock, msg, siocb->scm, flags);
1861 
1862 out_free:
1863 	skb_free_datagram(sk, skb);
1864 out_unlock:
1865 	mutex_unlock(&u->readlock);
1866 out:
1867 	return err;
1868 }
1869 
1870 /*
1871  *	Sleep until more data has arrived. But check for races..
1872  */
1873 static long unix_stream_data_wait(struct sock *sk, long timeo,
1874 				  struct sk_buff *last)
1875 {
1876 	DEFINE_WAIT(wait);
1877 
1878 	unix_state_lock(sk);
1879 
1880 	for (;;) {
1881 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1882 
1883 		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1884 		    sk->sk_err ||
1885 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1886 		    signal_pending(current) ||
1887 		    !timeo)
1888 			break;
1889 
1890 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1891 		unix_state_unlock(sk);
1892 		timeo = freezable_schedule_timeout(timeo);
1893 		unix_state_lock(sk);
1894 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1895 	}
1896 
1897 	finish_wait(sk_sleep(sk), &wait);
1898 	unix_state_unlock(sk);
1899 	return timeo;
1900 }
1901 
1902 static unsigned int unix_skb_len(const struct sk_buff *skb)
1903 {
1904 	return skb->len - UNIXCB(skb).consumed;
1905 }
1906 
1907 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1908 			       struct msghdr *msg, size_t size,
1909 			       int flags)
1910 {
1911 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1912 	struct scm_cookie tmp_scm;
1913 	struct sock *sk = sock->sk;
1914 	struct unix_sock *u = unix_sk(sk);
1915 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1916 	int copied = 0;
1917 	int check_creds = 0;
1918 	int target;
1919 	int err = 0;
1920 	long timeo;
1921 	int skip;
1922 
1923 	err = -EINVAL;
1924 	if (sk->sk_state != TCP_ESTABLISHED)
1925 		goto out;
1926 
1927 	err = -EOPNOTSUPP;
1928 	if (flags&MSG_OOB)
1929 		goto out;
1930 
1931 	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1932 	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1933 
1934 	/* Lock the socket to prevent queue disordering
1935 	 * while sleeps in memcpy_tomsg
1936 	 */
1937 
1938 	if (!siocb->scm) {
1939 		siocb->scm = &tmp_scm;
1940 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1941 	}
1942 
1943 	err = mutex_lock_interruptible(&u->readlock);
1944 	if (err) {
1945 		err = sock_intr_errno(timeo);
1946 		goto out;
1947 	}
1948 
1949 	do {
1950 		int chunk;
1951 		struct sk_buff *skb, *last;
1952 
1953 		unix_state_lock(sk);
1954 		last = skb = skb_peek(&sk->sk_receive_queue);
1955 again:
1956 		if (skb == NULL) {
1957 			unix_sk(sk)->recursion_level = 0;
1958 			if (copied >= target)
1959 				goto unlock;
1960 
1961 			/*
1962 			 *	POSIX 1003.1g mandates this order.
1963 			 */
1964 
1965 			err = sock_error(sk);
1966 			if (err)
1967 				goto unlock;
1968 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1969 				goto unlock;
1970 
1971 			unix_state_unlock(sk);
1972 			err = -EAGAIN;
1973 			if (!timeo)
1974 				break;
1975 			mutex_unlock(&u->readlock);
1976 
1977 			timeo = unix_stream_data_wait(sk, timeo, last);
1978 
1979 			if (signal_pending(current)
1980 			    ||  mutex_lock_interruptible(&u->readlock)) {
1981 				err = sock_intr_errno(timeo);
1982 				goto out;
1983 			}
1984 
1985 			continue;
1986  unlock:
1987 			unix_state_unlock(sk);
1988 			break;
1989 		}
1990 
1991 		skip = sk_peek_offset(sk, flags);
1992 		while (skip >= unix_skb_len(skb)) {
1993 			skip -= unix_skb_len(skb);
1994 			last = skb;
1995 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1996 			if (!skb)
1997 				goto again;
1998 		}
1999 
2000 		unix_state_unlock(sk);
2001 
2002 		if (check_creds) {
2003 			/* Never glue messages from different writers */
2004 			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
2005 			    !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
2006 			    !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
2007 				break;
2008 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2009 			/* Copy credentials */
2010 			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2011 			check_creds = 1;
2012 		}
2013 
2014 		/* Copy address just once */
2015 		if (sunaddr) {
2016 			unix_copy_addr(msg, skb->sk);
2017 			sunaddr = NULL;
2018 		}
2019 
2020 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2021 		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2022 					    msg->msg_iov, chunk)) {
2023 			if (copied == 0)
2024 				copied = -EFAULT;
2025 			break;
2026 		}
2027 		copied += chunk;
2028 		size -= chunk;
2029 
2030 		/* Mark read part of skb as used */
2031 		if (!(flags & MSG_PEEK)) {
2032 			UNIXCB(skb).consumed += chunk;
2033 
2034 			sk_peek_offset_bwd(sk, chunk);
2035 
2036 			if (UNIXCB(skb).fp)
2037 				unix_detach_fds(siocb->scm, skb);
2038 
2039 			if (unix_skb_len(skb))
2040 				break;
2041 
2042 			skb_unlink(skb, &sk->sk_receive_queue);
2043 			consume_skb(skb);
2044 
2045 			if (siocb->scm->fp)
2046 				break;
2047 		} else {
2048 			/* It is questionable, see note in unix_dgram_recvmsg.
2049 			 */
2050 			if (UNIXCB(skb).fp)
2051 				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2052 
2053 			sk_peek_offset_fwd(sk, chunk);
2054 
2055 			break;
2056 		}
2057 	} while (size);
2058 
2059 	mutex_unlock(&u->readlock);
2060 	scm_recv(sock, msg, siocb->scm, flags);
2061 out:
2062 	return copied ? : err;
2063 }
2064 
2065 static int unix_shutdown(struct socket *sock, int mode)
2066 {
2067 	struct sock *sk = sock->sk;
2068 	struct sock *other;
2069 
2070 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2071 		return -EINVAL;
2072 	/* This maps:
2073 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2074 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2075 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2076 	 */
2077 	++mode;
2078 
2079 	unix_state_lock(sk);
2080 	sk->sk_shutdown |= mode;
2081 	other = unix_peer(sk);
2082 	if (other)
2083 		sock_hold(other);
2084 	unix_state_unlock(sk);
2085 	sk->sk_state_change(sk);
2086 
2087 	if (other &&
2088 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2089 
2090 		int peer_mode = 0;
2091 
2092 		if (mode&RCV_SHUTDOWN)
2093 			peer_mode |= SEND_SHUTDOWN;
2094 		if (mode&SEND_SHUTDOWN)
2095 			peer_mode |= RCV_SHUTDOWN;
2096 		unix_state_lock(other);
2097 		other->sk_shutdown |= peer_mode;
2098 		unix_state_unlock(other);
2099 		other->sk_state_change(other);
2100 		if (peer_mode == SHUTDOWN_MASK)
2101 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2102 		else if (peer_mode & RCV_SHUTDOWN)
2103 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2104 	}
2105 	if (other)
2106 		sock_put(other);
2107 
2108 	return 0;
2109 }
2110 
2111 long unix_inq_len(struct sock *sk)
2112 {
2113 	struct sk_buff *skb;
2114 	long amount = 0;
2115 
2116 	if (sk->sk_state == TCP_LISTEN)
2117 		return -EINVAL;
2118 
2119 	spin_lock(&sk->sk_receive_queue.lock);
2120 	if (sk->sk_type == SOCK_STREAM ||
2121 	    sk->sk_type == SOCK_SEQPACKET) {
2122 		skb_queue_walk(&sk->sk_receive_queue, skb)
2123 			amount += unix_skb_len(skb);
2124 	} else {
2125 		skb = skb_peek(&sk->sk_receive_queue);
2126 		if (skb)
2127 			amount = skb->len;
2128 	}
2129 	spin_unlock(&sk->sk_receive_queue.lock);
2130 
2131 	return amount;
2132 }
2133 EXPORT_SYMBOL_GPL(unix_inq_len);
2134 
2135 long unix_outq_len(struct sock *sk)
2136 {
2137 	return sk_wmem_alloc_get(sk);
2138 }
2139 EXPORT_SYMBOL_GPL(unix_outq_len);
2140 
2141 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2142 {
2143 	struct sock *sk = sock->sk;
2144 	long amount = 0;
2145 	int err;
2146 
2147 	switch (cmd) {
2148 	case SIOCOUTQ:
2149 		amount = unix_outq_len(sk);
2150 		err = put_user(amount, (int __user *)arg);
2151 		break;
2152 	case SIOCINQ:
2153 		amount = unix_inq_len(sk);
2154 		if (amount < 0)
2155 			err = amount;
2156 		else
2157 			err = put_user(amount, (int __user *)arg);
2158 		break;
2159 	default:
2160 		err = -ENOIOCTLCMD;
2161 		break;
2162 	}
2163 	return err;
2164 }
2165 
2166 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2167 {
2168 	struct sock *sk = sock->sk;
2169 	unsigned int mask;
2170 
2171 	sock_poll_wait(file, sk_sleep(sk), wait);
2172 	mask = 0;
2173 
2174 	/* exceptional events? */
2175 	if (sk->sk_err)
2176 		mask |= POLLERR;
2177 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2178 		mask |= POLLHUP;
2179 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2180 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2181 
2182 	/* readable? */
2183 	if (!skb_queue_empty(&sk->sk_receive_queue))
2184 		mask |= POLLIN | POLLRDNORM;
2185 
2186 	/* Connection-based need to check for termination and startup */
2187 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2188 	    sk->sk_state == TCP_CLOSE)
2189 		mask |= POLLHUP;
2190 
2191 	/*
2192 	 * we set writable also when the other side has shut down the
2193 	 * connection. This prevents stuck sockets.
2194 	 */
2195 	if (unix_writable(sk))
2196 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2197 
2198 	return mask;
2199 }
2200 
2201 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2202 				    poll_table *wait)
2203 {
2204 	struct sock *sk = sock->sk, *other;
2205 	unsigned int mask, writable;
2206 
2207 	sock_poll_wait(file, sk_sleep(sk), wait);
2208 	mask = 0;
2209 
2210 	/* exceptional events? */
2211 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2212 		mask |= POLLERR |
2213 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2214 
2215 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2216 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2217 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2218 		mask |= POLLHUP;
2219 
2220 	/* readable? */
2221 	if (!skb_queue_empty(&sk->sk_receive_queue))
2222 		mask |= POLLIN | POLLRDNORM;
2223 
2224 	/* Connection-based need to check for termination and startup */
2225 	if (sk->sk_type == SOCK_SEQPACKET) {
2226 		if (sk->sk_state == TCP_CLOSE)
2227 			mask |= POLLHUP;
2228 		/* connection hasn't started yet? */
2229 		if (sk->sk_state == TCP_SYN_SENT)
2230 			return mask;
2231 	}
2232 
2233 	/* No write status requested, avoid expensive OUT tests. */
2234 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2235 		return mask;
2236 
2237 	writable = unix_writable(sk);
2238 	other = unix_peer_get(sk);
2239 	if (other) {
2240 		if (unix_peer(other) != sk) {
2241 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2242 			if (unix_recvq_full(other))
2243 				writable = 0;
2244 		}
2245 		sock_put(other);
2246 	}
2247 
2248 	if (writable)
2249 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2250 	else
2251 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2252 
2253 	return mask;
2254 }
2255 
2256 #ifdef CONFIG_PROC_FS
2257 
2258 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2259 
2260 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2261 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2262 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2263 
2264 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2265 {
2266 	unsigned long offset = get_offset(*pos);
2267 	unsigned long bucket = get_bucket(*pos);
2268 	struct sock *sk;
2269 	unsigned long count = 0;
2270 
2271 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2272 		if (sock_net(sk) != seq_file_net(seq))
2273 			continue;
2274 		if (++count == offset)
2275 			break;
2276 	}
2277 
2278 	return sk;
2279 }
2280 
2281 static struct sock *unix_next_socket(struct seq_file *seq,
2282 				     struct sock *sk,
2283 				     loff_t *pos)
2284 {
2285 	unsigned long bucket;
2286 
2287 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2288 		sk = sk_next(sk);
2289 		if (!sk)
2290 			goto next_bucket;
2291 		if (sock_net(sk) == seq_file_net(seq))
2292 			return sk;
2293 	}
2294 
2295 	do {
2296 		sk = unix_from_bucket(seq, pos);
2297 		if (sk)
2298 			return sk;
2299 
2300 next_bucket:
2301 		bucket = get_bucket(*pos) + 1;
2302 		*pos = set_bucket_offset(bucket, 1);
2303 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2304 
2305 	return NULL;
2306 }
2307 
2308 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2309 	__acquires(unix_table_lock)
2310 {
2311 	spin_lock(&unix_table_lock);
2312 
2313 	if (!*pos)
2314 		return SEQ_START_TOKEN;
2315 
2316 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2317 		return NULL;
2318 
2319 	return unix_next_socket(seq, NULL, pos);
2320 }
2321 
2322 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2323 {
2324 	++*pos;
2325 	return unix_next_socket(seq, v, pos);
2326 }
2327 
2328 static void unix_seq_stop(struct seq_file *seq, void *v)
2329 	__releases(unix_table_lock)
2330 {
2331 	spin_unlock(&unix_table_lock);
2332 }
2333 
2334 static int unix_seq_show(struct seq_file *seq, void *v)
2335 {
2336 
2337 	if (v == SEQ_START_TOKEN)
2338 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2339 			 "Inode Path\n");
2340 	else {
2341 		struct sock *s = v;
2342 		struct unix_sock *u = unix_sk(s);
2343 		unix_state_lock(s);
2344 
2345 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2346 			s,
2347 			atomic_read(&s->sk_refcnt),
2348 			0,
2349 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2350 			s->sk_type,
2351 			s->sk_socket ?
2352 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2353 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2354 			sock_i_ino(s));
2355 
2356 		if (u->addr) {
2357 			int i, len;
2358 			seq_putc(seq, ' ');
2359 
2360 			i = 0;
2361 			len = u->addr->len - sizeof(short);
2362 			if (!UNIX_ABSTRACT(s))
2363 				len--;
2364 			else {
2365 				seq_putc(seq, '@');
2366 				i++;
2367 			}
2368 			for ( ; i < len; i++)
2369 				seq_putc(seq, u->addr->name->sun_path[i]);
2370 		}
2371 		unix_state_unlock(s);
2372 		seq_putc(seq, '\n');
2373 	}
2374 
2375 	return 0;
2376 }
2377 
2378 static const struct seq_operations unix_seq_ops = {
2379 	.start  = unix_seq_start,
2380 	.next   = unix_seq_next,
2381 	.stop   = unix_seq_stop,
2382 	.show   = unix_seq_show,
2383 };
2384 
2385 static int unix_seq_open(struct inode *inode, struct file *file)
2386 {
2387 	return seq_open_net(inode, file, &unix_seq_ops,
2388 			    sizeof(struct seq_net_private));
2389 }
2390 
2391 static const struct file_operations unix_seq_fops = {
2392 	.owner		= THIS_MODULE,
2393 	.open		= unix_seq_open,
2394 	.read		= seq_read,
2395 	.llseek		= seq_lseek,
2396 	.release	= seq_release_net,
2397 };
2398 
2399 #endif
2400 
2401 static const struct net_proto_family unix_family_ops = {
2402 	.family = PF_UNIX,
2403 	.create = unix_create,
2404 	.owner	= THIS_MODULE,
2405 };
2406 
2407 
2408 static int __net_init unix_net_init(struct net *net)
2409 {
2410 	int error = -ENOMEM;
2411 
2412 	net->unx.sysctl_max_dgram_qlen = 10;
2413 	if (unix_sysctl_register(net))
2414 		goto out;
2415 
2416 #ifdef CONFIG_PROC_FS
2417 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2418 		unix_sysctl_unregister(net);
2419 		goto out;
2420 	}
2421 #endif
2422 	error = 0;
2423 out:
2424 	return error;
2425 }
2426 
2427 static void __net_exit unix_net_exit(struct net *net)
2428 {
2429 	unix_sysctl_unregister(net);
2430 	remove_proc_entry("unix", net->proc_net);
2431 }
2432 
2433 static struct pernet_operations unix_net_ops = {
2434 	.init = unix_net_init,
2435 	.exit = unix_net_exit,
2436 };
2437 
2438 static int __init af_unix_init(void)
2439 {
2440 	int rc = -1;
2441 
2442 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2443 
2444 	rc = proto_register(&unix_proto, 1);
2445 	if (rc != 0) {
2446 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2447 		goto out;
2448 	}
2449 
2450 	sock_register(&unix_family_ops);
2451 	register_pernet_subsys(&unix_net_ops);
2452 out:
2453 	return rc;
2454 }
2455 
2456 static void __exit af_unix_exit(void)
2457 {
2458 	sock_unregister(PF_UNIX);
2459 	proto_unregister(&unix_proto);
2460 	unregister_pernet_subsys(&unix_net_ops);
2461 }
2462 
2463 /* Earlier than device_initcall() so that other drivers invoking
2464    request_module() don't end up in a loop when modprobe tries
2465    to use a UNIX socket. But later than subsys_initcall() because
2466    we depend on stuff initialised there */
2467 fs_initcall(af_unix_init);
2468 module_exit(af_unix_exit);
2469 
2470 MODULE_LICENSE("GPL");
2471 MODULE_ALIAS_NETPROTO(PF_UNIX);
2472