xref: /openbmc/linux/net/unix/af_unix.c (revision 77d84ff8)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #include <linux/module.h>
84 #include <linux/kernel.h>
85 #include <linux/signal.h>
86 #include <linux/sched.h>
87 #include <linux/errno.h>
88 #include <linux/string.h>
89 #include <linux/stat.h>
90 #include <linux/dcache.h>
91 #include <linux/namei.h>
92 #include <linux/socket.h>
93 #include <linux/un.h>
94 #include <linux/fcntl.h>
95 #include <linux/termios.h>
96 #include <linux/sockios.h>
97 #include <linux/net.h>
98 #include <linux/in.h>
99 #include <linux/fs.h>
100 #include <linux/slab.h>
101 #include <asm/uaccess.h>
102 #include <linux/skbuff.h>
103 #include <linux/netdevice.h>
104 #include <net/net_namespace.h>
105 #include <net/sock.h>
106 #include <net/tcp_states.h>
107 #include <net/af_unix.h>
108 #include <linux/proc_fs.h>
109 #include <linux/seq_file.h>
110 #include <net/scm.h>
111 #include <linux/init.h>
112 #include <linux/poll.h>
113 #include <linux/rtnetlink.h>
114 #include <linux/mount.h>
115 #include <net/checksum.h>
116 #include <linux/security.h>
117 #include <linux/freezer.h>
118 
119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
120 EXPORT_SYMBOL_GPL(unix_socket_table);
121 DEFINE_SPINLOCK(unix_table_lock);
122 EXPORT_SYMBOL_GPL(unix_table_lock);
123 static atomic_long_t unix_nr_socks;
124 
125 
126 static struct hlist_head *unix_sockets_unbound(void *addr)
127 {
128 	unsigned long hash = (unsigned long)addr;
129 
130 	hash ^= hash >> 16;
131 	hash ^= hash >> 8;
132 	hash %= UNIX_HASH_SIZE;
133 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
134 }
135 
136 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
137 
138 #ifdef CONFIG_SECURITY_NETWORK
139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
140 {
141 	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
142 }
143 
144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
145 {
146 	scm->secid = *UNIXSID(skb);
147 }
148 #else
149 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
150 { }
151 
152 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
153 { }
154 #endif /* CONFIG_SECURITY_NETWORK */
155 
156 /*
157  *  SMP locking strategy:
158  *    hash table is protected with spinlock unix_table_lock
159  *    each socket state is protected by separate spin lock.
160  */
161 
162 static inline unsigned int unix_hash_fold(__wsum n)
163 {
164 	unsigned int hash = (__force unsigned int)n;
165 
166 	hash ^= hash>>16;
167 	hash ^= hash>>8;
168 	return hash&(UNIX_HASH_SIZE-1);
169 }
170 
171 #define unix_peer(sk) (unix_sk(sk)->peer)
172 
173 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
174 {
175 	return unix_peer(osk) == sk;
176 }
177 
178 static inline int unix_may_send(struct sock *sk, struct sock *osk)
179 {
180 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
181 }
182 
183 static inline int unix_recvq_full(struct sock const *sk)
184 {
185 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
186 }
187 
188 struct sock *unix_peer_get(struct sock *s)
189 {
190 	struct sock *peer;
191 
192 	unix_state_lock(s);
193 	peer = unix_peer(s);
194 	if (peer)
195 		sock_hold(peer);
196 	unix_state_unlock(s);
197 	return peer;
198 }
199 EXPORT_SYMBOL_GPL(unix_peer_get);
200 
201 static inline void unix_release_addr(struct unix_address *addr)
202 {
203 	if (atomic_dec_and_test(&addr->refcnt))
204 		kfree(addr);
205 }
206 
207 /*
208  *	Check unix socket name:
209  *		- should be not zero length.
210  *	        - if started by not zero, should be NULL terminated (FS object)
211  *		- if started by zero, it is abstract name.
212  */
213 
214 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
215 {
216 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
217 		return -EINVAL;
218 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
219 		return -EINVAL;
220 	if (sunaddr->sun_path[0]) {
221 		/*
222 		 * This may look like an off by one error but it is a bit more
223 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
224 		 * sun_path[108] doesn't as such exist.  However in kernel space
225 		 * we are guaranteed that it is a valid memory location in our
226 		 * kernel address buffer.
227 		 */
228 		((char *)sunaddr)[len] = 0;
229 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
230 		return len;
231 	}
232 
233 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
234 	return len;
235 }
236 
237 static void __unix_remove_socket(struct sock *sk)
238 {
239 	sk_del_node_init(sk);
240 }
241 
242 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
243 {
244 	WARN_ON(!sk_unhashed(sk));
245 	sk_add_node(sk, list);
246 }
247 
248 static inline void unix_remove_socket(struct sock *sk)
249 {
250 	spin_lock(&unix_table_lock);
251 	__unix_remove_socket(sk);
252 	spin_unlock(&unix_table_lock);
253 }
254 
255 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
256 {
257 	spin_lock(&unix_table_lock);
258 	__unix_insert_socket(list, sk);
259 	spin_unlock(&unix_table_lock);
260 }
261 
262 static struct sock *__unix_find_socket_byname(struct net *net,
263 					      struct sockaddr_un *sunname,
264 					      int len, int type, unsigned int hash)
265 {
266 	struct sock *s;
267 
268 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
269 		struct unix_sock *u = unix_sk(s);
270 
271 		if (!net_eq(sock_net(s), net))
272 			continue;
273 
274 		if (u->addr->len == len &&
275 		    !memcmp(u->addr->name, sunname, len))
276 			goto found;
277 	}
278 	s = NULL;
279 found:
280 	return s;
281 }
282 
283 static inline struct sock *unix_find_socket_byname(struct net *net,
284 						   struct sockaddr_un *sunname,
285 						   int len, int type,
286 						   unsigned int hash)
287 {
288 	struct sock *s;
289 
290 	spin_lock(&unix_table_lock);
291 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
292 	if (s)
293 		sock_hold(s);
294 	spin_unlock(&unix_table_lock);
295 	return s;
296 }
297 
298 static struct sock *unix_find_socket_byinode(struct inode *i)
299 {
300 	struct sock *s;
301 
302 	spin_lock(&unix_table_lock);
303 	sk_for_each(s,
304 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
305 		struct dentry *dentry = unix_sk(s)->path.dentry;
306 
307 		if (dentry && dentry->d_inode == i) {
308 			sock_hold(s);
309 			goto found;
310 		}
311 	}
312 	s = NULL;
313 found:
314 	spin_unlock(&unix_table_lock);
315 	return s;
316 }
317 
318 static inline int unix_writable(struct sock *sk)
319 {
320 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
321 }
322 
323 static void unix_write_space(struct sock *sk)
324 {
325 	struct socket_wq *wq;
326 
327 	rcu_read_lock();
328 	if (unix_writable(sk)) {
329 		wq = rcu_dereference(sk->sk_wq);
330 		if (wq_has_sleeper(wq))
331 			wake_up_interruptible_sync_poll(&wq->wait,
332 				POLLOUT | POLLWRNORM | POLLWRBAND);
333 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
334 	}
335 	rcu_read_unlock();
336 }
337 
338 /* When dgram socket disconnects (or changes its peer), we clear its receive
339  * queue of packets arrived from previous peer. First, it allows to do
340  * flow control based only on wmem_alloc; second, sk connected to peer
341  * may receive messages only from that peer. */
342 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
343 {
344 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
345 		skb_queue_purge(&sk->sk_receive_queue);
346 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
347 
348 		/* If one link of bidirectional dgram pipe is disconnected,
349 		 * we signal error. Messages are lost. Do not make this,
350 		 * when peer was not connected to us.
351 		 */
352 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
353 			other->sk_err = ECONNRESET;
354 			other->sk_error_report(other);
355 		}
356 	}
357 }
358 
359 static void unix_sock_destructor(struct sock *sk)
360 {
361 	struct unix_sock *u = unix_sk(sk);
362 
363 	skb_queue_purge(&sk->sk_receive_queue);
364 
365 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
366 	WARN_ON(!sk_unhashed(sk));
367 	WARN_ON(sk->sk_socket);
368 	if (!sock_flag(sk, SOCK_DEAD)) {
369 		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
370 		return;
371 	}
372 
373 	if (u->addr)
374 		unix_release_addr(u->addr);
375 
376 	atomic_long_dec(&unix_nr_socks);
377 	local_bh_disable();
378 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
379 	local_bh_enable();
380 #ifdef UNIX_REFCNT_DEBUG
381 	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
382 		atomic_long_read(&unix_nr_socks));
383 #endif
384 }
385 
386 static void unix_release_sock(struct sock *sk, int embrion)
387 {
388 	struct unix_sock *u = unix_sk(sk);
389 	struct path path;
390 	struct sock *skpair;
391 	struct sk_buff *skb;
392 	int state;
393 
394 	unix_remove_socket(sk);
395 
396 	/* Clear state */
397 	unix_state_lock(sk);
398 	sock_orphan(sk);
399 	sk->sk_shutdown = SHUTDOWN_MASK;
400 	path	     = u->path;
401 	u->path.dentry = NULL;
402 	u->path.mnt = NULL;
403 	state = sk->sk_state;
404 	sk->sk_state = TCP_CLOSE;
405 	unix_state_unlock(sk);
406 
407 	wake_up_interruptible_all(&u->peer_wait);
408 
409 	skpair = unix_peer(sk);
410 
411 	if (skpair != NULL) {
412 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
413 			unix_state_lock(skpair);
414 			/* No more writes */
415 			skpair->sk_shutdown = SHUTDOWN_MASK;
416 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
417 				skpair->sk_err = ECONNRESET;
418 			unix_state_unlock(skpair);
419 			skpair->sk_state_change(skpair);
420 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
421 		}
422 		sock_put(skpair); /* It may now die */
423 		unix_peer(sk) = NULL;
424 	}
425 
426 	/* Try to flush out this socket. Throw out buffers at least */
427 
428 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
429 		if (state == TCP_LISTEN)
430 			unix_release_sock(skb->sk, 1);
431 		/* passed fds are erased in the kfree_skb hook	      */
432 		kfree_skb(skb);
433 	}
434 
435 	if (path.dentry)
436 		path_put(&path);
437 
438 	sock_put(sk);
439 
440 	/* ---- Socket is dead now and most probably destroyed ---- */
441 
442 	/*
443 	 * Fixme: BSD difference: In BSD all sockets connected to us get
444 	 *	  ECONNRESET and we die on the spot. In Linux we behave
445 	 *	  like files and pipes do and wait for the last
446 	 *	  dereference.
447 	 *
448 	 * Can't we simply set sock->err?
449 	 *
450 	 *	  What the above comment does talk about? --ANK(980817)
451 	 */
452 
453 	if (unix_tot_inflight)
454 		unix_gc();		/* Garbage collect fds */
455 }
456 
457 static void init_peercred(struct sock *sk)
458 {
459 	put_pid(sk->sk_peer_pid);
460 	if (sk->sk_peer_cred)
461 		put_cred(sk->sk_peer_cred);
462 	sk->sk_peer_pid  = get_pid(task_tgid(current));
463 	sk->sk_peer_cred = get_current_cred();
464 }
465 
466 static void copy_peercred(struct sock *sk, struct sock *peersk)
467 {
468 	put_pid(sk->sk_peer_pid);
469 	if (sk->sk_peer_cred)
470 		put_cred(sk->sk_peer_cred);
471 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
472 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
473 }
474 
475 static int unix_listen(struct socket *sock, int backlog)
476 {
477 	int err;
478 	struct sock *sk = sock->sk;
479 	struct unix_sock *u = unix_sk(sk);
480 	struct pid *old_pid = NULL;
481 
482 	err = -EOPNOTSUPP;
483 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
484 		goto out;	/* Only stream/seqpacket sockets accept */
485 	err = -EINVAL;
486 	if (!u->addr)
487 		goto out;	/* No listens on an unbound socket */
488 	unix_state_lock(sk);
489 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
490 		goto out_unlock;
491 	if (backlog > sk->sk_max_ack_backlog)
492 		wake_up_interruptible_all(&u->peer_wait);
493 	sk->sk_max_ack_backlog	= backlog;
494 	sk->sk_state		= TCP_LISTEN;
495 	/* set credentials so connect can copy them */
496 	init_peercred(sk);
497 	err = 0;
498 
499 out_unlock:
500 	unix_state_unlock(sk);
501 	put_pid(old_pid);
502 out:
503 	return err;
504 }
505 
506 static int unix_release(struct socket *);
507 static int unix_bind(struct socket *, struct sockaddr *, int);
508 static int unix_stream_connect(struct socket *, struct sockaddr *,
509 			       int addr_len, int flags);
510 static int unix_socketpair(struct socket *, struct socket *);
511 static int unix_accept(struct socket *, struct socket *, int);
512 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
513 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
514 static unsigned int unix_dgram_poll(struct file *, struct socket *,
515 				    poll_table *);
516 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
517 static int unix_shutdown(struct socket *, int);
518 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
519 			       struct msghdr *, size_t);
520 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
521 			       struct msghdr *, size_t, int);
522 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
523 			      struct msghdr *, size_t);
524 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
525 			      struct msghdr *, size_t, int);
526 static int unix_dgram_connect(struct socket *, struct sockaddr *,
527 			      int, int);
528 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
529 				  struct msghdr *, size_t);
530 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
531 				  struct msghdr *, size_t, int);
532 
533 static int unix_set_peek_off(struct sock *sk, int val)
534 {
535 	struct unix_sock *u = unix_sk(sk);
536 
537 	if (mutex_lock_interruptible(&u->readlock))
538 		return -EINTR;
539 
540 	sk->sk_peek_off = val;
541 	mutex_unlock(&u->readlock);
542 
543 	return 0;
544 }
545 
546 
547 static const struct proto_ops unix_stream_ops = {
548 	.family =	PF_UNIX,
549 	.owner =	THIS_MODULE,
550 	.release =	unix_release,
551 	.bind =		unix_bind,
552 	.connect =	unix_stream_connect,
553 	.socketpair =	unix_socketpair,
554 	.accept =	unix_accept,
555 	.getname =	unix_getname,
556 	.poll =		unix_poll,
557 	.ioctl =	unix_ioctl,
558 	.listen =	unix_listen,
559 	.shutdown =	unix_shutdown,
560 	.setsockopt =	sock_no_setsockopt,
561 	.getsockopt =	sock_no_getsockopt,
562 	.sendmsg =	unix_stream_sendmsg,
563 	.recvmsg =	unix_stream_recvmsg,
564 	.mmap =		sock_no_mmap,
565 	.sendpage =	sock_no_sendpage,
566 	.set_peek_off =	unix_set_peek_off,
567 };
568 
569 static const struct proto_ops unix_dgram_ops = {
570 	.family =	PF_UNIX,
571 	.owner =	THIS_MODULE,
572 	.release =	unix_release,
573 	.bind =		unix_bind,
574 	.connect =	unix_dgram_connect,
575 	.socketpair =	unix_socketpair,
576 	.accept =	sock_no_accept,
577 	.getname =	unix_getname,
578 	.poll =		unix_dgram_poll,
579 	.ioctl =	unix_ioctl,
580 	.listen =	sock_no_listen,
581 	.shutdown =	unix_shutdown,
582 	.setsockopt =	sock_no_setsockopt,
583 	.getsockopt =	sock_no_getsockopt,
584 	.sendmsg =	unix_dgram_sendmsg,
585 	.recvmsg =	unix_dgram_recvmsg,
586 	.mmap =		sock_no_mmap,
587 	.sendpage =	sock_no_sendpage,
588 	.set_peek_off =	unix_set_peek_off,
589 };
590 
591 static const struct proto_ops unix_seqpacket_ops = {
592 	.family =	PF_UNIX,
593 	.owner =	THIS_MODULE,
594 	.release =	unix_release,
595 	.bind =		unix_bind,
596 	.connect =	unix_stream_connect,
597 	.socketpair =	unix_socketpair,
598 	.accept =	unix_accept,
599 	.getname =	unix_getname,
600 	.poll =		unix_dgram_poll,
601 	.ioctl =	unix_ioctl,
602 	.listen =	unix_listen,
603 	.shutdown =	unix_shutdown,
604 	.setsockopt =	sock_no_setsockopt,
605 	.getsockopt =	sock_no_getsockopt,
606 	.sendmsg =	unix_seqpacket_sendmsg,
607 	.recvmsg =	unix_seqpacket_recvmsg,
608 	.mmap =		sock_no_mmap,
609 	.sendpage =	sock_no_sendpage,
610 	.set_peek_off =	unix_set_peek_off,
611 };
612 
613 static struct proto unix_proto = {
614 	.name			= "UNIX",
615 	.owner			= THIS_MODULE,
616 	.obj_size		= sizeof(struct unix_sock),
617 };
618 
619 /*
620  * AF_UNIX sockets do not interact with hardware, hence they
621  * dont trigger interrupts - so it's safe for them to have
622  * bh-unsafe locking for their sk_receive_queue.lock. Split off
623  * this special lock-class by reinitializing the spinlock key:
624  */
625 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
626 
627 static struct sock *unix_create1(struct net *net, struct socket *sock)
628 {
629 	struct sock *sk = NULL;
630 	struct unix_sock *u;
631 
632 	atomic_long_inc(&unix_nr_socks);
633 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
634 		goto out;
635 
636 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
637 	if (!sk)
638 		goto out;
639 
640 	sock_init_data(sock, sk);
641 	lockdep_set_class(&sk->sk_receive_queue.lock,
642 				&af_unix_sk_receive_queue_lock_key);
643 
644 	sk->sk_write_space	= unix_write_space;
645 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
646 	sk->sk_destruct		= unix_sock_destructor;
647 	u	  = unix_sk(sk);
648 	u->path.dentry = NULL;
649 	u->path.mnt = NULL;
650 	spin_lock_init(&u->lock);
651 	atomic_long_set(&u->inflight, 0);
652 	INIT_LIST_HEAD(&u->link);
653 	mutex_init(&u->readlock); /* single task reading lock */
654 	init_waitqueue_head(&u->peer_wait);
655 	unix_insert_socket(unix_sockets_unbound(sk), sk);
656 out:
657 	if (sk == NULL)
658 		atomic_long_dec(&unix_nr_socks);
659 	else {
660 		local_bh_disable();
661 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
662 		local_bh_enable();
663 	}
664 	return sk;
665 }
666 
667 static int unix_create(struct net *net, struct socket *sock, int protocol,
668 		       int kern)
669 {
670 	if (protocol && protocol != PF_UNIX)
671 		return -EPROTONOSUPPORT;
672 
673 	sock->state = SS_UNCONNECTED;
674 
675 	switch (sock->type) {
676 	case SOCK_STREAM:
677 		sock->ops = &unix_stream_ops;
678 		break;
679 		/*
680 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
681 		 *	nothing uses it.
682 		 */
683 	case SOCK_RAW:
684 		sock->type = SOCK_DGRAM;
685 	case SOCK_DGRAM:
686 		sock->ops = &unix_dgram_ops;
687 		break;
688 	case SOCK_SEQPACKET:
689 		sock->ops = &unix_seqpacket_ops;
690 		break;
691 	default:
692 		return -ESOCKTNOSUPPORT;
693 	}
694 
695 	return unix_create1(net, sock) ? 0 : -ENOMEM;
696 }
697 
698 static int unix_release(struct socket *sock)
699 {
700 	struct sock *sk = sock->sk;
701 
702 	if (!sk)
703 		return 0;
704 
705 	unix_release_sock(sk, 0);
706 	sock->sk = NULL;
707 
708 	return 0;
709 }
710 
711 static int unix_autobind(struct socket *sock)
712 {
713 	struct sock *sk = sock->sk;
714 	struct net *net = sock_net(sk);
715 	struct unix_sock *u = unix_sk(sk);
716 	static u32 ordernum = 1;
717 	struct unix_address *addr;
718 	int err;
719 	unsigned int retries = 0;
720 
721 	mutex_lock(&u->readlock);
722 
723 	err = 0;
724 	if (u->addr)
725 		goto out;
726 
727 	err = -ENOMEM;
728 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
729 	if (!addr)
730 		goto out;
731 
732 	addr->name->sun_family = AF_UNIX;
733 	atomic_set(&addr->refcnt, 1);
734 
735 retry:
736 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
737 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
738 
739 	spin_lock(&unix_table_lock);
740 	ordernum = (ordernum+1)&0xFFFFF;
741 
742 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
743 				      addr->hash)) {
744 		spin_unlock(&unix_table_lock);
745 		/*
746 		 * __unix_find_socket_byname() may take long time if many names
747 		 * are already in use.
748 		 */
749 		cond_resched();
750 		/* Give up if all names seems to be in use. */
751 		if (retries++ == 0xFFFFF) {
752 			err = -ENOSPC;
753 			kfree(addr);
754 			goto out;
755 		}
756 		goto retry;
757 	}
758 	addr->hash ^= sk->sk_type;
759 
760 	__unix_remove_socket(sk);
761 	u->addr = addr;
762 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
763 	spin_unlock(&unix_table_lock);
764 	err = 0;
765 
766 out:	mutex_unlock(&u->readlock);
767 	return err;
768 }
769 
770 static struct sock *unix_find_other(struct net *net,
771 				    struct sockaddr_un *sunname, int len,
772 				    int type, unsigned int hash, int *error)
773 {
774 	struct sock *u;
775 	struct path path;
776 	int err = 0;
777 
778 	if (sunname->sun_path[0]) {
779 		struct inode *inode;
780 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
781 		if (err)
782 			goto fail;
783 		inode = path.dentry->d_inode;
784 		err = inode_permission(inode, MAY_WRITE);
785 		if (err)
786 			goto put_fail;
787 
788 		err = -ECONNREFUSED;
789 		if (!S_ISSOCK(inode->i_mode))
790 			goto put_fail;
791 		u = unix_find_socket_byinode(inode);
792 		if (!u)
793 			goto put_fail;
794 
795 		if (u->sk_type == type)
796 			touch_atime(&path);
797 
798 		path_put(&path);
799 
800 		err = -EPROTOTYPE;
801 		if (u->sk_type != type) {
802 			sock_put(u);
803 			goto fail;
804 		}
805 	} else {
806 		err = -ECONNREFUSED;
807 		u = unix_find_socket_byname(net, sunname, len, type, hash);
808 		if (u) {
809 			struct dentry *dentry;
810 			dentry = unix_sk(u)->path.dentry;
811 			if (dentry)
812 				touch_atime(&unix_sk(u)->path);
813 		} else
814 			goto fail;
815 	}
816 	return u;
817 
818 put_fail:
819 	path_put(&path);
820 fail:
821 	*error = err;
822 	return NULL;
823 }
824 
825 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
826 {
827 	struct dentry *dentry;
828 	struct path path;
829 	int err = 0;
830 	/*
831 	 * Get the parent directory, calculate the hash for last
832 	 * component.
833 	 */
834 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
835 	err = PTR_ERR(dentry);
836 	if (IS_ERR(dentry))
837 		return err;
838 
839 	/*
840 	 * All right, let's create it.
841 	 */
842 	err = security_path_mknod(&path, dentry, mode, 0);
843 	if (!err) {
844 		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
845 		if (!err) {
846 			res->mnt = mntget(path.mnt);
847 			res->dentry = dget(dentry);
848 		}
849 	}
850 	done_path_create(&path, dentry);
851 	return err;
852 }
853 
854 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
855 {
856 	struct sock *sk = sock->sk;
857 	struct net *net = sock_net(sk);
858 	struct unix_sock *u = unix_sk(sk);
859 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
860 	char *sun_path = sunaddr->sun_path;
861 	int err;
862 	unsigned int hash;
863 	struct unix_address *addr;
864 	struct hlist_head *list;
865 
866 	err = -EINVAL;
867 	if (sunaddr->sun_family != AF_UNIX)
868 		goto out;
869 
870 	if (addr_len == sizeof(short)) {
871 		err = unix_autobind(sock);
872 		goto out;
873 	}
874 
875 	err = unix_mkname(sunaddr, addr_len, &hash);
876 	if (err < 0)
877 		goto out;
878 	addr_len = err;
879 
880 	mutex_lock(&u->readlock);
881 
882 	err = -EINVAL;
883 	if (u->addr)
884 		goto out_up;
885 
886 	err = -ENOMEM;
887 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
888 	if (!addr)
889 		goto out_up;
890 
891 	memcpy(addr->name, sunaddr, addr_len);
892 	addr->len = addr_len;
893 	addr->hash = hash ^ sk->sk_type;
894 	atomic_set(&addr->refcnt, 1);
895 
896 	if (sun_path[0]) {
897 		struct path path;
898 		umode_t mode = S_IFSOCK |
899 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
900 		err = unix_mknod(sun_path, mode, &path);
901 		if (err) {
902 			if (err == -EEXIST)
903 				err = -EADDRINUSE;
904 			unix_release_addr(addr);
905 			goto out_up;
906 		}
907 		addr->hash = UNIX_HASH_SIZE;
908 		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
909 		spin_lock(&unix_table_lock);
910 		u->path = path;
911 		list = &unix_socket_table[hash];
912 	} else {
913 		spin_lock(&unix_table_lock);
914 		err = -EADDRINUSE;
915 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
916 					      sk->sk_type, hash)) {
917 			unix_release_addr(addr);
918 			goto out_unlock;
919 		}
920 
921 		list = &unix_socket_table[addr->hash];
922 	}
923 
924 	err = 0;
925 	__unix_remove_socket(sk);
926 	u->addr = addr;
927 	__unix_insert_socket(list, sk);
928 
929 out_unlock:
930 	spin_unlock(&unix_table_lock);
931 out_up:
932 	mutex_unlock(&u->readlock);
933 out:
934 	return err;
935 }
936 
937 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
938 {
939 	if (unlikely(sk1 == sk2) || !sk2) {
940 		unix_state_lock(sk1);
941 		return;
942 	}
943 	if (sk1 < sk2) {
944 		unix_state_lock(sk1);
945 		unix_state_lock_nested(sk2);
946 	} else {
947 		unix_state_lock(sk2);
948 		unix_state_lock_nested(sk1);
949 	}
950 }
951 
952 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
953 {
954 	if (unlikely(sk1 == sk2) || !sk2) {
955 		unix_state_unlock(sk1);
956 		return;
957 	}
958 	unix_state_unlock(sk1);
959 	unix_state_unlock(sk2);
960 }
961 
962 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
963 			      int alen, int flags)
964 {
965 	struct sock *sk = sock->sk;
966 	struct net *net = sock_net(sk);
967 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
968 	struct sock *other;
969 	unsigned int hash;
970 	int err;
971 
972 	if (addr->sa_family != AF_UNSPEC) {
973 		err = unix_mkname(sunaddr, alen, &hash);
974 		if (err < 0)
975 			goto out;
976 		alen = err;
977 
978 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
979 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
980 			goto out;
981 
982 restart:
983 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
984 		if (!other)
985 			goto out;
986 
987 		unix_state_double_lock(sk, other);
988 
989 		/* Apparently VFS overslept socket death. Retry. */
990 		if (sock_flag(other, SOCK_DEAD)) {
991 			unix_state_double_unlock(sk, other);
992 			sock_put(other);
993 			goto restart;
994 		}
995 
996 		err = -EPERM;
997 		if (!unix_may_send(sk, other))
998 			goto out_unlock;
999 
1000 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1001 		if (err)
1002 			goto out_unlock;
1003 
1004 	} else {
1005 		/*
1006 		 *	1003.1g breaking connected state with AF_UNSPEC
1007 		 */
1008 		other = NULL;
1009 		unix_state_double_lock(sk, other);
1010 	}
1011 
1012 	/*
1013 	 * If it was connected, reconnect.
1014 	 */
1015 	if (unix_peer(sk)) {
1016 		struct sock *old_peer = unix_peer(sk);
1017 		unix_peer(sk) = other;
1018 		unix_state_double_unlock(sk, other);
1019 
1020 		if (other != old_peer)
1021 			unix_dgram_disconnected(sk, old_peer);
1022 		sock_put(old_peer);
1023 	} else {
1024 		unix_peer(sk) = other;
1025 		unix_state_double_unlock(sk, other);
1026 	}
1027 	return 0;
1028 
1029 out_unlock:
1030 	unix_state_double_unlock(sk, other);
1031 	sock_put(other);
1032 out:
1033 	return err;
1034 }
1035 
1036 static long unix_wait_for_peer(struct sock *other, long timeo)
1037 {
1038 	struct unix_sock *u = unix_sk(other);
1039 	int sched;
1040 	DEFINE_WAIT(wait);
1041 
1042 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1043 
1044 	sched = !sock_flag(other, SOCK_DEAD) &&
1045 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1046 		unix_recvq_full(other);
1047 
1048 	unix_state_unlock(other);
1049 
1050 	if (sched)
1051 		timeo = schedule_timeout(timeo);
1052 
1053 	finish_wait(&u->peer_wait, &wait);
1054 	return timeo;
1055 }
1056 
1057 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1058 			       int addr_len, int flags)
1059 {
1060 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1061 	struct sock *sk = sock->sk;
1062 	struct net *net = sock_net(sk);
1063 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1064 	struct sock *newsk = NULL;
1065 	struct sock *other = NULL;
1066 	struct sk_buff *skb = NULL;
1067 	unsigned int hash;
1068 	int st;
1069 	int err;
1070 	long timeo;
1071 
1072 	err = unix_mkname(sunaddr, addr_len, &hash);
1073 	if (err < 0)
1074 		goto out;
1075 	addr_len = err;
1076 
1077 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1078 	    (err = unix_autobind(sock)) != 0)
1079 		goto out;
1080 
1081 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1082 
1083 	/* First of all allocate resources.
1084 	   If we will make it after state is locked,
1085 	   we will have to recheck all again in any case.
1086 	 */
1087 
1088 	err = -ENOMEM;
1089 
1090 	/* create new sock for complete connection */
1091 	newsk = unix_create1(sock_net(sk), NULL);
1092 	if (newsk == NULL)
1093 		goto out;
1094 
1095 	/* Allocate skb for sending to listening sock */
1096 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1097 	if (skb == NULL)
1098 		goto out;
1099 
1100 restart:
1101 	/*  Find listening sock. */
1102 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1103 	if (!other)
1104 		goto out;
1105 
1106 	/* Latch state of peer */
1107 	unix_state_lock(other);
1108 
1109 	/* Apparently VFS overslept socket death. Retry. */
1110 	if (sock_flag(other, SOCK_DEAD)) {
1111 		unix_state_unlock(other);
1112 		sock_put(other);
1113 		goto restart;
1114 	}
1115 
1116 	err = -ECONNREFUSED;
1117 	if (other->sk_state != TCP_LISTEN)
1118 		goto out_unlock;
1119 	if (other->sk_shutdown & RCV_SHUTDOWN)
1120 		goto out_unlock;
1121 
1122 	if (unix_recvq_full(other)) {
1123 		err = -EAGAIN;
1124 		if (!timeo)
1125 			goto out_unlock;
1126 
1127 		timeo = unix_wait_for_peer(other, timeo);
1128 
1129 		err = sock_intr_errno(timeo);
1130 		if (signal_pending(current))
1131 			goto out;
1132 		sock_put(other);
1133 		goto restart;
1134 	}
1135 
1136 	/* Latch our state.
1137 
1138 	   It is tricky place. We need to grab our state lock and cannot
1139 	   drop lock on peer. It is dangerous because deadlock is
1140 	   possible. Connect to self case and simultaneous
1141 	   attempt to connect are eliminated by checking socket
1142 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1143 	   check this before attempt to grab lock.
1144 
1145 	   Well, and we have to recheck the state after socket locked.
1146 	 */
1147 	st = sk->sk_state;
1148 
1149 	switch (st) {
1150 	case TCP_CLOSE:
1151 		/* This is ok... continue with connect */
1152 		break;
1153 	case TCP_ESTABLISHED:
1154 		/* Socket is already connected */
1155 		err = -EISCONN;
1156 		goto out_unlock;
1157 	default:
1158 		err = -EINVAL;
1159 		goto out_unlock;
1160 	}
1161 
1162 	unix_state_lock_nested(sk);
1163 
1164 	if (sk->sk_state != st) {
1165 		unix_state_unlock(sk);
1166 		unix_state_unlock(other);
1167 		sock_put(other);
1168 		goto restart;
1169 	}
1170 
1171 	err = security_unix_stream_connect(sk, other, newsk);
1172 	if (err) {
1173 		unix_state_unlock(sk);
1174 		goto out_unlock;
1175 	}
1176 
1177 	/* The way is open! Fastly set all the necessary fields... */
1178 
1179 	sock_hold(sk);
1180 	unix_peer(newsk)	= sk;
1181 	newsk->sk_state		= TCP_ESTABLISHED;
1182 	newsk->sk_type		= sk->sk_type;
1183 	init_peercred(newsk);
1184 	newu = unix_sk(newsk);
1185 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1186 	otheru = unix_sk(other);
1187 
1188 	/* copy address information from listening to new sock*/
1189 	if (otheru->addr) {
1190 		atomic_inc(&otheru->addr->refcnt);
1191 		newu->addr = otheru->addr;
1192 	}
1193 	if (otheru->path.dentry) {
1194 		path_get(&otheru->path);
1195 		newu->path = otheru->path;
1196 	}
1197 
1198 	/* Set credentials */
1199 	copy_peercred(sk, other);
1200 
1201 	sock->state	= SS_CONNECTED;
1202 	sk->sk_state	= TCP_ESTABLISHED;
1203 	sock_hold(newsk);
1204 
1205 	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1206 	unix_peer(sk)	= newsk;
1207 
1208 	unix_state_unlock(sk);
1209 
1210 	/* take ten and and send info to listening sock */
1211 	spin_lock(&other->sk_receive_queue.lock);
1212 	__skb_queue_tail(&other->sk_receive_queue, skb);
1213 	spin_unlock(&other->sk_receive_queue.lock);
1214 	unix_state_unlock(other);
1215 	other->sk_data_ready(other, 0);
1216 	sock_put(other);
1217 	return 0;
1218 
1219 out_unlock:
1220 	if (other)
1221 		unix_state_unlock(other);
1222 
1223 out:
1224 	kfree_skb(skb);
1225 	if (newsk)
1226 		unix_release_sock(newsk, 0);
1227 	if (other)
1228 		sock_put(other);
1229 	return err;
1230 }
1231 
1232 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1233 {
1234 	struct sock *ska = socka->sk, *skb = sockb->sk;
1235 
1236 	/* Join our sockets back to back */
1237 	sock_hold(ska);
1238 	sock_hold(skb);
1239 	unix_peer(ska) = skb;
1240 	unix_peer(skb) = ska;
1241 	init_peercred(ska);
1242 	init_peercred(skb);
1243 
1244 	if (ska->sk_type != SOCK_DGRAM) {
1245 		ska->sk_state = TCP_ESTABLISHED;
1246 		skb->sk_state = TCP_ESTABLISHED;
1247 		socka->state  = SS_CONNECTED;
1248 		sockb->state  = SS_CONNECTED;
1249 	}
1250 	return 0;
1251 }
1252 
1253 static void unix_sock_inherit_flags(const struct socket *old,
1254 				    struct socket *new)
1255 {
1256 	if (test_bit(SOCK_PASSCRED, &old->flags))
1257 		set_bit(SOCK_PASSCRED, &new->flags);
1258 	if (test_bit(SOCK_PASSSEC, &old->flags))
1259 		set_bit(SOCK_PASSSEC, &new->flags);
1260 }
1261 
1262 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1263 {
1264 	struct sock *sk = sock->sk;
1265 	struct sock *tsk;
1266 	struct sk_buff *skb;
1267 	int err;
1268 
1269 	err = -EOPNOTSUPP;
1270 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1271 		goto out;
1272 
1273 	err = -EINVAL;
1274 	if (sk->sk_state != TCP_LISTEN)
1275 		goto out;
1276 
1277 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1278 	 * so that no locks are necessary.
1279 	 */
1280 
1281 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1282 	if (!skb) {
1283 		/* This means receive shutdown. */
1284 		if (err == 0)
1285 			err = -EINVAL;
1286 		goto out;
1287 	}
1288 
1289 	tsk = skb->sk;
1290 	skb_free_datagram(sk, skb);
1291 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1292 
1293 	/* attach accepted sock to socket */
1294 	unix_state_lock(tsk);
1295 	newsock->state = SS_CONNECTED;
1296 	unix_sock_inherit_flags(sock, newsock);
1297 	sock_graft(tsk, newsock);
1298 	unix_state_unlock(tsk);
1299 	return 0;
1300 
1301 out:
1302 	return err;
1303 }
1304 
1305 
1306 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1307 {
1308 	struct sock *sk = sock->sk;
1309 	struct unix_sock *u;
1310 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1311 	int err = 0;
1312 
1313 	if (peer) {
1314 		sk = unix_peer_get(sk);
1315 
1316 		err = -ENOTCONN;
1317 		if (!sk)
1318 			goto out;
1319 		err = 0;
1320 	} else {
1321 		sock_hold(sk);
1322 	}
1323 
1324 	u = unix_sk(sk);
1325 	unix_state_lock(sk);
1326 	if (!u->addr) {
1327 		sunaddr->sun_family = AF_UNIX;
1328 		sunaddr->sun_path[0] = 0;
1329 		*uaddr_len = sizeof(short);
1330 	} else {
1331 		struct unix_address *addr = u->addr;
1332 
1333 		*uaddr_len = addr->len;
1334 		memcpy(sunaddr, addr->name, *uaddr_len);
1335 	}
1336 	unix_state_unlock(sk);
1337 	sock_put(sk);
1338 out:
1339 	return err;
1340 }
1341 
1342 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1343 {
1344 	int i;
1345 
1346 	scm->fp = UNIXCB(skb).fp;
1347 	UNIXCB(skb).fp = NULL;
1348 
1349 	for (i = scm->fp->count-1; i >= 0; i--)
1350 		unix_notinflight(scm->fp->fp[i]);
1351 }
1352 
1353 static void unix_destruct_scm(struct sk_buff *skb)
1354 {
1355 	struct scm_cookie scm;
1356 	memset(&scm, 0, sizeof(scm));
1357 	scm.pid  = UNIXCB(skb).pid;
1358 	if (UNIXCB(skb).fp)
1359 		unix_detach_fds(&scm, skb);
1360 
1361 	/* Alas, it calls VFS */
1362 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1363 	scm_destroy(&scm);
1364 	sock_wfree(skb);
1365 }
1366 
1367 #define MAX_RECURSION_LEVEL 4
1368 
1369 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1370 {
1371 	int i;
1372 	unsigned char max_level = 0;
1373 	int unix_sock_count = 0;
1374 
1375 	for (i = scm->fp->count - 1; i >= 0; i--) {
1376 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1377 
1378 		if (sk) {
1379 			unix_sock_count++;
1380 			max_level = max(max_level,
1381 					unix_sk(sk)->recursion_level);
1382 		}
1383 	}
1384 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1385 		return -ETOOMANYREFS;
1386 
1387 	/*
1388 	 * Need to duplicate file references for the sake of garbage
1389 	 * collection.  Otherwise a socket in the fps might become a
1390 	 * candidate for GC while the skb is not yet queued.
1391 	 */
1392 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1393 	if (!UNIXCB(skb).fp)
1394 		return -ENOMEM;
1395 
1396 	if (unix_sock_count) {
1397 		for (i = scm->fp->count - 1; i >= 0; i--)
1398 			unix_inflight(scm->fp->fp[i]);
1399 	}
1400 	return max_level;
1401 }
1402 
1403 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1404 {
1405 	int err = 0;
1406 
1407 	UNIXCB(skb).pid  = get_pid(scm->pid);
1408 	UNIXCB(skb).uid = scm->creds.uid;
1409 	UNIXCB(skb).gid = scm->creds.gid;
1410 	UNIXCB(skb).fp = NULL;
1411 	if (scm->fp && send_fds)
1412 		err = unix_attach_fds(scm, skb);
1413 
1414 	skb->destructor = unix_destruct_scm;
1415 	return err;
1416 }
1417 
1418 /*
1419  * Some apps rely on write() giving SCM_CREDENTIALS
1420  * We include credentials if source or destination socket
1421  * asserted SOCK_PASSCRED.
1422  */
1423 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1424 			    const struct sock *other)
1425 {
1426 	if (UNIXCB(skb).pid)
1427 		return;
1428 	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1429 	    !other->sk_socket ||
1430 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1431 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1432 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1433 	}
1434 }
1435 
1436 /*
1437  *	Send AF_UNIX data.
1438  */
1439 
1440 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1441 			      struct msghdr *msg, size_t len)
1442 {
1443 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1444 	struct sock *sk = sock->sk;
1445 	struct net *net = sock_net(sk);
1446 	struct unix_sock *u = unix_sk(sk);
1447 	struct sockaddr_un *sunaddr = msg->msg_name;
1448 	struct sock *other = NULL;
1449 	int namelen = 0; /* fake GCC */
1450 	int err;
1451 	unsigned int hash;
1452 	struct sk_buff *skb;
1453 	long timeo;
1454 	struct scm_cookie tmp_scm;
1455 	int max_level;
1456 	int data_len = 0;
1457 
1458 	if (NULL == siocb->scm)
1459 		siocb->scm = &tmp_scm;
1460 	wait_for_unix_gc();
1461 	err = scm_send(sock, msg, siocb->scm, false);
1462 	if (err < 0)
1463 		return err;
1464 
1465 	err = -EOPNOTSUPP;
1466 	if (msg->msg_flags&MSG_OOB)
1467 		goto out;
1468 
1469 	if (msg->msg_namelen) {
1470 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1471 		if (err < 0)
1472 			goto out;
1473 		namelen = err;
1474 	} else {
1475 		sunaddr = NULL;
1476 		err = -ENOTCONN;
1477 		other = unix_peer_get(sk);
1478 		if (!other)
1479 			goto out;
1480 	}
1481 
1482 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1483 	    && (err = unix_autobind(sock)) != 0)
1484 		goto out;
1485 
1486 	err = -EMSGSIZE;
1487 	if (len > sk->sk_sndbuf - 32)
1488 		goto out;
1489 
1490 	if (len > SKB_MAX_ALLOC)
1491 		data_len = min_t(size_t,
1492 				 len - SKB_MAX_ALLOC,
1493 				 MAX_SKB_FRAGS * PAGE_SIZE);
1494 
1495 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1496 				   msg->msg_flags & MSG_DONTWAIT, &err,
1497 				   PAGE_ALLOC_COSTLY_ORDER);
1498 	if (skb == NULL)
1499 		goto out;
1500 
1501 	err = unix_scm_to_skb(siocb->scm, skb, true);
1502 	if (err < 0)
1503 		goto out_free;
1504 	max_level = err + 1;
1505 	unix_get_secdata(siocb->scm, skb);
1506 
1507 	skb_put(skb, len - data_len);
1508 	skb->data_len = data_len;
1509 	skb->len = len;
1510 	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1511 	if (err)
1512 		goto out_free;
1513 
1514 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1515 
1516 restart:
1517 	if (!other) {
1518 		err = -ECONNRESET;
1519 		if (sunaddr == NULL)
1520 			goto out_free;
1521 
1522 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1523 					hash, &err);
1524 		if (other == NULL)
1525 			goto out_free;
1526 	}
1527 
1528 	if (sk_filter(other, skb) < 0) {
1529 		/* Toss the packet but do not return any error to the sender */
1530 		err = len;
1531 		goto out_free;
1532 	}
1533 
1534 	unix_state_lock(other);
1535 	err = -EPERM;
1536 	if (!unix_may_send(sk, other))
1537 		goto out_unlock;
1538 
1539 	if (sock_flag(other, SOCK_DEAD)) {
1540 		/*
1541 		 *	Check with 1003.1g - what should
1542 		 *	datagram error
1543 		 */
1544 		unix_state_unlock(other);
1545 		sock_put(other);
1546 
1547 		err = 0;
1548 		unix_state_lock(sk);
1549 		if (unix_peer(sk) == other) {
1550 			unix_peer(sk) = NULL;
1551 			unix_state_unlock(sk);
1552 
1553 			unix_dgram_disconnected(sk, other);
1554 			sock_put(other);
1555 			err = -ECONNREFUSED;
1556 		} else {
1557 			unix_state_unlock(sk);
1558 		}
1559 
1560 		other = NULL;
1561 		if (err)
1562 			goto out_free;
1563 		goto restart;
1564 	}
1565 
1566 	err = -EPIPE;
1567 	if (other->sk_shutdown & RCV_SHUTDOWN)
1568 		goto out_unlock;
1569 
1570 	if (sk->sk_type != SOCK_SEQPACKET) {
1571 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1572 		if (err)
1573 			goto out_unlock;
1574 	}
1575 
1576 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1577 		if (!timeo) {
1578 			err = -EAGAIN;
1579 			goto out_unlock;
1580 		}
1581 
1582 		timeo = unix_wait_for_peer(other, timeo);
1583 
1584 		err = sock_intr_errno(timeo);
1585 		if (signal_pending(current))
1586 			goto out_free;
1587 
1588 		goto restart;
1589 	}
1590 
1591 	if (sock_flag(other, SOCK_RCVTSTAMP))
1592 		__net_timestamp(skb);
1593 	maybe_add_creds(skb, sock, other);
1594 	skb_queue_tail(&other->sk_receive_queue, skb);
1595 	if (max_level > unix_sk(other)->recursion_level)
1596 		unix_sk(other)->recursion_level = max_level;
1597 	unix_state_unlock(other);
1598 	other->sk_data_ready(other, len);
1599 	sock_put(other);
1600 	scm_destroy(siocb->scm);
1601 	return len;
1602 
1603 out_unlock:
1604 	unix_state_unlock(other);
1605 out_free:
1606 	kfree_skb(skb);
1607 out:
1608 	if (other)
1609 		sock_put(other);
1610 	scm_destroy(siocb->scm);
1611 	return err;
1612 }
1613 
1614 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1615  * bytes, and a minimun of a full page.
1616  */
1617 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1618 
1619 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1620 			       struct msghdr *msg, size_t len)
1621 {
1622 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1623 	struct sock *sk = sock->sk;
1624 	struct sock *other = NULL;
1625 	int err, size;
1626 	struct sk_buff *skb;
1627 	int sent = 0;
1628 	struct scm_cookie tmp_scm;
1629 	bool fds_sent = false;
1630 	int max_level;
1631 	int data_len;
1632 
1633 	if (NULL == siocb->scm)
1634 		siocb->scm = &tmp_scm;
1635 	wait_for_unix_gc();
1636 	err = scm_send(sock, msg, siocb->scm, false);
1637 	if (err < 0)
1638 		return err;
1639 
1640 	err = -EOPNOTSUPP;
1641 	if (msg->msg_flags&MSG_OOB)
1642 		goto out_err;
1643 
1644 	if (msg->msg_namelen) {
1645 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1646 		goto out_err;
1647 	} else {
1648 		err = -ENOTCONN;
1649 		other = unix_peer(sk);
1650 		if (!other)
1651 			goto out_err;
1652 	}
1653 
1654 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1655 		goto pipe_err;
1656 
1657 	while (sent < len) {
1658 		size = len - sent;
1659 
1660 		/* Keep two messages in the pipe so it schedules better */
1661 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1662 
1663 		/* allow fallback to order-0 allocations */
1664 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1665 
1666 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1667 
1668 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1669 					   msg->msg_flags & MSG_DONTWAIT, &err,
1670 					   get_order(UNIX_SKB_FRAGS_SZ));
1671 		if (!skb)
1672 			goto out_err;
1673 
1674 		/* Only send the fds in the first buffer */
1675 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1676 		if (err < 0) {
1677 			kfree_skb(skb);
1678 			goto out_err;
1679 		}
1680 		max_level = err + 1;
1681 		fds_sent = true;
1682 
1683 		skb_put(skb, size - data_len);
1684 		skb->data_len = data_len;
1685 		skb->len = size;
1686 		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1687 						   sent, size);
1688 		if (err) {
1689 			kfree_skb(skb);
1690 			goto out_err;
1691 		}
1692 
1693 		unix_state_lock(other);
1694 
1695 		if (sock_flag(other, SOCK_DEAD) ||
1696 		    (other->sk_shutdown & RCV_SHUTDOWN))
1697 			goto pipe_err_free;
1698 
1699 		maybe_add_creds(skb, sock, other);
1700 		skb_queue_tail(&other->sk_receive_queue, skb);
1701 		if (max_level > unix_sk(other)->recursion_level)
1702 			unix_sk(other)->recursion_level = max_level;
1703 		unix_state_unlock(other);
1704 		other->sk_data_ready(other, size);
1705 		sent += size;
1706 	}
1707 
1708 	scm_destroy(siocb->scm);
1709 	siocb->scm = NULL;
1710 
1711 	return sent;
1712 
1713 pipe_err_free:
1714 	unix_state_unlock(other);
1715 	kfree_skb(skb);
1716 pipe_err:
1717 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1718 		send_sig(SIGPIPE, current, 0);
1719 	err = -EPIPE;
1720 out_err:
1721 	scm_destroy(siocb->scm);
1722 	siocb->scm = NULL;
1723 	return sent ? : err;
1724 }
1725 
1726 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1727 				  struct msghdr *msg, size_t len)
1728 {
1729 	int err;
1730 	struct sock *sk = sock->sk;
1731 
1732 	err = sock_error(sk);
1733 	if (err)
1734 		return err;
1735 
1736 	if (sk->sk_state != TCP_ESTABLISHED)
1737 		return -ENOTCONN;
1738 
1739 	if (msg->msg_namelen)
1740 		msg->msg_namelen = 0;
1741 
1742 	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1743 }
1744 
1745 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1746 			      struct msghdr *msg, size_t size,
1747 			      int flags)
1748 {
1749 	struct sock *sk = sock->sk;
1750 
1751 	if (sk->sk_state != TCP_ESTABLISHED)
1752 		return -ENOTCONN;
1753 
1754 	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1755 }
1756 
1757 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1758 {
1759 	struct unix_sock *u = unix_sk(sk);
1760 
1761 	if (u->addr) {
1762 		msg->msg_namelen = u->addr->len;
1763 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1764 	}
1765 }
1766 
1767 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1768 			      struct msghdr *msg, size_t size,
1769 			      int flags)
1770 {
1771 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1772 	struct scm_cookie tmp_scm;
1773 	struct sock *sk = sock->sk;
1774 	struct unix_sock *u = unix_sk(sk);
1775 	int noblock = flags & MSG_DONTWAIT;
1776 	struct sk_buff *skb;
1777 	int err;
1778 	int peeked, skip;
1779 
1780 	err = -EOPNOTSUPP;
1781 	if (flags&MSG_OOB)
1782 		goto out;
1783 
1784 	err = mutex_lock_interruptible(&u->readlock);
1785 	if (err) {
1786 		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1787 		goto out;
1788 	}
1789 
1790 	skip = sk_peek_offset(sk, flags);
1791 
1792 	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1793 	if (!skb) {
1794 		unix_state_lock(sk);
1795 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1796 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1797 		    (sk->sk_shutdown & RCV_SHUTDOWN))
1798 			err = 0;
1799 		unix_state_unlock(sk);
1800 		goto out_unlock;
1801 	}
1802 
1803 	wake_up_interruptible_sync_poll(&u->peer_wait,
1804 					POLLOUT | POLLWRNORM | POLLWRBAND);
1805 
1806 	if (msg->msg_name)
1807 		unix_copy_addr(msg, skb->sk);
1808 
1809 	if (size > skb->len - skip)
1810 		size = skb->len - skip;
1811 	else if (size < skb->len - skip)
1812 		msg->msg_flags |= MSG_TRUNC;
1813 
1814 	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1815 	if (err)
1816 		goto out_free;
1817 
1818 	if (sock_flag(sk, SOCK_RCVTSTAMP))
1819 		__sock_recv_timestamp(msg, sk, skb);
1820 
1821 	if (!siocb->scm) {
1822 		siocb->scm = &tmp_scm;
1823 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1824 	}
1825 	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1826 	unix_set_secdata(siocb->scm, skb);
1827 
1828 	if (!(flags & MSG_PEEK)) {
1829 		if (UNIXCB(skb).fp)
1830 			unix_detach_fds(siocb->scm, skb);
1831 
1832 		sk_peek_offset_bwd(sk, skb->len);
1833 	} else {
1834 		/* It is questionable: on PEEK we could:
1835 		   - do not return fds - good, but too simple 8)
1836 		   - return fds, and do not return them on read (old strategy,
1837 		     apparently wrong)
1838 		   - clone fds (I chose it for now, it is the most universal
1839 		     solution)
1840 
1841 		   POSIX 1003.1g does not actually define this clearly
1842 		   at all. POSIX 1003.1g doesn't define a lot of things
1843 		   clearly however!
1844 
1845 		*/
1846 
1847 		sk_peek_offset_fwd(sk, size);
1848 
1849 		if (UNIXCB(skb).fp)
1850 			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1851 	}
1852 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1853 
1854 	scm_recv(sock, msg, siocb->scm, flags);
1855 
1856 out_free:
1857 	skb_free_datagram(sk, skb);
1858 out_unlock:
1859 	mutex_unlock(&u->readlock);
1860 out:
1861 	return err;
1862 }
1863 
1864 /*
1865  *	Sleep until more data has arrived. But check for races..
1866  */
1867 static long unix_stream_data_wait(struct sock *sk, long timeo,
1868 				  struct sk_buff *last)
1869 {
1870 	DEFINE_WAIT(wait);
1871 
1872 	unix_state_lock(sk);
1873 
1874 	for (;;) {
1875 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1876 
1877 		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1878 		    sk->sk_err ||
1879 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1880 		    signal_pending(current) ||
1881 		    !timeo)
1882 			break;
1883 
1884 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1885 		unix_state_unlock(sk);
1886 		timeo = freezable_schedule_timeout(timeo);
1887 		unix_state_lock(sk);
1888 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1889 	}
1890 
1891 	finish_wait(sk_sleep(sk), &wait);
1892 	unix_state_unlock(sk);
1893 	return timeo;
1894 }
1895 
1896 static unsigned int unix_skb_len(const struct sk_buff *skb)
1897 {
1898 	return skb->len - UNIXCB(skb).consumed;
1899 }
1900 
1901 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1902 			       struct msghdr *msg, size_t size,
1903 			       int flags)
1904 {
1905 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1906 	struct scm_cookie tmp_scm;
1907 	struct sock *sk = sock->sk;
1908 	struct unix_sock *u = unix_sk(sk);
1909 	struct sockaddr_un *sunaddr = msg->msg_name;
1910 	int copied = 0;
1911 	int check_creds = 0;
1912 	int target;
1913 	int err = 0;
1914 	long timeo;
1915 	int skip;
1916 
1917 	err = -EINVAL;
1918 	if (sk->sk_state != TCP_ESTABLISHED)
1919 		goto out;
1920 
1921 	err = -EOPNOTSUPP;
1922 	if (flags&MSG_OOB)
1923 		goto out;
1924 
1925 	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1926 	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1927 
1928 	/* Lock the socket to prevent queue disordering
1929 	 * while sleeps in memcpy_tomsg
1930 	 */
1931 
1932 	if (!siocb->scm) {
1933 		siocb->scm = &tmp_scm;
1934 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1935 	}
1936 
1937 	err = mutex_lock_interruptible(&u->readlock);
1938 	if (err) {
1939 		err = sock_intr_errno(timeo);
1940 		goto out;
1941 	}
1942 
1943 	do {
1944 		int chunk;
1945 		struct sk_buff *skb, *last;
1946 
1947 		unix_state_lock(sk);
1948 		last = skb = skb_peek(&sk->sk_receive_queue);
1949 again:
1950 		if (skb == NULL) {
1951 			unix_sk(sk)->recursion_level = 0;
1952 			if (copied >= target)
1953 				goto unlock;
1954 
1955 			/*
1956 			 *	POSIX 1003.1g mandates this order.
1957 			 */
1958 
1959 			err = sock_error(sk);
1960 			if (err)
1961 				goto unlock;
1962 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1963 				goto unlock;
1964 
1965 			unix_state_unlock(sk);
1966 			err = -EAGAIN;
1967 			if (!timeo)
1968 				break;
1969 			mutex_unlock(&u->readlock);
1970 
1971 			timeo = unix_stream_data_wait(sk, timeo, last);
1972 
1973 			if (signal_pending(current)
1974 			    ||  mutex_lock_interruptible(&u->readlock)) {
1975 				err = sock_intr_errno(timeo);
1976 				goto out;
1977 			}
1978 
1979 			continue;
1980  unlock:
1981 			unix_state_unlock(sk);
1982 			break;
1983 		}
1984 
1985 		skip = sk_peek_offset(sk, flags);
1986 		while (skip >= unix_skb_len(skb)) {
1987 			skip -= unix_skb_len(skb);
1988 			last = skb;
1989 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1990 			if (!skb)
1991 				goto again;
1992 		}
1993 
1994 		unix_state_unlock(sk);
1995 
1996 		if (check_creds) {
1997 			/* Never glue messages from different writers */
1998 			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1999 			    !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
2000 			    !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
2001 				break;
2002 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2003 			/* Copy credentials */
2004 			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2005 			check_creds = 1;
2006 		}
2007 
2008 		/* Copy address just once */
2009 		if (sunaddr) {
2010 			unix_copy_addr(msg, skb->sk);
2011 			sunaddr = NULL;
2012 		}
2013 
2014 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2015 		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2016 					    msg->msg_iov, chunk)) {
2017 			if (copied == 0)
2018 				copied = -EFAULT;
2019 			break;
2020 		}
2021 		copied += chunk;
2022 		size -= chunk;
2023 
2024 		/* Mark read part of skb as used */
2025 		if (!(flags & MSG_PEEK)) {
2026 			UNIXCB(skb).consumed += chunk;
2027 
2028 			sk_peek_offset_bwd(sk, chunk);
2029 
2030 			if (UNIXCB(skb).fp)
2031 				unix_detach_fds(siocb->scm, skb);
2032 
2033 			if (unix_skb_len(skb))
2034 				break;
2035 
2036 			skb_unlink(skb, &sk->sk_receive_queue);
2037 			consume_skb(skb);
2038 
2039 			if (siocb->scm->fp)
2040 				break;
2041 		} else {
2042 			/* It is questionable, see note in unix_dgram_recvmsg.
2043 			 */
2044 			if (UNIXCB(skb).fp)
2045 				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2046 
2047 			sk_peek_offset_fwd(sk, chunk);
2048 
2049 			break;
2050 		}
2051 	} while (size);
2052 
2053 	mutex_unlock(&u->readlock);
2054 	scm_recv(sock, msg, siocb->scm, flags);
2055 out:
2056 	return copied ? : err;
2057 }
2058 
2059 static int unix_shutdown(struct socket *sock, int mode)
2060 {
2061 	struct sock *sk = sock->sk;
2062 	struct sock *other;
2063 
2064 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2065 		return -EINVAL;
2066 	/* This maps:
2067 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2068 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2069 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2070 	 */
2071 	++mode;
2072 
2073 	unix_state_lock(sk);
2074 	sk->sk_shutdown |= mode;
2075 	other = unix_peer(sk);
2076 	if (other)
2077 		sock_hold(other);
2078 	unix_state_unlock(sk);
2079 	sk->sk_state_change(sk);
2080 
2081 	if (other &&
2082 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2083 
2084 		int peer_mode = 0;
2085 
2086 		if (mode&RCV_SHUTDOWN)
2087 			peer_mode |= SEND_SHUTDOWN;
2088 		if (mode&SEND_SHUTDOWN)
2089 			peer_mode |= RCV_SHUTDOWN;
2090 		unix_state_lock(other);
2091 		other->sk_shutdown |= peer_mode;
2092 		unix_state_unlock(other);
2093 		other->sk_state_change(other);
2094 		if (peer_mode == SHUTDOWN_MASK)
2095 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2096 		else if (peer_mode & RCV_SHUTDOWN)
2097 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2098 	}
2099 	if (other)
2100 		sock_put(other);
2101 
2102 	return 0;
2103 }
2104 
2105 long unix_inq_len(struct sock *sk)
2106 {
2107 	struct sk_buff *skb;
2108 	long amount = 0;
2109 
2110 	if (sk->sk_state == TCP_LISTEN)
2111 		return -EINVAL;
2112 
2113 	spin_lock(&sk->sk_receive_queue.lock);
2114 	if (sk->sk_type == SOCK_STREAM ||
2115 	    sk->sk_type == SOCK_SEQPACKET) {
2116 		skb_queue_walk(&sk->sk_receive_queue, skb)
2117 			amount += unix_skb_len(skb);
2118 	} else {
2119 		skb = skb_peek(&sk->sk_receive_queue);
2120 		if (skb)
2121 			amount = skb->len;
2122 	}
2123 	spin_unlock(&sk->sk_receive_queue.lock);
2124 
2125 	return amount;
2126 }
2127 EXPORT_SYMBOL_GPL(unix_inq_len);
2128 
2129 long unix_outq_len(struct sock *sk)
2130 {
2131 	return sk_wmem_alloc_get(sk);
2132 }
2133 EXPORT_SYMBOL_GPL(unix_outq_len);
2134 
2135 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2136 {
2137 	struct sock *sk = sock->sk;
2138 	long amount = 0;
2139 	int err;
2140 
2141 	switch (cmd) {
2142 	case SIOCOUTQ:
2143 		amount = unix_outq_len(sk);
2144 		err = put_user(amount, (int __user *)arg);
2145 		break;
2146 	case SIOCINQ:
2147 		amount = unix_inq_len(sk);
2148 		if (amount < 0)
2149 			err = amount;
2150 		else
2151 			err = put_user(amount, (int __user *)arg);
2152 		break;
2153 	default:
2154 		err = -ENOIOCTLCMD;
2155 		break;
2156 	}
2157 	return err;
2158 }
2159 
2160 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2161 {
2162 	struct sock *sk = sock->sk;
2163 	unsigned int mask;
2164 
2165 	sock_poll_wait(file, sk_sleep(sk), wait);
2166 	mask = 0;
2167 
2168 	/* exceptional events? */
2169 	if (sk->sk_err)
2170 		mask |= POLLERR;
2171 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2172 		mask |= POLLHUP;
2173 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2174 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2175 
2176 	/* readable? */
2177 	if (!skb_queue_empty(&sk->sk_receive_queue))
2178 		mask |= POLLIN | POLLRDNORM;
2179 
2180 	/* Connection-based need to check for termination and startup */
2181 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2182 	    sk->sk_state == TCP_CLOSE)
2183 		mask |= POLLHUP;
2184 
2185 	/*
2186 	 * we set writable also when the other side has shut down the
2187 	 * connection. This prevents stuck sockets.
2188 	 */
2189 	if (unix_writable(sk))
2190 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2191 
2192 	return mask;
2193 }
2194 
2195 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2196 				    poll_table *wait)
2197 {
2198 	struct sock *sk = sock->sk, *other;
2199 	unsigned int mask, writable;
2200 
2201 	sock_poll_wait(file, sk_sleep(sk), wait);
2202 	mask = 0;
2203 
2204 	/* exceptional events? */
2205 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2206 		mask |= POLLERR |
2207 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2208 
2209 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2210 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2211 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2212 		mask |= POLLHUP;
2213 
2214 	/* readable? */
2215 	if (!skb_queue_empty(&sk->sk_receive_queue))
2216 		mask |= POLLIN | POLLRDNORM;
2217 
2218 	/* Connection-based need to check for termination and startup */
2219 	if (sk->sk_type == SOCK_SEQPACKET) {
2220 		if (sk->sk_state == TCP_CLOSE)
2221 			mask |= POLLHUP;
2222 		/* connection hasn't started yet? */
2223 		if (sk->sk_state == TCP_SYN_SENT)
2224 			return mask;
2225 	}
2226 
2227 	/* No write status requested, avoid expensive OUT tests. */
2228 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2229 		return mask;
2230 
2231 	writable = unix_writable(sk);
2232 	other = unix_peer_get(sk);
2233 	if (other) {
2234 		if (unix_peer(other) != sk) {
2235 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2236 			if (unix_recvq_full(other))
2237 				writable = 0;
2238 		}
2239 		sock_put(other);
2240 	}
2241 
2242 	if (writable)
2243 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2244 	else
2245 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2246 
2247 	return mask;
2248 }
2249 
2250 #ifdef CONFIG_PROC_FS
2251 
2252 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2253 
2254 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2255 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2256 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2257 
2258 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2259 {
2260 	unsigned long offset = get_offset(*pos);
2261 	unsigned long bucket = get_bucket(*pos);
2262 	struct sock *sk;
2263 	unsigned long count = 0;
2264 
2265 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2266 		if (sock_net(sk) != seq_file_net(seq))
2267 			continue;
2268 		if (++count == offset)
2269 			break;
2270 	}
2271 
2272 	return sk;
2273 }
2274 
2275 static struct sock *unix_next_socket(struct seq_file *seq,
2276 				     struct sock *sk,
2277 				     loff_t *pos)
2278 {
2279 	unsigned long bucket;
2280 
2281 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2282 		sk = sk_next(sk);
2283 		if (!sk)
2284 			goto next_bucket;
2285 		if (sock_net(sk) == seq_file_net(seq))
2286 			return sk;
2287 	}
2288 
2289 	do {
2290 		sk = unix_from_bucket(seq, pos);
2291 		if (sk)
2292 			return sk;
2293 
2294 next_bucket:
2295 		bucket = get_bucket(*pos) + 1;
2296 		*pos = set_bucket_offset(bucket, 1);
2297 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2298 
2299 	return NULL;
2300 }
2301 
2302 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2303 	__acquires(unix_table_lock)
2304 {
2305 	spin_lock(&unix_table_lock);
2306 
2307 	if (!*pos)
2308 		return SEQ_START_TOKEN;
2309 
2310 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2311 		return NULL;
2312 
2313 	return unix_next_socket(seq, NULL, pos);
2314 }
2315 
2316 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2317 {
2318 	++*pos;
2319 	return unix_next_socket(seq, v, pos);
2320 }
2321 
2322 static void unix_seq_stop(struct seq_file *seq, void *v)
2323 	__releases(unix_table_lock)
2324 {
2325 	spin_unlock(&unix_table_lock);
2326 }
2327 
2328 static int unix_seq_show(struct seq_file *seq, void *v)
2329 {
2330 
2331 	if (v == SEQ_START_TOKEN)
2332 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2333 			 "Inode Path\n");
2334 	else {
2335 		struct sock *s = v;
2336 		struct unix_sock *u = unix_sk(s);
2337 		unix_state_lock(s);
2338 
2339 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2340 			s,
2341 			atomic_read(&s->sk_refcnt),
2342 			0,
2343 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2344 			s->sk_type,
2345 			s->sk_socket ?
2346 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2347 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2348 			sock_i_ino(s));
2349 
2350 		if (u->addr) {
2351 			int i, len;
2352 			seq_putc(seq, ' ');
2353 
2354 			i = 0;
2355 			len = u->addr->len - sizeof(short);
2356 			if (!UNIX_ABSTRACT(s))
2357 				len--;
2358 			else {
2359 				seq_putc(seq, '@');
2360 				i++;
2361 			}
2362 			for ( ; i < len; i++)
2363 				seq_putc(seq, u->addr->name->sun_path[i]);
2364 		}
2365 		unix_state_unlock(s);
2366 		seq_putc(seq, '\n');
2367 	}
2368 
2369 	return 0;
2370 }
2371 
2372 static const struct seq_operations unix_seq_ops = {
2373 	.start  = unix_seq_start,
2374 	.next   = unix_seq_next,
2375 	.stop   = unix_seq_stop,
2376 	.show   = unix_seq_show,
2377 };
2378 
2379 static int unix_seq_open(struct inode *inode, struct file *file)
2380 {
2381 	return seq_open_net(inode, file, &unix_seq_ops,
2382 			    sizeof(struct seq_net_private));
2383 }
2384 
2385 static const struct file_operations unix_seq_fops = {
2386 	.owner		= THIS_MODULE,
2387 	.open		= unix_seq_open,
2388 	.read		= seq_read,
2389 	.llseek		= seq_lseek,
2390 	.release	= seq_release_net,
2391 };
2392 
2393 #endif
2394 
2395 static const struct net_proto_family unix_family_ops = {
2396 	.family = PF_UNIX,
2397 	.create = unix_create,
2398 	.owner	= THIS_MODULE,
2399 };
2400 
2401 
2402 static int __net_init unix_net_init(struct net *net)
2403 {
2404 	int error = -ENOMEM;
2405 
2406 	net->unx.sysctl_max_dgram_qlen = 10;
2407 	if (unix_sysctl_register(net))
2408 		goto out;
2409 
2410 #ifdef CONFIG_PROC_FS
2411 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2412 		unix_sysctl_unregister(net);
2413 		goto out;
2414 	}
2415 #endif
2416 	error = 0;
2417 out:
2418 	return error;
2419 }
2420 
2421 static void __net_exit unix_net_exit(struct net *net)
2422 {
2423 	unix_sysctl_unregister(net);
2424 	remove_proc_entry("unix", net->proc_net);
2425 }
2426 
2427 static struct pernet_operations unix_net_ops = {
2428 	.init = unix_net_init,
2429 	.exit = unix_net_exit,
2430 };
2431 
2432 static int __init af_unix_init(void)
2433 {
2434 	int rc = -1;
2435 
2436 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2437 
2438 	rc = proto_register(&unix_proto, 1);
2439 	if (rc != 0) {
2440 		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2441 		       __func__);
2442 		goto out;
2443 	}
2444 
2445 	sock_register(&unix_family_ops);
2446 	register_pernet_subsys(&unix_net_ops);
2447 out:
2448 	return rc;
2449 }
2450 
2451 static void __exit af_unix_exit(void)
2452 {
2453 	sock_unregister(PF_UNIX);
2454 	proto_unregister(&unix_proto);
2455 	unregister_pernet_subsys(&unix_net_ops);
2456 }
2457 
2458 /* Earlier than device_initcall() so that other drivers invoking
2459    request_module() don't end up in a loop when modprobe tries
2460    to use a UNIX socket. But later than subsys_initcall() because
2461    we depend on stuff initialised there */
2462 fs_initcall(af_unix_init);
2463 module_exit(af_unix_exit);
2464 
2465 MODULE_LICENSE("GPL");
2466 MODULE_ALIAS_NETPROTO(PF_UNIX);
2467