xref: /openbmc/linux/net/unix/af_unix.c (revision 4e5e4705)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #include <linux/module.h>
84 #include <linux/kernel.h>
85 #include <linux/signal.h>
86 #include <linux/sched.h>
87 #include <linux/errno.h>
88 #include <linux/string.h>
89 #include <linux/stat.h>
90 #include <linux/dcache.h>
91 #include <linux/namei.h>
92 #include <linux/socket.h>
93 #include <linux/un.h>
94 #include <linux/fcntl.h>
95 #include <linux/termios.h>
96 #include <linux/sockios.h>
97 #include <linux/net.h>
98 #include <linux/in.h>
99 #include <linux/fs.h>
100 #include <linux/slab.h>
101 #include <asm/uaccess.h>
102 #include <linux/skbuff.h>
103 #include <linux/netdevice.h>
104 #include <net/net_namespace.h>
105 #include <net/sock.h>
106 #include <net/tcp_states.h>
107 #include <net/af_unix.h>
108 #include <linux/proc_fs.h>
109 #include <linux/seq_file.h>
110 #include <net/scm.h>
111 #include <linux/init.h>
112 #include <linux/poll.h>
113 #include <linux/rtnetlink.h>
114 #include <linux/mount.h>
115 #include <net/checksum.h>
116 #include <linux/security.h>
117 #include <linux/freezer.h>
118 
119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
120 EXPORT_SYMBOL_GPL(unix_socket_table);
121 DEFINE_SPINLOCK(unix_table_lock);
122 EXPORT_SYMBOL_GPL(unix_table_lock);
123 static atomic_long_t unix_nr_socks;
124 
125 
126 static struct hlist_head *unix_sockets_unbound(void *addr)
127 {
128 	unsigned long hash = (unsigned long)addr;
129 
130 	hash ^= hash >> 16;
131 	hash ^= hash >> 8;
132 	hash %= UNIX_HASH_SIZE;
133 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
134 }
135 
136 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
137 
138 #ifdef CONFIG_SECURITY_NETWORK
139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
140 {
141 	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
142 }
143 
144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
145 {
146 	scm->secid = *UNIXSID(skb);
147 }
148 #else
149 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
150 { }
151 
152 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
153 { }
154 #endif /* CONFIG_SECURITY_NETWORK */
155 
156 /*
157  *  SMP locking strategy:
158  *    hash table is protected with spinlock unix_table_lock
159  *    each socket state is protected by separate spin lock.
160  */
161 
162 static inline unsigned int unix_hash_fold(__wsum n)
163 {
164 	unsigned int hash = (__force unsigned int)n;
165 
166 	hash ^= hash>>16;
167 	hash ^= hash>>8;
168 	return hash&(UNIX_HASH_SIZE-1);
169 }
170 
171 #define unix_peer(sk) (unix_sk(sk)->peer)
172 
173 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
174 {
175 	return unix_peer(osk) == sk;
176 }
177 
178 static inline int unix_may_send(struct sock *sk, struct sock *osk)
179 {
180 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
181 }
182 
183 static inline int unix_recvq_full(struct sock const *sk)
184 {
185 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
186 }
187 
188 struct sock *unix_peer_get(struct sock *s)
189 {
190 	struct sock *peer;
191 
192 	unix_state_lock(s);
193 	peer = unix_peer(s);
194 	if (peer)
195 		sock_hold(peer);
196 	unix_state_unlock(s);
197 	return peer;
198 }
199 EXPORT_SYMBOL_GPL(unix_peer_get);
200 
201 static inline void unix_release_addr(struct unix_address *addr)
202 {
203 	if (atomic_dec_and_test(&addr->refcnt))
204 		kfree(addr);
205 }
206 
207 /*
208  *	Check unix socket name:
209  *		- should be not zero length.
210  *	        - if started by not zero, should be NULL terminated (FS object)
211  *		- if started by zero, it is abstract name.
212  */
213 
214 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
215 {
216 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
217 		return -EINVAL;
218 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
219 		return -EINVAL;
220 	if (sunaddr->sun_path[0]) {
221 		/*
222 		 * This may look like an off by one error but it is a bit more
223 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
224 		 * sun_path[108] doesn't as such exist.  However in kernel space
225 		 * we are guaranteed that it is a valid memory location in our
226 		 * kernel address buffer.
227 		 */
228 		((char *)sunaddr)[len] = 0;
229 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
230 		return len;
231 	}
232 
233 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
234 	return len;
235 }
236 
237 static void __unix_remove_socket(struct sock *sk)
238 {
239 	sk_del_node_init(sk);
240 }
241 
242 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
243 {
244 	WARN_ON(!sk_unhashed(sk));
245 	sk_add_node(sk, list);
246 }
247 
248 static inline void unix_remove_socket(struct sock *sk)
249 {
250 	spin_lock(&unix_table_lock);
251 	__unix_remove_socket(sk);
252 	spin_unlock(&unix_table_lock);
253 }
254 
255 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
256 {
257 	spin_lock(&unix_table_lock);
258 	__unix_insert_socket(list, sk);
259 	spin_unlock(&unix_table_lock);
260 }
261 
262 static struct sock *__unix_find_socket_byname(struct net *net,
263 					      struct sockaddr_un *sunname,
264 					      int len, int type, unsigned int hash)
265 {
266 	struct sock *s;
267 
268 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
269 		struct unix_sock *u = unix_sk(s);
270 
271 		if (!net_eq(sock_net(s), net))
272 			continue;
273 
274 		if (u->addr->len == len &&
275 		    !memcmp(u->addr->name, sunname, len))
276 			goto found;
277 	}
278 	s = NULL;
279 found:
280 	return s;
281 }
282 
283 static inline struct sock *unix_find_socket_byname(struct net *net,
284 						   struct sockaddr_un *sunname,
285 						   int len, int type,
286 						   unsigned int hash)
287 {
288 	struct sock *s;
289 
290 	spin_lock(&unix_table_lock);
291 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
292 	if (s)
293 		sock_hold(s);
294 	spin_unlock(&unix_table_lock);
295 	return s;
296 }
297 
298 static struct sock *unix_find_socket_byinode(struct inode *i)
299 {
300 	struct sock *s;
301 
302 	spin_lock(&unix_table_lock);
303 	sk_for_each(s,
304 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
305 		struct dentry *dentry = unix_sk(s)->path.dentry;
306 
307 		if (dentry && dentry->d_inode == i) {
308 			sock_hold(s);
309 			goto found;
310 		}
311 	}
312 	s = NULL;
313 found:
314 	spin_unlock(&unix_table_lock);
315 	return s;
316 }
317 
318 static inline int unix_writable(struct sock *sk)
319 {
320 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
321 }
322 
323 static void unix_write_space(struct sock *sk)
324 {
325 	struct socket_wq *wq;
326 
327 	rcu_read_lock();
328 	if (unix_writable(sk)) {
329 		wq = rcu_dereference(sk->sk_wq);
330 		if (wq_has_sleeper(wq))
331 			wake_up_interruptible_sync_poll(&wq->wait,
332 				POLLOUT | POLLWRNORM | POLLWRBAND);
333 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
334 	}
335 	rcu_read_unlock();
336 }
337 
338 /* When dgram socket disconnects (or changes its peer), we clear its receive
339  * queue of packets arrived from previous peer. First, it allows to do
340  * flow control based only on wmem_alloc; second, sk connected to peer
341  * may receive messages only from that peer. */
342 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
343 {
344 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
345 		skb_queue_purge(&sk->sk_receive_queue);
346 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
347 
348 		/* If one link of bidirectional dgram pipe is disconnected,
349 		 * we signal error. Messages are lost. Do not make this,
350 		 * when peer was not connected to us.
351 		 */
352 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
353 			other->sk_err = ECONNRESET;
354 			other->sk_error_report(other);
355 		}
356 	}
357 }
358 
359 static void unix_sock_destructor(struct sock *sk)
360 {
361 	struct unix_sock *u = unix_sk(sk);
362 
363 	skb_queue_purge(&sk->sk_receive_queue);
364 
365 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
366 	WARN_ON(!sk_unhashed(sk));
367 	WARN_ON(sk->sk_socket);
368 	if (!sock_flag(sk, SOCK_DEAD)) {
369 		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
370 		return;
371 	}
372 
373 	if (u->addr)
374 		unix_release_addr(u->addr);
375 
376 	atomic_long_dec(&unix_nr_socks);
377 	local_bh_disable();
378 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
379 	local_bh_enable();
380 #ifdef UNIX_REFCNT_DEBUG
381 	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
382 		atomic_long_read(&unix_nr_socks));
383 #endif
384 }
385 
386 static void unix_release_sock(struct sock *sk, int embrion)
387 {
388 	struct unix_sock *u = unix_sk(sk);
389 	struct path path;
390 	struct sock *skpair;
391 	struct sk_buff *skb;
392 	int state;
393 
394 	unix_remove_socket(sk);
395 
396 	/* Clear state */
397 	unix_state_lock(sk);
398 	sock_orphan(sk);
399 	sk->sk_shutdown = SHUTDOWN_MASK;
400 	path	     = u->path;
401 	u->path.dentry = NULL;
402 	u->path.mnt = NULL;
403 	state = sk->sk_state;
404 	sk->sk_state = TCP_CLOSE;
405 	unix_state_unlock(sk);
406 
407 	wake_up_interruptible_all(&u->peer_wait);
408 
409 	skpair = unix_peer(sk);
410 
411 	if (skpair != NULL) {
412 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
413 			unix_state_lock(skpair);
414 			/* No more writes */
415 			skpair->sk_shutdown = SHUTDOWN_MASK;
416 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
417 				skpair->sk_err = ECONNRESET;
418 			unix_state_unlock(skpair);
419 			skpair->sk_state_change(skpair);
420 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
421 		}
422 		sock_put(skpair); /* It may now die */
423 		unix_peer(sk) = NULL;
424 	}
425 
426 	/* Try to flush out this socket. Throw out buffers at least */
427 
428 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
429 		if (state == TCP_LISTEN)
430 			unix_release_sock(skb->sk, 1);
431 		/* passed fds are erased in the kfree_skb hook	      */
432 		kfree_skb(skb);
433 	}
434 
435 	if (path.dentry)
436 		path_put(&path);
437 
438 	sock_put(sk);
439 
440 	/* ---- Socket is dead now and most probably destroyed ---- */
441 
442 	/*
443 	 * Fixme: BSD difference: In BSD all sockets connected to us get
444 	 *	  ECONNRESET and we die on the spot. In Linux we behave
445 	 *	  like files and pipes do and wait for the last
446 	 *	  dereference.
447 	 *
448 	 * Can't we simply set sock->err?
449 	 *
450 	 *	  What the above comment does talk about? --ANK(980817)
451 	 */
452 
453 	if (unix_tot_inflight)
454 		unix_gc();		/* Garbage collect fds */
455 }
456 
457 static void init_peercred(struct sock *sk)
458 {
459 	put_pid(sk->sk_peer_pid);
460 	if (sk->sk_peer_cred)
461 		put_cred(sk->sk_peer_cred);
462 	sk->sk_peer_pid  = get_pid(task_tgid(current));
463 	sk->sk_peer_cred = get_current_cred();
464 }
465 
466 static void copy_peercred(struct sock *sk, struct sock *peersk)
467 {
468 	put_pid(sk->sk_peer_pid);
469 	if (sk->sk_peer_cred)
470 		put_cred(sk->sk_peer_cred);
471 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
472 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
473 }
474 
475 static int unix_listen(struct socket *sock, int backlog)
476 {
477 	int err;
478 	struct sock *sk = sock->sk;
479 	struct unix_sock *u = unix_sk(sk);
480 	struct pid *old_pid = NULL;
481 
482 	err = -EOPNOTSUPP;
483 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
484 		goto out;	/* Only stream/seqpacket sockets accept */
485 	err = -EINVAL;
486 	if (!u->addr)
487 		goto out;	/* No listens on an unbound socket */
488 	unix_state_lock(sk);
489 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
490 		goto out_unlock;
491 	if (backlog > sk->sk_max_ack_backlog)
492 		wake_up_interruptible_all(&u->peer_wait);
493 	sk->sk_max_ack_backlog	= backlog;
494 	sk->sk_state		= TCP_LISTEN;
495 	/* set credentials so connect can copy them */
496 	init_peercred(sk);
497 	err = 0;
498 
499 out_unlock:
500 	unix_state_unlock(sk);
501 	put_pid(old_pid);
502 out:
503 	return err;
504 }
505 
506 static int unix_release(struct socket *);
507 static int unix_bind(struct socket *, struct sockaddr *, int);
508 static int unix_stream_connect(struct socket *, struct sockaddr *,
509 			       int addr_len, int flags);
510 static int unix_socketpair(struct socket *, struct socket *);
511 static int unix_accept(struct socket *, struct socket *, int);
512 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
513 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
514 static unsigned int unix_dgram_poll(struct file *, struct socket *,
515 				    poll_table *);
516 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
517 static int unix_shutdown(struct socket *, int);
518 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
519 			       struct msghdr *, size_t);
520 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
521 			       struct msghdr *, size_t, int);
522 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
523 			      struct msghdr *, size_t);
524 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
525 			      struct msghdr *, size_t, int);
526 static int unix_dgram_connect(struct socket *, struct sockaddr *,
527 			      int, int);
528 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
529 				  struct msghdr *, size_t);
530 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
531 				  struct msghdr *, size_t, int);
532 
533 static void unix_set_peek_off(struct sock *sk, int val)
534 {
535 	struct unix_sock *u = unix_sk(sk);
536 
537 	mutex_lock(&u->readlock);
538 	sk->sk_peek_off = val;
539 	mutex_unlock(&u->readlock);
540 }
541 
542 
543 static const struct proto_ops unix_stream_ops = {
544 	.family =	PF_UNIX,
545 	.owner =	THIS_MODULE,
546 	.release =	unix_release,
547 	.bind =		unix_bind,
548 	.connect =	unix_stream_connect,
549 	.socketpair =	unix_socketpair,
550 	.accept =	unix_accept,
551 	.getname =	unix_getname,
552 	.poll =		unix_poll,
553 	.ioctl =	unix_ioctl,
554 	.listen =	unix_listen,
555 	.shutdown =	unix_shutdown,
556 	.setsockopt =	sock_no_setsockopt,
557 	.getsockopt =	sock_no_getsockopt,
558 	.sendmsg =	unix_stream_sendmsg,
559 	.recvmsg =	unix_stream_recvmsg,
560 	.mmap =		sock_no_mmap,
561 	.sendpage =	sock_no_sendpage,
562 	.set_peek_off =	unix_set_peek_off,
563 };
564 
565 static const struct proto_ops unix_dgram_ops = {
566 	.family =	PF_UNIX,
567 	.owner =	THIS_MODULE,
568 	.release =	unix_release,
569 	.bind =		unix_bind,
570 	.connect =	unix_dgram_connect,
571 	.socketpair =	unix_socketpair,
572 	.accept =	sock_no_accept,
573 	.getname =	unix_getname,
574 	.poll =		unix_dgram_poll,
575 	.ioctl =	unix_ioctl,
576 	.listen =	sock_no_listen,
577 	.shutdown =	unix_shutdown,
578 	.setsockopt =	sock_no_setsockopt,
579 	.getsockopt =	sock_no_getsockopt,
580 	.sendmsg =	unix_dgram_sendmsg,
581 	.recvmsg =	unix_dgram_recvmsg,
582 	.mmap =		sock_no_mmap,
583 	.sendpage =	sock_no_sendpage,
584 	.set_peek_off =	unix_set_peek_off,
585 };
586 
587 static const struct proto_ops unix_seqpacket_ops = {
588 	.family =	PF_UNIX,
589 	.owner =	THIS_MODULE,
590 	.release =	unix_release,
591 	.bind =		unix_bind,
592 	.connect =	unix_stream_connect,
593 	.socketpair =	unix_socketpair,
594 	.accept =	unix_accept,
595 	.getname =	unix_getname,
596 	.poll =		unix_dgram_poll,
597 	.ioctl =	unix_ioctl,
598 	.listen =	unix_listen,
599 	.shutdown =	unix_shutdown,
600 	.setsockopt =	sock_no_setsockopt,
601 	.getsockopt =	sock_no_getsockopt,
602 	.sendmsg =	unix_seqpacket_sendmsg,
603 	.recvmsg =	unix_seqpacket_recvmsg,
604 	.mmap =		sock_no_mmap,
605 	.sendpage =	sock_no_sendpage,
606 	.set_peek_off =	unix_set_peek_off,
607 };
608 
609 static struct proto unix_proto = {
610 	.name			= "UNIX",
611 	.owner			= THIS_MODULE,
612 	.obj_size		= sizeof(struct unix_sock),
613 };
614 
615 /*
616  * AF_UNIX sockets do not interact with hardware, hence they
617  * dont trigger interrupts - so it's safe for them to have
618  * bh-unsafe locking for their sk_receive_queue.lock. Split off
619  * this special lock-class by reinitializing the spinlock key:
620  */
621 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
622 
623 static struct sock *unix_create1(struct net *net, struct socket *sock)
624 {
625 	struct sock *sk = NULL;
626 	struct unix_sock *u;
627 
628 	atomic_long_inc(&unix_nr_socks);
629 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
630 		goto out;
631 
632 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
633 	if (!sk)
634 		goto out;
635 
636 	sock_init_data(sock, sk);
637 	lockdep_set_class(&sk->sk_receive_queue.lock,
638 				&af_unix_sk_receive_queue_lock_key);
639 
640 	sk->sk_write_space	= unix_write_space;
641 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
642 	sk->sk_destruct		= unix_sock_destructor;
643 	u	  = unix_sk(sk);
644 	u->path.dentry = NULL;
645 	u->path.mnt = NULL;
646 	spin_lock_init(&u->lock);
647 	atomic_long_set(&u->inflight, 0);
648 	INIT_LIST_HEAD(&u->link);
649 	mutex_init(&u->readlock); /* single task reading lock */
650 	init_waitqueue_head(&u->peer_wait);
651 	unix_insert_socket(unix_sockets_unbound(sk), sk);
652 out:
653 	if (sk == NULL)
654 		atomic_long_dec(&unix_nr_socks);
655 	else {
656 		local_bh_disable();
657 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
658 		local_bh_enable();
659 	}
660 	return sk;
661 }
662 
663 static int unix_create(struct net *net, struct socket *sock, int protocol,
664 		       int kern)
665 {
666 	if (protocol && protocol != PF_UNIX)
667 		return -EPROTONOSUPPORT;
668 
669 	sock->state = SS_UNCONNECTED;
670 
671 	switch (sock->type) {
672 	case SOCK_STREAM:
673 		sock->ops = &unix_stream_ops;
674 		break;
675 		/*
676 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
677 		 *	nothing uses it.
678 		 */
679 	case SOCK_RAW:
680 		sock->type = SOCK_DGRAM;
681 	case SOCK_DGRAM:
682 		sock->ops = &unix_dgram_ops;
683 		break;
684 	case SOCK_SEQPACKET:
685 		sock->ops = &unix_seqpacket_ops;
686 		break;
687 	default:
688 		return -ESOCKTNOSUPPORT;
689 	}
690 
691 	return unix_create1(net, sock) ? 0 : -ENOMEM;
692 }
693 
694 static int unix_release(struct socket *sock)
695 {
696 	struct sock *sk = sock->sk;
697 
698 	if (!sk)
699 		return 0;
700 
701 	unix_release_sock(sk, 0);
702 	sock->sk = NULL;
703 
704 	return 0;
705 }
706 
707 static int unix_autobind(struct socket *sock)
708 {
709 	struct sock *sk = sock->sk;
710 	struct net *net = sock_net(sk);
711 	struct unix_sock *u = unix_sk(sk);
712 	static u32 ordernum = 1;
713 	struct unix_address *addr;
714 	int err;
715 	unsigned int retries = 0;
716 
717 	mutex_lock(&u->readlock);
718 
719 	err = 0;
720 	if (u->addr)
721 		goto out;
722 
723 	err = -ENOMEM;
724 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
725 	if (!addr)
726 		goto out;
727 
728 	addr->name->sun_family = AF_UNIX;
729 	atomic_set(&addr->refcnt, 1);
730 
731 retry:
732 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
733 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
734 
735 	spin_lock(&unix_table_lock);
736 	ordernum = (ordernum+1)&0xFFFFF;
737 
738 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
739 				      addr->hash)) {
740 		spin_unlock(&unix_table_lock);
741 		/*
742 		 * __unix_find_socket_byname() may take long time if many names
743 		 * are already in use.
744 		 */
745 		cond_resched();
746 		/* Give up if all names seems to be in use. */
747 		if (retries++ == 0xFFFFF) {
748 			err = -ENOSPC;
749 			kfree(addr);
750 			goto out;
751 		}
752 		goto retry;
753 	}
754 	addr->hash ^= sk->sk_type;
755 
756 	__unix_remove_socket(sk);
757 	u->addr = addr;
758 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
759 	spin_unlock(&unix_table_lock);
760 	err = 0;
761 
762 out:	mutex_unlock(&u->readlock);
763 	return err;
764 }
765 
766 static struct sock *unix_find_other(struct net *net,
767 				    struct sockaddr_un *sunname, int len,
768 				    int type, unsigned int hash, int *error)
769 {
770 	struct sock *u;
771 	struct path path;
772 	int err = 0;
773 
774 	if (sunname->sun_path[0]) {
775 		struct inode *inode;
776 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
777 		if (err)
778 			goto fail;
779 		inode = path.dentry->d_inode;
780 		err = inode_permission(inode, MAY_WRITE);
781 		if (err)
782 			goto put_fail;
783 
784 		err = -ECONNREFUSED;
785 		if (!S_ISSOCK(inode->i_mode))
786 			goto put_fail;
787 		u = unix_find_socket_byinode(inode);
788 		if (!u)
789 			goto put_fail;
790 
791 		if (u->sk_type == type)
792 			touch_atime(&path);
793 
794 		path_put(&path);
795 
796 		err = -EPROTOTYPE;
797 		if (u->sk_type != type) {
798 			sock_put(u);
799 			goto fail;
800 		}
801 	} else {
802 		err = -ECONNREFUSED;
803 		u = unix_find_socket_byname(net, sunname, len, type, hash);
804 		if (u) {
805 			struct dentry *dentry;
806 			dentry = unix_sk(u)->path.dentry;
807 			if (dentry)
808 				touch_atime(&unix_sk(u)->path);
809 		} else
810 			goto fail;
811 	}
812 	return u;
813 
814 put_fail:
815 	path_put(&path);
816 fail:
817 	*error = err;
818 	return NULL;
819 }
820 
821 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
822 {
823 	struct dentry *dentry;
824 	struct path path;
825 	int err = 0;
826 	/*
827 	 * Get the parent directory, calculate the hash for last
828 	 * component.
829 	 */
830 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
831 	err = PTR_ERR(dentry);
832 	if (IS_ERR(dentry))
833 		return err;
834 
835 	/*
836 	 * All right, let's create it.
837 	 */
838 	err = security_path_mknod(&path, dentry, mode, 0);
839 	if (!err) {
840 		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
841 		if (!err) {
842 			res->mnt = mntget(path.mnt);
843 			res->dentry = dget(dentry);
844 		}
845 	}
846 	done_path_create(&path, dentry);
847 	return err;
848 }
849 
850 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
851 {
852 	struct sock *sk = sock->sk;
853 	struct net *net = sock_net(sk);
854 	struct unix_sock *u = unix_sk(sk);
855 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
856 	char *sun_path = sunaddr->sun_path;
857 	int err;
858 	unsigned int hash;
859 	struct unix_address *addr;
860 	struct hlist_head *list;
861 
862 	err = -EINVAL;
863 	if (sunaddr->sun_family != AF_UNIX)
864 		goto out;
865 
866 	if (addr_len == sizeof(short)) {
867 		err = unix_autobind(sock);
868 		goto out;
869 	}
870 
871 	err = unix_mkname(sunaddr, addr_len, &hash);
872 	if (err < 0)
873 		goto out;
874 	addr_len = err;
875 
876 	mutex_lock(&u->readlock);
877 
878 	err = -EINVAL;
879 	if (u->addr)
880 		goto out_up;
881 
882 	err = -ENOMEM;
883 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
884 	if (!addr)
885 		goto out_up;
886 
887 	memcpy(addr->name, sunaddr, addr_len);
888 	addr->len = addr_len;
889 	addr->hash = hash ^ sk->sk_type;
890 	atomic_set(&addr->refcnt, 1);
891 
892 	if (sun_path[0]) {
893 		struct path path;
894 		umode_t mode = S_IFSOCK |
895 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
896 		err = unix_mknod(sun_path, mode, &path);
897 		if (err) {
898 			if (err == -EEXIST)
899 				err = -EADDRINUSE;
900 			unix_release_addr(addr);
901 			goto out_up;
902 		}
903 		addr->hash = UNIX_HASH_SIZE;
904 		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
905 		spin_lock(&unix_table_lock);
906 		u->path = path;
907 		list = &unix_socket_table[hash];
908 	} else {
909 		spin_lock(&unix_table_lock);
910 		err = -EADDRINUSE;
911 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
912 					      sk->sk_type, hash)) {
913 			unix_release_addr(addr);
914 			goto out_unlock;
915 		}
916 
917 		list = &unix_socket_table[addr->hash];
918 	}
919 
920 	err = 0;
921 	__unix_remove_socket(sk);
922 	u->addr = addr;
923 	__unix_insert_socket(list, sk);
924 
925 out_unlock:
926 	spin_unlock(&unix_table_lock);
927 out_up:
928 	mutex_unlock(&u->readlock);
929 out:
930 	return err;
931 }
932 
933 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
934 {
935 	if (unlikely(sk1 == sk2) || !sk2) {
936 		unix_state_lock(sk1);
937 		return;
938 	}
939 	if (sk1 < sk2) {
940 		unix_state_lock(sk1);
941 		unix_state_lock_nested(sk2);
942 	} else {
943 		unix_state_lock(sk2);
944 		unix_state_lock_nested(sk1);
945 	}
946 }
947 
948 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
949 {
950 	if (unlikely(sk1 == sk2) || !sk2) {
951 		unix_state_unlock(sk1);
952 		return;
953 	}
954 	unix_state_unlock(sk1);
955 	unix_state_unlock(sk2);
956 }
957 
958 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
959 			      int alen, int flags)
960 {
961 	struct sock *sk = sock->sk;
962 	struct net *net = sock_net(sk);
963 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
964 	struct sock *other;
965 	unsigned int hash;
966 	int err;
967 
968 	if (addr->sa_family != AF_UNSPEC) {
969 		err = unix_mkname(sunaddr, alen, &hash);
970 		if (err < 0)
971 			goto out;
972 		alen = err;
973 
974 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
975 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
976 			goto out;
977 
978 restart:
979 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
980 		if (!other)
981 			goto out;
982 
983 		unix_state_double_lock(sk, other);
984 
985 		/* Apparently VFS overslept socket death. Retry. */
986 		if (sock_flag(other, SOCK_DEAD)) {
987 			unix_state_double_unlock(sk, other);
988 			sock_put(other);
989 			goto restart;
990 		}
991 
992 		err = -EPERM;
993 		if (!unix_may_send(sk, other))
994 			goto out_unlock;
995 
996 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
997 		if (err)
998 			goto out_unlock;
999 
1000 	} else {
1001 		/*
1002 		 *	1003.1g breaking connected state with AF_UNSPEC
1003 		 */
1004 		other = NULL;
1005 		unix_state_double_lock(sk, other);
1006 	}
1007 
1008 	/*
1009 	 * If it was connected, reconnect.
1010 	 */
1011 	if (unix_peer(sk)) {
1012 		struct sock *old_peer = unix_peer(sk);
1013 		unix_peer(sk) = other;
1014 		unix_state_double_unlock(sk, other);
1015 
1016 		if (other != old_peer)
1017 			unix_dgram_disconnected(sk, old_peer);
1018 		sock_put(old_peer);
1019 	} else {
1020 		unix_peer(sk) = other;
1021 		unix_state_double_unlock(sk, other);
1022 	}
1023 	return 0;
1024 
1025 out_unlock:
1026 	unix_state_double_unlock(sk, other);
1027 	sock_put(other);
1028 out:
1029 	return err;
1030 }
1031 
1032 static long unix_wait_for_peer(struct sock *other, long timeo)
1033 {
1034 	struct unix_sock *u = unix_sk(other);
1035 	int sched;
1036 	DEFINE_WAIT(wait);
1037 
1038 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1039 
1040 	sched = !sock_flag(other, SOCK_DEAD) &&
1041 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1042 		unix_recvq_full(other);
1043 
1044 	unix_state_unlock(other);
1045 
1046 	if (sched)
1047 		timeo = schedule_timeout(timeo);
1048 
1049 	finish_wait(&u->peer_wait, &wait);
1050 	return timeo;
1051 }
1052 
1053 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1054 			       int addr_len, int flags)
1055 {
1056 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1057 	struct sock *sk = sock->sk;
1058 	struct net *net = sock_net(sk);
1059 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1060 	struct sock *newsk = NULL;
1061 	struct sock *other = NULL;
1062 	struct sk_buff *skb = NULL;
1063 	unsigned int hash;
1064 	int st;
1065 	int err;
1066 	long timeo;
1067 
1068 	err = unix_mkname(sunaddr, addr_len, &hash);
1069 	if (err < 0)
1070 		goto out;
1071 	addr_len = err;
1072 
1073 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1074 	    (err = unix_autobind(sock)) != 0)
1075 		goto out;
1076 
1077 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1078 
1079 	/* First of all allocate resources.
1080 	   If we will make it after state is locked,
1081 	   we will have to recheck all again in any case.
1082 	 */
1083 
1084 	err = -ENOMEM;
1085 
1086 	/* create new sock for complete connection */
1087 	newsk = unix_create1(sock_net(sk), NULL);
1088 	if (newsk == NULL)
1089 		goto out;
1090 
1091 	/* Allocate skb for sending to listening sock */
1092 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1093 	if (skb == NULL)
1094 		goto out;
1095 
1096 restart:
1097 	/*  Find listening sock. */
1098 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1099 	if (!other)
1100 		goto out;
1101 
1102 	/* Latch state of peer */
1103 	unix_state_lock(other);
1104 
1105 	/* Apparently VFS overslept socket death. Retry. */
1106 	if (sock_flag(other, SOCK_DEAD)) {
1107 		unix_state_unlock(other);
1108 		sock_put(other);
1109 		goto restart;
1110 	}
1111 
1112 	err = -ECONNREFUSED;
1113 	if (other->sk_state != TCP_LISTEN)
1114 		goto out_unlock;
1115 	if (other->sk_shutdown & RCV_SHUTDOWN)
1116 		goto out_unlock;
1117 
1118 	if (unix_recvq_full(other)) {
1119 		err = -EAGAIN;
1120 		if (!timeo)
1121 			goto out_unlock;
1122 
1123 		timeo = unix_wait_for_peer(other, timeo);
1124 
1125 		err = sock_intr_errno(timeo);
1126 		if (signal_pending(current))
1127 			goto out;
1128 		sock_put(other);
1129 		goto restart;
1130 	}
1131 
1132 	/* Latch our state.
1133 
1134 	   It is tricky place. We need to grab our state lock and cannot
1135 	   drop lock on peer. It is dangerous because deadlock is
1136 	   possible. Connect to self case and simultaneous
1137 	   attempt to connect are eliminated by checking socket
1138 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1139 	   check this before attempt to grab lock.
1140 
1141 	   Well, and we have to recheck the state after socket locked.
1142 	 */
1143 	st = sk->sk_state;
1144 
1145 	switch (st) {
1146 	case TCP_CLOSE:
1147 		/* This is ok... continue with connect */
1148 		break;
1149 	case TCP_ESTABLISHED:
1150 		/* Socket is already connected */
1151 		err = -EISCONN;
1152 		goto out_unlock;
1153 	default:
1154 		err = -EINVAL;
1155 		goto out_unlock;
1156 	}
1157 
1158 	unix_state_lock_nested(sk);
1159 
1160 	if (sk->sk_state != st) {
1161 		unix_state_unlock(sk);
1162 		unix_state_unlock(other);
1163 		sock_put(other);
1164 		goto restart;
1165 	}
1166 
1167 	err = security_unix_stream_connect(sk, other, newsk);
1168 	if (err) {
1169 		unix_state_unlock(sk);
1170 		goto out_unlock;
1171 	}
1172 
1173 	/* The way is open! Fastly set all the necessary fields... */
1174 
1175 	sock_hold(sk);
1176 	unix_peer(newsk)	= sk;
1177 	newsk->sk_state		= TCP_ESTABLISHED;
1178 	newsk->sk_type		= sk->sk_type;
1179 	init_peercred(newsk);
1180 	newu = unix_sk(newsk);
1181 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1182 	otheru = unix_sk(other);
1183 
1184 	/* copy address information from listening to new sock*/
1185 	if (otheru->addr) {
1186 		atomic_inc(&otheru->addr->refcnt);
1187 		newu->addr = otheru->addr;
1188 	}
1189 	if (otheru->path.dentry) {
1190 		path_get(&otheru->path);
1191 		newu->path = otheru->path;
1192 	}
1193 
1194 	/* Set credentials */
1195 	copy_peercred(sk, other);
1196 
1197 	sock->state	= SS_CONNECTED;
1198 	sk->sk_state	= TCP_ESTABLISHED;
1199 	sock_hold(newsk);
1200 
1201 	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1202 	unix_peer(sk)	= newsk;
1203 
1204 	unix_state_unlock(sk);
1205 
1206 	/* take ten and and send info to listening sock */
1207 	spin_lock(&other->sk_receive_queue.lock);
1208 	__skb_queue_tail(&other->sk_receive_queue, skb);
1209 	spin_unlock(&other->sk_receive_queue.lock);
1210 	unix_state_unlock(other);
1211 	other->sk_data_ready(other, 0);
1212 	sock_put(other);
1213 	return 0;
1214 
1215 out_unlock:
1216 	if (other)
1217 		unix_state_unlock(other);
1218 
1219 out:
1220 	kfree_skb(skb);
1221 	if (newsk)
1222 		unix_release_sock(newsk, 0);
1223 	if (other)
1224 		sock_put(other);
1225 	return err;
1226 }
1227 
1228 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1229 {
1230 	struct sock *ska = socka->sk, *skb = sockb->sk;
1231 
1232 	/* Join our sockets back to back */
1233 	sock_hold(ska);
1234 	sock_hold(skb);
1235 	unix_peer(ska) = skb;
1236 	unix_peer(skb) = ska;
1237 	init_peercred(ska);
1238 	init_peercred(skb);
1239 
1240 	if (ska->sk_type != SOCK_DGRAM) {
1241 		ska->sk_state = TCP_ESTABLISHED;
1242 		skb->sk_state = TCP_ESTABLISHED;
1243 		socka->state  = SS_CONNECTED;
1244 		sockb->state  = SS_CONNECTED;
1245 	}
1246 	return 0;
1247 }
1248 
1249 static void unix_sock_inherit_flags(const struct socket *old,
1250 				    struct socket *new)
1251 {
1252 	if (test_bit(SOCK_PASSCRED, &old->flags))
1253 		set_bit(SOCK_PASSCRED, &new->flags);
1254 	if (test_bit(SOCK_PASSSEC, &old->flags))
1255 		set_bit(SOCK_PASSSEC, &new->flags);
1256 }
1257 
1258 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1259 {
1260 	struct sock *sk = sock->sk;
1261 	struct sock *tsk;
1262 	struct sk_buff *skb;
1263 	int err;
1264 
1265 	err = -EOPNOTSUPP;
1266 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1267 		goto out;
1268 
1269 	err = -EINVAL;
1270 	if (sk->sk_state != TCP_LISTEN)
1271 		goto out;
1272 
1273 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1274 	 * so that no locks are necessary.
1275 	 */
1276 
1277 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1278 	if (!skb) {
1279 		/* This means receive shutdown. */
1280 		if (err == 0)
1281 			err = -EINVAL;
1282 		goto out;
1283 	}
1284 
1285 	tsk = skb->sk;
1286 	skb_free_datagram(sk, skb);
1287 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1288 
1289 	/* attach accepted sock to socket */
1290 	unix_state_lock(tsk);
1291 	newsock->state = SS_CONNECTED;
1292 	unix_sock_inherit_flags(sock, newsock);
1293 	sock_graft(tsk, newsock);
1294 	unix_state_unlock(tsk);
1295 	return 0;
1296 
1297 out:
1298 	return err;
1299 }
1300 
1301 
1302 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1303 {
1304 	struct sock *sk = sock->sk;
1305 	struct unix_sock *u;
1306 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1307 	int err = 0;
1308 
1309 	if (peer) {
1310 		sk = unix_peer_get(sk);
1311 
1312 		err = -ENOTCONN;
1313 		if (!sk)
1314 			goto out;
1315 		err = 0;
1316 	} else {
1317 		sock_hold(sk);
1318 	}
1319 
1320 	u = unix_sk(sk);
1321 	unix_state_lock(sk);
1322 	if (!u->addr) {
1323 		sunaddr->sun_family = AF_UNIX;
1324 		sunaddr->sun_path[0] = 0;
1325 		*uaddr_len = sizeof(short);
1326 	} else {
1327 		struct unix_address *addr = u->addr;
1328 
1329 		*uaddr_len = addr->len;
1330 		memcpy(sunaddr, addr->name, *uaddr_len);
1331 	}
1332 	unix_state_unlock(sk);
1333 	sock_put(sk);
1334 out:
1335 	return err;
1336 }
1337 
1338 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1339 {
1340 	int i;
1341 
1342 	scm->fp = UNIXCB(skb).fp;
1343 	UNIXCB(skb).fp = NULL;
1344 
1345 	for (i = scm->fp->count-1; i >= 0; i--)
1346 		unix_notinflight(scm->fp->fp[i]);
1347 }
1348 
1349 static void unix_destruct_scm(struct sk_buff *skb)
1350 {
1351 	struct scm_cookie scm;
1352 	memset(&scm, 0, sizeof(scm));
1353 	scm.pid  = UNIXCB(skb).pid;
1354 	if (UNIXCB(skb).fp)
1355 		unix_detach_fds(&scm, skb);
1356 
1357 	/* Alas, it calls VFS */
1358 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1359 	scm_destroy(&scm);
1360 	sock_wfree(skb);
1361 }
1362 
1363 #define MAX_RECURSION_LEVEL 4
1364 
1365 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1366 {
1367 	int i;
1368 	unsigned char max_level = 0;
1369 	int unix_sock_count = 0;
1370 
1371 	for (i = scm->fp->count - 1; i >= 0; i--) {
1372 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1373 
1374 		if (sk) {
1375 			unix_sock_count++;
1376 			max_level = max(max_level,
1377 					unix_sk(sk)->recursion_level);
1378 		}
1379 	}
1380 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1381 		return -ETOOMANYREFS;
1382 
1383 	/*
1384 	 * Need to duplicate file references for the sake of garbage
1385 	 * collection.  Otherwise a socket in the fps might become a
1386 	 * candidate for GC while the skb is not yet queued.
1387 	 */
1388 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1389 	if (!UNIXCB(skb).fp)
1390 		return -ENOMEM;
1391 
1392 	if (unix_sock_count) {
1393 		for (i = scm->fp->count - 1; i >= 0; i--)
1394 			unix_inflight(scm->fp->fp[i]);
1395 	}
1396 	return max_level;
1397 }
1398 
1399 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1400 {
1401 	int err = 0;
1402 
1403 	UNIXCB(skb).pid  = get_pid(scm->pid);
1404 	UNIXCB(skb).uid = scm->creds.uid;
1405 	UNIXCB(skb).gid = scm->creds.gid;
1406 	UNIXCB(skb).fp = NULL;
1407 	if (scm->fp && send_fds)
1408 		err = unix_attach_fds(scm, skb);
1409 
1410 	skb->destructor = unix_destruct_scm;
1411 	return err;
1412 }
1413 
1414 /*
1415  * Some apps rely on write() giving SCM_CREDENTIALS
1416  * We include credentials if source or destination socket
1417  * asserted SOCK_PASSCRED.
1418  */
1419 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1420 			    const struct sock *other)
1421 {
1422 	if (UNIXCB(skb).pid)
1423 		return;
1424 	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1425 	    !other->sk_socket ||
1426 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1427 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1428 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1429 	}
1430 }
1431 
1432 /*
1433  *	Send AF_UNIX data.
1434  */
1435 
1436 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1437 			      struct msghdr *msg, size_t len)
1438 {
1439 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1440 	struct sock *sk = sock->sk;
1441 	struct net *net = sock_net(sk);
1442 	struct unix_sock *u = unix_sk(sk);
1443 	struct sockaddr_un *sunaddr = msg->msg_name;
1444 	struct sock *other = NULL;
1445 	int namelen = 0; /* fake GCC */
1446 	int err;
1447 	unsigned int hash;
1448 	struct sk_buff *skb;
1449 	long timeo;
1450 	struct scm_cookie tmp_scm;
1451 	int max_level;
1452 	int data_len = 0;
1453 
1454 	if (NULL == siocb->scm)
1455 		siocb->scm = &tmp_scm;
1456 	wait_for_unix_gc();
1457 	err = scm_send(sock, msg, siocb->scm, false);
1458 	if (err < 0)
1459 		return err;
1460 
1461 	err = -EOPNOTSUPP;
1462 	if (msg->msg_flags&MSG_OOB)
1463 		goto out;
1464 
1465 	if (msg->msg_namelen) {
1466 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1467 		if (err < 0)
1468 			goto out;
1469 		namelen = err;
1470 	} else {
1471 		sunaddr = NULL;
1472 		err = -ENOTCONN;
1473 		other = unix_peer_get(sk);
1474 		if (!other)
1475 			goto out;
1476 	}
1477 
1478 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1479 	    && (err = unix_autobind(sock)) != 0)
1480 		goto out;
1481 
1482 	err = -EMSGSIZE;
1483 	if (len > sk->sk_sndbuf - 32)
1484 		goto out;
1485 
1486 	if (len > SKB_MAX_ALLOC)
1487 		data_len = min_t(size_t,
1488 				 len - SKB_MAX_ALLOC,
1489 				 MAX_SKB_FRAGS * PAGE_SIZE);
1490 
1491 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1492 				   msg->msg_flags & MSG_DONTWAIT, &err,
1493 				   PAGE_ALLOC_COSTLY_ORDER);
1494 	if (skb == NULL)
1495 		goto out;
1496 
1497 	err = unix_scm_to_skb(siocb->scm, skb, true);
1498 	if (err < 0)
1499 		goto out_free;
1500 	max_level = err + 1;
1501 	unix_get_secdata(siocb->scm, skb);
1502 
1503 	skb_put(skb, len - data_len);
1504 	skb->data_len = data_len;
1505 	skb->len = len;
1506 	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1507 	if (err)
1508 		goto out_free;
1509 
1510 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1511 
1512 restart:
1513 	if (!other) {
1514 		err = -ECONNRESET;
1515 		if (sunaddr == NULL)
1516 			goto out_free;
1517 
1518 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1519 					hash, &err);
1520 		if (other == NULL)
1521 			goto out_free;
1522 	}
1523 
1524 	if (sk_filter(other, skb) < 0) {
1525 		/* Toss the packet but do not return any error to the sender */
1526 		err = len;
1527 		goto out_free;
1528 	}
1529 
1530 	unix_state_lock(other);
1531 	err = -EPERM;
1532 	if (!unix_may_send(sk, other))
1533 		goto out_unlock;
1534 
1535 	if (sock_flag(other, SOCK_DEAD)) {
1536 		/*
1537 		 *	Check with 1003.1g - what should
1538 		 *	datagram error
1539 		 */
1540 		unix_state_unlock(other);
1541 		sock_put(other);
1542 
1543 		err = 0;
1544 		unix_state_lock(sk);
1545 		if (unix_peer(sk) == other) {
1546 			unix_peer(sk) = NULL;
1547 			unix_state_unlock(sk);
1548 
1549 			unix_dgram_disconnected(sk, other);
1550 			sock_put(other);
1551 			err = -ECONNREFUSED;
1552 		} else {
1553 			unix_state_unlock(sk);
1554 		}
1555 
1556 		other = NULL;
1557 		if (err)
1558 			goto out_free;
1559 		goto restart;
1560 	}
1561 
1562 	err = -EPIPE;
1563 	if (other->sk_shutdown & RCV_SHUTDOWN)
1564 		goto out_unlock;
1565 
1566 	if (sk->sk_type != SOCK_SEQPACKET) {
1567 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1568 		if (err)
1569 			goto out_unlock;
1570 	}
1571 
1572 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1573 		if (!timeo) {
1574 			err = -EAGAIN;
1575 			goto out_unlock;
1576 		}
1577 
1578 		timeo = unix_wait_for_peer(other, timeo);
1579 
1580 		err = sock_intr_errno(timeo);
1581 		if (signal_pending(current))
1582 			goto out_free;
1583 
1584 		goto restart;
1585 	}
1586 
1587 	if (sock_flag(other, SOCK_RCVTSTAMP))
1588 		__net_timestamp(skb);
1589 	maybe_add_creds(skb, sock, other);
1590 	skb_queue_tail(&other->sk_receive_queue, skb);
1591 	if (max_level > unix_sk(other)->recursion_level)
1592 		unix_sk(other)->recursion_level = max_level;
1593 	unix_state_unlock(other);
1594 	other->sk_data_ready(other, len);
1595 	sock_put(other);
1596 	scm_destroy(siocb->scm);
1597 	return len;
1598 
1599 out_unlock:
1600 	unix_state_unlock(other);
1601 out_free:
1602 	kfree_skb(skb);
1603 out:
1604 	if (other)
1605 		sock_put(other);
1606 	scm_destroy(siocb->scm);
1607 	return err;
1608 }
1609 
1610 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1611  * bytes, and a minimun of a full page.
1612  */
1613 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1614 
1615 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1616 			       struct msghdr *msg, size_t len)
1617 {
1618 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1619 	struct sock *sk = sock->sk;
1620 	struct sock *other = NULL;
1621 	int err, size;
1622 	struct sk_buff *skb;
1623 	int sent = 0;
1624 	struct scm_cookie tmp_scm;
1625 	bool fds_sent = false;
1626 	int max_level;
1627 	int data_len;
1628 
1629 	if (NULL == siocb->scm)
1630 		siocb->scm = &tmp_scm;
1631 	wait_for_unix_gc();
1632 	err = scm_send(sock, msg, siocb->scm, false);
1633 	if (err < 0)
1634 		return err;
1635 
1636 	err = -EOPNOTSUPP;
1637 	if (msg->msg_flags&MSG_OOB)
1638 		goto out_err;
1639 
1640 	if (msg->msg_namelen) {
1641 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1642 		goto out_err;
1643 	} else {
1644 		err = -ENOTCONN;
1645 		other = unix_peer(sk);
1646 		if (!other)
1647 			goto out_err;
1648 	}
1649 
1650 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1651 		goto pipe_err;
1652 
1653 	while (sent < len) {
1654 		size = len - sent;
1655 
1656 		/* Keep two messages in the pipe so it schedules better */
1657 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1658 
1659 		/* allow fallback to order-0 allocations */
1660 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1661 
1662 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1663 
1664 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1665 					   msg->msg_flags & MSG_DONTWAIT, &err,
1666 					   get_order(UNIX_SKB_FRAGS_SZ));
1667 		if (!skb)
1668 			goto out_err;
1669 
1670 		/* Only send the fds in the first buffer */
1671 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1672 		if (err < 0) {
1673 			kfree_skb(skb);
1674 			goto out_err;
1675 		}
1676 		max_level = err + 1;
1677 		fds_sent = true;
1678 
1679 		skb_put(skb, size - data_len);
1680 		skb->data_len = data_len;
1681 		skb->len = size;
1682 		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1683 						   sent, size);
1684 		if (err) {
1685 			kfree_skb(skb);
1686 			goto out_err;
1687 		}
1688 
1689 		unix_state_lock(other);
1690 
1691 		if (sock_flag(other, SOCK_DEAD) ||
1692 		    (other->sk_shutdown & RCV_SHUTDOWN))
1693 			goto pipe_err_free;
1694 
1695 		maybe_add_creds(skb, sock, other);
1696 		skb_queue_tail(&other->sk_receive_queue, skb);
1697 		if (max_level > unix_sk(other)->recursion_level)
1698 			unix_sk(other)->recursion_level = max_level;
1699 		unix_state_unlock(other);
1700 		other->sk_data_ready(other, size);
1701 		sent += size;
1702 	}
1703 
1704 	scm_destroy(siocb->scm);
1705 	siocb->scm = NULL;
1706 
1707 	return sent;
1708 
1709 pipe_err_free:
1710 	unix_state_unlock(other);
1711 	kfree_skb(skb);
1712 pipe_err:
1713 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1714 		send_sig(SIGPIPE, current, 0);
1715 	err = -EPIPE;
1716 out_err:
1717 	scm_destroy(siocb->scm);
1718 	siocb->scm = NULL;
1719 	return sent ? : err;
1720 }
1721 
1722 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1723 				  struct msghdr *msg, size_t len)
1724 {
1725 	int err;
1726 	struct sock *sk = sock->sk;
1727 
1728 	err = sock_error(sk);
1729 	if (err)
1730 		return err;
1731 
1732 	if (sk->sk_state != TCP_ESTABLISHED)
1733 		return -ENOTCONN;
1734 
1735 	if (msg->msg_namelen)
1736 		msg->msg_namelen = 0;
1737 
1738 	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1739 }
1740 
1741 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1742 			      struct msghdr *msg, size_t size,
1743 			      int flags)
1744 {
1745 	struct sock *sk = sock->sk;
1746 
1747 	if (sk->sk_state != TCP_ESTABLISHED)
1748 		return -ENOTCONN;
1749 
1750 	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1751 }
1752 
1753 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1754 {
1755 	struct unix_sock *u = unix_sk(sk);
1756 
1757 	if (u->addr) {
1758 		msg->msg_namelen = u->addr->len;
1759 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1760 	}
1761 }
1762 
1763 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1764 			      struct msghdr *msg, size_t size,
1765 			      int flags)
1766 {
1767 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1768 	struct scm_cookie tmp_scm;
1769 	struct sock *sk = sock->sk;
1770 	struct unix_sock *u = unix_sk(sk);
1771 	int noblock = flags & MSG_DONTWAIT;
1772 	struct sk_buff *skb;
1773 	int err;
1774 	int peeked, skip;
1775 
1776 	err = -EOPNOTSUPP;
1777 	if (flags&MSG_OOB)
1778 		goto out;
1779 
1780 	err = mutex_lock_interruptible(&u->readlock);
1781 	if (err) {
1782 		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1783 		goto out;
1784 	}
1785 
1786 	skip = sk_peek_offset(sk, flags);
1787 
1788 	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1789 	if (!skb) {
1790 		unix_state_lock(sk);
1791 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1792 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1793 		    (sk->sk_shutdown & RCV_SHUTDOWN))
1794 			err = 0;
1795 		unix_state_unlock(sk);
1796 		goto out_unlock;
1797 	}
1798 
1799 	wake_up_interruptible_sync_poll(&u->peer_wait,
1800 					POLLOUT | POLLWRNORM | POLLWRBAND);
1801 
1802 	if (msg->msg_name)
1803 		unix_copy_addr(msg, skb->sk);
1804 
1805 	if (size > skb->len - skip)
1806 		size = skb->len - skip;
1807 	else if (size < skb->len - skip)
1808 		msg->msg_flags |= MSG_TRUNC;
1809 
1810 	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1811 	if (err)
1812 		goto out_free;
1813 
1814 	if (sock_flag(sk, SOCK_RCVTSTAMP))
1815 		__sock_recv_timestamp(msg, sk, skb);
1816 
1817 	if (!siocb->scm) {
1818 		siocb->scm = &tmp_scm;
1819 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1820 	}
1821 	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1822 	unix_set_secdata(siocb->scm, skb);
1823 
1824 	if (!(flags & MSG_PEEK)) {
1825 		if (UNIXCB(skb).fp)
1826 			unix_detach_fds(siocb->scm, skb);
1827 
1828 		sk_peek_offset_bwd(sk, skb->len);
1829 	} else {
1830 		/* It is questionable: on PEEK we could:
1831 		   - do not return fds - good, but too simple 8)
1832 		   - return fds, and do not return them on read (old strategy,
1833 		     apparently wrong)
1834 		   - clone fds (I chose it for now, it is the most universal
1835 		     solution)
1836 
1837 		   POSIX 1003.1g does not actually define this clearly
1838 		   at all. POSIX 1003.1g doesn't define a lot of things
1839 		   clearly however!
1840 
1841 		*/
1842 
1843 		sk_peek_offset_fwd(sk, size);
1844 
1845 		if (UNIXCB(skb).fp)
1846 			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1847 	}
1848 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1849 
1850 	scm_recv(sock, msg, siocb->scm, flags);
1851 
1852 out_free:
1853 	skb_free_datagram(sk, skb);
1854 out_unlock:
1855 	mutex_unlock(&u->readlock);
1856 out:
1857 	return err;
1858 }
1859 
1860 /*
1861  *	Sleep until more data has arrived. But check for races..
1862  */
1863 static long unix_stream_data_wait(struct sock *sk, long timeo,
1864 				  struct sk_buff *last)
1865 {
1866 	DEFINE_WAIT(wait);
1867 
1868 	unix_state_lock(sk);
1869 
1870 	for (;;) {
1871 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1872 
1873 		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1874 		    sk->sk_err ||
1875 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1876 		    signal_pending(current) ||
1877 		    !timeo)
1878 			break;
1879 
1880 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1881 		unix_state_unlock(sk);
1882 		timeo = freezable_schedule_timeout(timeo);
1883 		unix_state_lock(sk);
1884 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1885 	}
1886 
1887 	finish_wait(sk_sleep(sk), &wait);
1888 	unix_state_unlock(sk);
1889 	return timeo;
1890 }
1891 
1892 static unsigned int unix_skb_len(const struct sk_buff *skb)
1893 {
1894 	return skb->len - UNIXCB(skb).consumed;
1895 }
1896 
1897 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1898 			       struct msghdr *msg, size_t size,
1899 			       int flags)
1900 {
1901 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1902 	struct scm_cookie tmp_scm;
1903 	struct sock *sk = sock->sk;
1904 	struct unix_sock *u = unix_sk(sk);
1905 	struct sockaddr_un *sunaddr = msg->msg_name;
1906 	int copied = 0;
1907 	int check_creds = 0;
1908 	int target;
1909 	int err = 0;
1910 	long timeo;
1911 	int skip;
1912 
1913 	err = -EINVAL;
1914 	if (sk->sk_state != TCP_ESTABLISHED)
1915 		goto out;
1916 
1917 	err = -EOPNOTSUPP;
1918 	if (flags&MSG_OOB)
1919 		goto out;
1920 
1921 	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1922 	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1923 
1924 	/* Lock the socket to prevent queue disordering
1925 	 * while sleeps in memcpy_tomsg
1926 	 */
1927 
1928 	if (!siocb->scm) {
1929 		siocb->scm = &tmp_scm;
1930 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1931 	}
1932 
1933 	err = mutex_lock_interruptible(&u->readlock);
1934 	if (err) {
1935 		err = sock_intr_errno(timeo);
1936 		goto out;
1937 	}
1938 
1939 	do {
1940 		int chunk;
1941 		struct sk_buff *skb, *last;
1942 
1943 		unix_state_lock(sk);
1944 		last = skb = skb_peek(&sk->sk_receive_queue);
1945 again:
1946 		if (skb == NULL) {
1947 			unix_sk(sk)->recursion_level = 0;
1948 			if (copied >= target)
1949 				goto unlock;
1950 
1951 			/*
1952 			 *	POSIX 1003.1g mandates this order.
1953 			 */
1954 
1955 			err = sock_error(sk);
1956 			if (err)
1957 				goto unlock;
1958 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1959 				goto unlock;
1960 
1961 			unix_state_unlock(sk);
1962 			err = -EAGAIN;
1963 			if (!timeo)
1964 				break;
1965 			mutex_unlock(&u->readlock);
1966 
1967 			timeo = unix_stream_data_wait(sk, timeo, last);
1968 
1969 			if (signal_pending(current)
1970 			    ||  mutex_lock_interruptible(&u->readlock)) {
1971 				err = sock_intr_errno(timeo);
1972 				goto out;
1973 			}
1974 
1975 			continue;
1976  unlock:
1977 			unix_state_unlock(sk);
1978 			break;
1979 		}
1980 
1981 		skip = sk_peek_offset(sk, flags);
1982 		while (skip >= unix_skb_len(skb)) {
1983 			skip -= unix_skb_len(skb);
1984 			last = skb;
1985 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1986 			if (!skb)
1987 				goto again;
1988 		}
1989 
1990 		unix_state_unlock(sk);
1991 
1992 		if (check_creds) {
1993 			/* Never glue messages from different writers */
1994 			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1995 			    !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
1996 			    !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
1997 				break;
1998 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
1999 			/* Copy credentials */
2000 			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2001 			check_creds = 1;
2002 		}
2003 
2004 		/* Copy address just once */
2005 		if (sunaddr) {
2006 			unix_copy_addr(msg, skb->sk);
2007 			sunaddr = NULL;
2008 		}
2009 
2010 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2011 		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2012 					    msg->msg_iov, chunk)) {
2013 			if (copied == 0)
2014 				copied = -EFAULT;
2015 			break;
2016 		}
2017 		copied += chunk;
2018 		size -= chunk;
2019 
2020 		/* Mark read part of skb as used */
2021 		if (!(flags & MSG_PEEK)) {
2022 			UNIXCB(skb).consumed += chunk;
2023 
2024 			sk_peek_offset_bwd(sk, chunk);
2025 
2026 			if (UNIXCB(skb).fp)
2027 				unix_detach_fds(siocb->scm, skb);
2028 
2029 			if (unix_skb_len(skb))
2030 				break;
2031 
2032 			skb_unlink(skb, &sk->sk_receive_queue);
2033 			consume_skb(skb);
2034 
2035 			if (siocb->scm->fp)
2036 				break;
2037 		} else {
2038 			/* It is questionable, see note in unix_dgram_recvmsg.
2039 			 */
2040 			if (UNIXCB(skb).fp)
2041 				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2042 
2043 			sk_peek_offset_fwd(sk, chunk);
2044 
2045 			break;
2046 		}
2047 	} while (size);
2048 
2049 	mutex_unlock(&u->readlock);
2050 	scm_recv(sock, msg, siocb->scm, flags);
2051 out:
2052 	return copied ? : err;
2053 }
2054 
2055 static int unix_shutdown(struct socket *sock, int mode)
2056 {
2057 	struct sock *sk = sock->sk;
2058 	struct sock *other;
2059 
2060 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2061 		return -EINVAL;
2062 	/* This maps:
2063 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2064 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2065 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2066 	 */
2067 	++mode;
2068 
2069 	unix_state_lock(sk);
2070 	sk->sk_shutdown |= mode;
2071 	other = unix_peer(sk);
2072 	if (other)
2073 		sock_hold(other);
2074 	unix_state_unlock(sk);
2075 	sk->sk_state_change(sk);
2076 
2077 	if (other &&
2078 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2079 
2080 		int peer_mode = 0;
2081 
2082 		if (mode&RCV_SHUTDOWN)
2083 			peer_mode |= SEND_SHUTDOWN;
2084 		if (mode&SEND_SHUTDOWN)
2085 			peer_mode |= RCV_SHUTDOWN;
2086 		unix_state_lock(other);
2087 		other->sk_shutdown |= peer_mode;
2088 		unix_state_unlock(other);
2089 		other->sk_state_change(other);
2090 		if (peer_mode == SHUTDOWN_MASK)
2091 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2092 		else if (peer_mode & RCV_SHUTDOWN)
2093 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2094 	}
2095 	if (other)
2096 		sock_put(other);
2097 
2098 	return 0;
2099 }
2100 
2101 long unix_inq_len(struct sock *sk)
2102 {
2103 	struct sk_buff *skb;
2104 	long amount = 0;
2105 
2106 	if (sk->sk_state == TCP_LISTEN)
2107 		return -EINVAL;
2108 
2109 	spin_lock(&sk->sk_receive_queue.lock);
2110 	if (sk->sk_type == SOCK_STREAM ||
2111 	    sk->sk_type == SOCK_SEQPACKET) {
2112 		skb_queue_walk(&sk->sk_receive_queue, skb)
2113 			amount += unix_skb_len(skb);
2114 	} else {
2115 		skb = skb_peek(&sk->sk_receive_queue);
2116 		if (skb)
2117 			amount = skb->len;
2118 	}
2119 	spin_unlock(&sk->sk_receive_queue.lock);
2120 
2121 	return amount;
2122 }
2123 EXPORT_SYMBOL_GPL(unix_inq_len);
2124 
2125 long unix_outq_len(struct sock *sk)
2126 {
2127 	return sk_wmem_alloc_get(sk);
2128 }
2129 EXPORT_SYMBOL_GPL(unix_outq_len);
2130 
2131 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2132 {
2133 	struct sock *sk = sock->sk;
2134 	long amount = 0;
2135 	int err;
2136 
2137 	switch (cmd) {
2138 	case SIOCOUTQ:
2139 		amount = unix_outq_len(sk);
2140 		err = put_user(amount, (int __user *)arg);
2141 		break;
2142 	case SIOCINQ:
2143 		amount = unix_inq_len(sk);
2144 		if (amount < 0)
2145 			err = amount;
2146 		else
2147 			err = put_user(amount, (int __user *)arg);
2148 		break;
2149 	default:
2150 		err = -ENOIOCTLCMD;
2151 		break;
2152 	}
2153 	return err;
2154 }
2155 
2156 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2157 {
2158 	struct sock *sk = sock->sk;
2159 	unsigned int mask;
2160 
2161 	sock_poll_wait(file, sk_sleep(sk), wait);
2162 	mask = 0;
2163 
2164 	/* exceptional events? */
2165 	if (sk->sk_err)
2166 		mask |= POLLERR;
2167 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2168 		mask |= POLLHUP;
2169 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2170 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2171 
2172 	/* readable? */
2173 	if (!skb_queue_empty(&sk->sk_receive_queue))
2174 		mask |= POLLIN | POLLRDNORM;
2175 
2176 	/* Connection-based need to check for termination and startup */
2177 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2178 	    sk->sk_state == TCP_CLOSE)
2179 		mask |= POLLHUP;
2180 
2181 	/*
2182 	 * we set writable also when the other side has shut down the
2183 	 * connection. This prevents stuck sockets.
2184 	 */
2185 	if (unix_writable(sk))
2186 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2187 
2188 	return mask;
2189 }
2190 
2191 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2192 				    poll_table *wait)
2193 {
2194 	struct sock *sk = sock->sk, *other;
2195 	unsigned int mask, writable;
2196 
2197 	sock_poll_wait(file, sk_sleep(sk), wait);
2198 	mask = 0;
2199 
2200 	/* exceptional events? */
2201 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2202 		mask |= POLLERR |
2203 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2204 
2205 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2206 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2207 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2208 		mask |= POLLHUP;
2209 
2210 	/* readable? */
2211 	if (!skb_queue_empty(&sk->sk_receive_queue))
2212 		mask |= POLLIN | POLLRDNORM;
2213 
2214 	/* Connection-based need to check for termination and startup */
2215 	if (sk->sk_type == SOCK_SEQPACKET) {
2216 		if (sk->sk_state == TCP_CLOSE)
2217 			mask |= POLLHUP;
2218 		/* connection hasn't started yet? */
2219 		if (sk->sk_state == TCP_SYN_SENT)
2220 			return mask;
2221 	}
2222 
2223 	/* No write status requested, avoid expensive OUT tests. */
2224 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2225 		return mask;
2226 
2227 	writable = unix_writable(sk);
2228 	other = unix_peer_get(sk);
2229 	if (other) {
2230 		if (unix_peer(other) != sk) {
2231 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2232 			if (unix_recvq_full(other))
2233 				writable = 0;
2234 		}
2235 		sock_put(other);
2236 	}
2237 
2238 	if (writable)
2239 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2240 	else
2241 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2242 
2243 	return mask;
2244 }
2245 
2246 #ifdef CONFIG_PROC_FS
2247 
2248 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2249 
2250 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2251 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2252 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2253 
2254 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2255 {
2256 	unsigned long offset = get_offset(*pos);
2257 	unsigned long bucket = get_bucket(*pos);
2258 	struct sock *sk;
2259 	unsigned long count = 0;
2260 
2261 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2262 		if (sock_net(sk) != seq_file_net(seq))
2263 			continue;
2264 		if (++count == offset)
2265 			break;
2266 	}
2267 
2268 	return sk;
2269 }
2270 
2271 static struct sock *unix_next_socket(struct seq_file *seq,
2272 				     struct sock *sk,
2273 				     loff_t *pos)
2274 {
2275 	unsigned long bucket;
2276 
2277 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2278 		sk = sk_next(sk);
2279 		if (!sk)
2280 			goto next_bucket;
2281 		if (sock_net(sk) == seq_file_net(seq))
2282 			return sk;
2283 	}
2284 
2285 	do {
2286 		sk = unix_from_bucket(seq, pos);
2287 		if (sk)
2288 			return sk;
2289 
2290 next_bucket:
2291 		bucket = get_bucket(*pos) + 1;
2292 		*pos = set_bucket_offset(bucket, 1);
2293 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2294 
2295 	return NULL;
2296 }
2297 
2298 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2299 	__acquires(unix_table_lock)
2300 {
2301 	spin_lock(&unix_table_lock);
2302 
2303 	if (!*pos)
2304 		return SEQ_START_TOKEN;
2305 
2306 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2307 		return NULL;
2308 
2309 	return unix_next_socket(seq, NULL, pos);
2310 }
2311 
2312 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2313 {
2314 	++*pos;
2315 	return unix_next_socket(seq, v, pos);
2316 }
2317 
2318 static void unix_seq_stop(struct seq_file *seq, void *v)
2319 	__releases(unix_table_lock)
2320 {
2321 	spin_unlock(&unix_table_lock);
2322 }
2323 
2324 static int unix_seq_show(struct seq_file *seq, void *v)
2325 {
2326 
2327 	if (v == SEQ_START_TOKEN)
2328 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2329 			 "Inode Path\n");
2330 	else {
2331 		struct sock *s = v;
2332 		struct unix_sock *u = unix_sk(s);
2333 		unix_state_lock(s);
2334 
2335 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2336 			s,
2337 			atomic_read(&s->sk_refcnt),
2338 			0,
2339 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2340 			s->sk_type,
2341 			s->sk_socket ?
2342 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2343 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2344 			sock_i_ino(s));
2345 
2346 		if (u->addr) {
2347 			int i, len;
2348 			seq_putc(seq, ' ');
2349 
2350 			i = 0;
2351 			len = u->addr->len - sizeof(short);
2352 			if (!UNIX_ABSTRACT(s))
2353 				len--;
2354 			else {
2355 				seq_putc(seq, '@');
2356 				i++;
2357 			}
2358 			for ( ; i < len; i++)
2359 				seq_putc(seq, u->addr->name->sun_path[i]);
2360 		}
2361 		unix_state_unlock(s);
2362 		seq_putc(seq, '\n');
2363 	}
2364 
2365 	return 0;
2366 }
2367 
2368 static const struct seq_operations unix_seq_ops = {
2369 	.start  = unix_seq_start,
2370 	.next   = unix_seq_next,
2371 	.stop   = unix_seq_stop,
2372 	.show   = unix_seq_show,
2373 };
2374 
2375 static int unix_seq_open(struct inode *inode, struct file *file)
2376 {
2377 	return seq_open_net(inode, file, &unix_seq_ops,
2378 			    sizeof(struct seq_net_private));
2379 }
2380 
2381 static const struct file_operations unix_seq_fops = {
2382 	.owner		= THIS_MODULE,
2383 	.open		= unix_seq_open,
2384 	.read		= seq_read,
2385 	.llseek		= seq_lseek,
2386 	.release	= seq_release_net,
2387 };
2388 
2389 #endif
2390 
2391 static const struct net_proto_family unix_family_ops = {
2392 	.family = PF_UNIX,
2393 	.create = unix_create,
2394 	.owner	= THIS_MODULE,
2395 };
2396 
2397 
2398 static int __net_init unix_net_init(struct net *net)
2399 {
2400 	int error = -ENOMEM;
2401 
2402 	net->unx.sysctl_max_dgram_qlen = 10;
2403 	if (unix_sysctl_register(net))
2404 		goto out;
2405 
2406 #ifdef CONFIG_PROC_FS
2407 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2408 		unix_sysctl_unregister(net);
2409 		goto out;
2410 	}
2411 #endif
2412 	error = 0;
2413 out:
2414 	return error;
2415 }
2416 
2417 static void __net_exit unix_net_exit(struct net *net)
2418 {
2419 	unix_sysctl_unregister(net);
2420 	remove_proc_entry("unix", net->proc_net);
2421 }
2422 
2423 static struct pernet_operations unix_net_ops = {
2424 	.init = unix_net_init,
2425 	.exit = unix_net_exit,
2426 };
2427 
2428 static int __init af_unix_init(void)
2429 {
2430 	int rc = -1;
2431 
2432 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2433 
2434 	rc = proto_register(&unix_proto, 1);
2435 	if (rc != 0) {
2436 		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2437 		       __func__);
2438 		goto out;
2439 	}
2440 
2441 	sock_register(&unix_family_ops);
2442 	register_pernet_subsys(&unix_net_ops);
2443 out:
2444 	return rc;
2445 }
2446 
2447 static void __exit af_unix_exit(void)
2448 {
2449 	sock_unregister(PF_UNIX);
2450 	proto_unregister(&unix_proto);
2451 	unregister_pernet_subsys(&unix_net_ops);
2452 }
2453 
2454 /* Earlier than device_initcall() so that other drivers invoking
2455    request_module() don't end up in a loop when modprobe tries
2456    to use a UNIX socket. But later than subsys_initcall() because
2457    we depend on stuff initialised there */
2458 fs_initcall(af_unix_init);
2459 module_exit(af_unix_exit);
2460 
2461 MODULE_LICENSE("GPL");
2462 MODULE_ALIAS_NETPROTO(PF_UNIX);
2463