xref: /openbmc/linux/net/unix/af_unix.c (revision cd5d5810)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #include <linux/module.h>
84 #include <linux/kernel.h>
85 #include <linux/signal.h>
86 #include <linux/sched.h>
87 #include <linux/errno.h>
88 #include <linux/string.h>
89 #include <linux/stat.h>
90 #include <linux/dcache.h>
91 #include <linux/namei.h>
92 #include <linux/socket.h>
93 #include <linux/un.h>
94 #include <linux/fcntl.h>
95 #include <linux/termios.h>
96 #include <linux/sockios.h>
97 #include <linux/net.h>
98 #include <linux/in.h>
99 #include <linux/fs.h>
100 #include <linux/slab.h>
101 #include <asm/uaccess.h>
102 #include <linux/skbuff.h>
103 #include <linux/netdevice.h>
104 #include <net/net_namespace.h>
105 #include <net/sock.h>
106 #include <net/tcp_states.h>
107 #include <net/af_unix.h>
108 #include <linux/proc_fs.h>
109 #include <linux/seq_file.h>
110 #include <net/scm.h>
111 #include <linux/init.h>
112 #include <linux/poll.h>
113 #include <linux/rtnetlink.h>
114 #include <linux/mount.h>
115 #include <net/checksum.h>
116 #include <linux/security.h>
117 #include <linux/freezer.h>
118 
119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
120 EXPORT_SYMBOL_GPL(unix_socket_table);
121 DEFINE_SPINLOCK(unix_table_lock);
122 EXPORT_SYMBOL_GPL(unix_table_lock);
123 static atomic_long_t unix_nr_socks;
124 
125 
126 static struct hlist_head *unix_sockets_unbound(void *addr)
127 {
128 	unsigned long hash = (unsigned long)addr;
129 
130 	hash ^= hash >> 16;
131 	hash ^= hash >> 8;
132 	hash %= UNIX_HASH_SIZE;
133 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
134 }
135 
136 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
137 
138 #ifdef CONFIG_SECURITY_NETWORK
139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
140 {
141 	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
142 }
143 
144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
145 {
146 	scm->secid = *UNIXSID(skb);
147 }
148 #else
149 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
150 { }
151 
152 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
153 { }
154 #endif /* CONFIG_SECURITY_NETWORK */
155 
156 /*
157  *  SMP locking strategy:
158  *    hash table is protected with spinlock unix_table_lock
159  *    each socket state is protected by separate spin lock.
160  */
161 
162 static inline unsigned int unix_hash_fold(__wsum n)
163 {
164 	unsigned int hash = (__force unsigned int)n;
165 
166 	hash ^= hash>>16;
167 	hash ^= hash>>8;
168 	return hash&(UNIX_HASH_SIZE-1);
169 }
170 
171 #define unix_peer(sk) (unix_sk(sk)->peer)
172 
173 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
174 {
175 	return unix_peer(osk) == sk;
176 }
177 
178 static inline int unix_may_send(struct sock *sk, struct sock *osk)
179 {
180 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
181 }
182 
183 static inline int unix_recvq_full(struct sock const *sk)
184 {
185 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
186 }
187 
188 struct sock *unix_peer_get(struct sock *s)
189 {
190 	struct sock *peer;
191 
192 	unix_state_lock(s);
193 	peer = unix_peer(s);
194 	if (peer)
195 		sock_hold(peer);
196 	unix_state_unlock(s);
197 	return peer;
198 }
199 EXPORT_SYMBOL_GPL(unix_peer_get);
200 
201 static inline void unix_release_addr(struct unix_address *addr)
202 {
203 	if (atomic_dec_and_test(&addr->refcnt))
204 		kfree(addr);
205 }
206 
207 /*
208  *	Check unix socket name:
209  *		- should be not zero length.
210  *	        - if started by not zero, should be NULL terminated (FS object)
211  *		- if started by zero, it is abstract name.
212  */
213 
214 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
215 {
216 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
217 		return -EINVAL;
218 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
219 		return -EINVAL;
220 	if (sunaddr->sun_path[0]) {
221 		/*
222 		 * This may look like an off by one error but it is a bit more
223 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
224 		 * sun_path[108] doesn't as such exist.  However in kernel space
225 		 * we are guaranteed that it is a valid memory location in our
226 		 * kernel address buffer.
227 		 */
228 		((char *)sunaddr)[len] = 0;
229 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
230 		return len;
231 	}
232 
233 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
234 	return len;
235 }
236 
237 static void __unix_remove_socket(struct sock *sk)
238 {
239 	sk_del_node_init(sk);
240 }
241 
242 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
243 {
244 	WARN_ON(!sk_unhashed(sk));
245 	sk_add_node(sk, list);
246 }
247 
248 static inline void unix_remove_socket(struct sock *sk)
249 {
250 	spin_lock(&unix_table_lock);
251 	__unix_remove_socket(sk);
252 	spin_unlock(&unix_table_lock);
253 }
254 
255 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
256 {
257 	spin_lock(&unix_table_lock);
258 	__unix_insert_socket(list, sk);
259 	spin_unlock(&unix_table_lock);
260 }
261 
262 static struct sock *__unix_find_socket_byname(struct net *net,
263 					      struct sockaddr_un *sunname,
264 					      int len, int type, unsigned int hash)
265 {
266 	struct sock *s;
267 
268 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
269 		struct unix_sock *u = unix_sk(s);
270 
271 		if (!net_eq(sock_net(s), net))
272 			continue;
273 
274 		if (u->addr->len == len &&
275 		    !memcmp(u->addr->name, sunname, len))
276 			goto found;
277 	}
278 	s = NULL;
279 found:
280 	return s;
281 }
282 
283 static inline struct sock *unix_find_socket_byname(struct net *net,
284 						   struct sockaddr_un *sunname,
285 						   int len, int type,
286 						   unsigned int hash)
287 {
288 	struct sock *s;
289 
290 	spin_lock(&unix_table_lock);
291 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
292 	if (s)
293 		sock_hold(s);
294 	spin_unlock(&unix_table_lock);
295 	return s;
296 }
297 
298 static struct sock *unix_find_socket_byinode(struct inode *i)
299 {
300 	struct sock *s;
301 
302 	spin_lock(&unix_table_lock);
303 	sk_for_each(s,
304 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
305 		struct dentry *dentry = unix_sk(s)->path.dentry;
306 
307 		if (dentry && dentry->d_inode == i) {
308 			sock_hold(s);
309 			goto found;
310 		}
311 	}
312 	s = NULL;
313 found:
314 	spin_unlock(&unix_table_lock);
315 	return s;
316 }
317 
318 static inline int unix_writable(struct sock *sk)
319 {
320 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
321 }
322 
323 static void unix_write_space(struct sock *sk)
324 {
325 	struct socket_wq *wq;
326 
327 	rcu_read_lock();
328 	if (unix_writable(sk)) {
329 		wq = rcu_dereference(sk->sk_wq);
330 		if (wq_has_sleeper(wq))
331 			wake_up_interruptible_sync_poll(&wq->wait,
332 				POLLOUT | POLLWRNORM | POLLWRBAND);
333 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
334 	}
335 	rcu_read_unlock();
336 }
337 
338 /* When dgram socket disconnects (or changes its peer), we clear its receive
339  * queue of packets arrived from previous peer. First, it allows to do
340  * flow control based only on wmem_alloc; second, sk connected to peer
341  * may receive messages only from that peer. */
342 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
343 {
344 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
345 		skb_queue_purge(&sk->sk_receive_queue);
346 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
347 
348 		/* If one link of bidirectional dgram pipe is disconnected,
349 		 * we signal error. Messages are lost. Do not make this,
350 		 * when peer was not connected to us.
351 		 */
352 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
353 			other->sk_err = ECONNRESET;
354 			other->sk_error_report(other);
355 		}
356 	}
357 }
358 
359 static void unix_sock_destructor(struct sock *sk)
360 {
361 	struct unix_sock *u = unix_sk(sk);
362 
363 	skb_queue_purge(&sk->sk_receive_queue);
364 
365 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
366 	WARN_ON(!sk_unhashed(sk));
367 	WARN_ON(sk->sk_socket);
368 	if (!sock_flag(sk, SOCK_DEAD)) {
369 		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
370 		return;
371 	}
372 
373 	if (u->addr)
374 		unix_release_addr(u->addr);
375 
376 	atomic_long_dec(&unix_nr_socks);
377 	local_bh_disable();
378 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
379 	local_bh_enable();
380 #ifdef UNIX_REFCNT_DEBUG
381 	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
382 		atomic_long_read(&unix_nr_socks));
383 #endif
384 }
385 
386 static void unix_release_sock(struct sock *sk, int embrion)
387 {
388 	struct unix_sock *u = unix_sk(sk);
389 	struct path path;
390 	struct sock *skpair;
391 	struct sk_buff *skb;
392 	int state;
393 
394 	unix_remove_socket(sk);
395 
396 	/* Clear state */
397 	unix_state_lock(sk);
398 	sock_orphan(sk);
399 	sk->sk_shutdown = SHUTDOWN_MASK;
400 	path	     = u->path;
401 	u->path.dentry = NULL;
402 	u->path.mnt = NULL;
403 	state = sk->sk_state;
404 	sk->sk_state = TCP_CLOSE;
405 	unix_state_unlock(sk);
406 
407 	wake_up_interruptible_all(&u->peer_wait);
408 
409 	skpair = unix_peer(sk);
410 
411 	if (skpair != NULL) {
412 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
413 			unix_state_lock(skpair);
414 			/* No more writes */
415 			skpair->sk_shutdown = SHUTDOWN_MASK;
416 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
417 				skpair->sk_err = ECONNRESET;
418 			unix_state_unlock(skpair);
419 			skpair->sk_state_change(skpair);
420 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
421 		}
422 		sock_put(skpair); /* It may now die */
423 		unix_peer(sk) = NULL;
424 	}
425 
426 	/* Try to flush out this socket. Throw out buffers at least */
427 
428 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
429 		if (state == TCP_LISTEN)
430 			unix_release_sock(skb->sk, 1);
431 		/* passed fds are erased in the kfree_skb hook	      */
432 		kfree_skb(skb);
433 	}
434 
435 	if (path.dentry)
436 		path_put(&path);
437 
438 	sock_put(sk);
439 
440 	/* ---- Socket is dead now and most probably destroyed ---- */
441 
442 	/*
443 	 * Fixme: BSD difference: In BSD all sockets connected to us get
444 	 *	  ECONNRESET and we die on the spot. In Linux we behave
445 	 *	  like files and pipes do and wait for the last
446 	 *	  dereference.
447 	 *
448 	 * Can't we simply set sock->err?
449 	 *
450 	 *	  What the above comment does talk about? --ANK(980817)
451 	 */
452 
453 	if (unix_tot_inflight)
454 		unix_gc();		/* Garbage collect fds */
455 }
456 
457 static void init_peercred(struct sock *sk)
458 {
459 	put_pid(sk->sk_peer_pid);
460 	if (sk->sk_peer_cred)
461 		put_cred(sk->sk_peer_cred);
462 	sk->sk_peer_pid  = get_pid(task_tgid(current));
463 	sk->sk_peer_cred = get_current_cred();
464 }
465 
466 static void copy_peercred(struct sock *sk, struct sock *peersk)
467 {
468 	put_pid(sk->sk_peer_pid);
469 	if (sk->sk_peer_cred)
470 		put_cred(sk->sk_peer_cred);
471 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
472 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
473 }
474 
475 static int unix_listen(struct socket *sock, int backlog)
476 {
477 	int err;
478 	struct sock *sk = sock->sk;
479 	struct unix_sock *u = unix_sk(sk);
480 	struct pid *old_pid = NULL;
481 
482 	err = -EOPNOTSUPP;
483 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
484 		goto out;	/* Only stream/seqpacket sockets accept */
485 	err = -EINVAL;
486 	if (!u->addr)
487 		goto out;	/* No listens on an unbound socket */
488 	unix_state_lock(sk);
489 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
490 		goto out_unlock;
491 	if (backlog > sk->sk_max_ack_backlog)
492 		wake_up_interruptible_all(&u->peer_wait);
493 	sk->sk_max_ack_backlog	= backlog;
494 	sk->sk_state		= TCP_LISTEN;
495 	/* set credentials so connect can copy them */
496 	init_peercred(sk);
497 	err = 0;
498 
499 out_unlock:
500 	unix_state_unlock(sk);
501 	put_pid(old_pid);
502 out:
503 	return err;
504 }
505 
506 static int unix_release(struct socket *);
507 static int unix_bind(struct socket *, struct sockaddr *, int);
508 static int unix_stream_connect(struct socket *, struct sockaddr *,
509 			       int addr_len, int flags);
510 static int unix_socketpair(struct socket *, struct socket *);
511 static int unix_accept(struct socket *, struct socket *, int);
512 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
513 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
514 static unsigned int unix_dgram_poll(struct file *, struct socket *,
515 				    poll_table *);
516 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
517 static int unix_shutdown(struct socket *, int);
518 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
519 			       struct msghdr *, size_t);
520 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
521 			       struct msghdr *, size_t, int);
522 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
523 			      struct msghdr *, size_t);
524 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
525 			      struct msghdr *, size_t, int);
526 static int unix_dgram_connect(struct socket *, struct sockaddr *,
527 			      int, int);
528 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
529 				  struct msghdr *, size_t);
530 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
531 				  struct msghdr *, size_t, int);
532 
533 static void unix_set_peek_off(struct sock *sk, int val)
534 {
535 	struct unix_sock *u = unix_sk(sk);
536 
537 	mutex_lock(&u->readlock);
538 	sk->sk_peek_off = val;
539 	mutex_unlock(&u->readlock);
540 }
541 
542 
543 static const struct proto_ops unix_stream_ops = {
544 	.family =	PF_UNIX,
545 	.owner =	THIS_MODULE,
546 	.release =	unix_release,
547 	.bind =		unix_bind,
548 	.connect =	unix_stream_connect,
549 	.socketpair =	unix_socketpair,
550 	.accept =	unix_accept,
551 	.getname =	unix_getname,
552 	.poll =		unix_poll,
553 	.ioctl =	unix_ioctl,
554 	.listen =	unix_listen,
555 	.shutdown =	unix_shutdown,
556 	.setsockopt =	sock_no_setsockopt,
557 	.getsockopt =	sock_no_getsockopt,
558 	.sendmsg =	unix_stream_sendmsg,
559 	.recvmsg =	unix_stream_recvmsg,
560 	.mmap =		sock_no_mmap,
561 	.sendpage =	sock_no_sendpage,
562 	.set_peek_off =	unix_set_peek_off,
563 };
564 
565 static const struct proto_ops unix_dgram_ops = {
566 	.family =	PF_UNIX,
567 	.owner =	THIS_MODULE,
568 	.release =	unix_release,
569 	.bind =		unix_bind,
570 	.connect =	unix_dgram_connect,
571 	.socketpair =	unix_socketpair,
572 	.accept =	sock_no_accept,
573 	.getname =	unix_getname,
574 	.poll =		unix_dgram_poll,
575 	.ioctl =	unix_ioctl,
576 	.listen =	sock_no_listen,
577 	.shutdown =	unix_shutdown,
578 	.setsockopt =	sock_no_setsockopt,
579 	.getsockopt =	sock_no_getsockopt,
580 	.sendmsg =	unix_dgram_sendmsg,
581 	.recvmsg =	unix_dgram_recvmsg,
582 	.mmap =		sock_no_mmap,
583 	.sendpage =	sock_no_sendpage,
584 	.set_peek_off =	unix_set_peek_off,
585 };
586 
587 static const struct proto_ops unix_seqpacket_ops = {
588 	.family =	PF_UNIX,
589 	.owner =	THIS_MODULE,
590 	.release =	unix_release,
591 	.bind =		unix_bind,
592 	.connect =	unix_stream_connect,
593 	.socketpair =	unix_socketpair,
594 	.accept =	unix_accept,
595 	.getname =	unix_getname,
596 	.poll =		unix_dgram_poll,
597 	.ioctl =	unix_ioctl,
598 	.listen =	unix_listen,
599 	.shutdown =	unix_shutdown,
600 	.setsockopt =	sock_no_setsockopt,
601 	.getsockopt =	sock_no_getsockopt,
602 	.sendmsg =	unix_seqpacket_sendmsg,
603 	.recvmsg =	unix_seqpacket_recvmsg,
604 	.mmap =		sock_no_mmap,
605 	.sendpage =	sock_no_sendpage,
606 	.set_peek_off =	unix_set_peek_off,
607 };
608 
609 static struct proto unix_proto = {
610 	.name			= "UNIX",
611 	.owner			= THIS_MODULE,
612 	.obj_size		= sizeof(struct unix_sock),
613 };
614 
615 /*
616  * AF_UNIX sockets do not interact with hardware, hence they
617  * dont trigger interrupts - so it's safe for them to have
618  * bh-unsafe locking for their sk_receive_queue.lock. Split off
619  * this special lock-class by reinitializing the spinlock key:
620  */
621 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
622 
623 static struct sock *unix_create1(struct net *net, struct socket *sock)
624 {
625 	struct sock *sk = NULL;
626 	struct unix_sock *u;
627 
628 	atomic_long_inc(&unix_nr_socks);
629 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
630 		goto out;
631 
632 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
633 	if (!sk)
634 		goto out;
635 
636 	sock_init_data(sock, sk);
637 	lockdep_set_class(&sk->sk_receive_queue.lock,
638 				&af_unix_sk_receive_queue_lock_key);
639 
640 	sk->sk_write_space	= unix_write_space;
641 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
642 	sk->sk_destruct		= unix_sock_destructor;
643 	u	  = unix_sk(sk);
644 	u->path.dentry = NULL;
645 	u->path.mnt = NULL;
646 	spin_lock_init(&u->lock);
647 	atomic_long_set(&u->inflight, 0);
648 	INIT_LIST_HEAD(&u->link);
649 	mutex_init(&u->readlock); /* single task reading lock */
650 	init_waitqueue_head(&u->peer_wait);
651 	unix_insert_socket(unix_sockets_unbound(sk), sk);
652 out:
653 	if (sk == NULL)
654 		atomic_long_dec(&unix_nr_socks);
655 	else {
656 		local_bh_disable();
657 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
658 		local_bh_enable();
659 	}
660 	return sk;
661 }
662 
663 static int unix_create(struct net *net, struct socket *sock, int protocol,
664 		       int kern)
665 {
666 	if (protocol && protocol != PF_UNIX)
667 		return -EPROTONOSUPPORT;
668 
669 	sock->state = SS_UNCONNECTED;
670 
671 	switch (sock->type) {
672 	case SOCK_STREAM:
673 		sock->ops = &unix_stream_ops;
674 		break;
675 		/*
676 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
677 		 *	nothing uses it.
678 		 */
679 	case SOCK_RAW:
680 		sock->type = SOCK_DGRAM;
681 	case SOCK_DGRAM:
682 		sock->ops = &unix_dgram_ops;
683 		break;
684 	case SOCK_SEQPACKET:
685 		sock->ops = &unix_seqpacket_ops;
686 		break;
687 	default:
688 		return -ESOCKTNOSUPPORT;
689 	}
690 
691 	return unix_create1(net, sock) ? 0 : -ENOMEM;
692 }
693 
694 static int unix_release(struct socket *sock)
695 {
696 	struct sock *sk = sock->sk;
697 
698 	if (!sk)
699 		return 0;
700 
701 	unix_release_sock(sk, 0);
702 	sock->sk = NULL;
703 
704 	return 0;
705 }
706 
707 static int unix_autobind(struct socket *sock)
708 {
709 	struct sock *sk = sock->sk;
710 	struct net *net = sock_net(sk);
711 	struct unix_sock *u = unix_sk(sk);
712 	static u32 ordernum = 1;
713 	struct unix_address *addr;
714 	int err;
715 	unsigned int retries = 0;
716 
717 	mutex_lock(&u->readlock);
718 
719 	err = 0;
720 	if (u->addr)
721 		goto out;
722 
723 	err = -ENOMEM;
724 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
725 	if (!addr)
726 		goto out;
727 
728 	addr->name->sun_family = AF_UNIX;
729 	atomic_set(&addr->refcnt, 1);
730 
731 retry:
732 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
733 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
734 
735 	spin_lock(&unix_table_lock);
736 	ordernum = (ordernum+1)&0xFFFFF;
737 
738 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
739 				      addr->hash)) {
740 		spin_unlock(&unix_table_lock);
741 		/*
742 		 * __unix_find_socket_byname() may take long time if many names
743 		 * are already in use.
744 		 */
745 		cond_resched();
746 		/* Give up if all names seems to be in use. */
747 		if (retries++ == 0xFFFFF) {
748 			err = -ENOSPC;
749 			kfree(addr);
750 			goto out;
751 		}
752 		goto retry;
753 	}
754 	addr->hash ^= sk->sk_type;
755 
756 	__unix_remove_socket(sk);
757 	u->addr = addr;
758 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
759 	spin_unlock(&unix_table_lock);
760 	err = 0;
761 
762 out:	mutex_unlock(&u->readlock);
763 	return err;
764 }
765 
766 static struct sock *unix_find_other(struct net *net,
767 				    struct sockaddr_un *sunname, int len,
768 				    int type, unsigned int hash, int *error)
769 {
770 	struct sock *u;
771 	struct path path;
772 	int err = 0;
773 
774 	if (sunname->sun_path[0]) {
775 		struct inode *inode;
776 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
777 		if (err)
778 			goto fail;
779 		inode = path.dentry->d_inode;
780 		err = inode_permission(inode, MAY_WRITE);
781 		if (err)
782 			goto put_fail;
783 
784 		err = -ECONNREFUSED;
785 		if (!S_ISSOCK(inode->i_mode))
786 			goto put_fail;
787 		u = unix_find_socket_byinode(inode);
788 		if (!u)
789 			goto put_fail;
790 
791 		if (u->sk_type == type)
792 			touch_atime(&path);
793 
794 		path_put(&path);
795 
796 		err = -EPROTOTYPE;
797 		if (u->sk_type != type) {
798 			sock_put(u);
799 			goto fail;
800 		}
801 	} else {
802 		err = -ECONNREFUSED;
803 		u = unix_find_socket_byname(net, sunname, len, type, hash);
804 		if (u) {
805 			struct dentry *dentry;
806 			dentry = unix_sk(u)->path.dentry;
807 			if (dentry)
808 				touch_atime(&unix_sk(u)->path);
809 		} else
810 			goto fail;
811 	}
812 	return u;
813 
814 put_fail:
815 	path_put(&path);
816 fail:
817 	*error = err;
818 	return NULL;
819 }
820 
821 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
822 {
823 	struct dentry *dentry;
824 	struct path path;
825 	int err = 0;
826 	/*
827 	 * Get the parent directory, calculate the hash for last
828 	 * component.
829 	 */
830 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
831 	err = PTR_ERR(dentry);
832 	if (IS_ERR(dentry))
833 		return err;
834 
835 	/*
836 	 * All right, let's create it.
837 	 */
838 	err = security_path_mknod(&path, dentry, mode, 0);
839 	if (!err) {
840 		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
841 		if (!err) {
842 			res->mnt = mntget(path.mnt);
843 			res->dentry = dget(dentry);
844 		}
845 	}
846 	done_path_create(&path, dentry);
847 	return err;
848 }
849 
850 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
851 {
852 	struct sock *sk = sock->sk;
853 	struct net *net = sock_net(sk);
854 	struct unix_sock *u = unix_sk(sk);
855 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
856 	char *sun_path = sunaddr->sun_path;
857 	int err;
858 	unsigned int hash;
859 	struct unix_address *addr;
860 	struct hlist_head *list;
861 
862 	err = -EINVAL;
863 	if (sunaddr->sun_family != AF_UNIX)
864 		goto out;
865 
866 	if (addr_len == sizeof(short)) {
867 		err = unix_autobind(sock);
868 		goto out;
869 	}
870 
871 	err = unix_mkname(sunaddr, addr_len, &hash);
872 	if (err < 0)
873 		goto out;
874 	addr_len = err;
875 
876 	mutex_lock(&u->readlock);
877 
878 	err = -EINVAL;
879 	if (u->addr)
880 		goto out_up;
881 
882 	err = -ENOMEM;
883 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
884 	if (!addr)
885 		goto out_up;
886 
887 	memcpy(addr->name, sunaddr, addr_len);
888 	addr->len = addr_len;
889 	addr->hash = hash ^ sk->sk_type;
890 	atomic_set(&addr->refcnt, 1);
891 
892 	if (sun_path[0]) {
893 		struct path path;
894 		umode_t mode = S_IFSOCK |
895 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
896 		err = unix_mknod(sun_path, mode, &path);
897 		if (err) {
898 			if (err == -EEXIST)
899 				err = -EADDRINUSE;
900 			unix_release_addr(addr);
901 			goto out_up;
902 		}
903 		addr->hash = UNIX_HASH_SIZE;
904 		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
905 		spin_lock(&unix_table_lock);
906 		u->path = path;
907 		list = &unix_socket_table[hash];
908 	} else {
909 		spin_lock(&unix_table_lock);
910 		err = -EADDRINUSE;
911 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
912 					      sk->sk_type, hash)) {
913 			unix_release_addr(addr);
914 			goto out_unlock;
915 		}
916 
917 		list = &unix_socket_table[addr->hash];
918 	}
919 
920 	err = 0;
921 	__unix_remove_socket(sk);
922 	u->addr = addr;
923 	__unix_insert_socket(list, sk);
924 
925 out_unlock:
926 	spin_unlock(&unix_table_lock);
927 out_up:
928 	mutex_unlock(&u->readlock);
929 out:
930 	return err;
931 }
932 
933 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
934 {
935 	if (unlikely(sk1 == sk2) || !sk2) {
936 		unix_state_lock(sk1);
937 		return;
938 	}
939 	if (sk1 < sk2) {
940 		unix_state_lock(sk1);
941 		unix_state_lock_nested(sk2);
942 	} else {
943 		unix_state_lock(sk2);
944 		unix_state_lock_nested(sk1);
945 	}
946 }
947 
948 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
949 {
950 	if (unlikely(sk1 == sk2) || !sk2) {
951 		unix_state_unlock(sk1);
952 		return;
953 	}
954 	unix_state_unlock(sk1);
955 	unix_state_unlock(sk2);
956 }
957 
958 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
959 			      int alen, int flags)
960 {
961 	struct sock *sk = sock->sk;
962 	struct net *net = sock_net(sk);
963 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
964 	struct sock *other;
965 	unsigned int hash;
966 	int err;
967 
968 	if (addr->sa_family != AF_UNSPEC) {
969 		err = unix_mkname(sunaddr, alen, &hash);
970 		if (err < 0)
971 			goto out;
972 		alen = err;
973 
974 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
975 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
976 			goto out;
977 
978 restart:
979 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
980 		if (!other)
981 			goto out;
982 
983 		unix_state_double_lock(sk, other);
984 
985 		/* Apparently VFS overslept socket death. Retry. */
986 		if (sock_flag(other, SOCK_DEAD)) {
987 			unix_state_double_unlock(sk, other);
988 			sock_put(other);
989 			goto restart;
990 		}
991 
992 		err = -EPERM;
993 		if (!unix_may_send(sk, other))
994 			goto out_unlock;
995 
996 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
997 		if (err)
998 			goto out_unlock;
999 
1000 	} else {
1001 		/*
1002 		 *	1003.1g breaking connected state with AF_UNSPEC
1003 		 */
1004 		other = NULL;
1005 		unix_state_double_lock(sk, other);
1006 	}
1007 
1008 	/*
1009 	 * If it was connected, reconnect.
1010 	 */
1011 	if (unix_peer(sk)) {
1012 		struct sock *old_peer = unix_peer(sk);
1013 		unix_peer(sk) = other;
1014 		unix_state_double_unlock(sk, other);
1015 
1016 		if (other != old_peer)
1017 			unix_dgram_disconnected(sk, old_peer);
1018 		sock_put(old_peer);
1019 	} else {
1020 		unix_peer(sk) = other;
1021 		unix_state_double_unlock(sk, other);
1022 	}
1023 	return 0;
1024 
1025 out_unlock:
1026 	unix_state_double_unlock(sk, other);
1027 	sock_put(other);
1028 out:
1029 	return err;
1030 }
1031 
1032 static long unix_wait_for_peer(struct sock *other, long timeo)
1033 {
1034 	struct unix_sock *u = unix_sk(other);
1035 	int sched;
1036 	DEFINE_WAIT(wait);
1037 
1038 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1039 
1040 	sched = !sock_flag(other, SOCK_DEAD) &&
1041 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1042 		unix_recvq_full(other);
1043 
1044 	unix_state_unlock(other);
1045 
1046 	if (sched)
1047 		timeo = schedule_timeout(timeo);
1048 
1049 	finish_wait(&u->peer_wait, &wait);
1050 	return timeo;
1051 }
1052 
1053 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1054 			       int addr_len, int flags)
1055 {
1056 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1057 	struct sock *sk = sock->sk;
1058 	struct net *net = sock_net(sk);
1059 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1060 	struct sock *newsk = NULL;
1061 	struct sock *other = NULL;
1062 	struct sk_buff *skb = NULL;
1063 	unsigned int hash;
1064 	int st;
1065 	int err;
1066 	long timeo;
1067 
1068 	err = unix_mkname(sunaddr, addr_len, &hash);
1069 	if (err < 0)
1070 		goto out;
1071 	addr_len = err;
1072 
1073 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1074 	    (err = unix_autobind(sock)) != 0)
1075 		goto out;
1076 
1077 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1078 
1079 	/* First of all allocate resources.
1080 	   If we will make it after state is locked,
1081 	   we will have to recheck all again in any case.
1082 	 */
1083 
1084 	err = -ENOMEM;
1085 
1086 	/* create new sock for complete connection */
1087 	newsk = unix_create1(sock_net(sk), NULL);
1088 	if (newsk == NULL)
1089 		goto out;
1090 
1091 	/* Allocate skb for sending to listening sock */
1092 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1093 	if (skb == NULL)
1094 		goto out;
1095 
1096 restart:
1097 	/*  Find listening sock. */
1098 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1099 	if (!other)
1100 		goto out;
1101 
1102 	/* Latch state of peer */
1103 	unix_state_lock(other);
1104 
1105 	/* Apparently VFS overslept socket death. Retry. */
1106 	if (sock_flag(other, SOCK_DEAD)) {
1107 		unix_state_unlock(other);
1108 		sock_put(other);
1109 		goto restart;
1110 	}
1111 
1112 	err = -ECONNREFUSED;
1113 	if (other->sk_state != TCP_LISTEN)
1114 		goto out_unlock;
1115 	if (other->sk_shutdown & RCV_SHUTDOWN)
1116 		goto out_unlock;
1117 
1118 	if (unix_recvq_full(other)) {
1119 		err = -EAGAIN;
1120 		if (!timeo)
1121 			goto out_unlock;
1122 
1123 		timeo = unix_wait_for_peer(other, timeo);
1124 
1125 		err = sock_intr_errno(timeo);
1126 		if (signal_pending(current))
1127 			goto out;
1128 		sock_put(other);
1129 		goto restart;
1130 	}
1131 
1132 	/* Latch our state.
1133 
1134 	   It is tricky place. We need to grab our state lock and cannot
1135 	   drop lock on peer. It is dangerous because deadlock is
1136 	   possible. Connect to self case and simultaneous
1137 	   attempt to connect are eliminated by checking socket
1138 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1139 	   check this before attempt to grab lock.
1140 
1141 	   Well, and we have to recheck the state after socket locked.
1142 	 */
1143 	st = sk->sk_state;
1144 
1145 	switch (st) {
1146 	case TCP_CLOSE:
1147 		/* This is ok... continue with connect */
1148 		break;
1149 	case TCP_ESTABLISHED:
1150 		/* Socket is already connected */
1151 		err = -EISCONN;
1152 		goto out_unlock;
1153 	default:
1154 		err = -EINVAL;
1155 		goto out_unlock;
1156 	}
1157 
1158 	unix_state_lock_nested(sk);
1159 
1160 	if (sk->sk_state != st) {
1161 		unix_state_unlock(sk);
1162 		unix_state_unlock(other);
1163 		sock_put(other);
1164 		goto restart;
1165 	}
1166 
1167 	err = security_unix_stream_connect(sk, other, newsk);
1168 	if (err) {
1169 		unix_state_unlock(sk);
1170 		goto out_unlock;
1171 	}
1172 
1173 	/* The way is open! Fastly set all the necessary fields... */
1174 
1175 	sock_hold(sk);
1176 	unix_peer(newsk)	= sk;
1177 	newsk->sk_state		= TCP_ESTABLISHED;
1178 	newsk->sk_type		= sk->sk_type;
1179 	init_peercred(newsk);
1180 	newu = unix_sk(newsk);
1181 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1182 	otheru = unix_sk(other);
1183 
1184 	/* copy address information from listening to new sock*/
1185 	if (otheru->addr) {
1186 		atomic_inc(&otheru->addr->refcnt);
1187 		newu->addr = otheru->addr;
1188 	}
1189 	if (otheru->path.dentry) {
1190 		path_get(&otheru->path);
1191 		newu->path = otheru->path;
1192 	}
1193 
1194 	/* Set credentials */
1195 	copy_peercred(sk, other);
1196 
1197 	sock->state	= SS_CONNECTED;
1198 	sk->sk_state	= TCP_ESTABLISHED;
1199 	sock_hold(newsk);
1200 
1201 	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1202 	unix_peer(sk)	= newsk;
1203 
1204 	unix_state_unlock(sk);
1205 
1206 	/* take ten and and send info to listening sock */
1207 	spin_lock(&other->sk_receive_queue.lock);
1208 	__skb_queue_tail(&other->sk_receive_queue, skb);
1209 	spin_unlock(&other->sk_receive_queue.lock);
1210 	unix_state_unlock(other);
1211 	other->sk_data_ready(other, 0);
1212 	sock_put(other);
1213 	return 0;
1214 
1215 out_unlock:
1216 	if (other)
1217 		unix_state_unlock(other);
1218 
1219 out:
1220 	kfree_skb(skb);
1221 	if (newsk)
1222 		unix_release_sock(newsk, 0);
1223 	if (other)
1224 		sock_put(other);
1225 	return err;
1226 }
1227 
1228 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1229 {
1230 	struct sock *ska = socka->sk, *skb = sockb->sk;
1231 
1232 	/* Join our sockets back to back */
1233 	sock_hold(ska);
1234 	sock_hold(skb);
1235 	unix_peer(ska) = skb;
1236 	unix_peer(skb) = ska;
1237 	init_peercred(ska);
1238 	init_peercred(skb);
1239 
1240 	if (ska->sk_type != SOCK_DGRAM) {
1241 		ska->sk_state = TCP_ESTABLISHED;
1242 		skb->sk_state = TCP_ESTABLISHED;
1243 		socka->state  = SS_CONNECTED;
1244 		sockb->state  = SS_CONNECTED;
1245 	}
1246 	return 0;
1247 }
1248 
1249 static void unix_sock_inherit_flags(const struct socket *old,
1250 				    struct socket *new)
1251 {
1252 	if (test_bit(SOCK_PASSCRED, &old->flags))
1253 		set_bit(SOCK_PASSCRED, &new->flags);
1254 	if (test_bit(SOCK_PASSSEC, &old->flags))
1255 		set_bit(SOCK_PASSSEC, &new->flags);
1256 }
1257 
1258 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1259 {
1260 	struct sock *sk = sock->sk;
1261 	struct sock *tsk;
1262 	struct sk_buff *skb;
1263 	int err;
1264 
1265 	err = -EOPNOTSUPP;
1266 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1267 		goto out;
1268 
1269 	err = -EINVAL;
1270 	if (sk->sk_state != TCP_LISTEN)
1271 		goto out;
1272 
1273 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1274 	 * so that no locks are necessary.
1275 	 */
1276 
1277 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1278 	if (!skb) {
1279 		/* This means receive shutdown. */
1280 		if (err == 0)
1281 			err = -EINVAL;
1282 		goto out;
1283 	}
1284 
1285 	tsk = skb->sk;
1286 	skb_free_datagram(sk, skb);
1287 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1288 
1289 	/* attach accepted sock to socket */
1290 	unix_state_lock(tsk);
1291 	newsock->state = SS_CONNECTED;
1292 	unix_sock_inherit_flags(sock, newsock);
1293 	sock_graft(tsk, newsock);
1294 	unix_state_unlock(tsk);
1295 	return 0;
1296 
1297 out:
1298 	return err;
1299 }
1300 
1301 
1302 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1303 {
1304 	struct sock *sk = sock->sk;
1305 	struct unix_sock *u;
1306 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1307 	int err = 0;
1308 
1309 	if (peer) {
1310 		sk = unix_peer_get(sk);
1311 
1312 		err = -ENOTCONN;
1313 		if (!sk)
1314 			goto out;
1315 		err = 0;
1316 	} else {
1317 		sock_hold(sk);
1318 	}
1319 
1320 	u = unix_sk(sk);
1321 	unix_state_lock(sk);
1322 	if (!u->addr) {
1323 		sunaddr->sun_family = AF_UNIX;
1324 		sunaddr->sun_path[0] = 0;
1325 		*uaddr_len = sizeof(short);
1326 	} else {
1327 		struct unix_address *addr = u->addr;
1328 
1329 		*uaddr_len = addr->len;
1330 		memcpy(sunaddr, addr->name, *uaddr_len);
1331 	}
1332 	unix_state_unlock(sk);
1333 	sock_put(sk);
1334 out:
1335 	return err;
1336 }
1337 
1338 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1339 {
1340 	int i;
1341 
1342 	scm->fp = UNIXCB(skb).fp;
1343 	UNIXCB(skb).fp = NULL;
1344 
1345 	for (i = scm->fp->count-1; i >= 0; i--)
1346 		unix_notinflight(scm->fp->fp[i]);
1347 }
1348 
1349 static void unix_destruct_scm(struct sk_buff *skb)
1350 {
1351 	struct scm_cookie scm;
1352 	memset(&scm, 0, sizeof(scm));
1353 	scm.pid  = UNIXCB(skb).pid;
1354 	if (UNIXCB(skb).fp)
1355 		unix_detach_fds(&scm, skb);
1356 
1357 	/* Alas, it calls VFS */
1358 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1359 	scm_destroy(&scm);
1360 	sock_wfree(skb);
1361 }
1362 
1363 #define MAX_RECURSION_LEVEL 4
1364 
1365 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1366 {
1367 	int i;
1368 	unsigned char max_level = 0;
1369 	int unix_sock_count = 0;
1370 
1371 	for (i = scm->fp->count - 1; i >= 0; i--) {
1372 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1373 
1374 		if (sk) {
1375 			unix_sock_count++;
1376 			max_level = max(max_level,
1377 					unix_sk(sk)->recursion_level);
1378 		}
1379 	}
1380 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1381 		return -ETOOMANYREFS;
1382 
1383 	/*
1384 	 * Need to duplicate file references for the sake of garbage
1385 	 * collection.  Otherwise a socket in the fps might become a
1386 	 * candidate for GC while the skb is not yet queued.
1387 	 */
1388 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1389 	if (!UNIXCB(skb).fp)
1390 		return -ENOMEM;
1391 
1392 	if (unix_sock_count) {
1393 		for (i = scm->fp->count - 1; i >= 0; i--)
1394 			unix_inflight(scm->fp->fp[i]);
1395 	}
1396 	return max_level;
1397 }
1398 
1399 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1400 {
1401 	int err = 0;
1402 
1403 	UNIXCB(skb).pid  = get_pid(scm->pid);
1404 	UNIXCB(skb).uid = scm->creds.uid;
1405 	UNIXCB(skb).gid = scm->creds.gid;
1406 	UNIXCB(skb).fp = NULL;
1407 	if (scm->fp && send_fds)
1408 		err = unix_attach_fds(scm, skb);
1409 
1410 	skb->destructor = unix_destruct_scm;
1411 	return err;
1412 }
1413 
1414 /*
1415  * Some apps rely on write() giving SCM_CREDENTIALS
1416  * We include credentials if source or destination socket
1417  * asserted SOCK_PASSCRED.
1418  */
1419 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1420 			    const struct sock *other)
1421 {
1422 	if (UNIXCB(skb).pid)
1423 		return;
1424 	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1425 	    !other->sk_socket ||
1426 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1427 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1428 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1429 	}
1430 }
1431 
1432 /*
1433  *	Send AF_UNIX data.
1434  */
1435 
1436 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1437 			      struct msghdr *msg, size_t len)
1438 {
1439 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1440 	struct sock *sk = sock->sk;
1441 	struct net *net = sock_net(sk);
1442 	struct unix_sock *u = unix_sk(sk);
1443 	struct sockaddr_un *sunaddr = msg->msg_name;
1444 	struct sock *other = NULL;
1445 	int namelen = 0; /* fake GCC */
1446 	int err;
1447 	unsigned int hash;
1448 	struct sk_buff *skb;
1449 	long timeo;
1450 	struct scm_cookie tmp_scm;
1451 	int max_level;
1452 	int data_len = 0;
1453 
1454 	if (NULL == siocb->scm)
1455 		siocb->scm = &tmp_scm;
1456 	wait_for_unix_gc();
1457 	err = scm_send(sock, msg, siocb->scm, false);
1458 	if (err < 0)
1459 		return err;
1460 
1461 	err = -EOPNOTSUPP;
1462 	if (msg->msg_flags&MSG_OOB)
1463 		goto out;
1464 
1465 	if (msg->msg_namelen) {
1466 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1467 		if (err < 0)
1468 			goto out;
1469 		namelen = err;
1470 	} else {
1471 		sunaddr = NULL;
1472 		err = -ENOTCONN;
1473 		other = unix_peer_get(sk);
1474 		if (!other)
1475 			goto out;
1476 	}
1477 
1478 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1479 	    && (err = unix_autobind(sock)) != 0)
1480 		goto out;
1481 
1482 	err = -EMSGSIZE;
1483 	if (len > sk->sk_sndbuf - 32)
1484 		goto out;
1485 
1486 	if (len > SKB_MAX_ALLOC)
1487 		data_len = min_t(size_t,
1488 				 len - SKB_MAX_ALLOC,
1489 				 MAX_SKB_FRAGS * PAGE_SIZE);
1490 
1491 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1492 				   msg->msg_flags & MSG_DONTWAIT, &err,
1493 				   PAGE_ALLOC_COSTLY_ORDER);
1494 	if (skb == NULL)
1495 		goto out;
1496 
1497 	err = unix_scm_to_skb(siocb->scm, skb, true);
1498 	if (err < 0)
1499 		goto out_free;
1500 	max_level = err + 1;
1501 	unix_get_secdata(siocb->scm, skb);
1502 
1503 	skb_put(skb, len - data_len);
1504 	skb->data_len = data_len;
1505 	skb->len = len;
1506 	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1507 	if (err)
1508 		goto out_free;
1509 
1510 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1511 
1512 restart:
1513 	if (!other) {
1514 		err = -ECONNRESET;
1515 		if (sunaddr == NULL)
1516 			goto out_free;
1517 
1518 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1519 					hash, &err);
1520 		if (other == NULL)
1521 			goto out_free;
1522 	}
1523 
1524 	if (sk_filter(other, skb) < 0) {
1525 		/* Toss the packet but do not return any error to the sender */
1526 		err = len;
1527 		goto out_free;
1528 	}
1529 
1530 	unix_state_lock(other);
1531 	err = -EPERM;
1532 	if (!unix_may_send(sk, other))
1533 		goto out_unlock;
1534 
1535 	if (sock_flag(other, SOCK_DEAD)) {
1536 		/*
1537 		 *	Check with 1003.1g - what should
1538 		 *	datagram error
1539 		 */
1540 		unix_state_unlock(other);
1541 		sock_put(other);
1542 
1543 		err = 0;
1544 		unix_state_lock(sk);
1545 		if (unix_peer(sk) == other) {
1546 			unix_peer(sk) = NULL;
1547 			unix_state_unlock(sk);
1548 
1549 			unix_dgram_disconnected(sk, other);
1550 			sock_put(other);
1551 			err = -ECONNREFUSED;
1552 		} else {
1553 			unix_state_unlock(sk);
1554 		}
1555 
1556 		other = NULL;
1557 		if (err)
1558 			goto out_free;
1559 		goto restart;
1560 	}
1561 
1562 	err = -EPIPE;
1563 	if (other->sk_shutdown & RCV_SHUTDOWN)
1564 		goto out_unlock;
1565 
1566 	if (sk->sk_type != SOCK_SEQPACKET) {
1567 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1568 		if (err)
1569 			goto out_unlock;
1570 	}
1571 
1572 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1573 		if (!timeo) {
1574 			err = -EAGAIN;
1575 			goto out_unlock;
1576 		}
1577 
1578 		timeo = unix_wait_for_peer(other, timeo);
1579 
1580 		err = sock_intr_errno(timeo);
1581 		if (signal_pending(current))
1582 			goto out_free;
1583 
1584 		goto restart;
1585 	}
1586 
1587 	if (sock_flag(other, SOCK_RCVTSTAMP))
1588 		__net_timestamp(skb);
1589 	maybe_add_creds(skb, sock, other);
1590 	skb_queue_tail(&other->sk_receive_queue, skb);
1591 	if (max_level > unix_sk(other)->recursion_level)
1592 		unix_sk(other)->recursion_level = max_level;
1593 	unix_state_unlock(other);
1594 	other->sk_data_ready(other, len);
1595 	sock_put(other);
1596 	scm_destroy(siocb->scm);
1597 	return len;
1598 
1599 out_unlock:
1600 	unix_state_unlock(other);
1601 out_free:
1602 	kfree_skb(skb);
1603 out:
1604 	if (other)
1605 		sock_put(other);
1606 	scm_destroy(siocb->scm);
1607 	return err;
1608 }
1609 
1610 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1611  * bytes, and a minimun of a full page.
1612  */
1613 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1614 
1615 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1616 			       struct msghdr *msg, size_t len)
1617 {
1618 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1619 	struct sock *sk = sock->sk;
1620 	struct sock *other = NULL;
1621 	int err, size;
1622 	struct sk_buff *skb;
1623 	int sent = 0;
1624 	struct scm_cookie tmp_scm;
1625 	bool fds_sent = false;
1626 	int max_level;
1627 	int data_len;
1628 
1629 	if (NULL == siocb->scm)
1630 		siocb->scm = &tmp_scm;
1631 	wait_for_unix_gc();
1632 	err = scm_send(sock, msg, siocb->scm, false);
1633 	if (err < 0)
1634 		return err;
1635 
1636 	err = -EOPNOTSUPP;
1637 	if (msg->msg_flags&MSG_OOB)
1638 		goto out_err;
1639 
1640 	if (msg->msg_namelen) {
1641 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1642 		goto out_err;
1643 	} else {
1644 		err = -ENOTCONN;
1645 		other = unix_peer(sk);
1646 		if (!other)
1647 			goto out_err;
1648 	}
1649 
1650 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1651 		goto pipe_err;
1652 
1653 	while (sent < len) {
1654 		size = len - sent;
1655 
1656 		/* Keep two messages in the pipe so it schedules better */
1657 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1658 
1659 		/* allow fallback to order-0 allocations */
1660 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1661 
1662 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1663 
1664 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1665 					   msg->msg_flags & MSG_DONTWAIT, &err,
1666 					   get_order(UNIX_SKB_FRAGS_SZ));
1667 		if (!skb)
1668 			goto out_err;
1669 
1670 		/* Only send the fds in the first buffer */
1671 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1672 		if (err < 0) {
1673 			kfree_skb(skb);
1674 			goto out_err;
1675 		}
1676 		max_level = err + 1;
1677 		fds_sent = true;
1678 
1679 		skb_put(skb, size - data_len);
1680 		skb->data_len = data_len;
1681 		skb->len = size;
1682 		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1683 						   sent, size);
1684 		if (err) {
1685 			kfree_skb(skb);
1686 			goto out_err;
1687 		}
1688 
1689 		unix_state_lock(other);
1690 
1691 		if (sock_flag(other, SOCK_DEAD) ||
1692 		    (other->sk_shutdown & RCV_SHUTDOWN))
1693 			goto pipe_err_free;
1694 
1695 		maybe_add_creds(skb, sock, other);
1696 		skb_queue_tail(&other->sk_receive_queue, skb);
1697 		if (max_level > unix_sk(other)->recursion_level)
1698 			unix_sk(other)->recursion_level = max_level;
1699 		unix_state_unlock(other);
1700 		other->sk_data_ready(other, size);
1701 		sent += size;
1702 	}
1703 
1704 	scm_destroy(siocb->scm);
1705 	siocb->scm = NULL;
1706 
1707 	return sent;
1708 
1709 pipe_err_free:
1710 	unix_state_unlock(other);
1711 	kfree_skb(skb);
1712 pipe_err:
1713 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1714 		send_sig(SIGPIPE, current, 0);
1715 	err = -EPIPE;
1716 out_err:
1717 	scm_destroy(siocb->scm);
1718 	siocb->scm = NULL;
1719 	return sent ? : err;
1720 }
1721 
1722 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1723 				  struct msghdr *msg, size_t len)
1724 {
1725 	int err;
1726 	struct sock *sk = sock->sk;
1727 
1728 	err = sock_error(sk);
1729 	if (err)
1730 		return err;
1731 
1732 	if (sk->sk_state != TCP_ESTABLISHED)
1733 		return -ENOTCONN;
1734 
1735 	if (msg->msg_namelen)
1736 		msg->msg_namelen = 0;
1737 
1738 	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1739 }
1740 
1741 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1742 			      struct msghdr *msg, size_t size,
1743 			      int flags)
1744 {
1745 	struct sock *sk = sock->sk;
1746 
1747 	if (sk->sk_state != TCP_ESTABLISHED)
1748 		return -ENOTCONN;
1749 
1750 	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1751 }
1752 
1753 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1754 {
1755 	struct unix_sock *u = unix_sk(sk);
1756 
1757 	msg->msg_namelen = 0;
1758 	if (u->addr) {
1759 		msg->msg_namelen = u->addr->len;
1760 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1761 	}
1762 }
1763 
1764 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1765 			      struct msghdr *msg, size_t size,
1766 			      int flags)
1767 {
1768 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1769 	struct scm_cookie tmp_scm;
1770 	struct sock *sk = sock->sk;
1771 	struct unix_sock *u = unix_sk(sk);
1772 	int noblock = flags & MSG_DONTWAIT;
1773 	struct sk_buff *skb;
1774 	int err;
1775 	int peeked, skip;
1776 
1777 	err = -EOPNOTSUPP;
1778 	if (flags&MSG_OOB)
1779 		goto out;
1780 
1781 	msg->msg_namelen = 0;
1782 
1783 	err = mutex_lock_interruptible(&u->readlock);
1784 	if (err) {
1785 		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1786 		goto out;
1787 	}
1788 
1789 	skip = sk_peek_offset(sk, flags);
1790 
1791 	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1792 	if (!skb) {
1793 		unix_state_lock(sk);
1794 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1795 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1796 		    (sk->sk_shutdown & RCV_SHUTDOWN))
1797 			err = 0;
1798 		unix_state_unlock(sk);
1799 		goto out_unlock;
1800 	}
1801 
1802 	wake_up_interruptible_sync_poll(&u->peer_wait,
1803 					POLLOUT | POLLWRNORM | POLLWRBAND);
1804 
1805 	if (msg->msg_name)
1806 		unix_copy_addr(msg, skb->sk);
1807 
1808 	if (size > skb->len - skip)
1809 		size = skb->len - skip;
1810 	else if (size < skb->len - skip)
1811 		msg->msg_flags |= MSG_TRUNC;
1812 
1813 	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1814 	if (err)
1815 		goto out_free;
1816 
1817 	if (sock_flag(sk, SOCK_RCVTSTAMP))
1818 		__sock_recv_timestamp(msg, sk, skb);
1819 
1820 	if (!siocb->scm) {
1821 		siocb->scm = &tmp_scm;
1822 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1823 	}
1824 	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1825 	unix_set_secdata(siocb->scm, skb);
1826 
1827 	if (!(flags & MSG_PEEK)) {
1828 		if (UNIXCB(skb).fp)
1829 			unix_detach_fds(siocb->scm, skb);
1830 
1831 		sk_peek_offset_bwd(sk, skb->len);
1832 	} else {
1833 		/* It is questionable: on PEEK we could:
1834 		   - do not return fds - good, but too simple 8)
1835 		   - return fds, and do not return them on read (old strategy,
1836 		     apparently wrong)
1837 		   - clone fds (I chose it for now, it is the most universal
1838 		     solution)
1839 
1840 		   POSIX 1003.1g does not actually define this clearly
1841 		   at all. POSIX 1003.1g doesn't define a lot of things
1842 		   clearly however!
1843 
1844 		*/
1845 
1846 		sk_peek_offset_fwd(sk, size);
1847 
1848 		if (UNIXCB(skb).fp)
1849 			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1850 	}
1851 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1852 
1853 	scm_recv(sock, msg, siocb->scm, flags);
1854 
1855 out_free:
1856 	skb_free_datagram(sk, skb);
1857 out_unlock:
1858 	mutex_unlock(&u->readlock);
1859 out:
1860 	return err;
1861 }
1862 
1863 /*
1864  *	Sleep until more data has arrived. But check for races..
1865  */
1866 static long unix_stream_data_wait(struct sock *sk, long timeo,
1867 				  struct sk_buff *last)
1868 {
1869 	DEFINE_WAIT(wait);
1870 
1871 	unix_state_lock(sk);
1872 
1873 	for (;;) {
1874 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1875 
1876 		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1877 		    sk->sk_err ||
1878 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1879 		    signal_pending(current) ||
1880 		    !timeo)
1881 			break;
1882 
1883 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1884 		unix_state_unlock(sk);
1885 		timeo = freezable_schedule_timeout(timeo);
1886 		unix_state_lock(sk);
1887 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1888 	}
1889 
1890 	finish_wait(sk_sleep(sk), &wait);
1891 	unix_state_unlock(sk);
1892 	return timeo;
1893 }
1894 
1895 static unsigned int unix_skb_len(const struct sk_buff *skb)
1896 {
1897 	return skb->len - UNIXCB(skb).consumed;
1898 }
1899 
1900 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1901 			       struct msghdr *msg, size_t size,
1902 			       int flags)
1903 {
1904 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1905 	struct scm_cookie tmp_scm;
1906 	struct sock *sk = sock->sk;
1907 	struct unix_sock *u = unix_sk(sk);
1908 	struct sockaddr_un *sunaddr = msg->msg_name;
1909 	int copied = 0;
1910 	int check_creds = 0;
1911 	int target;
1912 	int err = 0;
1913 	long timeo;
1914 	int skip;
1915 
1916 	err = -EINVAL;
1917 	if (sk->sk_state != TCP_ESTABLISHED)
1918 		goto out;
1919 
1920 	err = -EOPNOTSUPP;
1921 	if (flags&MSG_OOB)
1922 		goto out;
1923 
1924 	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1925 	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1926 
1927 	msg->msg_namelen = 0;
1928 
1929 	/* Lock the socket to prevent queue disordering
1930 	 * while sleeps in memcpy_tomsg
1931 	 */
1932 
1933 	if (!siocb->scm) {
1934 		siocb->scm = &tmp_scm;
1935 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1936 	}
1937 
1938 	err = mutex_lock_interruptible(&u->readlock);
1939 	if (err) {
1940 		err = sock_intr_errno(timeo);
1941 		goto out;
1942 	}
1943 
1944 	do {
1945 		int chunk;
1946 		struct sk_buff *skb, *last;
1947 
1948 		unix_state_lock(sk);
1949 		last = skb = skb_peek(&sk->sk_receive_queue);
1950 again:
1951 		if (skb == NULL) {
1952 			unix_sk(sk)->recursion_level = 0;
1953 			if (copied >= target)
1954 				goto unlock;
1955 
1956 			/*
1957 			 *	POSIX 1003.1g mandates this order.
1958 			 */
1959 
1960 			err = sock_error(sk);
1961 			if (err)
1962 				goto unlock;
1963 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1964 				goto unlock;
1965 
1966 			unix_state_unlock(sk);
1967 			err = -EAGAIN;
1968 			if (!timeo)
1969 				break;
1970 			mutex_unlock(&u->readlock);
1971 
1972 			timeo = unix_stream_data_wait(sk, timeo, last);
1973 
1974 			if (signal_pending(current)
1975 			    ||  mutex_lock_interruptible(&u->readlock)) {
1976 				err = sock_intr_errno(timeo);
1977 				goto out;
1978 			}
1979 
1980 			continue;
1981  unlock:
1982 			unix_state_unlock(sk);
1983 			break;
1984 		}
1985 
1986 		skip = sk_peek_offset(sk, flags);
1987 		while (skip >= unix_skb_len(skb)) {
1988 			skip -= unix_skb_len(skb);
1989 			last = skb;
1990 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1991 			if (!skb)
1992 				goto again;
1993 		}
1994 
1995 		unix_state_unlock(sk);
1996 
1997 		if (check_creds) {
1998 			/* Never glue messages from different writers */
1999 			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
2000 			    !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
2001 			    !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
2002 				break;
2003 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2004 			/* Copy credentials */
2005 			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2006 			check_creds = 1;
2007 		}
2008 
2009 		/* Copy address just once */
2010 		if (sunaddr) {
2011 			unix_copy_addr(msg, skb->sk);
2012 			sunaddr = NULL;
2013 		}
2014 
2015 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2016 		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2017 					    msg->msg_iov, chunk)) {
2018 			if (copied == 0)
2019 				copied = -EFAULT;
2020 			break;
2021 		}
2022 		copied += chunk;
2023 		size -= chunk;
2024 
2025 		/* Mark read part of skb as used */
2026 		if (!(flags & MSG_PEEK)) {
2027 			UNIXCB(skb).consumed += chunk;
2028 
2029 			sk_peek_offset_bwd(sk, chunk);
2030 
2031 			if (UNIXCB(skb).fp)
2032 				unix_detach_fds(siocb->scm, skb);
2033 
2034 			if (unix_skb_len(skb))
2035 				break;
2036 
2037 			skb_unlink(skb, &sk->sk_receive_queue);
2038 			consume_skb(skb);
2039 
2040 			if (siocb->scm->fp)
2041 				break;
2042 		} else {
2043 			/* It is questionable, see note in unix_dgram_recvmsg.
2044 			 */
2045 			if (UNIXCB(skb).fp)
2046 				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2047 
2048 			sk_peek_offset_fwd(sk, chunk);
2049 
2050 			break;
2051 		}
2052 	} while (size);
2053 
2054 	mutex_unlock(&u->readlock);
2055 	scm_recv(sock, msg, siocb->scm, flags);
2056 out:
2057 	return copied ? : err;
2058 }
2059 
2060 static int unix_shutdown(struct socket *sock, int mode)
2061 {
2062 	struct sock *sk = sock->sk;
2063 	struct sock *other;
2064 
2065 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2066 		return -EINVAL;
2067 	/* This maps:
2068 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2069 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2070 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2071 	 */
2072 	++mode;
2073 
2074 	unix_state_lock(sk);
2075 	sk->sk_shutdown |= mode;
2076 	other = unix_peer(sk);
2077 	if (other)
2078 		sock_hold(other);
2079 	unix_state_unlock(sk);
2080 	sk->sk_state_change(sk);
2081 
2082 	if (other &&
2083 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2084 
2085 		int peer_mode = 0;
2086 
2087 		if (mode&RCV_SHUTDOWN)
2088 			peer_mode |= SEND_SHUTDOWN;
2089 		if (mode&SEND_SHUTDOWN)
2090 			peer_mode |= RCV_SHUTDOWN;
2091 		unix_state_lock(other);
2092 		other->sk_shutdown |= peer_mode;
2093 		unix_state_unlock(other);
2094 		other->sk_state_change(other);
2095 		if (peer_mode == SHUTDOWN_MASK)
2096 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2097 		else if (peer_mode & RCV_SHUTDOWN)
2098 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2099 	}
2100 	if (other)
2101 		sock_put(other);
2102 
2103 	return 0;
2104 }
2105 
2106 long unix_inq_len(struct sock *sk)
2107 {
2108 	struct sk_buff *skb;
2109 	long amount = 0;
2110 
2111 	if (sk->sk_state == TCP_LISTEN)
2112 		return -EINVAL;
2113 
2114 	spin_lock(&sk->sk_receive_queue.lock);
2115 	if (sk->sk_type == SOCK_STREAM ||
2116 	    sk->sk_type == SOCK_SEQPACKET) {
2117 		skb_queue_walk(&sk->sk_receive_queue, skb)
2118 			amount += unix_skb_len(skb);
2119 	} else {
2120 		skb = skb_peek(&sk->sk_receive_queue);
2121 		if (skb)
2122 			amount = skb->len;
2123 	}
2124 	spin_unlock(&sk->sk_receive_queue.lock);
2125 
2126 	return amount;
2127 }
2128 EXPORT_SYMBOL_GPL(unix_inq_len);
2129 
2130 long unix_outq_len(struct sock *sk)
2131 {
2132 	return sk_wmem_alloc_get(sk);
2133 }
2134 EXPORT_SYMBOL_GPL(unix_outq_len);
2135 
2136 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2137 {
2138 	struct sock *sk = sock->sk;
2139 	long amount = 0;
2140 	int err;
2141 
2142 	switch (cmd) {
2143 	case SIOCOUTQ:
2144 		amount = unix_outq_len(sk);
2145 		err = put_user(amount, (int __user *)arg);
2146 		break;
2147 	case SIOCINQ:
2148 		amount = unix_inq_len(sk);
2149 		if (amount < 0)
2150 			err = amount;
2151 		else
2152 			err = put_user(amount, (int __user *)arg);
2153 		break;
2154 	default:
2155 		err = -ENOIOCTLCMD;
2156 		break;
2157 	}
2158 	return err;
2159 }
2160 
2161 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2162 {
2163 	struct sock *sk = sock->sk;
2164 	unsigned int mask;
2165 
2166 	sock_poll_wait(file, sk_sleep(sk), wait);
2167 	mask = 0;
2168 
2169 	/* exceptional events? */
2170 	if (sk->sk_err)
2171 		mask |= POLLERR;
2172 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2173 		mask |= POLLHUP;
2174 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2175 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2176 
2177 	/* readable? */
2178 	if (!skb_queue_empty(&sk->sk_receive_queue))
2179 		mask |= POLLIN | POLLRDNORM;
2180 
2181 	/* Connection-based need to check for termination and startup */
2182 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2183 	    sk->sk_state == TCP_CLOSE)
2184 		mask |= POLLHUP;
2185 
2186 	/*
2187 	 * we set writable also when the other side has shut down the
2188 	 * connection. This prevents stuck sockets.
2189 	 */
2190 	if (unix_writable(sk))
2191 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2192 
2193 	return mask;
2194 }
2195 
2196 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2197 				    poll_table *wait)
2198 {
2199 	struct sock *sk = sock->sk, *other;
2200 	unsigned int mask, writable;
2201 
2202 	sock_poll_wait(file, sk_sleep(sk), wait);
2203 	mask = 0;
2204 
2205 	/* exceptional events? */
2206 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2207 		mask |= POLLERR |
2208 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2209 
2210 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2211 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2212 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2213 		mask |= POLLHUP;
2214 
2215 	/* readable? */
2216 	if (!skb_queue_empty(&sk->sk_receive_queue))
2217 		mask |= POLLIN | POLLRDNORM;
2218 
2219 	/* Connection-based need to check for termination and startup */
2220 	if (sk->sk_type == SOCK_SEQPACKET) {
2221 		if (sk->sk_state == TCP_CLOSE)
2222 			mask |= POLLHUP;
2223 		/* connection hasn't started yet? */
2224 		if (sk->sk_state == TCP_SYN_SENT)
2225 			return mask;
2226 	}
2227 
2228 	/* No write status requested, avoid expensive OUT tests. */
2229 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2230 		return mask;
2231 
2232 	writable = unix_writable(sk);
2233 	other = unix_peer_get(sk);
2234 	if (other) {
2235 		if (unix_peer(other) != sk) {
2236 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2237 			if (unix_recvq_full(other))
2238 				writable = 0;
2239 		}
2240 		sock_put(other);
2241 	}
2242 
2243 	if (writable)
2244 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2245 	else
2246 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2247 
2248 	return mask;
2249 }
2250 
2251 #ifdef CONFIG_PROC_FS
2252 
2253 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2254 
2255 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2256 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2257 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2258 
2259 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2260 {
2261 	unsigned long offset = get_offset(*pos);
2262 	unsigned long bucket = get_bucket(*pos);
2263 	struct sock *sk;
2264 	unsigned long count = 0;
2265 
2266 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2267 		if (sock_net(sk) != seq_file_net(seq))
2268 			continue;
2269 		if (++count == offset)
2270 			break;
2271 	}
2272 
2273 	return sk;
2274 }
2275 
2276 static struct sock *unix_next_socket(struct seq_file *seq,
2277 				     struct sock *sk,
2278 				     loff_t *pos)
2279 {
2280 	unsigned long bucket;
2281 
2282 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2283 		sk = sk_next(sk);
2284 		if (!sk)
2285 			goto next_bucket;
2286 		if (sock_net(sk) == seq_file_net(seq))
2287 			return sk;
2288 	}
2289 
2290 	do {
2291 		sk = unix_from_bucket(seq, pos);
2292 		if (sk)
2293 			return sk;
2294 
2295 next_bucket:
2296 		bucket = get_bucket(*pos) + 1;
2297 		*pos = set_bucket_offset(bucket, 1);
2298 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2299 
2300 	return NULL;
2301 }
2302 
2303 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2304 	__acquires(unix_table_lock)
2305 {
2306 	spin_lock(&unix_table_lock);
2307 
2308 	if (!*pos)
2309 		return SEQ_START_TOKEN;
2310 
2311 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2312 		return NULL;
2313 
2314 	return unix_next_socket(seq, NULL, pos);
2315 }
2316 
2317 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2318 {
2319 	++*pos;
2320 	return unix_next_socket(seq, v, pos);
2321 }
2322 
2323 static void unix_seq_stop(struct seq_file *seq, void *v)
2324 	__releases(unix_table_lock)
2325 {
2326 	spin_unlock(&unix_table_lock);
2327 }
2328 
2329 static int unix_seq_show(struct seq_file *seq, void *v)
2330 {
2331 
2332 	if (v == SEQ_START_TOKEN)
2333 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2334 			 "Inode Path\n");
2335 	else {
2336 		struct sock *s = v;
2337 		struct unix_sock *u = unix_sk(s);
2338 		unix_state_lock(s);
2339 
2340 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2341 			s,
2342 			atomic_read(&s->sk_refcnt),
2343 			0,
2344 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2345 			s->sk_type,
2346 			s->sk_socket ?
2347 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2348 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2349 			sock_i_ino(s));
2350 
2351 		if (u->addr) {
2352 			int i, len;
2353 			seq_putc(seq, ' ');
2354 
2355 			i = 0;
2356 			len = u->addr->len - sizeof(short);
2357 			if (!UNIX_ABSTRACT(s))
2358 				len--;
2359 			else {
2360 				seq_putc(seq, '@');
2361 				i++;
2362 			}
2363 			for ( ; i < len; i++)
2364 				seq_putc(seq, u->addr->name->sun_path[i]);
2365 		}
2366 		unix_state_unlock(s);
2367 		seq_putc(seq, '\n');
2368 	}
2369 
2370 	return 0;
2371 }
2372 
2373 static const struct seq_operations unix_seq_ops = {
2374 	.start  = unix_seq_start,
2375 	.next   = unix_seq_next,
2376 	.stop   = unix_seq_stop,
2377 	.show   = unix_seq_show,
2378 };
2379 
2380 static int unix_seq_open(struct inode *inode, struct file *file)
2381 {
2382 	return seq_open_net(inode, file, &unix_seq_ops,
2383 			    sizeof(struct seq_net_private));
2384 }
2385 
2386 static const struct file_operations unix_seq_fops = {
2387 	.owner		= THIS_MODULE,
2388 	.open		= unix_seq_open,
2389 	.read		= seq_read,
2390 	.llseek		= seq_lseek,
2391 	.release	= seq_release_net,
2392 };
2393 
2394 #endif
2395 
2396 static const struct net_proto_family unix_family_ops = {
2397 	.family = PF_UNIX,
2398 	.create = unix_create,
2399 	.owner	= THIS_MODULE,
2400 };
2401 
2402 
2403 static int __net_init unix_net_init(struct net *net)
2404 {
2405 	int error = -ENOMEM;
2406 
2407 	net->unx.sysctl_max_dgram_qlen = 10;
2408 	if (unix_sysctl_register(net))
2409 		goto out;
2410 
2411 #ifdef CONFIG_PROC_FS
2412 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2413 		unix_sysctl_unregister(net);
2414 		goto out;
2415 	}
2416 #endif
2417 	error = 0;
2418 out:
2419 	return error;
2420 }
2421 
2422 static void __net_exit unix_net_exit(struct net *net)
2423 {
2424 	unix_sysctl_unregister(net);
2425 	remove_proc_entry("unix", net->proc_net);
2426 }
2427 
2428 static struct pernet_operations unix_net_ops = {
2429 	.init = unix_net_init,
2430 	.exit = unix_net_exit,
2431 };
2432 
2433 static int __init af_unix_init(void)
2434 {
2435 	int rc = -1;
2436 
2437 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2438 
2439 	rc = proto_register(&unix_proto, 1);
2440 	if (rc != 0) {
2441 		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2442 		       __func__);
2443 		goto out;
2444 	}
2445 
2446 	sock_register(&unix_family_ops);
2447 	register_pernet_subsys(&unix_net_ops);
2448 out:
2449 	return rc;
2450 }
2451 
2452 static void __exit af_unix_exit(void)
2453 {
2454 	sock_unregister(PF_UNIX);
2455 	proto_unregister(&unix_proto);
2456 	unregister_pernet_subsys(&unix_net_ops);
2457 }
2458 
2459 /* Earlier than device_initcall() so that other drivers invoking
2460    request_module() don't end up in a loop when modprobe tries
2461    to use a UNIX socket. But later than subsys_initcall() because
2462    we depend on stuff initialised there */
2463 fs_initcall(af_unix_init);
2464 module_exit(af_unix_exit);
2465 
2466 MODULE_LICENSE("GPL");
2467 MODULE_ALIAS_NETPROTO(PF_UNIX);
2468