xref: /openbmc/linux/net/unix/af_unix.c (revision e6c81cce)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
84 
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
95 #include <linux/un.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <asm/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
120 
121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
122 EXPORT_SYMBOL_GPL(unix_socket_table);
123 DEFINE_SPINLOCK(unix_table_lock);
124 EXPORT_SYMBOL_GPL(unix_table_lock);
125 static atomic_long_t unix_nr_socks;
126 
127 
128 static struct hlist_head *unix_sockets_unbound(void *addr)
129 {
130 	unsigned long hash = (unsigned long)addr;
131 
132 	hash ^= hash >> 16;
133 	hash ^= hash >> 8;
134 	hash %= UNIX_HASH_SIZE;
135 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
136 }
137 
138 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
139 
140 #ifdef CONFIG_SECURITY_NETWORK
141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
142 {
143 	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
144 }
145 
146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
147 {
148 	scm->secid = *UNIXSID(skb);
149 }
150 #else
151 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
152 { }
153 
154 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
155 { }
156 #endif /* CONFIG_SECURITY_NETWORK */
157 
158 /*
159  *  SMP locking strategy:
160  *    hash table is protected with spinlock unix_table_lock
161  *    each socket state is protected by separate spin lock.
162  */
163 
164 static inline unsigned int unix_hash_fold(__wsum n)
165 {
166 	unsigned int hash = (__force unsigned int)csum_fold(n);
167 
168 	hash ^= hash>>8;
169 	return hash&(UNIX_HASH_SIZE-1);
170 }
171 
172 #define unix_peer(sk) (unix_sk(sk)->peer)
173 
174 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
175 {
176 	return unix_peer(osk) == sk;
177 }
178 
179 static inline int unix_may_send(struct sock *sk, struct sock *osk)
180 {
181 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
182 }
183 
184 static inline int unix_recvq_full(struct sock const *sk)
185 {
186 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
187 }
188 
189 struct sock *unix_peer_get(struct sock *s)
190 {
191 	struct sock *peer;
192 
193 	unix_state_lock(s);
194 	peer = unix_peer(s);
195 	if (peer)
196 		sock_hold(peer);
197 	unix_state_unlock(s);
198 	return peer;
199 }
200 EXPORT_SYMBOL_GPL(unix_peer_get);
201 
202 static inline void unix_release_addr(struct unix_address *addr)
203 {
204 	if (atomic_dec_and_test(&addr->refcnt))
205 		kfree(addr);
206 }
207 
208 /*
209  *	Check unix socket name:
210  *		- should be not zero length.
211  *	        - if started by not zero, should be NULL terminated (FS object)
212  *		- if started by zero, it is abstract name.
213  */
214 
215 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
216 {
217 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
218 		return -EINVAL;
219 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
220 		return -EINVAL;
221 	if (sunaddr->sun_path[0]) {
222 		/*
223 		 * This may look like an off by one error but it is a bit more
224 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
225 		 * sun_path[108] doesn't as such exist.  However in kernel space
226 		 * we are guaranteed that it is a valid memory location in our
227 		 * kernel address buffer.
228 		 */
229 		((char *)sunaddr)[len] = 0;
230 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
231 		return len;
232 	}
233 
234 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
235 	return len;
236 }
237 
238 static void __unix_remove_socket(struct sock *sk)
239 {
240 	sk_del_node_init(sk);
241 }
242 
243 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
244 {
245 	WARN_ON(!sk_unhashed(sk));
246 	sk_add_node(sk, list);
247 }
248 
249 static inline void unix_remove_socket(struct sock *sk)
250 {
251 	spin_lock(&unix_table_lock);
252 	__unix_remove_socket(sk);
253 	spin_unlock(&unix_table_lock);
254 }
255 
256 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
257 {
258 	spin_lock(&unix_table_lock);
259 	__unix_insert_socket(list, sk);
260 	spin_unlock(&unix_table_lock);
261 }
262 
263 static struct sock *__unix_find_socket_byname(struct net *net,
264 					      struct sockaddr_un *sunname,
265 					      int len, int type, unsigned int hash)
266 {
267 	struct sock *s;
268 
269 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
270 		struct unix_sock *u = unix_sk(s);
271 
272 		if (!net_eq(sock_net(s), net))
273 			continue;
274 
275 		if (u->addr->len == len &&
276 		    !memcmp(u->addr->name, sunname, len))
277 			goto found;
278 	}
279 	s = NULL;
280 found:
281 	return s;
282 }
283 
284 static inline struct sock *unix_find_socket_byname(struct net *net,
285 						   struct sockaddr_un *sunname,
286 						   int len, int type,
287 						   unsigned int hash)
288 {
289 	struct sock *s;
290 
291 	spin_lock(&unix_table_lock);
292 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
293 	if (s)
294 		sock_hold(s);
295 	spin_unlock(&unix_table_lock);
296 	return s;
297 }
298 
299 static struct sock *unix_find_socket_byinode(struct inode *i)
300 {
301 	struct sock *s;
302 
303 	spin_lock(&unix_table_lock);
304 	sk_for_each(s,
305 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
306 		struct dentry *dentry = unix_sk(s)->path.dentry;
307 
308 		if (dentry && dentry->d_inode == i) {
309 			sock_hold(s);
310 			goto found;
311 		}
312 	}
313 	s = NULL;
314 found:
315 	spin_unlock(&unix_table_lock);
316 	return s;
317 }
318 
319 static inline int unix_writable(struct sock *sk)
320 {
321 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
322 }
323 
324 static void unix_write_space(struct sock *sk)
325 {
326 	struct socket_wq *wq;
327 
328 	rcu_read_lock();
329 	if (unix_writable(sk)) {
330 		wq = rcu_dereference(sk->sk_wq);
331 		if (wq_has_sleeper(wq))
332 			wake_up_interruptible_sync_poll(&wq->wait,
333 				POLLOUT | POLLWRNORM | POLLWRBAND);
334 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
335 	}
336 	rcu_read_unlock();
337 }
338 
339 /* When dgram socket disconnects (or changes its peer), we clear its receive
340  * queue of packets arrived from previous peer. First, it allows to do
341  * flow control based only on wmem_alloc; second, sk connected to peer
342  * may receive messages only from that peer. */
343 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
344 {
345 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
346 		skb_queue_purge(&sk->sk_receive_queue);
347 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
348 
349 		/* If one link of bidirectional dgram pipe is disconnected,
350 		 * we signal error. Messages are lost. Do not make this,
351 		 * when peer was not connected to us.
352 		 */
353 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
354 			other->sk_err = ECONNRESET;
355 			other->sk_error_report(other);
356 		}
357 	}
358 }
359 
360 static void unix_sock_destructor(struct sock *sk)
361 {
362 	struct unix_sock *u = unix_sk(sk);
363 
364 	skb_queue_purge(&sk->sk_receive_queue);
365 
366 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
367 	WARN_ON(!sk_unhashed(sk));
368 	WARN_ON(sk->sk_socket);
369 	if (!sock_flag(sk, SOCK_DEAD)) {
370 		pr_info("Attempt to release alive unix socket: %p\n", sk);
371 		return;
372 	}
373 
374 	if (u->addr)
375 		unix_release_addr(u->addr);
376 
377 	atomic_long_dec(&unix_nr_socks);
378 	local_bh_disable();
379 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
380 	local_bh_enable();
381 #ifdef UNIX_REFCNT_DEBUG
382 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
383 		atomic_long_read(&unix_nr_socks));
384 #endif
385 }
386 
387 static void unix_release_sock(struct sock *sk, int embrion)
388 {
389 	struct unix_sock *u = unix_sk(sk);
390 	struct path path;
391 	struct sock *skpair;
392 	struct sk_buff *skb;
393 	int state;
394 
395 	unix_remove_socket(sk);
396 
397 	/* Clear state */
398 	unix_state_lock(sk);
399 	sock_orphan(sk);
400 	sk->sk_shutdown = SHUTDOWN_MASK;
401 	path	     = u->path;
402 	u->path.dentry = NULL;
403 	u->path.mnt = NULL;
404 	state = sk->sk_state;
405 	sk->sk_state = TCP_CLOSE;
406 	unix_state_unlock(sk);
407 
408 	wake_up_interruptible_all(&u->peer_wait);
409 
410 	skpair = unix_peer(sk);
411 
412 	if (skpair != NULL) {
413 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
414 			unix_state_lock(skpair);
415 			/* No more writes */
416 			skpair->sk_shutdown = SHUTDOWN_MASK;
417 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
418 				skpair->sk_err = ECONNRESET;
419 			unix_state_unlock(skpair);
420 			skpair->sk_state_change(skpair);
421 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
422 		}
423 		sock_put(skpair); /* It may now die */
424 		unix_peer(sk) = NULL;
425 	}
426 
427 	/* Try to flush out this socket. Throw out buffers at least */
428 
429 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
430 		if (state == TCP_LISTEN)
431 			unix_release_sock(skb->sk, 1);
432 		/* passed fds are erased in the kfree_skb hook	      */
433 		kfree_skb(skb);
434 	}
435 
436 	if (path.dentry)
437 		path_put(&path);
438 
439 	sock_put(sk);
440 
441 	/* ---- Socket is dead now and most probably destroyed ---- */
442 
443 	/*
444 	 * Fixme: BSD difference: In BSD all sockets connected to us get
445 	 *	  ECONNRESET and we die on the spot. In Linux we behave
446 	 *	  like files and pipes do and wait for the last
447 	 *	  dereference.
448 	 *
449 	 * Can't we simply set sock->err?
450 	 *
451 	 *	  What the above comment does talk about? --ANK(980817)
452 	 */
453 
454 	if (unix_tot_inflight)
455 		unix_gc();		/* Garbage collect fds */
456 }
457 
458 static void init_peercred(struct sock *sk)
459 {
460 	put_pid(sk->sk_peer_pid);
461 	if (sk->sk_peer_cred)
462 		put_cred(sk->sk_peer_cred);
463 	sk->sk_peer_pid  = get_pid(task_tgid(current));
464 	sk->sk_peer_cred = get_current_cred();
465 }
466 
467 static void copy_peercred(struct sock *sk, struct sock *peersk)
468 {
469 	put_pid(sk->sk_peer_pid);
470 	if (sk->sk_peer_cred)
471 		put_cred(sk->sk_peer_cred);
472 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
473 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
474 }
475 
476 static int unix_listen(struct socket *sock, int backlog)
477 {
478 	int err;
479 	struct sock *sk = sock->sk;
480 	struct unix_sock *u = unix_sk(sk);
481 	struct pid *old_pid = NULL;
482 
483 	err = -EOPNOTSUPP;
484 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
485 		goto out;	/* Only stream/seqpacket sockets accept */
486 	err = -EINVAL;
487 	if (!u->addr)
488 		goto out;	/* No listens on an unbound socket */
489 	unix_state_lock(sk);
490 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
491 		goto out_unlock;
492 	if (backlog > sk->sk_max_ack_backlog)
493 		wake_up_interruptible_all(&u->peer_wait);
494 	sk->sk_max_ack_backlog	= backlog;
495 	sk->sk_state		= TCP_LISTEN;
496 	/* set credentials so connect can copy them */
497 	init_peercred(sk);
498 	err = 0;
499 
500 out_unlock:
501 	unix_state_unlock(sk);
502 	put_pid(old_pid);
503 out:
504 	return err;
505 }
506 
507 static int unix_release(struct socket *);
508 static int unix_bind(struct socket *, struct sockaddr *, int);
509 static int unix_stream_connect(struct socket *, struct sockaddr *,
510 			       int addr_len, int flags);
511 static int unix_socketpair(struct socket *, struct socket *);
512 static int unix_accept(struct socket *, struct socket *, int);
513 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
514 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
515 static unsigned int unix_dgram_poll(struct file *, struct socket *,
516 				    poll_table *);
517 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
518 static int unix_shutdown(struct socket *, int);
519 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
520 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
521 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
522 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
523 static int unix_dgram_connect(struct socket *, struct sockaddr *,
524 			      int, int);
525 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
526 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
527 				  int);
528 
529 static int unix_set_peek_off(struct sock *sk, int val)
530 {
531 	struct unix_sock *u = unix_sk(sk);
532 
533 	if (mutex_lock_interruptible(&u->readlock))
534 		return -EINTR;
535 
536 	sk->sk_peek_off = val;
537 	mutex_unlock(&u->readlock);
538 
539 	return 0;
540 }
541 
542 
543 static const struct proto_ops unix_stream_ops = {
544 	.family =	PF_UNIX,
545 	.owner =	THIS_MODULE,
546 	.release =	unix_release,
547 	.bind =		unix_bind,
548 	.connect =	unix_stream_connect,
549 	.socketpair =	unix_socketpair,
550 	.accept =	unix_accept,
551 	.getname =	unix_getname,
552 	.poll =		unix_poll,
553 	.ioctl =	unix_ioctl,
554 	.listen =	unix_listen,
555 	.shutdown =	unix_shutdown,
556 	.setsockopt =	sock_no_setsockopt,
557 	.getsockopt =	sock_no_getsockopt,
558 	.sendmsg =	unix_stream_sendmsg,
559 	.recvmsg =	unix_stream_recvmsg,
560 	.mmap =		sock_no_mmap,
561 	.sendpage =	sock_no_sendpage,
562 	.set_peek_off =	unix_set_peek_off,
563 };
564 
565 static const struct proto_ops unix_dgram_ops = {
566 	.family =	PF_UNIX,
567 	.owner =	THIS_MODULE,
568 	.release =	unix_release,
569 	.bind =		unix_bind,
570 	.connect =	unix_dgram_connect,
571 	.socketpair =	unix_socketpair,
572 	.accept =	sock_no_accept,
573 	.getname =	unix_getname,
574 	.poll =		unix_dgram_poll,
575 	.ioctl =	unix_ioctl,
576 	.listen =	sock_no_listen,
577 	.shutdown =	unix_shutdown,
578 	.setsockopt =	sock_no_setsockopt,
579 	.getsockopt =	sock_no_getsockopt,
580 	.sendmsg =	unix_dgram_sendmsg,
581 	.recvmsg =	unix_dgram_recvmsg,
582 	.mmap =		sock_no_mmap,
583 	.sendpage =	sock_no_sendpage,
584 	.set_peek_off =	unix_set_peek_off,
585 };
586 
587 static const struct proto_ops unix_seqpacket_ops = {
588 	.family =	PF_UNIX,
589 	.owner =	THIS_MODULE,
590 	.release =	unix_release,
591 	.bind =		unix_bind,
592 	.connect =	unix_stream_connect,
593 	.socketpair =	unix_socketpair,
594 	.accept =	unix_accept,
595 	.getname =	unix_getname,
596 	.poll =		unix_dgram_poll,
597 	.ioctl =	unix_ioctl,
598 	.listen =	unix_listen,
599 	.shutdown =	unix_shutdown,
600 	.setsockopt =	sock_no_setsockopt,
601 	.getsockopt =	sock_no_getsockopt,
602 	.sendmsg =	unix_seqpacket_sendmsg,
603 	.recvmsg =	unix_seqpacket_recvmsg,
604 	.mmap =		sock_no_mmap,
605 	.sendpage =	sock_no_sendpage,
606 	.set_peek_off =	unix_set_peek_off,
607 };
608 
609 static struct proto unix_proto = {
610 	.name			= "UNIX",
611 	.owner			= THIS_MODULE,
612 	.obj_size		= sizeof(struct unix_sock),
613 };
614 
615 /*
616  * AF_UNIX sockets do not interact with hardware, hence they
617  * dont trigger interrupts - so it's safe for them to have
618  * bh-unsafe locking for their sk_receive_queue.lock. Split off
619  * this special lock-class by reinitializing the spinlock key:
620  */
621 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
622 
623 static struct sock *unix_create1(struct net *net, struct socket *sock)
624 {
625 	struct sock *sk = NULL;
626 	struct unix_sock *u;
627 
628 	atomic_long_inc(&unix_nr_socks);
629 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
630 		goto out;
631 
632 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
633 	if (!sk)
634 		goto out;
635 
636 	sock_init_data(sock, sk);
637 	lockdep_set_class(&sk->sk_receive_queue.lock,
638 				&af_unix_sk_receive_queue_lock_key);
639 
640 	sk->sk_write_space	= unix_write_space;
641 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
642 	sk->sk_destruct		= unix_sock_destructor;
643 	u	  = unix_sk(sk);
644 	u->path.dentry = NULL;
645 	u->path.mnt = NULL;
646 	spin_lock_init(&u->lock);
647 	atomic_long_set(&u->inflight, 0);
648 	INIT_LIST_HEAD(&u->link);
649 	mutex_init(&u->readlock); /* single task reading lock */
650 	init_waitqueue_head(&u->peer_wait);
651 	unix_insert_socket(unix_sockets_unbound(sk), sk);
652 out:
653 	if (sk == NULL)
654 		atomic_long_dec(&unix_nr_socks);
655 	else {
656 		local_bh_disable();
657 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
658 		local_bh_enable();
659 	}
660 	return sk;
661 }
662 
663 static int unix_create(struct net *net, struct socket *sock, int protocol,
664 		       int kern)
665 {
666 	if (protocol && protocol != PF_UNIX)
667 		return -EPROTONOSUPPORT;
668 
669 	sock->state = SS_UNCONNECTED;
670 
671 	switch (sock->type) {
672 	case SOCK_STREAM:
673 		sock->ops = &unix_stream_ops;
674 		break;
675 		/*
676 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
677 		 *	nothing uses it.
678 		 */
679 	case SOCK_RAW:
680 		sock->type = SOCK_DGRAM;
681 	case SOCK_DGRAM:
682 		sock->ops = &unix_dgram_ops;
683 		break;
684 	case SOCK_SEQPACKET:
685 		sock->ops = &unix_seqpacket_ops;
686 		break;
687 	default:
688 		return -ESOCKTNOSUPPORT;
689 	}
690 
691 	return unix_create1(net, sock) ? 0 : -ENOMEM;
692 }
693 
694 static int unix_release(struct socket *sock)
695 {
696 	struct sock *sk = sock->sk;
697 
698 	if (!sk)
699 		return 0;
700 
701 	unix_release_sock(sk, 0);
702 	sock->sk = NULL;
703 
704 	return 0;
705 }
706 
707 static int unix_autobind(struct socket *sock)
708 {
709 	struct sock *sk = sock->sk;
710 	struct net *net = sock_net(sk);
711 	struct unix_sock *u = unix_sk(sk);
712 	static u32 ordernum = 1;
713 	struct unix_address *addr;
714 	int err;
715 	unsigned int retries = 0;
716 
717 	err = mutex_lock_interruptible(&u->readlock);
718 	if (err)
719 		return err;
720 
721 	err = 0;
722 	if (u->addr)
723 		goto out;
724 
725 	err = -ENOMEM;
726 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
727 	if (!addr)
728 		goto out;
729 
730 	addr->name->sun_family = AF_UNIX;
731 	atomic_set(&addr->refcnt, 1);
732 
733 retry:
734 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
735 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
736 
737 	spin_lock(&unix_table_lock);
738 	ordernum = (ordernum+1)&0xFFFFF;
739 
740 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
741 				      addr->hash)) {
742 		spin_unlock(&unix_table_lock);
743 		/*
744 		 * __unix_find_socket_byname() may take long time if many names
745 		 * are already in use.
746 		 */
747 		cond_resched();
748 		/* Give up if all names seems to be in use. */
749 		if (retries++ == 0xFFFFF) {
750 			err = -ENOSPC;
751 			kfree(addr);
752 			goto out;
753 		}
754 		goto retry;
755 	}
756 	addr->hash ^= sk->sk_type;
757 
758 	__unix_remove_socket(sk);
759 	u->addr = addr;
760 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
761 	spin_unlock(&unix_table_lock);
762 	err = 0;
763 
764 out:	mutex_unlock(&u->readlock);
765 	return err;
766 }
767 
768 static struct sock *unix_find_other(struct net *net,
769 				    struct sockaddr_un *sunname, int len,
770 				    int type, unsigned int hash, int *error)
771 {
772 	struct sock *u;
773 	struct path path;
774 	int err = 0;
775 
776 	if (sunname->sun_path[0]) {
777 		struct inode *inode;
778 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
779 		if (err)
780 			goto fail;
781 		inode = path.dentry->d_inode;
782 		err = inode_permission(inode, MAY_WRITE);
783 		if (err)
784 			goto put_fail;
785 
786 		err = -ECONNREFUSED;
787 		if (!S_ISSOCK(inode->i_mode))
788 			goto put_fail;
789 		u = unix_find_socket_byinode(inode);
790 		if (!u)
791 			goto put_fail;
792 
793 		if (u->sk_type == type)
794 			touch_atime(&path);
795 
796 		path_put(&path);
797 
798 		err = -EPROTOTYPE;
799 		if (u->sk_type != type) {
800 			sock_put(u);
801 			goto fail;
802 		}
803 	} else {
804 		err = -ECONNREFUSED;
805 		u = unix_find_socket_byname(net, sunname, len, type, hash);
806 		if (u) {
807 			struct dentry *dentry;
808 			dentry = unix_sk(u)->path.dentry;
809 			if (dentry)
810 				touch_atime(&unix_sk(u)->path);
811 		} else
812 			goto fail;
813 	}
814 	return u;
815 
816 put_fail:
817 	path_put(&path);
818 fail:
819 	*error = err;
820 	return NULL;
821 }
822 
823 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
824 {
825 	struct dentry *dentry;
826 	struct path path;
827 	int err = 0;
828 	/*
829 	 * Get the parent directory, calculate the hash for last
830 	 * component.
831 	 */
832 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
833 	err = PTR_ERR(dentry);
834 	if (IS_ERR(dentry))
835 		return err;
836 
837 	/*
838 	 * All right, let's create it.
839 	 */
840 	err = security_path_mknod(&path, dentry, mode, 0);
841 	if (!err) {
842 		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
843 		if (!err) {
844 			res->mnt = mntget(path.mnt);
845 			res->dentry = dget(dentry);
846 		}
847 	}
848 	done_path_create(&path, dentry);
849 	return err;
850 }
851 
852 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
853 {
854 	struct sock *sk = sock->sk;
855 	struct net *net = sock_net(sk);
856 	struct unix_sock *u = unix_sk(sk);
857 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
858 	char *sun_path = sunaddr->sun_path;
859 	int err;
860 	unsigned int hash;
861 	struct unix_address *addr;
862 	struct hlist_head *list;
863 
864 	err = -EINVAL;
865 	if (sunaddr->sun_family != AF_UNIX)
866 		goto out;
867 
868 	if (addr_len == sizeof(short)) {
869 		err = unix_autobind(sock);
870 		goto out;
871 	}
872 
873 	err = unix_mkname(sunaddr, addr_len, &hash);
874 	if (err < 0)
875 		goto out;
876 	addr_len = err;
877 
878 	err = mutex_lock_interruptible(&u->readlock);
879 	if (err)
880 		goto out;
881 
882 	err = -EINVAL;
883 	if (u->addr)
884 		goto out_up;
885 
886 	err = -ENOMEM;
887 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
888 	if (!addr)
889 		goto out_up;
890 
891 	memcpy(addr->name, sunaddr, addr_len);
892 	addr->len = addr_len;
893 	addr->hash = hash ^ sk->sk_type;
894 	atomic_set(&addr->refcnt, 1);
895 
896 	if (sun_path[0]) {
897 		struct path path;
898 		umode_t mode = S_IFSOCK |
899 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
900 		err = unix_mknod(sun_path, mode, &path);
901 		if (err) {
902 			if (err == -EEXIST)
903 				err = -EADDRINUSE;
904 			unix_release_addr(addr);
905 			goto out_up;
906 		}
907 		addr->hash = UNIX_HASH_SIZE;
908 		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
909 		spin_lock(&unix_table_lock);
910 		u->path = path;
911 		list = &unix_socket_table[hash];
912 	} else {
913 		spin_lock(&unix_table_lock);
914 		err = -EADDRINUSE;
915 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
916 					      sk->sk_type, hash)) {
917 			unix_release_addr(addr);
918 			goto out_unlock;
919 		}
920 
921 		list = &unix_socket_table[addr->hash];
922 	}
923 
924 	err = 0;
925 	__unix_remove_socket(sk);
926 	u->addr = addr;
927 	__unix_insert_socket(list, sk);
928 
929 out_unlock:
930 	spin_unlock(&unix_table_lock);
931 out_up:
932 	mutex_unlock(&u->readlock);
933 out:
934 	return err;
935 }
936 
937 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
938 {
939 	if (unlikely(sk1 == sk2) || !sk2) {
940 		unix_state_lock(sk1);
941 		return;
942 	}
943 	if (sk1 < sk2) {
944 		unix_state_lock(sk1);
945 		unix_state_lock_nested(sk2);
946 	} else {
947 		unix_state_lock(sk2);
948 		unix_state_lock_nested(sk1);
949 	}
950 }
951 
952 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
953 {
954 	if (unlikely(sk1 == sk2) || !sk2) {
955 		unix_state_unlock(sk1);
956 		return;
957 	}
958 	unix_state_unlock(sk1);
959 	unix_state_unlock(sk2);
960 }
961 
962 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
963 			      int alen, int flags)
964 {
965 	struct sock *sk = sock->sk;
966 	struct net *net = sock_net(sk);
967 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
968 	struct sock *other;
969 	unsigned int hash;
970 	int err;
971 
972 	if (addr->sa_family != AF_UNSPEC) {
973 		err = unix_mkname(sunaddr, alen, &hash);
974 		if (err < 0)
975 			goto out;
976 		alen = err;
977 
978 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
979 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
980 			goto out;
981 
982 restart:
983 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
984 		if (!other)
985 			goto out;
986 
987 		unix_state_double_lock(sk, other);
988 
989 		/* Apparently VFS overslept socket death. Retry. */
990 		if (sock_flag(other, SOCK_DEAD)) {
991 			unix_state_double_unlock(sk, other);
992 			sock_put(other);
993 			goto restart;
994 		}
995 
996 		err = -EPERM;
997 		if (!unix_may_send(sk, other))
998 			goto out_unlock;
999 
1000 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1001 		if (err)
1002 			goto out_unlock;
1003 
1004 	} else {
1005 		/*
1006 		 *	1003.1g breaking connected state with AF_UNSPEC
1007 		 */
1008 		other = NULL;
1009 		unix_state_double_lock(sk, other);
1010 	}
1011 
1012 	/*
1013 	 * If it was connected, reconnect.
1014 	 */
1015 	if (unix_peer(sk)) {
1016 		struct sock *old_peer = unix_peer(sk);
1017 		unix_peer(sk) = other;
1018 		unix_state_double_unlock(sk, other);
1019 
1020 		if (other != old_peer)
1021 			unix_dgram_disconnected(sk, old_peer);
1022 		sock_put(old_peer);
1023 	} else {
1024 		unix_peer(sk) = other;
1025 		unix_state_double_unlock(sk, other);
1026 	}
1027 	return 0;
1028 
1029 out_unlock:
1030 	unix_state_double_unlock(sk, other);
1031 	sock_put(other);
1032 out:
1033 	return err;
1034 }
1035 
1036 static long unix_wait_for_peer(struct sock *other, long timeo)
1037 {
1038 	struct unix_sock *u = unix_sk(other);
1039 	int sched;
1040 	DEFINE_WAIT(wait);
1041 
1042 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1043 
1044 	sched = !sock_flag(other, SOCK_DEAD) &&
1045 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1046 		unix_recvq_full(other);
1047 
1048 	unix_state_unlock(other);
1049 
1050 	if (sched)
1051 		timeo = schedule_timeout(timeo);
1052 
1053 	finish_wait(&u->peer_wait, &wait);
1054 	return timeo;
1055 }
1056 
1057 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1058 			       int addr_len, int flags)
1059 {
1060 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1061 	struct sock *sk = sock->sk;
1062 	struct net *net = sock_net(sk);
1063 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1064 	struct sock *newsk = NULL;
1065 	struct sock *other = NULL;
1066 	struct sk_buff *skb = NULL;
1067 	unsigned int hash;
1068 	int st;
1069 	int err;
1070 	long timeo;
1071 
1072 	err = unix_mkname(sunaddr, addr_len, &hash);
1073 	if (err < 0)
1074 		goto out;
1075 	addr_len = err;
1076 
1077 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1078 	    (err = unix_autobind(sock)) != 0)
1079 		goto out;
1080 
1081 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1082 
1083 	/* First of all allocate resources.
1084 	   If we will make it after state is locked,
1085 	   we will have to recheck all again in any case.
1086 	 */
1087 
1088 	err = -ENOMEM;
1089 
1090 	/* create new sock for complete connection */
1091 	newsk = unix_create1(sock_net(sk), NULL);
1092 	if (newsk == NULL)
1093 		goto out;
1094 
1095 	/* Allocate skb for sending to listening sock */
1096 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1097 	if (skb == NULL)
1098 		goto out;
1099 
1100 restart:
1101 	/*  Find listening sock. */
1102 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1103 	if (!other)
1104 		goto out;
1105 
1106 	/* Latch state of peer */
1107 	unix_state_lock(other);
1108 
1109 	/* Apparently VFS overslept socket death. Retry. */
1110 	if (sock_flag(other, SOCK_DEAD)) {
1111 		unix_state_unlock(other);
1112 		sock_put(other);
1113 		goto restart;
1114 	}
1115 
1116 	err = -ECONNREFUSED;
1117 	if (other->sk_state != TCP_LISTEN)
1118 		goto out_unlock;
1119 	if (other->sk_shutdown & RCV_SHUTDOWN)
1120 		goto out_unlock;
1121 
1122 	if (unix_recvq_full(other)) {
1123 		err = -EAGAIN;
1124 		if (!timeo)
1125 			goto out_unlock;
1126 
1127 		timeo = unix_wait_for_peer(other, timeo);
1128 
1129 		err = sock_intr_errno(timeo);
1130 		if (signal_pending(current))
1131 			goto out;
1132 		sock_put(other);
1133 		goto restart;
1134 	}
1135 
1136 	/* Latch our state.
1137 
1138 	   It is tricky place. We need to grab our state lock and cannot
1139 	   drop lock on peer. It is dangerous because deadlock is
1140 	   possible. Connect to self case and simultaneous
1141 	   attempt to connect are eliminated by checking socket
1142 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1143 	   check this before attempt to grab lock.
1144 
1145 	   Well, and we have to recheck the state after socket locked.
1146 	 */
1147 	st = sk->sk_state;
1148 
1149 	switch (st) {
1150 	case TCP_CLOSE:
1151 		/* This is ok... continue with connect */
1152 		break;
1153 	case TCP_ESTABLISHED:
1154 		/* Socket is already connected */
1155 		err = -EISCONN;
1156 		goto out_unlock;
1157 	default:
1158 		err = -EINVAL;
1159 		goto out_unlock;
1160 	}
1161 
1162 	unix_state_lock_nested(sk);
1163 
1164 	if (sk->sk_state != st) {
1165 		unix_state_unlock(sk);
1166 		unix_state_unlock(other);
1167 		sock_put(other);
1168 		goto restart;
1169 	}
1170 
1171 	err = security_unix_stream_connect(sk, other, newsk);
1172 	if (err) {
1173 		unix_state_unlock(sk);
1174 		goto out_unlock;
1175 	}
1176 
1177 	/* The way is open! Fastly set all the necessary fields... */
1178 
1179 	sock_hold(sk);
1180 	unix_peer(newsk)	= sk;
1181 	newsk->sk_state		= TCP_ESTABLISHED;
1182 	newsk->sk_type		= sk->sk_type;
1183 	init_peercred(newsk);
1184 	newu = unix_sk(newsk);
1185 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1186 	otheru = unix_sk(other);
1187 
1188 	/* copy address information from listening to new sock*/
1189 	if (otheru->addr) {
1190 		atomic_inc(&otheru->addr->refcnt);
1191 		newu->addr = otheru->addr;
1192 	}
1193 	if (otheru->path.dentry) {
1194 		path_get(&otheru->path);
1195 		newu->path = otheru->path;
1196 	}
1197 
1198 	/* Set credentials */
1199 	copy_peercred(sk, other);
1200 
1201 	sock->state	= SS_CONNECTED;
1202 	sk->sk_state	= TCP_ESTABLISHED;
1203 	sock_hold(newsk);
1204 
1205 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1206 	unix_peer(sk)	= newsk;
1207 
1208 	unix_state_unlock(sk);
1209 
1210 	/* take ten and and send info to listening sock */
1211 	spin_lock(&other->sk_receive_queue.lock);
1212 	__skb_queue_tail(&other->sk_receive_queue, skb);
1213 	spin_unlock(&other->sk_receive_queue.lock);
1214 	unix_state_unlock(other);
1215 	other->sk_data_ready(other);
1216 	sock_put(other);
1217 	return 0;
1218 
1219 out_unlock:
1220 	if (other)
1221 		unix_state_unlock(other);
1222 
1223 out:
1224 	kfree_skb(skb);
1225 	if (newsk)
1226 		unix_release_sock(newsk, 0);
1227 	if (other)
1228 		sock_put(other);
1229 	return err;
1230 }
1231 
1232 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1233 {
1234 	struct sock *ska = socka->sk, *skb = sockb->sk;
1235 
1236 	/* Join our sockets back to back */
1237 	sock_hold(ska);
1238 	sock_hold(skb);
1239 	unix_peer(ska) = skb;
1240 	unix_peer(skb) = ska;
1241 	init_peercred(ska);
1242 	init_peercred(skb);
1243 
1244 	if (ska->sk_type != SOCK_DGRAM) {
1245 		ska->sk_state = TCP_ESTABLISHED;
1246 		skb->sk_state = TCP_ESTABLISHED;
1247 		socka->state  = SS_CONNECTED;
1248 		sockb->state  = SS_CONNECTED;
1249 	}
1250 	return 0;
1251 }
1252 
1253 static void unix_sock_inherit_flags(const struct socket *old,
1254 				    struct socket *new)
1255 {
1256 	if (test_bit(SOCK_PASSCRED, &old->flags))
1257 		set_bit(SOCK_PASSCRED, &new->flags);
1258 	if (test_bit(SOCK_PASSSEC, &old->flags))
1259 		set_bit(SOCK_PASSSEC, &new->flags);
1260 }
1261 
1262 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1263 {
1264 	struct sock *sk = sock->sk;
1265 	struct sock *tsk;
1266 	struct sk_buff *skb;
1267 	int err;
1268 
1269 	err = -EOPNOTSUPP;
1270 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1271 		goto out;
1272 
1273 	err = -EINVAL;
1274 	if (sk->sk_state != TCP_LISTEN)
1275 		goto out;
1276 
1277 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1278 	 * so that no locks are necessary.
1279 	 */
1280 
1281 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1282 	if (!skb) {
1283 		/* This means receive shutdown. */
1284 		if (err == 0)
1285 			err = -EINVAL;
1286 		goto out;
1287 	}
1288 
1289 	tsk = skb->sk;
1290 	skb_free_datagram(sk, skb);
1291 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1292 
1293 	/* attach accepted sock to socket */
1294 	unix_state_lock(tsk);
1295 	newsock->state = SS_CONNECTED;
1296 	unix_sock_inherit_flags(sock, newsock);
1297 	sock_graft(tsk, newsock);
1298 	unix_state_unlock(tsk);
1299 	return 0;
1300 
1301 out:
1302 	return err;
1303 }
1304 
1305 
1306 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1307 {
1308 	struct sock *sk = sock->sk;
1309 	struct unix_sock *u;
1310 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1311 	int err = 0;
1312 
1313 	if (peer) {
1314 		sk = unix_peer_get(sk);
1315 
1316 		err = -ENOTCONN;
1317 		if (!sk)
1318 			goto out;
1319 		err = 0;
1320 	} else {
1321 		sock_hold(sk);
1322 	}
1323 
1324 	u = unix_sk(sk);
1325 	unix_state_lock(sk);
1326 	if (!u->addr) {
1327 		sunaddr->sun_family = AF_UNIX;
1328 		sunaddr->sun_path[0] = 0;
1329 		*uaddr_len = sizeof(short);
1330 	} else {
1331 		struct unix_address *addr = u->addr;
1332 
1333 		*uaddr_len = addr->len;
1334 		memcpy(sunaddr, addr->name, *uaddr_len);
1335 	}
1336 	unix_state_unlock(sk);
1337 	sock_put(sk);
1338 out:
1339 	return err;
1340 }
1341 
1342 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1343 {
1344 	int i;
1345 
1346 	scm->fp = UNIXCB(skb).fp;
1347 	UNIXCB(skb).fp = NULL;
1348 
1349 	for (i = scm->fp->count-1; i >= 0; i--)
1350 		unix_notinflight(scm->fp->fp[i]);
1351 }
1352 
1353 static void unix_destruct_scm(struct sk_buff *skb)
1354 {
1355 	struct scm_cookie scm;
1356 	memset(&scm, 0, sizeof(scm));
1357 	scm.pid  = UNIXCB(skb).pid;
1358 	if (UNIXCB(skb).fp)
1359 		unix_detach_fds(&scm, skb);
1360 
1361 	/* Alas, it calls VFS */
1362 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1363 	scm_destroy(&scm);
1364 	sock_wfree(skb);
1365 }
1366 
1367 #define MAX_RECURSION_LEVEL 4
1368 
1369 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1370 {
1371 	int i;
1372 	unsigned char max_level = 0;
1373 	int unix_sock_count = 0;
1374 
1375 	for (i = scm->fp->count - 1; i >= 0; i--) {
1376 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1377 
1378 		if (sk) {
1379 			unix_sock_count++;
1380 			max_level = max(max_level,
1381 					unix_sk(sk)->recursion_level);
1382 		}
1383 	}
1384 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1385 		return -ETOOMANYREFS;
1386 
1387 	/*
1388 	 * Need to duplicate file references for the sake of garbage
1389 	 * collection.  Otherwise a socket in the fps might become a
1390 	 * candidate for GC while the skb is not yet queued.
1391 	 */
1392 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1393 	if (!UNIXCB(skb).fp)
1394 		return -ENOMEM;
1395 
1396 	if (unix_sock_count) {
1397 		for (i = scm->fp->count - 1; i >= 0; i--)
1398 			unix_inflight(scm->fp->fp[i]);
1399 	}
1400 	return max_level;
1401 }
1402 
1403 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1404 {
1405 	int err = 0;
1406 
1407 	UNIXCB(skb).pid  = get_pid(scm->pid);
1408 	UNIXCB(skb).uid = scm->creds.uid;
1409 	UNIXCB(skb).gid = scm->creds.gid;
1410 	UNIXCB(skb).fp = NULL;
1411 	if (scm->fp && send_fds)
1412 		err = unix_attach_fds(scm, skb);
1413 
1414 	skb->destructor = unix_destruct_scm;
1415 	return err;
1416 }
1417 
1418 /*
1419  * Some apps rely on write() giving SCM_CREDENTIALS
1420  * We include credentials if source or destination socket
1421  * asserted SOCK_PASSCRED.
1422  */
1423 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1424 			    const struct sock *other)
1425 {
1426 	if (UNIXCB(skb).pid)
1427 		return;
1428 	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1429 	    !other->sk_socket ||
1430 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1431 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1432 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1433 	}
1434 }
1435 
1436 /*
1437  *	Send AF_UNIX data.
1438  */
1439 
1440 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1441 			      size_t len)
1442 {
1443 	struct sock *sk = sock->sk;
1444 	struct net *net = sock_net(sk);
1445 	struct unix_sock *u = unix_sk(sk);
1446 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1447 	struct sock *other = NULL;
1448 	int namelen = 0; /* fake GCC */
1449 	int err;
1450 	unsigned int hash;
1451 	struct sk_buff *skb;
1452 	long timeo;
1453 	struct scm_cookie scm;
1454 	int max_level;
1455 	int data_len = 0;
1456 
1457 	wait_for_unix_gc();
1458 	err = scm_send(sock, msg, &scm, false);
1459 	if (err < 0)
1460 		return err;
1461 
1462 	err = -EOPNOTSUPP;
1463 	if (msg->msg_flags&MSG_OOB)
1464 		goto out;
1465 
1466 	if (msg->msg_namelen) {
1467 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1468 		if (err < 0)
1469 			goto out;
1470 		namelen = err;
1471 	} else {
1472 		sunaddr = NULL;
1473 		err = -ENOTCONN;
1474 		other = unix_peer_get(sk);
1475 		if (!other)
1476 			goto out;
1477 	}
1478 
1479 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1480 	    && (err = unix_autobind(sock)) != 0)
1481 		goto out;
1482 
1483 	err = -EMSGSIZE;
1484 	if (len > sk->sk_sndbuf - 32)
1485 		goto out;
1486 
1487 	if (len > SKB_MAX_ALLOC) {
1488 		data_len = min_t(size_t,
1489 				 len - SKB_MAX_ALLOC,
1490 				 MAX_SKB_FRAGS * PAGE_SIZE);
1491 		data_len = PAGE_ALIGN(data_len);
1492 
1493 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1494 	}
1495 
1496 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1497 				   msg->msg_flags & MSG_DONTWAIT, &err,
1498 				   PAGE_ALLOC_COSTLY_ORDER);
1499 	if (skb == NULL)
1500 		goto out;
1501 
1502 	err = unix_scm_to_skb(&scm, skb, true);
1503 	if (err < 0)
1504 		goto out_free;
1505 	max_level = err + 1;
1506 	unix_get_secdata(&scm, skb);
1507 
1508 	skb_put(skb, len - data_len);
1509 	skb->data_len = data_len;
1510 	skb->len = len;
1511 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1512 	if (err)
1513 		goto out_free;
1514 
1515 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1516 
1517 restart:
1518 	if (!other) {
1519 		err = -ECONNRESET;
1520 		if (sunaddr == NULL)
1521 			goto out_free;
1522 
1523 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1524 					hash, &err);
1525 		if (other == NULL)
1526 			goto out_free;
1527 	}
1528 
1529 	if (sk_filter(other, skb) < 0) {
1530 		/* Toss the packet but do not return any error to the sender */
1531 		err = len;
1532 		goto out_free;
1533 	}
1534 
1535 	unix_state_lock(other);
1536 	err = -EPERM;
1537 	if (!unix_may_send(sk, other))
1538 		goto out_unlock;
1539 
1540 	if (sock_flag(other, SOCK_DEAD)) {
1541 		/*
1542 		 *	Check with 1003.1g - what should
1543 		 *	datagram error
1544 		 */
1545 		unix_state_unlock(other);
1546 		sock_put(other);
1547 
1548 		err = 0;
1549 		unix_state_lock(sk);
1550 		if (unix_peer(sk) == other) {
1551 			unix_peer(sk) = NULL;
1552 			unix_state_unlock(sk);
1553 
1554 			unix_dgram_disconnected(sk, other);
1555 			sock_put(other);
1556 			err = -ECONNREFUSED;
1557 		} else {
1558 			unix_state_unlock(sk);
1559 		}
1560 
1561 		other = NULL;
1562 		if (err)
1563 			goto out_free;
1564 		goto restart;
1565 	}
1566 
1567 	err = -EPIPE;
1568 	if (other->sk_shutdown & RCV_SHUTDOWN)
1569 		goto out_unlock;
1570 
1571 	if (sk->sk_type != SOCK_SEQPACKET) {
1572 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1573 		if (err)
1574 			goto out_unlock;
1575 	}
1576 
1577 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1578 		if (!timeo) {
1579 			err = -EAGAIN;
1580 			goto out_unlock;
1581 		}
1582 
1583 		timeo = unix_wait_for_peer(other, timeo);
1584 
1585 		err = sock_intr_errno(timeo);
1586 		if (signal_pending(current))
1587 			goto out_free;
1588 
1589 		goto restart;
1590 	}
1591 
1592 	if (sock_flag(other, SOCK_RCVTSTAMP))
1593 		__net_timestamp(skb);
1594 	maybe_add_creds(skb, sock, other);
1595 	skb_queue_tail(&other->sk_receive_queue, skb);
1596 	if (max_level > unix_sk(other)->recursion_level)
1597 		unix_sk(other)->recursion_level = max_level;
1598 	unix_state_unlock(other);
1599 	other->sk_data_ready(other);
1600 	sock_put(other);
1601 	scm_destroy(&scm);
1602 	return len;
1603 
1604 out_unlock:
1605 	unix_state_unlock(other);
1606 out_free:
1607 	kfree_skb(skb);
1608 out:
1609 	if (other)
1610 		sock_put(other);
1611 	scm_destroy(&scm);
1612 	return err;
1613 }
1614 
1615 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1616  * bytes, and a minimun of a full page.
1617  */
1618 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1619 
1620 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1621 			       size_t len)
1622 {
1623 	struct sock *sk = sock->sk;
1624 	struct sock *other = NULL;
1625 	int err, size;
1626 	struct sk_buff *skb;
1627 	int sent = 0;
1628 	struct scm_cookie scm;
1629 	bool fds_sent = false;
1630 	int max_level;
1631 	int data_len;
1632 
1633 	wait_for_unix_gc();
1634 	err = scm_send(sock, msg, &scm, false);
1635 	if (err < 0)
1636 		return err;
1637 
1638 	err = -EOPNOTSUPP;
1639 	if (msg->msg_flags&MSG_OOB)
1640 		goto out_err;
1641 
1642 	if (msg->msg_namelen) {
1643 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1644 		goto out_err;
1645 	} else {
1646 		err = -ENOTCONN;
1647 		other = unix_peer(sk);
1648 		if (!other)
1649 			goto out_err;
1650 	}
1651 
1652 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1653 		goto pipe_err;
1654 
1655 	while (sent < len) {
1656 		size = len - sent;
1657 
1658 		/* Keep two messages in the pipe so it schedules better */
1659 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1660 
1661 		/* allow fallback to order-0 allocations */
1662 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1663 
1664 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1665 
1666 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1667 
1668 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1669 					   msg->msg_flags & MSG_DONTWAIT, &err,
1670 					   get_order(UNIX_SKB_FRAGS_SZ));
1671 		if (!skb)
1672 			goto out_err;
1673 
1674 		/* Only send the fds in the first buffer */
1675 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
1676 		if (err < 0) {
1677 			kfree_skb(skb);
1678 			goto out_err;
1679 		}
1680 		max_level = err + 1;
1681 		fds_sent = true;
1682 
1683 		skb_put(skb, size - data_len);
1684 		skb->data_len = data_len;
1685 		skb->len = size;
1686 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1687 		if (err) {
1688 			kfree_skb(skb);
1689 			goto out_err;
1690 		}
1691 
1692 		unix_state_lock(other);
1693 
1694 		if (sock_flag(other, SOCK_DEAD) ||
1695 		    (other->sk_shutdown & RCV_SHUTDOWN))
1696 			goto pipe_err_free;
1697 
1698 		maybe_add_creds(skb, sock, other);
1699 		skb_queue_tail(&other->sk_receive_queue, skb);
1700 		if (max_level > unix_sk(other)->recursion_level)
1701 			unix_sk(other)->recursion_level = max_level;
1702 		unix_state_unlock(other);
1703 		other->sk_data_ready(other);
1704 		sent += size;
1705 	}
1706 
1707 	scm_destroy(&scm);
1708 
1709 	return sent;
1710 
1711 pipe_err_free:
1712 	unix_state_unlock(other);
1713 	kfree_skb(skb);
1714 pipe_err:
1715 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1716 		send_sig(SIGPIPE, current, 0);
1717 	err = -EPIPE;
1718 out_err:
1719 	scm_destroy(&scm);
1720 	return sent ? : err;
1721 }
1722 
1723 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
1724 				  size_t len)
1725 {
1726 	int err;
1727 	struct sock *sk = sock->sk;
1728 
1729 	err = sock_error(sk);
1730 	if (err)
1731 		return err;
1732 
1733 	if (sk->sk_state != TCP_ESTABLISHED)
1734 		return -ENOTCONN;
1735 
1736 	if (msg->msg_namelen)
1737 		msg->msg_namelen = 0;
1738 
1739 	return unix_dgram_sendmsg(sock, msg, len);
1740 }
1741 
1742 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
1743 				  size_t size, int flags)
1744 {
1745 	struct sock *sk = sock->sk;
1746 
1747 	if (sk->sk_state != TCP_ESTABLISHED)
1748 		return -ENOTCONN;
1749 
1750 	return unix_dgram_recvmsg(sock, msg, size, flags);
1751 }
1752 
1753 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1754 {
1755 	struct unix_sock *u = unix_sk(sk);
1756 
1757 	if (u->addr) {
1758 		msg->msg_namelen = u->addr->len;
1759 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1760 	}
1761 }
1762 
1763 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
1764 			      size_t size, int flags)
1765 {
1766 	struct scm_cookie scm;
1767 	struct sock *sk = sock->sk;
1768 	struct unix_sock *u = unix_sk(sk);
1769 	int noblock = flags & MSG_DONTWAIT;
1770 	struct sk_buff *skb;
1771 	int err;
1772 	int peeked, skip;
1773 
1774 	err = -EOPNOTSUPP;
1775 	if (flags&MSG_OOB)
1776 		goto out;
1777 
1778 	err = mutex_lock_interruptible(&u->readlock);
1779 	if (unlikely(err)) {
1780 		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
1781 		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1782 		 */
1783 		err = noblock ? -EAGAIN : -ERESTARTSYS;
1784 		goto out;
1785 	}
1786 
1787 	skip = sk_peek_offset(sk, flags);
1788 
1789 	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1790 	if (!skb) {
1791 		unix_state_lock(sk);
1792 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1793 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1794 		    (sk->sk_shutdown & RCV_SHUTDOWN))
1795 			err = 0;
1796 		unix_state_unlock(sk);
1797 		goto out_unlock;
1798 	}
1799 
1800 	wake_up_interruptible_sync_poll(&u->peer_wait,
1801 					POLLOUT | POLLWRNORM | POLLWRBAND);
1802 
1803 	if (msg->msg_name)
1804 		unix_copy_addr(msg, skb->sk);
1805 
1806 	if (size > skb->len - skip)
1807 		size = skb->len - skip;
1808 	else if (size < skb->len - skip)
1809 		msg->msg_flags |= MSG_TRUNC;
1810 
1811 	err = skb_copy_datagram_msg(skb, skip, msg, size);
1812 	if (err)
1813 		goto out_free;
1814 
1815 	if (sock_flag(sk, SOCK_RCVTSTAMP))
1816 		__sock_recv_timestamp(msg, sk, skb);
1817 
1818 	memset(&scm, 0, sizeof(scm));
1819 
1820 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1821 	unix_set_secdata(&scm, skb);
1822 
1823 	if (!(flags & MSG_PEEK)) {
1824 		if (UNIXCB(skb).fp)
1825 			unix_detach_fds(&scm, skb);
1826 
1827 		sk_peek_offset_bwd(sk, skb->len);
1828 	} else {
1829 		/* It is questionable: on PEEK we could:
1830 		   - do not return fds - good, but too simple 8)
1831 		   - return fds, and do not return them on read (old strategy,
1832 		     apparently wrong)
1833 		   - clone fds (I chose it for now, it is the most universal
1834 		     solution)
1835 
1836 		   POSIX 1003.1g does not actually define this clearly
1837 		   at all. POSIX 1003.1g doesn't define a lot of things
1838 		   clearly however!
1839 
1840 		*/
1841 
1842 		sk_peek_offset_fwd(sk, size);
1843 
1844 		if (UNIXCB(skb).fp)
1845 			scm.fp = scm_fp_dup(UNIXCB(skb).fp);
1846 	}
1847 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1848 
1849 	scm_recv(sock, msg, &scm, flags);
1850 
1851 out_free:
1852 	skb_free_datagram(sk, skb);
1853 out_unlock:
1854 	mutex_unlock(&u->readlock);
1855 out:
1856 	return err;
1857 }
1858 
1859 /*
1860  *	Sleep until more data has arrived. But check for races..
1861  */
1862 static long unix_stream_data_wait(struct sock *sk, long timeo,
1863 				  struct sk_buff *last)
1864 {
1865 	DEFINE_WAIT(wait);
1866 
1867 	unix_state_lock(sk);
1868 
1869 	for (;;) {
1870 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1871 
1872 		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1873 		    sk->sk_err ||
1874 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1875 		    signal_pending(current) ||
1876 		    !timeo)
1877 			break;
1878 
1879 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1880 		unix_state_unlock(sk);
1881 		timeo = freezable_schedule_timeout(timeo);
1882 		unix_state_lock(sk);
1883 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1884 	}
1885 
1886 	finish_wait(sk_sleep(sk), &wait);
1887 	unix_state_unlock(sk);
1888 	return timeo;
1889 }
1890 
1891 static unsigned int unix_skb_len(const struct sk_buff *skb)
1892 {
1893 	return skb->len - UNIXCB(skb).consumed;
1894 }
1895 
1896 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
1897 			       size_t size, int flags)
1898 {
1899 	struct scm_cookie scm;
1900 	struct sock *sk = sock->sk;
1901 	struct unix_sock *u = unix_sk(sk);
1902 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1903 	int copied = 0;
1904 	int noblock = flags & MSG_DONTWAIT;
1905 	int check_creds = 0;
1906 	int target;
1907 	int err = 0;
1908 	long timeo;
1909 	int skip;
1910 
1911 	err = -EINVAL;
1912 	if (sk->sk_state != TCP_ESTABLISHED)
1913 		goto out;
1914 
1915 	err = -EOPNOTSUPP;
1916 	if (flags&MSG_OOB)
1917 		goto out;
1918 
1919 	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1920 	timeo = sock_rcvtimeo(sk, noblock);
1921 
1922 	/* Lock the socket to prevent queue disordering
1923 	 * while sleeps in memcpy_tomsg
1924 	 */
1925 
1926 	memset(&scm, 0, sizeof(scm));
1927 
1928 	err = mutex_lock_interruptible(&u->readlock);
1929 	if (unlikely(err)) {
1930 		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
1931 		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1932 		 */
1933 		err = noblock ? -EAGAIN : -ERESTARTSYS;
1934 		goto out;
1935 	}
1936 
1937 	do {
1938 		int chunk;
1939 		struct sk_buff *skb, *last;
1940 
1941 		unix_state_lock(sk);
1942 		last = skb = skb_peek(&sk->sk_receive_queue);
1943 again:
1944 		if (skb == NULL) {
1945 			unix_sk(sk)->recursion_level = 0;
1946 			if (copied >= target)
1947 				goto unlock;
1948 
1949 			/*
1950 			 *	POSIX 1003.1g mandates this order.
1951 			 */
1952 
1953 			err = sock_error(sk);
1954 			if (err)
1955 				goto unlock;
1956 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1957 				goto unlock;
1958 
1959 			unix_state_unlock(sk);
1960 			err = -EAGAIN;
1961 			if (!timeo)
1962 				break;
1963 			mutex_unlock(&u->readlock);
1964 
1965 			timeo = unix_stream_data_wait(sk, timeo, last);
1966 
1967 			if (signal_pending(current)
1968 			    ||  mutex_lock_interruptible(&u->readlock)) {
1969 				err = sock_intr_errno(timeo);
1970 				goto out;
1971 			}
1972 
1973 			continue;
1974  unlock:
1975 			unix_state_unlock(sk);
1976 			break;
1977 		}
1978 
1979 		skip = sk_peek_offset(sk, flags);
1980 		while (skip >= unix_skb_len(skb)) {
1981 			skip -= unix_skb_len(skb);
1982 			last = skb;
1983 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1984 			if (!skb)
1985 				goto again;
1986 		}
1987 
1988 		unix_state_unlock(sk);
1989 
1990 		if (check_creds) {
1991 			/* Never glue messages from different writers */
1992 			if ((UNIXCB(skb).pid  != scm.pid) ||
1993 			    !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
1994 			    !gid_eq(UNIXCB(skb).gid, scm.creds.gid))
1995 				break;
1996 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
1997 			/* Copy credentials */
1998 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1999 			check_creds = 1;
2000 		}
2001 
2002 		/* Copy address just once */
2003 		if (sunaddr) {
2004 			unix_copy_addr(msg, skb->sk);
2005 			sunaddr = NULL;
2006 		}
2007 
2008 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2009 		if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2010 					  msg, chunk)) {
2011 			if (copied == 0)
2012 				copied = -EFAULT;
2013 			break;
2014 		}
2015 		copied += chunk;
2016 		size -= chunk;
2017 
2018 		/* Mark read part of skb as used */
2019 		if (!(flags & MSG_PEEK)) {
2020 			UNIXCB(skb).consumed += chunk;
2021 
2022 			sk_peek_offset_bwd(sk, chunk);
2023 
2024 			if (UNIXCB(skb).fp)
2025 				unix_detach_fds(&scm, skb);
2026 
2027 			if (unix_skb_len(skb))
2028 				break;
2029 
2030 			skb_unlink(skb, &sk->sk_receive_queue);
2031 			consume_skb(skb);
2032 
2033 			if (scm.fp)
2034 				break;
2035 		} else {
2036 			/* It is questionable, see note in unix_dgram_recvmsg.
2037 			 */
2038 			if (UNIXCB(skb).fp)
2039 				scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2040 
2041 			sk_peek_offset_fwd(sk, chunk);
2042 
2043 			break;
2044 		}
2045 	} while (size);
2046 
2047 	mutex_unlock(&u->readlock);
2048 	scm_recv(sock, msg, &scm, flags);
2049 out:
2050 	return copied ? : err;
2051 }
2052 
2053 static int unix_shutdown(struct socket *sock, int mode)
2054 {
2055 	struct sock *sk = sock->sk;
2056 	struct sock *other;
2057 
2058 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2059 		return -EINVAL;
2060 	/* This maps:
2061 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2062 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2063 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2064 	 */
2065 	++mode;
2066 
2067 	unix_state_lock(sk);
2068 	sk->sk_shutdown |= mode;
2069 	other = unix_peer(sk);
2070 	if (other)
2071 		sock_hold(other);
2072 	unix_state_unlock(sk);
2073 	sk->sk_state_change(sk);
2074 
2075 	if (other &&
2076 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2077 
2078 		int peer_mode = 0;
2079 
2080 		if (mode&RCV_SHUTDOWN)
2081 			peer_mode |= SEND_SHUTDOWN;
2082 		if (mode&SEND_SHUTDOWN)
2083 			peer_mode |= RCV_SHUTDOWN;
2084 		unix_state_lock(other);
2085 		other->sk_shutdown |= peer_mode;
2086 		unix_state_unlock(other);
2087 		other->sk_state_change(other);
2088 		if (peer_mode == SHUTDOWN_MASK)
2089 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2090 		else if (peer_mode & RCV_SHUTDOWN)
2091 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2092 	}
2093 	if (other)
2094 		sock_put(other);
2095 
2096 	return 0;
2097 }
2098 
2099 long unix_inq_len(struct sock *sk)
2100 {
2101 	struct sk_buff *skb;
2102 	long amount = 0;
2103 
2104 	if (sk->sk_state == TCP_LISTEN)
2105 		return -EINVAL;
2106 
2107 	spin_lock(&sk->sk_receive_queue.lock);
2108 	if (sk->sk_type == SOCK_STREAM ||
2109 	    sk->sk_type == SOCK_SEQPACKET) {
2110 		skb_queue_walk(&sk->sk_receive_queue, skb)
2111 			amount += unix_skb_len(skb);
2112 	} else {
2113 		skb = skb_peek(&sk->sk_receive_queue);
2114 		if (skb)
2115 			amount = skb->len;
2116 	}
2117 	spin_unlock(&sk->sk_receive_queue.lock);
2118 
2119 	return amount;
2120 }
2121 EXPORT_SYMBOL_GPL(unix_inq_len);
2122 
2123 long unix_outq_len(struct sock *sk)
2124 {
2125 	return sk_wmem_alloc_get(sk);
2126 }
2127 EXPORT_SYMBOL_GPL(unix_outq_len);
2128 
2129 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2130 {
2131 	struct sock *sk = sock->sk;
2132 	long amount = 0;
2133 	int err;
2134 
2135 	switch (cmd) {
2136 	case SIOCOUTQ:
2137 		amount = unix_outq_len(sk);
2138 		err = put_user(amount, (int __user *)arg);
2139 		break;
2140 	case SIOCINQ:
2141 		amount = unix_inq_len(sk);
2142 		if (amount < 0)
2143 			err = amount;
2144 		else
2145 			err = put_user(amount, (int __user *)arg);
2146 		break;
2147 	default:
2148 		err = -ENOIOCTLCMD;
2149 		break;
2150 	}
2151 	return err;
2152 }
2153 
2154 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2155 {
2156 	struct sock *sk = sock->sk;
2157 	unsigned int mask;
2158 
2159 	sock_poll_wait(file, sk_sleep(sk), wait);
2160 	mask = 0;
2161 
2162 	/* exceptional events? */
2163 	if (sk->sk_err)
2164 		mask |= POLLERR;
2165 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2166 		mask |= POLLHUP;
2167 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2168 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2169 
2170 	/* readable? */
2171 	if (!skb_queue_empty(&sk->sk_receive_queue))
2172 		mask |= POLLIN | POLLRDNORM;
2173 
2174 	/* Connection-based need to check for termination and startup */
2175 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2176 	    sk->sk_state == TCP_CLOSE)
2177 		mask |= POLLHUP;
2178 
2179 	/*
2180 	 * we set writable also when the other side has shut down the
2181 	 * connection. This prevents stuck sockets.
2182 	 */
2183 	if (unix_writable(sk))
2184 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2185 
2186 	return mask;
2187 }
2188 
2189 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2190 				    poll_table *wait)
2191 {
2192 	struct sock *sk = sock->sk, *other;
2193 	unsigned int mask, writable;
2194 
2195 	sock_poll_wait(file, sk_sleep(sk), wait);
2196 	mask = 0;
2197 
2198 	/* exceptional events? */
2199 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2200 		mask |= POLLERR |
2201 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2202 
2203 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2204 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2205 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2206 		mask |= POLLHUP;
2207 
2208 	/* readable? */
2209 	if (!skb_queue_empty(&sk->sk_receive_queue))
2210 		mask |= POLLIN | POLLRDNORM;
2211 
2212 	/* Connection-based need to check for termination and startup */
2213 	if (sk->sk_type == SOCK_SEQPACKET) {
2214 		if (sk->sk_state == TCP_CLOSE)
2215 			mask |= POLLHUP;
2216 		/* connection hasn't started yet? */
2217 		if (sk->sk_state == TCP_SYN_SENT)
2218 			return mask;
2219 	}
2220 
2221 	/* No write status requested, avoid expensive OUT tests. */
2222 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2223 		return mask;
2224 
2225 	writable = unix_writable(sk);
2226 	other = unix_peer_get(sk);
2227 	if (other) {
2228 		if (unix_peer(other) != sk) {
2229 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2230 			if (unix_recvq_full(other))
2231 				writable = 0;
2232 		}
2233 		sock_put(other);
2234 	}
2235 
2236 	if (writable)
2237 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2238 	else
2239 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2240 
2241 	return mask;
2242 }
2243 
2244 #ifdef CONFIG_PROC_FS
2245 
2246 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2247 
2248 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2249 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2250 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2251 
2252 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2253 {
2254 	unsigned long offset = get_offset(*pos);
2255 	unsigned long bucket = get_bucket(*pos);
2256 	struct sock *sk;
2257 	unsigned long count = 0;
2258 
2259 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2260 		if (sock_net(sk) != seq_file_net(seq))
2261 			continue;
2262 		if (++count == offset)
2263 			break;
2264 	}
2265 
2266 	return sk;
2267 }
2268 
2269 static struct sock *unix_next_socket(struct seq_file *seq,
2270 				     struct sock *sk,
2271 				     loff_t *pos)
2272 {
2273 	unsigned long bucket;
2274 
2275 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2276 		sk = sk_next(sk);
2277 		if (!sk)
2278 			goto next_bucket;
2279 		if (sock_net(sk) == seq_file_net(seq))
2280 			return sk;
2281 	}
2282 
2283 	do {
2284 		sk = unix_from_bucket(seq, pos);
2285 		if (sk)
2286 			return sk;
2287 
2288 next_bucket:
2289 		bucket = get_bucket(*pos) + 1;
2290 		*pos = set_bucket_offset(bucket, 1);
2291 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2292 
2293 	return NULL;
2294 }
2295 
2296 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2297 	__acquires(unix_table_lock)
2298 {
2299 	spin_lock(&unix_table_lock);
2300 
2301 	if (!*pos)
2302 		return SEQ_START_TOKEN;
2303 
2304 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2305 		return NULL;
2306 
2307 	return unix_next_socket(seq, NULL, pos);
2308 }
2309 
2310 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2311 {
2312 	++*pos;
2313 	return unix_next_socket(seq, v, pos);
2314 }
2315 
2316 static void unix_seq_stop(struct seq_file *seq, void *v)
2317 	__releases(unix_table_lock)
2318 {
2319 	spin_unlock(&unix_table_lock);
2320 }
2321 
2322 static int unix_seq_show(struct seq_file *seq, void *v)
2323 {
2324 
2325 	if (v == SEQ_START_TOKEN)
2326 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2327 			 "Inode Path\n");
2328 	else {
2329 		struct sock *s = v;
2330 		struct unix_sock *u = unix_sk(s);
2331 		unix_state_lock(s);
2332 
2333 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2334 			s,
2335 			atomic_read(&s->sk_refcnt),
2336 			0,
2337 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2338 			s->sk_type,
2339 			s->sk_socket ?
2340 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2341 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2342 			sock_i_ino(s));
2343 
2344 		if (u->addr) {
2345 			int i, len;
2346 			seq_putc(seq, ' ');
2347 
2348 			i = 0;
2349 			len = u->addr->len - sizeof(short);
2350 			if (!UNIX_ABSTRACT(s))
2351 				len--;
2352 			else {
2353 				seq_putc(seq, '@');
2354 				i++;
2355 			}
2356 			for ( ; i < len; i++)
2357 				seq_putc(seq, u->addr->name->sun_path[i]);
2358 		}
2359 		unix_state_unlock(s);
2360 		seq_putc(seq, '\n');
2361 	}
2362 
2363 	return 0;
2364 }
2365 
2366 static const struct seq_operations unix_seq_ops = {
2367 	.start  = unix_seq_start,
2368 	.next   = unix_seq_next,
2369 	.stop   = unix_seq_stop,
2370 	.show   = unix_seq_show,
2371 };
2372 
2373 static int unix_seq_open(struct inode *inode, struct file *file)
2374 {
2375 	return seq_open_net(inode, file, &unix_seq_ops,
2376 			    sizeof(struct seq_net_private));
2377 }
2378 
2379 static const struct file_operations unix_seq_fops = {
2380 	.owner		= THIS_MODULE,
2381 	.open		= unix_seq_open,
2382 	.read		= seq_read,
2383 	.llseek		= seq_lseek,
2384 	.release	= seq_release_net,
2385 };
2386 
2387 #endif
2388 
2389 static const struct net_proto_family unix_family_ops = {
2390 	.family = PF_UNIX,
2391 	.create = unix_create,
2392 	.owner	= THIS_MODULE,
2393 };
2394 
2395 
2396 static int __net_init unix_net_init(struct net *net)
2397 {
2398 	int error = -ENOMEM;
2399 
2400 	net->unx.sysctl_max_dgram_qlen = 10;
2401 	if (unix_sysctl_register(net))
2402 		goto out;
2403 
2404 #ifdef CONFIG_PROC_FS
2405 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2406 		unix_sysctl_unregister(net);
2407 		goto out;
2408 	}
2409 #endif
2410 	error = 0;
2411 out:
2412 	return error;
2413 }
2414 
2415 static void __net_exit unix_net_exit(struct net *net)
2416 {
2417 	unix_sysctl_unregister(net);
2418 	remove_proc_entry("unix", net->proc_net);
2419 }
2420 
2421 static struct pernet_operations unix_net_ops = {
2422 	.init = unix_net_init,
2423 	.exit = unix_net_exit,
2424 };
2425 
2426 static int __init af_unix_init(void)
2427 {
2428 	int rc = -1;
2429 
2430 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2431 
2432 	rc = proto_register(&unix_proto, 1);
2433 	if (rc != 0) {
2434 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2435 		goto out;
2436 	}
2437 
2438 	sock_register(&unix_family_ops);
2439 	register_pernet_subsys(&unix_net_ops);
2440 out:
2441 	return rc;
2442 }
2443 
2444 static void __exit af_unix_exit(void)
2445 {
2446 	sock_unregister(PF_UNIX);
2447 	proto_unregister(&unix_proto);
2448 	unregister_pernet_subsys(&unix_net_ops);
2449 }
2450 
2451 /* Earlier than device_initcall() so that other drivers invoking
2452    request_module() don't end up in a loop when modprobe tries
2453    to use a UNIX socket. But later than subsys_initcall() because
2454    we depend on stuff initialised there */
2455 fs_initcall(af_unix_init);
2456 module_exit(af_unix_exit);
2457 
2458 MODULE_LICENSE("GPL");
2459 MODULE_ALIAS_NETPROTO(PF_UNIX);
2460