xref: /openbmc/linux/net/unix/af_unix.c (revision 9cfc5c90)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
84 
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
95 #include <linux/un.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <asm/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
120 
121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
122 EXPORT_SYMBOL_GPL(unix_socket_table);
123 DEFINE_SPINLOCK(unix_table_lock);
124 EXPORT_SYMBOL_GPL(unix_table_lock);
125 static atomic_long_t unix_nr_socks;
126 
127 
128 static struct hlist_head *unix_sockets_unbound(void *addr)
129 {
130 	unsigned long hash = (unsigned long)addr;
131 
132 	hash ^= hash >> 16;
133 	hash ^= hash >> 8;
134 	hash %= UNIX_HASH_SIZE;
135 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
136 }
137 
138 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
139 
140 #ifdef CONFIG_SECURITY_NETWORK
141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
142 {
143 	UNIXCB(skb).secid = scm->secid;
144 }
145 
146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
147 {
148 	scm->secid = UNIXCB(skb).secid;
149 }
150 
151 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
152 {
153 	return (scm->secid == UNIXCB(skb).secid);
154 }
155 #else
156 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
157 { }
158 
159 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
160 { }
161 
162 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
163 {
164 	return true;
165 }
166 #endif /* CONFIG_SECURITY_NETWORK */
167 
168 /*
169  *  SMP locking strategy:
170  *    hash table is protected with spinlock unix_table_lock
171  *    each socket state is protected by separate spin lock.
172  */
173 
174 static inline unsigned int unix_hash_fold(__wsum n)
175 {
176 	unsigned int hash = (__force unsigned int)csum_fold(n);
177 
178 	hash ^= hash>>8;
179 	return hash&(UNIX_HASH_SIZE-1);
180 }
181 
182 #define unix_peer(sk) (unix_sk(sk)->peer)
183 
184 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
185 {
186 	return unix_peer(osk) == sk;
187 }
188 
189 static inline int unix_may_send(struct sock *sk, struct sock *osk)
190 {
191 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
192 }
193 
194 static inline int unix_recvq_full(struct sock const *sk)
195 {
196 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
197 }
198 
199 struct sock *unix_peer_get(struct sock *s)
200 {
201 	struct sock *peer;
202 
203 	unix_state_lock(s);
204 	peer = unix_peer(s);
205 	if (peer)
206 		sock_hold(peer);
207 	unix_state_unlock(s);
208 	return peer;
209 }
210 EXPORT_SYMBOL_GPL(unix_peer_get);
211 
212 static inline void unix_release_addr(struct unix_address *addr)
213 {
214 	if (atomic_dec_and_test(&addr->refcnt))
215 		kfree(addr);
216 }
217 
218 /*
219  *	Check unix socket name:
220  *		- should be not zero length.
221  *	        - if started by not zero, should be NULL terminated (FS object)
222  *		- if started by zero, it is abstract name.
223  */
224 
225 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
226 {
227 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
228 		return -EINVAL;
229 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
230 		return -EINVAL;
231 	if (sunaddr->sun_path[0]) {
232 		/*
233 		 * This may look like an off by one error but it is a bit more
234 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
235 		 * sun_path[108] doesn't as such exist.  However in kernel space
236 		 * we are guaranteed that it is a valid memory location in our
237 		 * kernel address buffer.
238 		 */
239 		((char *)sunaddr)[len] = 0;
240 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
241 		return len;
242 	}
243 
244 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
245 	return len;
246 }
247 
248 static void __unix_remove_socket(struct sock *sk)
249 {
250 	sk_del_node_init(sk);
251 }
252 
253 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
254 {
255 	WARN_ON(!sk_unhashed(sk));
256 	sk_add_node(sk, list);
257 }
258 
259 static inline void unix_remove_socket(struct sock *sk)
260 {
261 	spin_lock(&unix_table_lock);
262 	__unix_remove_socket(sk);
263 	spin_unlock(&unix_table_lock);
264 }
265 
266 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
267 {
268 	spin_lock(&unix_table_lock);
269 	__unix_insert_socket(list, sk);
270 	spin_unlock(&unix_table_lock);
271 }
272 
273 static struct sock *__unix_find_socket_byname(struct net *net,
274 					      struct sockaddr_un *sunname,
275 					      int len, int type, unsigned int hash)
276 {
277 	struct sock *s;
278 
279 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
280 		struct unix_sock *u = unix_sk(s);
281 
282 		if (!net_eq(sock_net(s), net))
283 			continue;
284 
285 		if (u->addr->len == len &&
286 		    !memcmp(u->addr->name, sunname, len))
287 			goto found;
288 	}
289 	s = NULL;
290 found:
291 	return s;
292 }
293 
294 static inline struct sock *unix_find_socket_byname(struct net *net,
295 						   struct sockaddr_un *sunname,
296 						   int len, int type,
297 						   unsigned int hash)
298 {
299 	struct sock *s;
300 
301 	spin_lock(&unix_table_lock);
302 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
303 	if (s)
304 		sock_hold(s);
305 	spin_unlock(&unix_table_lock);
306 	return s;
307 }
308 
309 static struct sock *unix_find_socket_byinode(struct inode *i)
310 {
311 	struct sock *s;
312 
313 	spin_lock(&unix_table_lock);
314 	sk_for_each(s,
315 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
316 		struct dentry *dentry = unix_sk(s)->path.dentry;
317 
318 		if (dentry && d_backing_inode(dentry) == i) {
319 			sock_hold(s);
320 			goto found;
321 		}
322 	}
323 	s = NULL;
324 found:
325 	spin_unlock(&unix_table_lock);
326 	return s;
327 }
328 
329 static int unix_writable(const struct sock *sk)
330 {
331 	return sk->sk_state != TCP_LISTEN &&
332 	       (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
333 }
334 
335 static void unix_write_space(struct sock *sk)
336 {
337 	struct socket_wq *wq;
338 
339 	rcu_read_lock();
340 	if (unix_writable(sk)) {
341 		wq = rcu_dereference(sk->sk_wq);
342 		if (wq_has_sleeper(wq))
343 			wake_up_interruptible_sync_poll(&wq->wait,
344 				POLLOUT | POLLWRNORM | POLLWRBAND);
345 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
346 	}
347 	rcu_read_unlock();
348 }
349 
350 /* When dgram socket disconnects (or changes its peer), we clear its receive
351  * queue of packets arrived from previous peer. First, it allows to do
352  * flow control based only on wmem_alloc; second, sk connected to peer
353  * may receive messages only from that peer. */
354 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
355 {
356 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
357 		skb_queue_purge(&sk->sk_receive_queue);
358 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
359 
360 		/* If one link of bidirectional dgram pipe is disconnected,
361 		 * we signal error. Messages are lost. Do not make this,
362 		 * when peer was not connected to us.
363 		 */
364 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
365 			other->sk_err = ECONNRESET;
366 			other->sk_error_report(other);
367 		}
368 	}
369 }
370 
371 static void unix_sock_destructor(struct sock *sk)
372 {
373 	struct unix_sock *u = unix_sk(sk);
374 
375 	skb_queue_purge(&sk->sk_receive_queue);
376 
377 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
378 	WARN_ON(!sk_unhashed(sk));
379 	WARN_ON(sk->sk_socket);
380 	if (!sock_flag(sk, SOCK_DEAD)) {
381 		pr_info("Attempt to release alive unix socket: %p\n", sk);
382 		return;
383 	}
384 
385 	if (u->addr)
386 		unix_release_addr(u->addr);
387 
388 	atomic_long_dec(&unix_nr_socks);
389 	local_bh_disable();
390 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
391 	local_bh_enable();
392 #ifdef UNIX_REFCNT_DEBUG
393 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
394 		atomic_long_read(&unix_nr_socks));
395 #endif
396 }
397 
398 static void unix_release_sock(struct sock *sk, int embrion)
399 {
400 	struct unix_sock *u = unix_sk(sk);
401 	struct path path;
402 	struct sock *skpair;
403 	struct sk_buff *skb;
404 	int state;
405 
406 	unix_remove_socket(sk);
407 
408 	/* Clear state */
409 	unix_state_lock(sk);
410 	sock_orphan(sk);
411 	sk->sk_shutdown = SHUTDOWN_MASK;
412 	path	     = u->path;
413 	u->path.dentry = NULL;
414 	u->path.mnt = NULL;
415 	state = sk->sk_state;
416 	sk->sk_state = TCP_CLOSE;
417 	unix_state_unlock(sk);
418 
419 	wake_up_interruptible_all(&u->peer_wait);
420 
421 	skpair = unix_peer(sk);
422 
423 	if (skpair != NULL) {
424 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
425 			unix_state_lock(skpair);
426 			/* No more writes */
427 			skpair->sk_shutdown = SHUTDOWN_MASK;
428 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
429 				skpair->sk_err = ECONNRESET;
430 			unix_state_unlock(skpair);
431 			skpair->sk_state_change(skpair);
432 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
433 		}
434 		sock_put(skpair); /* It may now die */
435 		unix_peer(sk) = NULL;
436 	}
437 
438 	/* Try to flush out this socket. Throw out buffers at least */
439 
440 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
441 		if (state == TCP_LISTEN)
442 			unix_release_sock(skb->sk, 1);
443 		/* passed fds are erased in the kfree_skb hook	      */
444 		kfree_skb(skb);
445 	}
446 
447 	if (path.dentry)
448 		path_put(&path);
449 
450 	sock_put(sk);
451 
452 	/* ---- Socket is dead now and most probably destroyed ---- */
453 
454 	/*
455 	 * Fixme: BSD difference: In BSD all sockets connected to us get
456 	 *	  ECONNRESET and we die on the spot. In Linux we behave
457 	 *	  like files and pipes do and wait for the last
458 	 *	  dereference.
459 	 *
460 	 * Can't we simply set sock->err?
461 	 *
462 	 *	  What the above comment does talk about? --ANK(980817)
463 	 */
464 
465 	if (unix_tot_inflight)
466 		unix_gc();		/* Garbage collect fds */
467 }
468 
469 static void init_peercred(struct sock *sk)
470 {
471 	put_pid(sk->sk_peer_pid);
472 	if (sk->sk_peer_cred)
473 		put_cred(sk->sk_peer_cred);
474 	sk->sk_peer_pid  = get_pid(task_tgid(current));
475 	sk->sk_peer_cred = get_current_cred();
476 }
477 
478 static void copy_peercred(struct sock *sk, struct sock *peersk)
479 {
480 	put_pid(sk->sk_peer_pid);
481 	if (sk->sk_peer_cred)
482 		put_cred(sk->sk_peer_cred);
483 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
484 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
485 }
486 
487 static int unix_listen(struct socket *sock, int backlog)
488 {
489 	int err;
490 	struct sock *sk = sock->sk;
491 	struct unix_sock *u = unix_sk(sk);
492 	struct pid *old_pid = NULL;
493 
494 	err = -EOPNOTSUPP;
495 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
496 		goto out;	/* Only stream/seqpacket sockets accept */
497 	err = -EINVAL;
498 	if (!u->addr)
499 		goto out;	/* No listens on an unbound socket */
500 	unix_state_lock(sk);
501 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
502 		goto out_unlock;
503 	if (backlog > sk->sk_max_ack_backlog)
504 		wake_up_interruptible_all(&u->peer_wait);
505 	sk->sk_max_ack_backlog	= backlog;
506 	sk->sk_state		= TCP_LISTEN;
507 	/* set credentials so connect can copy them */
508 	init_peercred(sk);
509 	err = 0;
510 
511 out_unlock:
512 	unix_state_unlock(sk);
513 	put_pid(old_pid);
514 out:
515 	return err;
516 }
517 
518 static int unix_release(struct socket *);
519 static int unix_bind(struct socket *, struct sockaddr *, int);
520 static int unix_stream_connect(struct socket *, struct sockaddr *,
521 			       int addr_len, int flags);
522 static int unix_socketpair(struct socket *, struct socket *);
523 static int unix_accept(struct socket *, struct socket *, int);
524 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
525 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
526 static unsigned int unix_dgram_poll(struct file *, struct socket *,
527 				    poll_table *);
528 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
529 static int unix_shutdown(struct socket *, int);
530 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
531 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
532 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
533 				    size_t size, int flags);
534 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
535 				       struct pipe_inode_info *, size_t size,
536 				       unsigned int flags);
537 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
538 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
539 static int unix_dgram_connect(struct socket *, struct sockaddr *,
540 			      int, int);
541 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
542 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
543 				  int);
544 
545 static int unix_set_peek_off(struct sock *sk, int val)
546 {
547 	struct unix_sock *u = unix_sk(sk);
548 
549 	if (mutex_lock_interruptible(&u->readlock))
550 		return -EINTR;
551 
552 	sk->sk_peek_off = val;
553 	mutex_unlock(&u->readlock);
554 
555 	return 0;
556 }
557 
558 
559 static const struct proto_ops unix_stream_ops = {
560 	.family =	PF_UNIX,
561 	.owner =	THIS_MODULE,
562 	.release =	unix_release,
563 	.bind =		unix_bind,
564 	.connect =	unix_stream_connect,
565 	.socketpair =	unix_socketpair,
566 	.accept =	unix_accept,
567 	.getname =	unix_getname,
568 	.poll =		unix_poll,
569 	.ioctl =	unix_ioctl,
570 	.listen =	unix_listen,
571 	.shutdown =	unix_shutdown,
572 	.setsockopt =	sock_no_setsockopt,
573 	.getsockopt =	sock_no_getsockopt,
574 	.sendmsg =	unix_stream_sendmsg,
575 	.recvmsg =	unix_stream_recvmsg,
576 	.mmap =		sock_no_mmap,
577 	.sendpage =	unix_stream_sendpage,
578 	.splice_read =	unix_stream_splice_read,
579 	.set_peek_off =	unix_set_peek_off,
580 };
581 
582 static const struct proto_ops unix_dgram_ops = {
583 	.family =	PF_UNIX,
584 	.owner =	THIS_MODULE,
585 	.release =	unix_release,
586 	.bind =		unix_bind,
587 	.connect =	unix_dgram_connect,
588 	.socketpair =	unix_socketpair,
589 	.accept =	sock_no_accept,
590 	.getname =	unix_getname,
591 	.poll =		unix_dgram_poll,
592 	.ioctl =	unix_ioctl,
593 	.listen =	sock_no_listen,
594 	.shutdown =	unix_shutdown,
595 	.setsockopt =	sock_no_setsockopt,
596 	.getsockopt =	sock_no_getsockopt,
597 	.sendmsg =	unix_dgram_sendmsg,
598 	.recvmsg =	unix_dgram_recvmsg,
599 	.mmap =		sock_no_mmap,
600 	.sendpage =	sock_no_sendpage,
601 	.set_peek_off =	unix_set_peek_off,
602 };
603 
604 static const struct proto_ops unix_seqpacket_ops = {
605 	.family =	PF_UNIX,
606 	.owner =	THIS_MODULE,
607 	.release =	unix_release,
608 	.bind =		unix_bind,
609 	.connect =	unix_stream_connect,
610 	.socketpair =	unix_socketpair,
611 	.accept =	unix_accept,
612 	.getname =	unix_getname,
613 	.poll =		unix_dgram_poll,
614 	.ioctl =	unix_ioctl,
615 	.listen =	unix_listen,
616 	.shutdown =	unix_shutdown,
617 	.setsockopt =	sock_no_setsockopt,
618 	.getsockopt =	sock_no_getsockopt,
619 	.sendmsg =	unix_seqpacket_sendmsg,
620 	.recvmsg =	unix_seqpacket_recvmsg,
621 	.mmap =		sock_no_mmap,
622 	.sendpage =	sock_no_sendpage,
623 	.set_peek_off =	unix_set_peek_off,
624 };
625 
626 static struct proto unix_proto = {
627 	.name			= "UNIX",
628 	.owner			= THIS_MODULE,
629 	.obj_size		= sizeof(struct unix_sock),
630 };
631 
632 /*
633  * AF_UNIX sockets do not interact with hardware, hence they
634  * dont trigger interrupts - so it's safe for them to have
635  * bh-unsafe locking for their sk_receive_queue.lock. Split off
636  * this special lock-class by reinitializing the spinlock key:
637  */
638 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
639 
640 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
641 {
642 	struct sock *sk = NULL;
643 	struct unix_sock *u;
644 
645 	atomic_long_inc(&unix_nr_socks);
646 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
647 		goto out;
648 
649 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
650 	if (!sk)
651 		goto out;
652 
653 	sock_init_data(sock, sk);
654 	lockdep_set_class(&sk->sk_receive_queue.lock,
655 				&af_unix_sk_receive_queue_lock_key);
656 
657 	sk->sk_write_space	= unix_write_space;
658 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
659 	sk->sk_destruct		= unix_sock_destructor;
660 	u	  = unix_sk(sk);
661 	u->path.dentry = NULL;
662 	u->path.mnt = NULL;
663 	spin_lock_init(&u->lock);
664 	atomic_long_set(&u->inflight, 0);
665 	INIT_LIST_HEAD(&u->link);
666 	mutex_init(&u->readlock); /* single task reading lock */
667 	init_waitqueue_head(&u->peer_wait);
668 	unix_insert_socket(unix_sockets_unbound(sk), sk);
669 out:
670 	if (sk == NULL)
671 		atomic_long_dec(&unix_nr_socks);
672 	else {
673 		local_bh_disable();
674 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
675 		local_bh_enable();
676 	}
677 	return sk;
678 }
679 
680 static int unix_create(struct net *net, struct socket *sock, int protocol,
681 		       int kern)
682 {
683 	if (protocol && protocol != PF_UNIX)
684 		return -EPROTONOSUPPORT;
685 
686 	sock->state = SS_UNCONNECTED;
687 
688 	switch (sock->type) {
689 	case SOCK_STREAM:
690 		sock->ops = &unix_stream_ops;
691 		break;
692 		/*
693 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
694 		 *	nothing uses it.
695 		 */
696 	case SOCK_RAW:
697 		sock->type = SOCK_DGRAM;
698 	case SOCK_DGRAM:
699 		sock->ops = &unix_dgram_ops;
700 		break;
701 	case SOCK_SEQPACKET:
702 		sock->ops = &unix_seqpacket_ops;
703 		break;
704 	default:
705 		return -ESOCKTNOSUPPORT;
706 	}
707 
708 	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
709 }
710 
711 static int unix_release(struct socket *sock)
712 {
713 	struct sock *sk = sock->sk;
714 
715 	if (!sk)
716 		return 0;
717 
718 	unix_release_sock(sk, 0);
719 	sock->sk = NULL;
720 
721 	return 0;
722 }
723 
724 static int unix_autobind(struct socket *sock)
725 {
726 	struct sock *sk = sock->sk;
727 	struct net *net = sock_net(sk);
728 	struct unix_sock *u = unix_sk(sk);
729 	static u32 ordernum = 1;
730 	struct unix_address *addr;
731 	int err;
732 	unsigned int retries = 0;
733 
734 	err = mutex_lock_interruptible(&u->readlock);
735 	if (err)
736 		return err;
737 
738 	err = 0;
739 	if (u->addr)
740 		goto out;
741 
742 	err = -ENOMEM;
743 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
744 	if (!addr)
745 		goto out;
746 
747 	addr->name->sun_family = AF_UNIX;
748 	atomic_set(&addr->refcnt, 1);
749 
750 retry:
751 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
752 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
753 
754 	spin_lock(&unix_table_lock);
755 	ordernum = (ordernum+1)&0xFFFFF;
756 
757 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
758 				      addr->hash)) {
759 		spin_unlock(&unix_table_lock);
760 		/*
761 		 * __unix_find_socket_byname() may take long time if many names
762 		 * are already in use.
763 		 */
764 		cond_resched();
765 		/* Give up if all names seems to be in use. */
766 		if (retries++ == 0xFFFFF) {
767 			err = -ENOSPC;
768 			kfree(addr);
769 			goto out;
770 		}
771 		goto retry;
772 	}
773 	addr->hash ^= sk->sk_type;
774 
775 	__unix_remove_socket(sk);
776 	u->addr = addr;
777 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
778 	spin_unlock(&unix_table_lock);
779 	err = 0;
780 
781 out:	mutex_unlock(&u->readlock);
782 	return err;
783 }
784 
785 static struct sock *unix_find_other(struct net *net,
786 				    struct sockaddr_un *sunname, int len,
787 				    int type, unsigned int hash, int *error)
788 {
789 	struct sock *u;
790 	struct path path;
791 	int err = 0;
792 
793 	if (sunname->sun_path[0]) {
794 		struct inode *inode;
795 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
796 		if (err)
797 			goto fail;
798 		inode = d_backing_inode(path.dentry);
799 		err = inode_permission(inode, MAY_WRITE);
800 		if (err)
801 			goto put_fail;
802 
803 		err = -ECONNREFUSED;
804 		if (!S_ISSOCK(inode->i_mode))
805 			goto put_fail;
806 		u = unix_find_socket_byinode(inode);
807 		if (!u)
808 			goto put_fail;
809 
810 		if (u->sk_type == type)
811 			touch_atime(&path);
812 
813 		path_put(&path);
814 
815 		err = -EPROTOTYPE;
816 		if (u->sk_type != type) {
817 			sock_put(u);
818 			goto fail;
819 		}
820 	} else {
821 		err = -ECONNREFUSED;
822 		u = unix_find_socket_byname(net, sunname, len, type, hash);
823 		if (u) {
824 			struct dentry *dentry;
825 			dentry = unix_sk(u)->path.dentry;
826 			if (dentry)
827 				touch_atime(&unix_sk(u)->path);
828 		} else
829 			goto fail;
830 	}
831 	return u;
832 
833 put_fail:
834 	path_put(&path);
835 fail:
836 	*error = err;
837 	return NULL;
838 }
839 
840 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
841 {
842 	struct dentry *dentry;
843 	struct path path;
844 	int err = 0;
845 	/*
846 	 * Get the parent directory, calculate the hash for last
847 	 * component.
848 	 */
849 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
850 	err = PTR_ERR(dentry);
851 	if (IS_ERR(dentry))
852 		return err;
853 
854 	/*
855 	 * All right, let's create it.
856 	 */
857 	err = security_path_mknod(&path, dentry, mode, 0);
858 	if (!err) {
859 		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
860 		if (!err) {
861 			res->mnt = mntget(path.mnt);
862 			res->dentry = dget(dentry);
863 		}
864 	}
865 	done_path_create(&path, dentry);
866 	return err;
867 }
868 
869 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
870 {
871 	struct sock *sk = sock->sk;
872 	struct net *net = sock_net(sk);
873 	struct unix_sock *u = unix_sk(sk);
874 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
875 	char *sun_path = sunaddr->sun_path;
876 	int err;
877 	unsigned int hash;
878 	struct unix_address *addr;
879 	struct hlist_head *list;
880 
881 	err = -EINVAL;
882 	if (sunaddr->sun_family != AF_UNIX)
883 		goto out;
884 
885 	if (addr_len == sizeof(short)) {
886 		err = unix_autobind(sock);
887 		goto out;
888 	}
889 
890 	err = unix_mkname(sunaddr, addr_len, &hash);
891 	if (err < 0)
892 		goto out;
893 	addr_len = err;
894 
895 	err = mutex_lock_interruptible(&u->readlock);
896 	if (err)
897 		goto out;
898 
899 	err = -EINVAL;
900 	if (u->addr)
901 		goto out_up;
902 
903 	err = -ENOMEM;
904 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
905 	if (!addr)
906 		goto out_up;
907 
908 	memcpy(addr->name, sunaddr, addr_len);
909 	addr->len = addr_len;
910 	addr->hash = hash ^ sk->sk_type;
911 	atomic_set(&addr->refcnt, 1);
912 
913 	if (sun_path[0]) {
914 		struct path path;
915 		umode_t mode = S_IFSOCK |
916 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
917 		err = unix_mknod(sun_path, mode, &path);
918 		if (err) {
919 			if (err == -EEXIST)
920 				err = -EADDRINUSE;
921 			unix_release_addr(addr);
922 			goto out_up;
923 		}
924 		addr->hash = UNIX_HASH_SIZE;
925 		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
926 		spin_lock(&unix_table_lock);
927 		u->path = path;
928 		list = &unix_socket_table[hash];
929 	} else {
930 		spin_lock(&unix_table_lock);
931 		err = -EADDRINUSE;
932 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
933 					      sk->sk_type, hash)) {
934 			unix_release_addr(addr);
935 			goto out_unlock;
936 		}
937 
938 		list = &unix_socket_table[addr->hash];
939 	}
940 
941 	err = 0;
942 	__unix_remove_socket(sk);
943 	u->addr = addr;
944 	__unix_insert_socket(list, sk);
945 
946 out_unlock:
947 	spin_unlock(&unix_table_lock);
948 out_up:
949 	mutex_unlock(&u->readlock);
950 out:
951 	return err;
952 }
953 
954 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
955 {
956 	if (unlikely(sk1 == sk2) || !sk2) {
957 		unix_state_lock(sk1);
958 		return;
959 	}
960 	if (sk1 < sk2) {
961 		unix_state_lock(sk1);
962 		unix_state_lock_nested(sk2);
963 	} else {
964 		unix_state_lock(sk2);
965 		unix_state_lock_nested(sk1);
966 	}
967 }
968 
969 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
970 {
971 	if (unlikely(sk1 == sk2) || !sk2) {
972 		unix_state_unlock(sk1);
973 		return;
974 	}
975 	unix_state_unlock(sk1);
976 	unix_state_unlock(sk2);
977 }
978 
979 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
980 			      int alen, int flags)
981 {
982 	struct sock *sk = sock->sk;
983 	struct net *net = sock_net(sk);
984 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
985 	struct sock *other;
986 	unsigned int hash;
987 	int err;
988 
989 	if (addr->sa_family != AF_UNSPEC) {
990 		err = unix_mkname(sunaddr, alen, &hash);
991 		if (err < 0)
992 			goto out;
993 		alen = err;
994 
995 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
996 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
997 			goto out;
998 
999 restart:
1000 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1001 		if (!other)
1002 			goto out;
1003 
1004 		unix_state_double_lock(sk, other);
1005 
1006 		/* Apparently VFS overslept socket death. Retry. */
1007 		if (sock_flag(other, SOCK_DEAD)) {
1008 			unix_state_double_unlock(sk, other);
1009 			sock_put(other);
1010 			goto restart;
1011 		}
1012 
1013 		err = -EPERM;
1014 		if (!unix_may_send(sk, other))
1015 			goto out_unlock;
1016 
1017 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1018 		if (err)
1019 			goto out_unlock;
1020 
1021 	} else {
1022 		/*
1023 		 *	1003.1g breaking connected state with AF_UNSPEC
1024 		 */
1025 		other = NULL;
1026 		unix_state_double_lock(sk, other);
1027 	}
1028 
1029 	/*
1030 	 * If it was connected, reconnect.
1031 	 */
1032 	if (unix_peer(sk)) {
1033 		struct sock *old_peer = unix_peer(sk);
1034 		unix_peer(sk) = other;
1035 		unix_state_double_unlock(sk, other);
1036 
1037 		if (other != old_peer)
1038 			unix_dgram_disconnected(sk, old_peer);
1039 		sock_put(old_peer);
1040 	} else {
1041 		unix_peer(sk) = other;
1042 		unix_state_double_unlock(sk, other);
1043 	}
1044 	return 0;
1045 
1046 out_unlock:
1047 	unix_state_double_unlock(sk, other);
1048 	sock_put(other);
1049 out:
1050 	return err;
1051 }
1052 
1053 static long unix_wait_for_peer(struct sock *other, long timeo)
1054 {
1055 	struct unix_sock *u = unix_sk(other);
1056 	int sched;
1057 	DEFINE_WAIT(wait);
1058 
1059 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1060 
1061 	sched = !sock_flag(other, SOCK_DEAD) &&
1062 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1063 		unix_recvq_full(other);
1064 
1065 	unix_state_unlock(other);
1066 
1067 	if (sched)
1068 		timeo = schedule_timeout(timeo);
1069 
1070 	finish_wait(&u->peer_wait, &wait);
1071 	return timeo;
1072 }
1073 
1074 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1075 			       int addr_len, int flags)
1076 {
1077 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1078 	struct sock *sk = sock->sk;
1079 	struct net *net = sock_net(sk);
1080 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1081 	struct sock *newsk = NULL;
1082 	struct sock *other = NULL;
1083 	struct sk_buff *skb = NULL;
1084 	unsigned int hash;
1085 	int st;
1086 	int err;
1087 	long timeo;
1088 
1089 	err = unix_mkname(sunaddr, addr_len, &hash);
1090 	if (err < 0)
1091 		goto out;
1092 	addr_len = err;
1093 
1094 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1095 	    (err = unix_autobind(sock)) != 0)
1096 		goto out;
1097 
1098 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1099 
1100 	/* First of all allocate resources.
1101 	   If we will make it after state is locked,
1102 	   we will have to recheck all again in any case.
1103 	 */
1104 
1105 	err = -ENOMEM;
1106 
1107 	/* create new sock for complete connection */
1108 	newsk = unix_create1(sock_net(sk), NULL, 0);
1109 	if (newsk == NULL)
1110 		goto out;
1111 
1112 	/* Allocate skb for sending to listening sock */
1113 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1114 	if (skb == NULL)
1115 		goto out;
1116 
1117 restart:
1118 	/*  Find listening sock. */
1119 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1120 	if (!other)
1121 		goto out;
1122 
1123 	/* Latch state of peer */
1124 	unix_state_lock(other);
1125 
1126 	/* Apparently VFS overslept socket death. Retry. */
1127 	if (sock_flag(other, SOCK_DEAD)) {
1128 		unix_state_unlock(other);
1129 		sock_put(other);
1130 		goto restart;
1131 	}
1132 
1133 	err = -ECONNREFUSED;
1134 	if (other->sk_state != TCP_LISTEN)
1135 		goto out_unlock;
1136 	if (other->sk_shutdown & RCV_SHUTDOWN)
1137 		goto out_unlock;
1138 
1139 	if (unix_recvq_full(other)) {
1140 		err = -EAGAIN;
1141 		if (!timeo)
1142 			goto out_unlock;
1143 
1144 		timeo = unix_wait_for_peer(other, timeo);
1145 
1146 		err = sock_intr_errno(timeo);
1147 		if (signal_pending(current))
1148 			goto out;
1149 		sock_put(other);
1150 		goto restart;
1151 	}
1152 
1153 	/* Latch our state.
1154 
1155 	   It is tricky place. We need to grab our state lock and cannot
1156 	   drop lock on peer. It is dangerous because deadlock is
1157 	   possible. Connect to self case and simultaneous
1158 	   attempt to connect are eliminated by checking socket
1159 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1160 	   check this before attempt to grab lock.
1161 
1162 	   Well, and we have to recheck the state after socket locked.
1163 	 */
1164 	st = sk->sk_state;
1165 
1166 	switch (st) {
1167 	case TCP_CLOSE:
1168 		/* This is ok... continue with connect */
1169 		break;
1170 	case TCP_ESTABLISHED:
1171 		/* Socket is already connected */
1172 		err = -EISCONN;
1173 		goto out_unlock;
1174 	default:
1175 		err = -EINVAL;
1176 		goto out_unlock;
1177 	}
1178 
1179 	unix_state_lock_nested(sk);
1180 
1181 	if (sk->sk_state != st) {
1182 		unix_state_unlock(sk);
1183 		unix_state_unlock(other);
1184 		sock_put(other);
1185 		goto restart;
1186 	}
1187 
1188 	err = security_unix_stream_connect(sk, other, newsk);
1189 	if (err) {
1190 		unix_state_unlock(sk);
1191 		goto out_unlock;
1192 	}
1193 
1194 	/* The way is open! Fastly set all the necessary fields... */
1195 
1196 	sock_hold(sk);
1197 	unix_peer(newsk)	= sk;
1198 	newsk->sk_state		= TCP_ESTABLISHED;
1199 	newsk->sk_type		= sk->sk_type;
1200 	init_peercred(newsk);
1201 	newu = unix_sk(newsk);
1202 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1203 	otheru = unix_sk(other);
1204 
1205 	/* copy address information from listening to new sock*/
1206 	if (otheru->addr) {
1207 		atomic_inc(&otheru->addr->refcnt);
1208 		newu->addr = otheru->addr;
1209 	}
1210 	if (otheru->path.dentry) {
1211 		path_get(&otheru->path);
1212 		newu->path = otheru->path;
1213 	}
1214 
1215 	/* Set credentials */
1216 	copy_peercred(sk, other);
1217 
1218 	sock->state	= SS_CONNECTED;
1219 	sk->sk_state	= TCP_ESTABLISHED;
1220 	sock_hold(newsk);
1221 
1222 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1223 	unix_peer(sk)	= newsk;
1224 
1225 	unix_state_unlock(sk);
1226 
1227 	/* take ten and and send info to listening sock */
1228 	spin_lock(&other->sk_receive_queue.lock);
1229 	__skb_queue_tail(&other->sk_receive_queue, skb);
1230 	spin_unlock(&other->sk_receive_queue.lock);
1231 	unix_state_unlock(other);
1232 	other->sk_data_ready(other);
1233 	sock_put(other);
1234 	return 0;
1235 
1236 out_unlock:
1237 	if (other)
1238 		unix_state_unlock(other);
1239 
1240 out:
1241 	kfree_skb(skb);
1242 	if (newsk)
1243 		unix_release_sock(newsk, 0);
1244 	if (other)
1245 		sock_put(other);
1246 	return err;
1247 }
1248 
1249 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1250 {
1251 	struct sock *ska = socka->sk, *skb = sockb->sk;
1252 
1253 	/* Join our sockets back to back */
1254 	sock_hold(ska);
1255 	sock_hold(skb);
1256 	unix_peer(ska) = skb;
1257 	unix_peer(skb) = ska;
1258 	init_peercred(ska);
1259 	init_peercred(skb);
1260 
1261 	if (ska->sk_type != SOCK_DGRAM) {
1262 		ska->sk_state = TCP_ESTABLISHED;
1263 		skb->sk_state = TCP_ESTABLISHED;
1264 		socka->state  = SS_CONNECTED;
1265 		sockb->state  = SS_CONNECTED;
1266 	}
1267 	return 0;
1268 }
1269 
1270 static void unix_sock_inherit_flags(const struct socket *old,
1271 				    struct socket *new)
1272 {
1273 	if (test_bit(SOCK_PASSCRED, &old->flags))
1274 		set_bit(SOCK_PASSCRED, &new->flags);
1275 	if (test_bit(SOCK_PASSSEC, &old->flags))
1276 		set_bit(SOCK_PASSSEC, &new->flags);
1277 }
1278 
1279 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1280 {
1281 	struct sock *sk = sock->sk;
1282 	struct sock *tsk;
1283 	struct sk_buff *skb;
1284 	int err;
1285 
1286 	err = -EOPNOTSUPP;
1287 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1288 		goto out;
1289 
1290 	err = -EINVAL;
1291 	if (sk->sk_state != TCP_LISTEN)
1292 		goto out;
1293 
1294 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1295 	 * so that no locks are necessary.
1296 	 */
1297 
1298 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1299 	if (!skb) {
1300 		/* This means receive shutdown. */
1301 		if (err == 0)
1302 			err = -EINVAL;
1303 		goto out;
1304 	}
1305 
1306 	tsk = skb->sk;
1307 	skb_free_datagram(sk, skb);
1308 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1309 
1310 	/* attach accepted sock to socket */
1311 	unix_state_lock(tsk);
1312 	newsock->state = SS_CONNECTED;
1313 	unix_sock_inherit_flags(sock, newsock);
1314 	sock_graft(tsk, newsock);
1315 	unix_state_unlock(tsk);
1316 	return 0;
1317 
1318 out:
1319 	return err;
1320 }
1321 
1322 
1323 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1324 {
1325 	struct sock *sk = sock->sk;
1326 	struct unix_sock *u;
1327 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1328 	int err = 0;
1329 
1330 	if (peer) {
1331 		sk = unix_peer_get(sk);
1332 
1333 		err = -ENOTCONN;
1334 		if (!sk)
1335 			goto out;
1336 		err = 0;
1337 	} else {
1338 		sock_hold(sk);
1339 	}
1340 
1341 	u = unix_sk(sk);
1342 	unix_state_lock(sk);
1343 	if (!u->addr) {
1344 		sunaddr->sun_family = AF_UNIX;
1345 		sunaddr->sun_path[0] = 0;
1346 		*uaddr_len = sizeof(short);
1347 	} else {
1348 		struct unix_address *addr = u->addr;
1349 
1350 		*uaddr_len = addr->len;
1351 		memcpy(sunaddr, addr->name, *uaddr_len);
1352 	}
1353 	unix_state_unlock(sk);
1354 	sock_put(sk);
1355 out:
1356 	return err;
1357 }
1358 
1359 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1360 {
1361 	int i;
1362 
1363 	scm->fp = UNIXCB(skb).fp;
1364 	UNIXCB(skb).fp = NULL;
1365 
1366 	for (i = scm->fp->count-1; i >= 0; i--)
1367 		unix_notinflight(scm->fp->fp[i]);
1368 }
1369 
1370 static void unix_destruct_scm(struct sk_buff *skb)
1371 {
1372 	struct scm_cookie scm;
1373 	memset(&scm, 0, sizeof(scm));
1374 	scm.pid  = UNIXCB(skb).pid;
1375 	if (UNIXCB(skb).fp)
1376 		unix_detach_fds(&scm, skb);
1377 
1378 	/* Alas, it calls VFS */
1379 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1380 	scm_destroy(&scm);
1381 	sock_wfree(skb);
1382 }
1383 
1384 #define MAX_RECURSION_LEVEL 4
1385 
1386 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1387 {
1388 	int i;
1389 	unsigned char max_level = 0;
1390 	int unix_sock_count = 0;
1391 
1392 	for (i = scm->fp->count - 1; i >= 0; i--) {
1393 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1394 
1395 		if (sk) {
1396 			unix_sock_count++;
1397 			max_level = max(max_level,
1398 					unix_sk(sk)->recursion_level);
1399 		}
1400 	}
1401 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1402 		return -ETOOMANYREFS;
1403 
1404 	/*
1405 	 * Need to duplicate file references for the sake of garbage
1406 	 * collection.  Otherwise a socket in the fps might become a
1407 	 * candidate for GC while the skb is not yet queued.
1408 	 */
1409 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1410 	if (!UNIXCB(skb).fp)
1411 		return -ENOMEM;
1412 
1413 	if (unix_sock_count) {
1414 		for (i = scm->fp->count - 1; i >= 0; i--)
1415 			unix_inflight(scm->fp->fp[i]);
1416 	}
1417 	return max_level;
1418 }
1419 
1420 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1421 {
1422 	int err = 0;
1423 
1424 	UNIXCB(skb).pid  = get_pid(scm->pid);
1425 	UNIXCB(skb).uid = scm->creds.uid;
1426 	UNIXCB(skb).gid = scm->creds.gid;
1427 	UNIXCB(skb).fp = NULL;
1428 	unix_get_secdata(scm, skb);
1429 	if (scm->fp && send_fds)
1430 		err = unix_attach_fds(scm, skb);
1431 
1432 	skb->destructor = unix_destruct_scm;
1433 	return err;
1434 }
1435 
1436 /*
1437  * Some apps rely on write() giving SCM_CREDENTIALS
1438  * We include credentials if source or destination socket
1439  * asserted SOCK_PASSCRED.
1440  */
1441 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1442 			    const struct sock *other)
1443 {
1444 	if (UNIXCB(skb).pid)
1445 		return;
1446 	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1447 	    !other->sk_socket ||
1448 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1449 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1450 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1451 	}
1452 }
1453 
1454 /*
1455  *	Send AF_UNIX data.
1456  */
1457 
1458 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1459 			      size_t len)
1460 {
1461 	struct sock *sk = sock->sk;
1462 	struct net *net = sock_net(sk);
1463 	struct unix_sock *u = unix_sk(sk);
1464 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1465 	struct sock *other = NULL;
1466 	int namelen = 0; /* fake GCC */
1467 	int err;
1468 	unsigned int hash;
1469 	struct sk_buff *skb;
1470 	long timeo;
1471 	struct scm_cookie scm;
1472 	int max_level;
1473 	int data_len = 0;
1474 
1475 	wait_for_unix_gc();
1476 	err = scm_send(sock, msg, &scm, false);
1477 	if (err < 0)
1478 		return err;
1479 
1480 	err = -EOPNOTSUPP;
1481 	if (msg->msg_flags&MSG_OOB)
1482 		goto out;
1483 
1484 	if (msg->msg_namelen) {
1485 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1486 		if (err < 0)
1487 			goto out;
1488 		namelen = err;
1489 	} else {
1490 		sunaddr = NULL;
1491 		err = -ENOTCONN;
1492 		other = unix_peer_get(sk);
1493 		if (!other)
1494 			goto out;
1495 	}
1496 
1497 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1498 	    && (err = unix_autobind(sock)) != 0)
1499 		goto out;
1500 
1501 	err = -EMSGSIZE;
1502 	if (len > sk->sk_sndbuf - 32)
1503 		goto out;
1504 
1505 	if (len > SKB_MAX_ALLOC) {
1506 		data_len = min_t(size_t,
1507 				 len - SKB_MAX_ALLOC,
1508 				 MAX_SKB_FRAGS * PAGE_SIZE);
1509 		data_len = PAGE_ALIGN(data_len);
1510 
1511 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1512 	}
1513 
1514 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1515 				   msg->msg_flags & MSG_DONTWAIT, &err,
1516 				   PAGE_ALLOC_COSTLY_ORDER);
1517 	if (skb == NULL)
1518 		goto out;
1519 
1520 	err = unix_scm_to_skb(&scm, skb, true);
1521 	if (err < 0)
1522 		goto out_free;
1523 	max_level = err + 1;
1524 
1525 	skb_put(skb, len - data_len);
1526 	skb->data_len = data_len;
1527 	skb->len = len;
1528 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1529 	if (err)
1530 		goto out_free;
1531 
1532 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1533 
1534 restart:
1535 	if (!other) {
1536 		err = -ECONNRESET;
1537 		if (sunaddr == NULL)
1538 			goto out_free;
1539 
1540 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1541 					hash, &err);
1542 		if (other == NULL)
1543 			goto out_free;
1544 	}
1545 
1546 	if (sk_filter(other, skb) < 0) {
1547 		/* Toss the packet but do not return any error to the sender */
1548 		err = len;
1549 		goto out_free;
1550 	}
1551 
1552 	unix_state_lock(other);
1553 	err = -EPERM;
1554 	if (!unix_may_send(sk, other))
1555 		goto out_unlock;
1556 
1557 	if (sock_flag(other, SOCK_DEAD)) {
1558 		/*
1559 		 *	Check with 1003.1g - what should
1560 		 *	datagram error
1561 		 */
1562 		unix_state_unlock(other);
1563 		sock_put(other);
1564 
1565 		err = 0;
1566 		unix_state_lock(sk);
1567 		if (unix_peer(sk) == other) {
1568 			unix_peer(sk) = NULL;
1569 			unix_state_unlock(sk);
1570 
1571 			unix_dgram_disconnected(sk, other);
1572 			sock_put(other);
1573 			err = -ECONNREFUSED;
1574 		} else {
1575 			unix_state_unlock(sk);
1576 		}
1577 
1578 		other = NULL;
1579 		if (err)
1580 			goto out_free;
1581 		goto restart;
1582 	}
1583 
1584 	err = -EPIPE;
1585 	if (other->sk_shutdown & RCV_SHUTDOWN)
1586 		goto out_unlock;
1587 
1588 	if (sk->sk_type != SOCK_SEQPACKET) {
1589 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1590 		if (err)
1591 			goto out_unlock;
1592 	}
1593 
1594 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1595 		if (!timeo) {
1596 			err = -EAGAIN;
1597 			goto out_unlock;
1598 		}
1599 
1600 		timeo = unix_wait_for_peer(other, timeo);
1601 
1602 		err = sock_intr_errno(timeo);
1603 		if (signal_pending(current))
1604 			goto out_free;
1605 
1606 		goto restart;
1607 	}
1608 
1609 	if (sock_flag(other, SOCK_RCVTSTAMP))
1610 		__net_timestamp(skb);
1611 	maybe_add_creds(skb, sock, other);
1612 	skb_queue_tail(&other->sk_receive_queue, skb);
1613 	if (max_level > unix_sk(other)->recursion_level)
1614 		unix_sk(other)->recursion_level = max_level;
1615 	unix_state_unlock(other);
1616 	other->sk_data_ready(other);
1617 	sock_put(other);
1618 	scm_destroy(&scm);
1619 	return len;
1620 
1621 out_unlock:
1622 	unix_state_unlock(other);
1623 out_free:
1624 	kfree_skb(skb);
1625 out:
1626 	if (other)
1627 		sock_put(other);
1628 	scm_destroy(&scm);
1629 	return err;
1630 }
1631 
1632 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1633  * bytes, and a minimun of a full page.
1634  */
1635 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1636 
1637 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1638 			       size_t len)
1639 {
1640 	struct sock *sk = sock->sk;
1641 	struct sock *other = NULL;
1642 	int err, size;
1643 	struct sk_buff *skb;
1644 	int sent = 0;
1645 	struct scm_cookie scm;
1646 	bool fds_sent = false;
1647 	int max_level;
1648 	int data_len;
1649 
1650 	wait_for_unix_gc();
1651 	err = scm_send(sock, msg, &scm, false);
1652 	if (err < 0)
1653 		return err;
1654 
1655 	err = -EOPNOTSUPP;
1656 	if (msg->msg_flags&MSG_OOB)
1657 		goto out_err;
1658 
1659 	if (msg->msg_namelen) {
1660 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1661 		goto out_err;
1662 	} else {
1663 		err = -ENOTCONN;
1664 		other = unix_peer(sk);
1665 		if (!other)
1666 			goto out_err;
1667 	}
1668 
1669 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1670 		goto pipe_err;
1671 
1672 	while (sent < len) {
1673 		size = len - sent;
1674 
1675 		/* Keep two messages in the pipe so it schedules better */
1676 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1677 
1678 		/* allow fallback to order-0 allocations */
1679 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1680 
1681 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1682 
1683 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1684 
1685 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1686 					   msg->msg_flags & MSG_DONTWAIT, &err,
1687 					   get_order(UNIX_SKB_FRAGS_SZ));
1688 		if (!skb)
1689 			goto out_err;
1690 
1691 		/* Only send the fds in the first buffer */
1692 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
1693 		if (err < 0) {
1694 			kfree_skb(skb);
1695 			goto out_err;
1696 		}
1697 		max_level = err + 1;
1698 		fds_sent = true;
1699 
1700 		skb_put(skb, size - data_len);
1701 		skb->data_len = data_len;
1702 		skb->len = size;
1703 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1704 		if (err) {
1705 			kfree_skb(skb);
1706 			goto out_err;
1707 		}
1708 
1709 		unix_state_lock(other);
1710 
1711 		if (sock_flag(other, SOCK_DEAD) ||
1712 		    (other->sk_shutdown & RCV_SHUTDOWN))
1713 			goto pipe_err_free;
1714 
1715 		maybe_add_creds(skb, sock, other);
1716 		skb_queue_tail(&other->sk_receive_queue, skb);
1717 		if (max_level > unix_sk(other)->recursion_level)
1718 			unix_sk(other)->recursion_level = max_level;
1719 		unix_state_unlock(other);
1720 		other->sk_data_ready(other);
1721 		sent += size;
1722 	}
1723 
1724 	scm_destroy(&scm);
1725 
1726 	return sent;
1727 
1728 pipe_err_free:
1729 	unix_state_unlock(other);
1730 	kfree_skb(skb);
1731 pipe_err:
1732 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1733 		send_sig(SIGPIPE, current, 0);
1734 	err = -EPIPE;
1735 out_err:
1736 	scm_destroy(&scm);
1737 	return sent ? : err;
1738 }
1739 
1740 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1741 				    int offset, size_t size, int flags)
1742 {
1743 	int err = 0;
1744 	bool send_sigpipe = true;
1745 	struct sock *other, *sk = socket->sk;
1746 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1747 
1748 	if (flags & MSG_OOB)
1749 		return -EOPNOTSUPP;
1750 
1751 	other = unix_peer(sk);
1752 	if (!other || sk->sk_state != TCP_ESTABLISHED)
1753 		return -ENOTCONN;
1754 
1755 	if (false) {
1756 alloc_skb:
1757 		unix_state_unlock(other);
1758 		mutex_unlock(&unix_sk(other)->readlock);
1759 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1760 					      &err, 0);
1761 		if (!newskb)
1762 			return err;
1763 	}
1764 
1765 	/* we must acquire readlock as we modify already present
1766 	 * skbs in the sk_receive_queue and mess with skb->len
1767 	 */
1768 	err = mutex_lock_interruptible(&unix_sk(other)->readlock);
1769 	if (err) {
1770 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1771 		send_sigpipe = false;
1772 		goto err;
1773 	}
1774 
1775 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
1776 		err = -EPIPE;
1777 		goto err_unlock;
1778 	}
1779 
1780 	unix_state_lock(other);
1781 
1782 	if (sock_flag(other, SOCK_DEAD) ||
1783 	    other->sk_shutdown & RCV_SHUTDOWN) {
1784 		err = -EPIPE;
1785 		goto err_state_unlock;
1786 	}
1787 
1788 	skb = skb_peek_tail(&other->sk_receive_queue);
1789 	if (tail && tail == skb) {
1790 		skb = newskb;
1791 	} else if (!skb) {
1792 		if (newskb)
1793 			skb = newskb;
1794 		else
1795 			goto alloc_skb;
1796 	} else if (newskb) {
1797 		/* this is fast path, we don't necessarily need to
1798 		 * call to kfree_skb even though with newskb == NULL
1799 		 * this - does no harm
1800 		 */
1801 		consume_skb(newskb);
1802 	}
1803 
1804 	if (skb_append_pagefrags(skb, page, offset, size)) {
1805 		tail = skb;
1806 		goto alloc_skb;
1807 	}
1808 
1809 	skb->len += size;
1810 	skb->data_len += size;
1811 	skb->truesize += size;
1812 	atomic_add(size, &sk->sk_wmem_alloc);
1813 
1814 	if (newskb)
1815 		__skb_queue_tail(&other->sk_receive_queue, newskb);
1816 
1817 	unix_state_unlock(other);
1818 	mutex_unlock(&unix_sk(other)->readlock);
1819 
1820 	other->sk_data_ready(other);
1821 
1822 	return size;
1823 
1824 err_state_unlock:
1825 	unix_state_unlock(other);
1826 err_unlock:
1827 	mutex_unlock(&unix_sk(other)->readlock);
1828 err:
1829 	kfree_skb(newskb);
1830 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
1831 		send_sig(SIGPIPE, current, 0);
1832 	return err;
1833 }
1834 
1835 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
1836 				  size_t len)
1837 {
1838 	int err;
1839 	struct sock *sk = sock->sk;
1840 
1841 	err = sock_error(sk);
1842 	if (err)
1843 		return err;
1844 
1845 	if (sk->sk_state != TCP_ESTABLISHED)
1846 		return -ENOTCONN;
1847 
1848 	if (msg->msg_namelen)
1849 		msg->msg_namelen = 0;
1850 
1851 	return unix_dgram_sendmsg(sock, msg, len);
1852 }
1853 
1854 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
1855 				  size_t size, int flags)
1856 {
1857 	struct sock *sk = sock->sk;
1858 
1859 	if (sk->sk_state != TCP_ESTABLISHED)
1860 		return -ENOTCONN;
1861 
1862 	return unix_dgram_recvmsg(sock, msg, size, flags);
1863 }
1864 
1865 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1866 {
1867 	struct unix_sock *u = unix_sk(sk);
1868 
1869 	if (u->addr) {
1870 		msg->msg_namelen = u->addr->len;
1871 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1872 	}
1873 }
1874 
1875 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
1876 			      size_t size, int flags)
1877 {
1878 	struct scm_cookie scm;
1879 	struct sock *sk = sock->sk;
1880 	struct unix_sock *u = unix_sk(sk);
1881 	int noblock = flags & MSG_DONTWAIT;
1882 	struct sk_buff *skb;
1883 	int err;
1884 	int peeked, skip;
1885 
1886 	err = -EOPNOTSUPP;
1887 	if (flags&MSG_OOB)
1888 		goto out;
1889 
1890 	err = mutex_lock_interruptible(&u->readlock);
1891 	if (unlikely(err)) {
1892 		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
1893 		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1894 		 */
1895 		err = noblock ? -EAGAIN : -ERESTARTSYS;
1896 		goto out;
1897 	}
1898 
1899 	skip = sk_peek_offset(sk, flags);
1900 
1901 	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1902 	if (!skb) {
1903 		unix_state_lock(sk);
1904 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1905 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1906 		    (sk->sk_shutdown & RCV_SHUTDOWN))
1907 			err = 0;
1908 		unix_state_unlock(sk);
1909 		goto out_unlock;
1910 	}
1911 
1912 	wake_up_interruptible_sync_poll(&u->peer_wait,
1913 					POLLOUT | POLLWRNORM | POLLWRBAND);
1914 
1915 	if (msg->msg_name)
1916 		unix_copy_addr(msg, skb->sk);
1917 
1918 	if (size > skb->len - skip)
1919 		size = skb->len - skip;
1920 	else if (size < skb->len - skip)
1921 		msg->msg_flags |= MSG_TRUNC;
1922 
1923 	err = skb_copy_datagram_msg(skb, skip, msg, size);
1924 	if (err)
1925 		goto out_free;
1926 
1927 	if (sock_flag(sk, SOCK_RCVTSTAMP))
1928 		__sock_recv_timestamp(msg, sk, skb);
1929 
1930 	memset(&scm, 0, sizeof(scm));
1931 
1932 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1933 	unix_set_secdata(&scm, skb);
1934 
1935 	if (!(flags & MSG_PEEK)) {
1936 		if (UNIXCB(skb).fp)
1937 			unix_detach_fds(&scm, skb);
1938 
1939 		sk_peek_offset_bwd(sk, skb->len);
1940 	} else {
1941 		/* It is questionable: on PEEK we could:
1942 		   - do not return fds - good, but too simple 8)
1943 		   - return fds, and do not return them on read (old strategy,
1944 		     apparently wrong)
1945 		   - clone fds (I chose it for now, it is the most universal
1946 		     solution)
1947 
1948 		   POSIX 1003.1g does not actually define this clearly
1949 		   at all. POSIX 1003.1g doesn't define a lot of things
1950 		   clearly however!
1951 
1952 		*/
1953 
1954 		sk_peek_offset_fwd(sk, size);
1955 
1956 		if (UNIXCB(skb).fp)
1957 			scm.fp = scm_fp_dup(UNIXCB(skb).fp);
1958 	}
1959 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1960 
1961 	scm_recv(sock, msg, &scm, flags);
1962 
1963 out_free:
1964 	skb_free_datagram(sk, skb);
1965 out_unlock:
1966 	mutex_unlock(&u->readlock);
1967 out:
1968 	return err;
1969 }
1970 
1971 /*
1972  *	Sleep until more data has arrived. But check for races..
1973  */
1974 static long unix_stream_data_wait(struct sock *sk, long timeo,
1975 				  struct sk_buff *last, unsigned int last_len)
1976 {
1977 	struct sk_buff *tail;
1978 	DEFINE_WAIT(wait);
1979 
1980 	unix_state_lock(sk);
1981 
1982 	for (;;) {
1983 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1984 
1985 		tail = skb_peek_tail(&sk->sk_receive_queue);
1986 		if (tail != last ||
1987 		    (tail && tail->len != last_len) ||
1988 		    sk->sk_err ||
1989 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1990 		    signal_pending(current) ||
1991 		    !timeo)
1992 			break;
1993 
1994 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1995 		unix_state_unlock(sk);
1996 		timeo = freezable_schedule_timeout(timeo);
1997 		unix_state_lock(sk);
1998 
1999 		if (sock_flag(sk, SOCK_DEAD))
2000 			break;
2001 
2002 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2003 	}
2004 
2005 	finish_wait(sk_sleep(sk), &wait);
2006 	unix_state_unlock(sk);
2007 	return timeo;
2008 }
2009 
2010 static unsigned int unix_skb_len(const struct sk_buff *skb)
2011 {
2012 	return skb->len - UNIXCB(skb).consumed;
2013 }
2014 
2015 struct unix_stream_read_state {
2016 	int (*recv_actor)(struct sk_buff *, int, int,
2017 			  struct unix_stream_read_state *);
2018 	struct socket *socket;
2019 	struct msghdr *msg;
2020 	struct pipe_inode_info *pipe;
2021 	size_t size;
2022 	int flags;
2023 	unsigned int splice_flags;
2024 };
2025 
2026 static int unix_stream_read_generic(struct unix_stream_read_state *state)
2027 {
2028 	struct scm_cookie scm;
2029 	struct socket *sock = state->socket;
2030 	struct sock *sk = sock->sk;
2031 	struct unix_sock *u = unix_sk(sk);
2032 	int copied = 0;
2033 	int flags = state->flags;
2034 	int noblock = flags & MSG_DONTWAIT;
2035 	bool check_creds = false;
2036 	int target;
2037 	int err = 0;
2038 	long timeo;
2039 	int skip;
2040 	size_t size = state->size;
2041 	unsigned int last_len;
2042 
2043 	err = -EINVAL;
2044 	if (sk->sk_state != TCP_ESTABLISHED)
2045 		goto out;
2046 
2047 	err = -EOPNOTSUPP;
2048 	if (flags & MSG_OOB)
2049 		goto out;
2050 
2051 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2052 	timeo = sock_rcvtimeo(sk, noblock);
2053 
2054 	memset(&scm, 0, sizeof(scm));
2055 
2056 	/* Lock the socket to prevent queue disordering
2057 	 * while sleeps in memcpy_tomsg
2058 	 */
2059 	err = mutex_lock_interruptible(&u->readlock);
2060 	if (unlikely(err)) {
2061 		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
2062 		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
2063 		 */
2064 		err = noblock ? -EAGAIN : -ERESTARTSYS;
2065 		goto out;
2066 	}
2067 
2068 	if (flags & MSG_PEEK)
2069 		skip = sk_peek_offset(sk, flags);
2070 	else
2071 		skip = 0;
2072 
2073 	do {
2074 		int chunk;
2075 		struct sk_buff *skb, *last;
2076 
2077 		unix_state_lock(sk);
2078 		if (sock_flag(sk, SOCK_DEAD)) {
2079 			err = -ECONNRESET;
2080 			goto unlock;
2081 		}
2082 		last = skb = skb_peek(&sk->sk_receive_queue);
2083 		last_len = last ? last->len : 0;
2084 again:
2085 		if (skb == NULL) {
2086 			unix_sk(sk)->recursion_level = 0;
2087 			if (copied >= target)
2088 				goto unlock;
2089 
2090 			/*
2091 			 *	POSIX 1003.1g mandates this order.
2092 			 */
2093 
2094 			err = sock_error(sk);
2095 			if (err)
2096 				goto unlock;
2097 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2098 				goto unlock;
2099 
2100 			unix_state_unlock(sk);
2101 			err = -EAGAIN;
2102 			if (!timeo)
2103 				break;
2104 			mutex_unlock(&u->readlock);
2105 
2106 			timeo = unix_stream_data_wait(sk, timeo, last,
2107 						      last_len);
2108 
2109 			if (signal_pending(current) ||
2110 			    mutex_lock_interruptible(&u->readlock)) {
2111 				err = sock_intr_errno(timeo);
2112 				goto out;
2113 			}
2114 
2115 			continue;
2116 unlock:
2117 			unix_state_unlock(sk);
2118 			break;
2119 		}
2120 
2121 		while (skip >= unix_skb_len(skb)) {
2122 			skip -= unix_skb_len(skb);
2123 			last = skb;
2124 			last_len = skb->len;
2125 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2126 			if (!skb)
2127 				goto again;
2128 		}
2129 
2130 		unix_state_unlock(sk);
2131 
2132 		if (check_creds) {
2133 			/* Never glue messages from different writers */
2134 			if ((UNIXCB(skb).pid  != scm.pid) ||
2135 			    !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
2136 			    !gid_eq(UNIXCB(skb).gid, scm.creds.gid) ||
2137 			    !unix_secdata_eq(&scm, skb))
2138 				break;
2139 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2140 			/* Copy credentials */
2141 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2142 			unix_set_secdata(&scm, skb);
2143 			check_creds = true;
2144 		}
2145 
2146 		/* Copy address just once */
2147 		if (state->msg && state->msg->msg_name) {
2148 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2149 					 state->msg->msg_name);
2150 			unix_copy_addr(state->msg, skb->sk);
2151 			sunaddr = NULL;
2152 		}
2153 
2154 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2155 		chunk = state->recv_actor(skb, skip, chunk, state);
2156 		if (chunk < 0) {
2157 			if (copied == 0)
2158 				copied = -EFAULT;
2159 			break;
2160 		}
2161 		copied += chunk;
2162 		size -= chunk;
2163 
2164 		/* Mark read part of skb as used */
2165 		if (!(flags & MSG_PEEK)) {
2166 			UNIXCB(skb).consumed += chunk;
2167 
2168 			sk_peek_offset_bwd(sk, chunk);
2169 
2170 			if (UNIXCB(skb).fp)
2171 				unix_detach_fds(&scm, skb);
2172 
2173 			if (unix_skb_len(skb))
2174 				break;
2175 
2176 			skb_unlink(skb, &sk->sk_receive_queue);
2177 			consume_skb(skb);
2178 
2179 			if (scm.fp)
2180 				break;
2181 		} else {
2182 			/* It is questionable, see note in unix_dgram_recvmsg.
2183 			 */
2184 			if (UNIXCB(skb).fp)
2185 				scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2186 
2187 			sk_peek_offset_fwd(sk, chunk);
2188 
2189 			if (UNIXCB(skb).fp)
2190 				break;
2191 
2192 			skip = 0;
2193 			last = skb;
2194 			last_len = skb->len;
2195 			unix_state_lock(sk);
2196 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2197 			if (skb)
2198 				goto again;
2199 			unix_state_unlock(sk);
2200 			break;
2201 		}
2202 	} while (size);
2203 
2204 	mutex_unlock(&u->readlock);
2205 	if (state->msg)
2206 		scm_recv(sock, state->msg, &scm, flags);
2207 	else
2208 		scm_destroy(&scm);
2209 out:
2210 	return copied ? : err;
2211 }
2212 
2213 static int unix_stream_read_actor(struct sk_buff *skb,
2214 				  int skip, int chunk,
2215 				  struct unix_stream_read_state *state)
2216 {
2217 	int ret;
2218 
2219 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2220 				    state->msg, chunk);
2221 	return ret ?: chunk;
2222 }
2223 
2224 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2225 			       size_t size, int flags)
2226 {
2227 	struct unix_stream_read_state state = {
2228 		.recv_actor = unix_stream_read_actor,
2229 		.socket = sock,
2230 		.msg = msg,
2231 		.size = size,
2232 		.flags = flags
2233 	};
2234 
2235 	return unix_stream_read_generic(&state);
2236 }
2237 
2238 static ssize_t skb_unix_socket_splice(struct sock *sk,
2239 				      struct pipe_inode_info *pipe,
2240 				      struct splice_pipe_desc *spd)
2241 {
2242 	int ret;
2243 	struct unix_sock *u = unix_sk(sk);
2244 
2245 	mutex_unlock(&u->readlock);
2246 	ret = splice_to_pipe(pipe, spd);
2247 	mutex_lock(&u->readlock);
2248 
2249 	return ret;
2250 }
2251 
2252 static int unix_stream_splice_actor(struct sk_buff *skb,
2253 				    int skip, int chunk,
2254 				    struct unix_stream_read_state *state)
2255 {
2256 	return skb_splice_bits(skb, state->socket->sk,
2257 			       UNIXCB(skb).consumed + skip,
2258 			       state->pipe, chunk, state->splice_flags,
2259 			       skb_unix_socket_splice);
2260 }
2261 
2262 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2263 				       struct pipe_inode_info *pipe,
2264 				       size_t size, unsigned int flags)
2265 {
2266 	struct unix_stream_read_state state = {
2267 		.recv_actor = unix_stream_splice_actor,
2268 		.socket = sock,
2269 		.pipe = pipe,
2270 		.size = size,
2271 		.splice_flags = flags,
2272 	};
2273 
2274 	if (unlikely(*ppos))
2275 		return -ESPIPE;
2276 
2277 	if (sock->file->f_flags & O_NONBLOCK ||
2278 	    flags & SPLICE_F_NONBLOCK)
2279 		state.flags = MSG_DONTWAIT;
2280 
2281 	return unix_stream_read_generic(&state);
2282 }
2283 
2284 static int unix_shutdown(struct socket *sock, int mode)
2285 {
2286 	struct sock *sk = sock->sk;
2287 	struct sock *other;
2288 
2289 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2290 		return -EINVAL;
2291 	/* This maps:
2292 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2293 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2294 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2295 	 */
2296 	++mode;
2297 
2298 	unix_state_lock(sk);
2299 	sk->sk_shutdown |= mode;
2300 	other = unix_peer(sk);
2301 	if (other)
2302 		sock_hold(other);
2303 	unix_state_unlock(sk);
2304 	sk->sk_state_change(sk);
2305 
2306 	if (other &&
2307 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2308 
2309 		int peer_mode = 0;
2310 
2311 		if (mode&RCV_SHUTDOWN)
2312 			peer_mode |= SEND_SHUTDOWN;
2313 		if (mode&SEND_SHUTDOWN)
2314 			peer_mode |= RCV_SHUTDOWN;
2315 		unix_state_lock(other);
2316 		other->sk_shutdown |= peer_mode;
2317 		unix_state_unlock(other);
2318 		other->sk_state_change(other);
2319 		if (peer_mode == SHUTDOWN_MASK)
2320 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2321 		else if (peer_mode & RCV_SHUTDOWN)
2322 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2323 	}
2324 	if (other)
2325 		sock_put(other);
2326 
2327 	return 0;
2328 }
2329 
2330 long unix_inq_len(struct sock *sk)
2331 {
2332 	struct sk_buff *skb;
2333 	long amount = 0;
2334 
2335 	if (sk->sk_state == TCP_LISTEN)
2336 		return -EINVAL;
2337 
2338 	spin_lock(&sk->sk_receive_queue.lock);
2339 	if (sk->sk_type == SOCK_STREAM ||
2340 	    sk->sk_type == SOCK_SEQPACKET) {
2341 		skb_queue_walk(&sk->sk_receive_queue, skb)
2342 			amount += unix_skb_len(skb);
2343 	} else {
2344 		skb = skb_peek(&sk->sk_receive_queue);
2345 		if (skb)
2346 			amount = skb->len;
2347 	}
2348 	spin_unlock(&sk->sk_receive_queue.lock);
2349 
2350 	return amount;
2351 }
2352 EXPORT_SYMBOL_GPL(unix_inq_len);
2353 
2354 long unix_outq_len(struct sock *sk)
2355 {
2356 	return sk_wmem_alloc_get(sk);
2357 }
2358 EXPORT_SYMBOL_GPL(unix_outq_len);
2359 
2360 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2361 {
2362 	struct sock *sk = sock->sk;
2363 	long amount = 0;
2364 	int err;
2365 
2366 	switch (cmd) {
2367 	case SIOCOUTQ:
2368 		amount = unix_outq_len(sk);
2369 		err = put_user(amount, (int __user *)arg);
2370 		break;
2371 	case SIOCINQ:
2372 		amount = unix_inq_len(sk);
2373 		if (amount < 0)
2374 			err = amount;
2375 		else
2376 			err = put_user(amount, (int __user *)arg);
2377 		break;
2378 	default:
2379 		err = -ENOIOCTLCMD;
2380 		break;
2381 	}
2382 	return err;
2383 }
2384 
2385 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2386 {
2387 	struct sock *sk = sock->sk;
2388 	unsigned int mask;
2389 
2390 	sock_poll_wait(file, sk_sleep(sk), wait);
2391 	mask = 0;
2392 
2393 	/* exceptional events? */
2394 	if (sk->sk_err)
2395 		mask |= POLLERR;
2396 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2397 		mask |= POLLHUP;
2398 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2399 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2400 
2401 	/* readable? */
2402 	if (!skb_queue_empty(&sk->sk_receive_queue))
2403 		mask |= POLLIN | POLLRDNORM;
2404 
2405 	/* Connection-based need to check for termination and startup */
2406 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2407 	    sk->sk_state == TCP_CLOSE)
2408 		mask |= POLLHUP;
2409 
2410 	/*
2411 	 * we set writable also when the other side has shut down the
2412 	 * connection. This prevents stuck sockets.
2413 	 */
2414 	if (unix_writable(sk))
2415 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2416 
2417 	return mask;
2418 }
2419 
2420 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2421 				    poll_table *wait)
2422 {
2423 	struct sock *sk = sock->sk, *other;
2424 	unsigned int mask, writable;
2425 
2426 	sock_poll_wait(file, sk_sleep(sk), wait);
2427 	mask = 0;
2428 
2429 	/* exceptional events? */
2430 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2431 		mask |= POLLERR |
2432 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2433 
2434 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2435 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2436 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2437 		mask |= POLLHUP;
2438 
2439 	/* readable? */
2440 	if (!skb_queue_empty(&sk->sk_receive_queue))
2441 		mask |= POLLIN | POLLRDNORM;
2442 
2443 	/* Connection-based need to check for termination and startup */
2444 	if (sk->sk_type == SOCK_SEQPACKET) {
2445 		if (sk->sk_state == TCP_CLOSE)
2446 			mask |= POLLHUP;
2447 		/* connection hasn't started yet? */
2448 		if (sk->sk_state == TCP_SYN_SENT)
2449 			return mask;
2450 	}
2451 
2452 	/* No write status requested, avoid expensive OUT tests. */
2453 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2454 		return mask;
2455 
2456 	writable = unix_writable(sk);
2457 	other = unix_peer_get(sk);
2458 	if (other) {
2459 		if (unix_peer(other) != sk) {
2460 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2461 			if (unix_recvq_full(other))
2462 				writable = 0;
2463 		}
2464 		sock_put(other);
2465 	}
2466 
2467 	if (writable)
2468 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2469 	else
2470 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2471 
2472 	return mask;
2473 }
2474 
2475 #ifdef CONFIG_PROC_FS
2476 
2477 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2478 
2479 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2480 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2481 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2482 
2483 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2484 {
2485 	unsigned long offset = get_offset(*pos);
2486 	unsigned long bucket = get_bucket(*pos);
2487 	struct sock *sk;
2488 	unsigned long count = 0;
2489 
2490 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2491 		if (sock_net(sk) != seq_file_net(seq))
2492 			continue;
2493 		if (++count == offset)
2494 			break;
2495 	}
2496 
2497 	return sk;
2498 }
2499 
2500 static struct sock *unix_next_socket(struct seq_file *seq,
2501 				     struct sock *sk,
2502 				     loff_t *pos)
2503 {
2504 	unsigned long bucket;
2505 
2506 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2507 		sk = sk_next(sk);
2508 		if (!sk)
2509 			goto next_bucket;
2510 		if (sock_net(sk) == seq_file_net(seq))
2511 			return sk;
2512 	}
2513 
2514 	do {
2515 		sk = unix_from_bucket(seq, pos);
2516 		if (sk)
2517 			return sk;
2518 
2519 next_bucket:
2520 		bucket = get_bucket(*pos) + 1;
2521 		*pos = set_bucket_offset(bucket, 1);
2522 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2523 
2524 	return NULL;
2525 }
2526 
2527 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2528 	__acquires(unix_table_lock)
2529 {
2530 	spin_lock(&unix_table_lock);
2531 
2532 	if (!*pos)
2533 		return SEQ_START_TOKEN;
2534 
2535 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2536 		return NULL;
2537 
2538 	return unix_next_socket(seq, NULL, pos);
2539 }
2540 
2541 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2542 {
2543 	++*pos;
2544 	return unix_next_socket(seq, v, pos);
2545 }
2546 
2547 static void unix_seq_stop(struct seq_file *seq, void *v)
2548 	__releases(unix_table_lock)
2549 {
2550 	spin_unlock(&unix_table_lock);
2551 }
2552 
2553 static int unix_seq_show(struct seq_file *seq, void *v)
2554 {
2555 
2556 	if (v == SEQ_START_TOKEN)
2557 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2558 			 "Inode Path\n");
2559 	else {
2560 		struct sock *s = v;
2561 		struct unix_sock *u = unix_sk(s);
2562 		unix_state_lock(s);
2563 
2564 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2565 			s,
2566 			atomic_read(&s->sk_refcnt),
2567 			0,
2568 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2569 			s->sk_type,
2570 			s->sk_socket ?
2571 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2572 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2573 			sock_i_ino(s));
2574 
2575 		if (u->addr) {
2576 			int i, len;
2577 			seq_putc(seq, ' ');
2578 
2579 			i = 0;
2580 			len = u->addr->len - sizeof(short);
2581 			if (!UNIX_ABSTRACT(s))
2582 				len--;
2583 			else {
2584 				seq_putc(seq, '@');
2585 				i++;
2586 			}
2587 			for ( ; i < len; i++)
2588 				seq_putc(seq, u->addr->name->sun_path[i]);
2589 		}
2590 		unix_state_unlock(s);
2591 		seq_putc(seq, '\n');
2592 	}
2593 
2594 	return 0;
2595 }
2596 
2597 static const struct seq_operations unix_seq_ops = {
2598 	.start  = unix_seq_start,
2599 	.next   = unix_seq_next,
2600 	.stop   = unix_seq_stop,
2601 	.show   = unix_seq_show,
2602 };
2603 
2604 static int unix_seq_open(struct inode *inode, struct file *file)
2605 {
2606 	return seq_open_net(inode, file, &unix_seq_ops,
2607 			    sizeof(struct seq_net_private));
2608 }
2609 
2610 static const struct file_operations unix_seq_fops = {
2611 	.owner		= THIS_MODULE,
2612 	.open		= unix_seq_open,
2613 	.read		= seq_read,
2614 	.llseek		= seq_lseek,
2615 	.release	= seq_release_net,
2616 };
2617 
2618 #endif
2619 
2620 static const struct net_proto_family unix_family_ops = {
2621 	.family = PF_UNIX,
2622 	.create = unix_create,
2623 	.owner	= THIS_MODULE,
2624 };
2625 
2626 
2627 static int __net_init unix_net_init(struct net *net)
2628 {
2629 	int error = -ENOMEM;
2630 
2631 	net->unx.sysctl_max_dgram_qlen = 10;
2632 	if (unix_sysctl_register(net))
2633 		goto out;
2634 
2635 #ifdef CONFIG_PROC_FS
2636 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2637 		unix_sysctl_unregister(net);
2638 		goto out;
2639 	}
2640 #endif
2641 	error = 0;
2642 out:
2643 	return error;
2644 }
2645 
2646 static void __net_exit unix_net_exit(struct net *net)
2647 {
2648 	unix_sysctl_unregister(net);
2649 	remove_proc_entry("unix", net->proc_net);
2650 }
2651 
2652 static struct pernet_operations unix_net_ops = {
2653 	.init = unix_net_init,
2654 	.exit = unix_net_exit,
2655 };
2656 
2657 static int __init af_unix_init(void)
2658 {
2659 	int rc = -1;
2660 
2661 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2662 
2663 	rc = proto_register(&unix_proto, 1);
2664 	if (rc != 0) {
2665 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2666 		goto out;
2667 	}
2668 
2669 	sock_register(&unix_family_ops);
2670 	register_pernet_subsys(&unix_net_ops);
2671 out:
2672 	return rc;
2673 }
2674 
2675 static void __exit af_unix_exit(void)
2676 {
2677 	sock_unregister(PF_UNIX);
2678 	proto_unregister(&unix_proto);
2679 	unregister_pernet_subsys(&unix_net_ops);
2680 }
2681 
2682 /* Earlier than device_initcall() so that other drivers invoking
2683    request_module() don't end up in a loop when modprobe tries
2684    to use a UNIX socket. But later than subsys_initcall() because
2685    we depend on stuff initialised there */
2686 fs_initcall(af_unix_init);
2687 module_exit(af_unix_exit);
2688 
2689 MODULE_LICENSE("GPL");
2690 MODULE_ALIAS_NETPROTO(PF_UNIX);
2691