xref: /openbmc/linux/net/unix/af_unix.c (revision 293d5b43)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
84 
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
95 #include <linux/un.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <asm/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
120 
121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
122 EXPORT_SYMBOL_GPL(unix_socket_table);
123 DEFINE_SPINLOCK(unix_table_lock);
124 EXPORT_SYMBOL_GPL(unix_table_lock);
125 static atomic_long_t unix_nr_socks;
126 
127 
128 static struct hlist_head *unix_sockets_unbound(void *addr)
129 {
130 	unsigned long hash = (unsigned long)addr;
131 
132 	hash ^= hash >> 16;
133 	hash ^= hash >> 8;
134 	hash %= UNIX_HASH_SIZE;
135 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
136 }
137 
138 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
139 
140 #ifdef CONFIG_SECURITY_NETWORK
141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
142 {
143 	UNIXCB(skb).secid = scm->secid;
144 }
145 
146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
147 {
148 	scm->secid = UNIXCB(skb).secid;
149 }
150 
151 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
152 {
153 	return (scm->secid == UNIXCB(skb).secid);
154 }
155 #else
156 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
157 { }
158 
159 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
160 { }
161 
162 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
163 {
164 	return true;
165 }
166 #endif /* CONFIG_SECURITY_NETWORK */
167 
168 /*
169  *  SMP locking strategy:
170  *    hash table is protected with spinlock unix_table_lock
171  *    each socket state is protected by separate spin lock.
172  */
173 
174 static inline unsigned int unix_hash_fold(__wsum n)
175 {
176 	unsigned int hash = (__force unsigned int)csum_fold(n);
177 
178 	hash ^= hash>>8;
179 	return hash&(UNIX_HASH_SIZE-1);
180 }
181 
182 #define unix_peer(sk) (unix_sk(sk)->peer)
183 
184 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
185 {
186 	return unix_peer(osk) == sk;
187 }
188 
189 static inline int unix_may_send(struct sock *sk, struct sock *osk)
190 {
191 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
192 }
193 
194 static inline int unix_recvq_full(struct sock const *sk)
195 {
196 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
197 }
198 
199 struct sock *unix_peer_get(struct sock *s)
200 {
201 	struct sock *peer;
202 
203 	unix_state_lock(s);
204 	peer = unix_peer(s);
205 	if (peer)
206 		sock_hold(peer);
207 	unix_state_unlock(s);
208 	return peer;
209 }
210 EXPORT_SYMBOL_GPL(unix_peer_get);
211 
212 static inline void unix_release_addr(struct unix_address *addr)
213 {
214 	if (atomic_dec_and_test(&addr->refcnt))
215 		kfree(addr);
216 }
217 
218 /*
219  *	Check unix socket name:
220  *		- should be not zero length.
221  *	        - if started by not zero, should be NULL terminated (FS object)
222  *		- if started by zero, it is abstract name.
223  */
224 
225 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
226 {
227 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
228 		return -EINVAL;
229 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
230 		return -EINVAL;
231 	if (sunaddr->sun_path[0]) {
232 		/*
233 		 * This may look like an off by one error but it is a bit more
234 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
235 		 * sun_path[108] doesn't as such exist.  However in kernel space
236 		 * we are guaranteed that it is a valid memory location in our
237 		 * kernel address buffer.
238 		 */
239 		((char *)sunaddr)[len] = 0;
240 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
241 		return len;
242 	}
243 
244 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
245 	return len;
246 }
247 
248 static void __unix_remove_socket(struct sock *sk)
249 {
250 	sk_del_node_init(sk);
251 }
252 
253 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
254 {
255 	WARN_ON(!sk_unhashed(sk));
256 	sk_add_node(sk, list);
257 }
258 
259 static inline void unix_remove_socket(struct sock *sk)
260 {
261 	spin_lock(&unix_table_lock);
262 	__unix_remove_socket(sk);
263 	spin_unlock(&unix_table_lock);
264 }
265 
266 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
267 {
268 	spin_lock(&unix_table_lock);
269 	__unix_insert_socket(list, sk);
270 	spin_unlock(&unix_table_lock);
271 }
272 
273 static struct sock *__unix_find_socket_byname(struct net *net,
274 					      struct sockaddr_un *sunname,
275 					      int len, int type, unsigned int hash)
276 {
277 	struct sock *s;
278 
279 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
280 		struct unix_sock *u = unix_sk(s);
281 
282 		if (!net_eq(sock_net(s), net))
283 			continue;
284 
285 		if (u->addr->len == len &&
286 		    !memcmp(u->addr->name, sunname, len))
287 			goto found;
288 	}
289 	s = NULL;
290 found:
291 	return s;
292 }
293 
294 static inline struct sock *unix_find_socket_byname(struct net *net,
295 						   struct sockaddr_un *sunname,
296 						   int len, int type,
297 						   unsigned int hash)
298 {
299 	struct sock *s;
300 
301 	spin_lock(&unix_table_lock);
302 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
303 	if (s)
304 		sock_hold(s);
305 	spin_unlock(&unix_table_lock);
306 	return s;
307 }
308 
309 static struct sock *unix_find_socket_byinode(struct inode *i)
310 {
311 	struct sock *s;
312 
313 	spin_lock(&unix_table_lock);
314 	sk_for_each(s,
315 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
316 		struct dentry *dentry = unix_sk(s)->path.dentry;
317 
318 		if (dentry && d_real_inode(dentry) == i) {
319 			sock_hold(s);
320 			goto found;
321 		}
322 	}
323 	s = NULL;
324 found:
325 	spin_unlock(&unix_table_lock);
326 	return s;
327 }
328 
329 /* Support code for asymmetrically connected dgram sockets
330  *
331  * If a datagram socket is connected to a socket not itself connected
332  * to the first socket (eg, /dev/log), clients may only enqueue more
333  * messages if the present receive queue of the server socket is not
334  * "too large". This means there's a second writeability condition
335  * poll and sendmsg need to test. The dgram recv code will do a wake
336  * up on the peer_wait wait queue of a socket upon reception of a
337  * datagram which needs to be propagated to sleeping would-be writers
338  * since these might not have sent anything so far. This can't be
339  * accomplished via poll_wait because the lifetime of the server
340  * socket might be less than that of its clients if these break their
341  * association with it or if the server socket is closed while clients
342  * are still connected to it and there's no way to inform "a polling
343  * implementation" that it should let go of a certain wait queue
344  *
345  * In order to propagate a wake up, a wait_queue_t of the client
346  * socket is enqueued on the peer_wait queue of the server socket
347  * whose wake function does a wake_up on the ordinary client socket
348  * wait queue. This connection is established whenever a write (or
349  * poll for write) hit the flow control condition and broken when the
350  * association to the server socket is dissolved or after a wake up
351  * was relayed.
352  */
353 
354 static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
355 				      void *key)
356 {
357 	struct unix_sock *u;
358 	wait_queue_head_t *u_sleep;
359 
360 	u = container_of(q, struct unix_sock, peer_wake);
361 
362 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
363 			    q);
364 	u->peer_wake.private = NULL;
365 
366 	/* relaying can only happen while the wq still exists */
367 	u_sleep = sk_sleep(&u->sk);
368 	if (u_sleep)
369 		wake_up_interruptible_poll(u_sleep, key);
370 
371 	return 0;
372 }
373 
374 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
375 {
376 	struct unix_sock *u, *u_other;
377 	int rc;
378 
379 	u = unix_sk(sk);
380 	u_other = unix_sk(other);
381 	rc = 0;
382 	spin_lock(&u_other->peer_wait.lock);
383 
384 	if (!u->peer_wake.private) {
385 		u->peer_wake.private = other;
386 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
387 
388 		rc = 1;
389 	}
390 
391 	spin_unlock(&u_other->peer_wait.lock);
392 	return rc;
393 }
394 
395 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
396 					    struct sock *other)
397 {
398 	struct unix_sock *u, *u_other;
399 
400 	u = unix_sk(sk);
401 	u_other = unix_sk(other);
402 	spin_lock(&u_other->peer_wait.lock);
403 
404 	if (u->peer_wake.private == other) {
405 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
406 		u->peer_wake.private = NULL;
407 	}
408 
409 	spin_unlock(&u_other->peer_wait.lock);
410 }
411 
412 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
413 						   struct sock *other)
414 {
415 	unix_dgram_peer_wake_disconnect(sk, other);
416 	wake_up_interruptible_poll(sk_sleep(sk),
417 				   POLLOUT |
418 				   POLLWRNORM |
419 				   POLLWRBAND);
420 }
421 
422 /* preconditions:
423  *	- unix_peer(sk) == other
424  *	- association is stable
425  */
426 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
427 {
428 	int connected;
429 
430 	connected = unix_dgram_peer_wake_connect(sk, other);
431 
432 	if (unix_recvq_full(other))
433 		return 1;
434 
435 	if (connected)
436 		unix_dgram_peer_wake_disconnect(sk, other);
437 
438 	return 0;
439 }
440 
441 static int unix_writable(const struct sock *sk)
442 {
443 	return sk->sk_state != TCP_LISTEN &&
444 	       (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
445 }
446 
447 static void unix_write_space(struct sock *sk)
448 {
449 	struct socket_wq *wq;
450 
451 	rcu_read_lock();
452 	if (unix_writable(sk)) {
453 		wq = rcu_dereference(sk->sk_wq);
454 		if (skwq_has_sleeper(wq))
455 			wake_up_interruptible_sync_poll(&wq->wait,
456 				POLLOUT | POLLWRNORM | POLLWRBAND);
457 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
458 	}
459 	rcu_read_unlock();
460 }
461 
462 /* When dgram socket disconnects (or changes its peer), we clear its receive
463  * queue of packets arrived from previous peer. First, it allows to do
464  * flow control based only on wmem_alloc; second, sk connected to peer
465  * may receive messages only from that peer. */
466 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
467 {
468 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
469 		skb_queue_purge(&sk->sk_receive_queue);
470 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
471 
472 		/* If one link of bidirectional dgram pipe is disconnected,
473 		 * we signal error. Messages are lost. Do not make this,
474 		 * when peer was not connected to us.
475 		 */
476 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
477 			other->sk_err = ECONNRESET;
478 			other->sk_error_report(other);
479 		}
480 	}
481 }
482 
483 static void unix_sock_destructor(struct sock *sk)
484 {
485 	struct unix_sock *u = unix_sk(sk);
486 
487 	skb_queue_purge(&sk->sk_receive_queue);
488 
489 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
490 	WARN_ON(!sk_unhashed(sk));
491 	WARN_ON(sk->sk_socket);
492 	if (!sock_flag(sk, SOCK_DEAD)) {
493 		pr_info("Attempt to release alive unix socket: %p\n", sk);
494 		return;
495 	}
496 
497 	if (u->addr)
498 		unix_release_addr(u->addr);
499 
500 	atomic_long_dec(&unix_nr_socks);
501 	local_bh_disable();
502 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
503 	local_bh_enable();
504 #ifdef UNIX_REFCNT_DEBUG
505 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
506 		atomic_long_read(&unix_nr_socks));
507 #endif
508 }
509 
510 static void unix_release_sock(struct sock *sk, int embrion)
511 {
512 	struct unix_sock *u = unix_sk(sk);
513 	struct path path;
514 	struct sock *skpair;
515 	struct sk_buff *skb;
516 	int state;
517 
518 	unix_remove_socket(sk);
519 
520 	/* Clear state */
521 	unix_state_lock(sk);
522 	sock_orphan(sk);
523 	sk->sk_shutdown = SHUTDOWN_MASK;
524 	path	     = u->path;
525 	u->path.dentry = NULL;
526 	u->path.mnt = NULL;
527 	state = sk->sk_state;
528 	sk->sk_state = TCP_CLOSE;
529 	unix_state_unlock(sk);
530 
531 	wake_up_interruptible_all(&u->peer_wait);
532 
533 	skpair = unix_peer(sk);
534 
535 	if (skpair != NULL) {
536 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
537 			unix_state_lock(skpair);
538 			/* No more writes */
539 			skpair->sk_shutdown = SHUTDOWN_MASK;
540 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
541 				skpair->sk_err = ECONNRESET;
542 			unix_state_unlock(skpair);
543 			skpair->sk_state_change(skpair);
544 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
545 		}
546 
547 		unix_dgram_peer_wake_disconnect(sk, skpair);
548 		sock_put(skpair); /* It may now die */
549 		unix_peer(sk) = NULL;
550 	}
551 
552 	/* Try to flush out this socket. Throw out buffers at least */
553 
554 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
555 		if (state == TCP_LISTEN)
556 			unix_release_sock(skb->sk, 1);
557 		/* passed fds are erased in the kfree_skb hook	      */
558 		UNIXCB(skb).consumed = skb->len;
559 		kfree_skb(skb);
560 	}
561 
562 	if (path.dentry)
563 		path_put(&path);
564 
565 	sock_put(sk);
566 
567 	/* ---- Socket is dead now and most probably destroyed ---- */
568 
569 	/*
570 	 * Fixme: BSD difference: In BSD all sockets connected to us get
571 	 *	  ECONNRESET and we die on the spot. In Linux we behave
572 	 *	  like files and pipes do and wait for the last
573 	 *	  dereference.
574 	 *
575 	 * Can't we simply set sock->err?
576 	 *
577 	 *	  What the above comment does talk about? --ANK(980817)
578 	 */
579 
580 	if (unix_tot_inflight)
581 		unix_gc();		/* Garbage collect fds */
582 }
583 
584 static void init_peercred(struct sock *sk)
585 {
586 	put_pid(sk->sk_peer_pid);
587 	if (sk->sk_peer_cred)
588 		put_cred(sk->sk_peer_cred);
589 	sk->sk_peer_pid  = get_pid(task_tgid(current));
590 	sk->sk_peer_cred = get_current_cred();
591 }
592 
593 static void copy_peercred(struct sock *sk, struct sock *peersk)
594 {
595 	put_pid(sk->sk_peer_pid);
596 	if (sk->sk_peer_cred)
597 		put_cred(sk->sk_peer_cred);
598 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
599 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
600 }
601 
602 static int unix_listen(struct socket *sock, int backlog)
603 {
604 	int err;
605 	struct sock *sk = sock->sk;
606 	struct unix_sock *u = unix_sk(sk);
607 	struct pid *old_pid = NULL;
608 
609 	err = -EOPNOTSUPP;
610 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
611 		goto out;	/* Only stream/seqpacket sockets accept */
612 	err = -EINVAL;
613 	if (!u->addr)
614 		goto out;	/* No listens on an unbound socket */
615 	unix_state_lock(sk);
616 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
617 		goto out_unlock;
618 	if (backlog > sk->sk_max_ack_backlog)
619 		wake_up_interruptible_all(&u->peer_wait);
620 	sk->sk_max_ack_backlog	= backlog;
621 	sk->sk_state		= TCP_LISTEN;
622 	/* set credentials so connect can copy them */
623 	init_peercred(sk);
624 	err = 0;
625 
626 out_unlock:
627 	unix_state_unlock(sk);
628 	put_pid(old_pid);
629 out:
630 	return err;
631 }
632 
633 static int unix_release(struct socket *);
634 static int unix_bind(struct socket *, struct sockaddr *, int);
635 static int unix_stream_connect(struct socket *, struct sockaddr *,
636 			       int addr_len, int flags);
637 static int unix_socketpair(struct socket *, struct socket *);
638 static int unix_accept(struct socket *, struct socket *, int);
639 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
640 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
641 static unsigned int unix_dgram_poll(struct file *, struct socket *,
642 				    poll_table *);
643 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
644 static int unix_shutdown(struct socket *, int);
645 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
646 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
647 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
648 				    size_t size, int flags);
649 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
650 				       struct pipe_inode_info *, size_t size,
651 				       unsigned int flags);
652 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
653 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
654 static int unix_dgram_connect(struct socket *, struct sockaddr *,
655 			      int, int);
656 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
657 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
658 				  int);
659 
660 static int unix_set_peek_off(struct sock *sk, int val)
661 {
662 	struct unix_sock *u = unix_sk(sk);
663 
664 	if (mutex_lock_interruptible(&u->readlock))
665 		return -EINTR;
666 
667 	sk->sk_peek_off = val;
668 	mutex_unlock(&u->readlock);
669 
670 	return 0;
671 }
672 
673 
674 static const struct proto_ops unix_stream_ops = {
675 	.family =	PF_UNIX,
676 	.owner =	THIS_MODULE,
677 	.release =	unix_release,
678 	.bind =		unix_bind,
679 	.connect =	unix_stream_connect,
680 	.socketpair =	unix_socketpair,
681 	.accept =	unix_accept,
682 	.getname =	unix_getname,
683 	.poll =		unix_poll,
684 	.ioctl =	unix_ioctl,
685 	.listen =	unix_listen,
686 	.shutdown =	unix_shutdown,
687 	.setsockopt =	sock_no_setsockopt,
688 	.getsockopt =	sock_no_getsockopt,
689 	.sendmsg =	unix_stream_sendmsg,
690 	.recvmsg =	unix_stream_recvmsg,
691 	.mmap =		sock_no_mmap,
692 	.sendpage =	unix_stream_sendpage,
693 	.splice_read =	unix_stream_splice_read,
694 	.set_peek_off =	unix_set_peek_off,
695 };
696 
697 static const struct proto_ops unix_dgram_ops = {
698 	.family =	PF_UNIX,
699 	.owner =	THIS_MODULE,
700 	.release =	unix_release,
701 	.bind =		unix_bind,
702 	.connect =	unix_dgram_connect,
703 	.socketpair =	unix_socketpair,
704 	.accept =	sock_no_accept,
705 	.getname =	unix_getname,
706 	.poll =		unix_dgram_poll,
707 	.ioctl =	unix_ioctl,
708 	.listen =	sock_no_listen,
709 	.shutdown =	unix_shutdown,
710 	.setsockopt =	sock_no_setsockopt,
711 	.getsockopt =	sock_no_getsockopt,
712 	.sendmsg =	unix_dgram_sendmsg,
713 	.recvmsg =	unix_dgram_recvmsg,
714 	.mmap =		sock_no_mmap,
715 	.sendpage =	sock_no_sendpage,
716 	.set_peek_off =	unix_set_peek_off,
717 };
718 
719 static const struct proto_ops unix_seqpacket_ops = {
720 	.family =	PF_UNIX,
721 	.owner =	THIS_MODULE,
722 	.release =	unix_release,
723 	.bind =		unix_bind,
724 	.connect =	unix_stream_connect,
725 	.socketpair =	unix_socketpair,
726 	.accept =	unix_accept,
727 	.getname =	unix_getname,
728 	.poll =		unix_dgram_poll,
729 	.ioctl =	unix_ioctl,
730 	.listen =	unix_listen,
731 	.shutdown =	unix_shutdown,
732 	.setsockopt =	sock_no_setsockopt,
733 	.getsockopt =	sock_no_getsockopt,
734 	.sendmsg =	unix_seqpacket_sendmsg,
735 	.recvmsg =	unix_seqpacket_recvmsg,
736 	.mmap =		sock_no_mmap,
737 	.sendpage =	sock_no_sendpage,
738 	.set_peek_off =	unix_set_peek_off,
739 };
740 
741 static struct proto unix_proto = {
742 	.name			= "UNIX",
743 	.owner			= THIS_MODULE,
744 	.obj_size		= sizeof(struct unix_sock),
745 };
746 
747 /*
748  * AF_UNIX sockets do not interact with hardware, hence they
749  * dont trigger interrupts - so it's safe for them to have
750  * bh-unsafe locking for their sk_receive_queue.lock. Split off
751  * this special lock-class by reinitializing the spinlock key:
752  */
753 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
754 
755 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
756 {
757 	struct sock *sk = NULL;
758 	struct unix_sock *u;
759 
760 	atomic_long_inc(&unix_nr_socks);
761 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
762 		goto out;
763 
764 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
765 	if (!sk)
766 		goto out;
767 
768 	sock_init_data(sock, sk);
769 	lockdep_set_class(&sk->sk_receive_queue.lock,
770 				&af_unix_sk_receive_queue_lock_key);
771 
772 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
773 	sk->sk_write_space	= unix_write_space;
774 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
775 	sk->sk_destruct		= unix_sock_destructor;
776 	u	  = unix_sk(sk);
777 	u->path.dentry = NULL;
778 	u->path.mnt = NULL;
779 	spin_lock_init(&u->lock);
780 	atomic_long_set(&u->inflight, 0);
781 	INIT_LIST_HEAD(&u->link);
782 	mutex_init(&u->readlock); /* single task reading lock */
783 	init_waitqueue_head(&u->peer_wait);
784 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
785 	unix_insert_socket(unix_sockets_unbound(sk), sk);
786 out:
787 	if (sk == NULL)
788 		atomic_long_dec(&unix_nr_socks);
789 	else {
790 		local_bh_disable();
791 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
792 		local_bh_enable();
793 	}
794 	return sk;
795 }
796 
797 static int unix_create(struct net *net, struct socket *sock, int protocol,
798 		       int kern)
799 {
800 	if (protocol && protocol != PF_UNIX)
801 		return -EPROTONOSUPPORT;
802 
803 	sock->state = SS_UNCONNECTED;
804 
805 	switch (sock->type) {
806 	case SOCK_STREAM:
807 		sock->ops = &unix_stream_ops;
808 		break;
809 		/*
810 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
811 		 *	nothing uses it.
812 		 */
813 	case SOCK_RAW:
814 		sock->type = SOCK_DGRAM;
815 	case SOCK_DGRAM:
816 		sock->ops = &unix_dgram_ops;
817 		break;
818 	case SOCK_SEQPACKET:
819 		sock->ops = &unix_seqpacket_ops;
820 		break;
821 	default:
822 		return -ESOCKTNOSUPPORT;
823 	}
824 
825 	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
826 }
827 
828 static int unix_release(struct socket *sock)
829 {
830 	struct sock *sk = sock->sk;
831 
832 	if (!sk)
833 		return 0;
834 
835 	unix_release_sock(sk, 0);
836 	sock->sk = NULL;
837 
838 	return 0;
839 }
840 
841 static int unix_autobind(struct socket *sock)
842 {
843 	struct sock *sk = sock->sk;
844 	struct net *net = sock_net(sk);
845 	struct unix_sock *u = unix_sk(sk);
846 	static u32 ordernum = 1;
847 	struct unix_address *addr;
848 	int err;
849 	unsigned int retries = 0;
850 
851 	err = mutex_lock_interruptible(&u->readlock);
852 	if (err)
853 		return err;
854 
855 	err = 0;
856 	if (u->addr)
857 		goto out;
858 
859 	err = -ENOMEM;
860 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
861 	if (!addr)
862 		goto out;
863 
864 	addr->name->sun_family = AF_UNIX;
865 	atomic_set(&addr->refcnt, 1);
866 
867 retry:
868 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
869 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
870 
871 	spin_lock(&unix_table_lock);
872 	ordernum = (ordernum+1)&0xFFFFF;
873 
874 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
875 				      addr->hash)) {
876 		spin_unlock(&unix_table_lock);
877 		/*
878 		 * __unix_find_socket_byname() may take long time if many names
879 		 * are already in use.
880 		 */
881 		cond_resched();
882 		/* Give up if all names seems to be in use. */
883 		if (retries++ == 0xFFFFF) {
884 			err = -ENOSPC;
885 			kfree(addr);
886 			goto out;
887 		}
888 		goto retry;
889 	}
890 	addr->hash ^= sk->sk_type;
891 
892 	__unix_remove_socket(sk);
893 	u->addr = addr;
894 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
895 	spin_unlock(&unix_table_lock);
896 	err = 0;
897 
898 out:	mutex_unlock(&u->readlock);
899 	return err;
900 }
901 
902 static struct sock *unix_find_other(struct net *net,
903 				    struct sockaddr_un *sunname, int len,
904 				    int type, unsigned int hash, int *error)
905 {
906 	struct sock *u;
907 	struct path path;
908 	int err = 0;
909 
910 	if (sunname->sun_path[0]) {
911 		struct inode *inode;
912 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
913 		if (err)
914 			goto fail;
915 		inode = d_real_inode(path.dentry);
916 		err = inode_permission(inode, MAY_WRITE);
917 		if (err)
918 			goto put_fail;
919 
920 		err = -ECONNREFUSED;
921 		if (!S_ISSOCK(inode->i_mode))
922 			goto put_fail;
923 		u = unix_find_socket_byinode(inode);
924 		if (!u)
925 			goto put_fail;
926 
927 		if (u->sk_type == type)
928 			touch_atime(&path);
929 
930 		path_put(&path);
931 
932 		err = -EPROTOTYPE;
933 		if (u->sk_type != type) {
934 			sock_put(u);
935 			goto fail;
936 		}
937 	} else {
938 		err = -ECONNREFUSED;
939 		u = unix_find_socket_byname(net, sunname, len, type, hash);
940 		if (u) {
941 			struct dentry *dentry;
942 			dentry = unix_sk(u)->path.dentry;
943 			if (dentry)
944 				touch_atime(&unix_sk(u)->path);
945 		} else
946 			goto fail;
947 	}
948 	return u;
949 
950 put_fail:
951 	path_put(&path);
952 fail:
953 	*error = err;
954 	return NULL;
955 }
956 
957 static int unix_mknod(struct dentry *dentry, const struct path *path, umode_t mode,
958 		      struct path *res)
959 {
960 	int err;
961 
962 	err = security_path_mknod(path, dentry, mode, 0);
963 	if (!err) {
964 		err = vfs_mknod(d_inode(path->dentry), dentry, mode, 0);
965 		if (!err) {
966 			res->mnt = mntget(path->mnt);
967 			res->dentry = dget(dentry);
968 		}
969 	}
970 
971 	return err;
972 }
973 
974 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
975 {
976 	struct sock *sk = sock->sk;
977 	struct net *net = sock_net(sk);
978 	struct unix_sock *u = unix_sk(sk);
979 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
980 	char *sun_path = sunaddr->sun_path;
981 	int err, name_err;
982 	unsigned int hash;
983 	struct unix_address *addr;
984 	struct hlist_head *list;
985 	struct path path;
986 	struct dentry *dentry;
987 
988 	err = -EINVAL;
989 	if (sunaddr->sun_family != AF_UNIX)
990 		goto out;
991 
992 	if (addr_len == sizeof(short)) {
993 		err = unix_autobind(sock);
994 		goto out;
995 	}
996 
997 	err = unix_mkname(sunaddr, addr_len, &hash);
998 	if (err < 0)
999 		goto out;
1000 	addr_len = err;
1001 
1002 	name_err = 0;
1003 	dentry = NULL;
1004 	if (sun_path[0]) {
1005 		/* Get the parent directory, calculate the hash for last
1006 		 * component.
1007 		 */
1008 		dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
1009 
1010 		if (IS_ERR(dentry)) {
1011 			/* delay report until after 'already bound' check */
1012 			name_err = PTR_ERR(dentry);
1013 			dentry = NULL;
1014 		}
1015 	}
1016 
1017 	err = mutex_lock_interruptible(&u->readlock);
1018 	if (err)
1019 		goto out_path;
1020 
1021 	err = -EINVAL;
1022 	if (u->addr)
1023 		goto out_up;
1024 
1025 	if (name_err) {
1026 		err = name_err == -EEXIST ? -EADDRINUSE : name_err;
1027 		goto out_up;
1028 	}
1029 
1030 	err = -ENOMEM;
1031 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1032 	if (!addr)
1033 		goto out_up;
1034 
1035 	memcpy(addr->name, sunaddr, addr_len);
1036 	addr->len = addr_len;
1037 	addr->hash = hash ^ sk->sk_type;
1038 	atomic_set(&addr->refcnt, 1);
1039 
1040 	if (dentry) {
1041 		struct path u_path;
1042 		umode_t mode = S_IFSOCK |
1043 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
1044 		err = unix_mknod(dentry, &path, mode, &u_path);
1045 		if (err) {
1046 			if (err == -EEXIST)
1047 				err = -EADDRINUSE;
1048 			unix_release_addr(addr);
1049 			goto out_up;
1050 		}
1051 		addr->hash = UNIX_HASH_SIZE;
1052 		hash = d_real_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1053 		spin_lock(&unix_table_lock);
1054 		u->path = u_path;
1055 		list = &unix_socket_table[hash];
1056 	} else {
1057 		spin_lock(&unix_table_lock);
1058 		err = -EADDRINUSE;
1059 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
1060 					      sk->sk_type, hash)) {
1061 			unix_release_addr(addr);
1062 			goto out_unlock;
1063 		}
1064 
1065 		list = &unix_socket_table[addr->hash];
1066 	}
1067 
1068 	err = 0;
1069 	__unix_remove_socket(sk);
1070 	u->addr = addr;
1071 	__unix_insert_socket(list, sk);
1072 
1073 out_unlock:
1074 	spin_unlock(&unix_table_lock);
1075 out_up:
1076 	mutex_unlock(&u->readlock);
1077 out_path:
1078 	if (dentry)
1079 		done_path_create(&path, dentry);
1080 
1081 out:
1082 	return err;
1083 }
1084 
1085 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1086 {
1087 	if (unlikely(sk1 == sk2) || !sk2) {
1088 		unix_state_lock(sk1);
1089 		return;
1090 	}
1091 	if (sk1 < sk2) {
1092 		unix_state_lock(sk1);
1093 		unix_state_lock_nested(sk2);
1094 	} else {
1095 		unix_state_lock(sk2);
1096 		unix_state_lock_nested(sk1);
1097 	}
1098 }
1099 
1100 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1101 {
1102 	if (unlikely(sk1 == sk2) || !sk2) {
1103 		unix_state_unlock(sk1);
1104 		return;
1105 	}
1106 	unix_state_unlock(sk1);
1107 	unix_state_unlock(sk2);
1108 }
1109 
1110 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1111 			      int alen, int flags)
1112 {
1113 	struct sock *sk = sock->sk;
1114 	struct net *net = sock_net(sk);
1115 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1116 	struct sock *other;
1117 	unsigned int hash;
1118 	int err;
1119 
1120 	if (addr->sa_family != AF_UNSPEC) {
1121 		err = unix_mkname(sunaddr, alen, &hash);
1122 		if (err < 0)
1123 			goto out;
1124 		alen = err;
1125 
1126 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1127 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1128 			goto out;
1129 
1130 restart:
1131 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1132 		if (!other)
1133 			goto out;
1134 
1135 		unix_state_double_lock(sk, other);
1136 
1137 		/* Apparently VFS overslept socket death. Retry. */
1138 		if (sock_flag(other, SOCK_DEAD)) {
1139 			unix_state_double_unlock(sk, other);
1140 			sock_put(other);
1141 			goto restart;
1142 		}
1143 
1144 		err = -EPERM;
1145 		if (!unix_may_send(sk, other))
1146 			goto out_unlock;
1147 
1148 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1149 		if (err)
1150 			goto out_unlock;
1151 
1152 	} else {
1153 		/*
1154 		 *	1003.1g breaking connected state with AF_UNSPEC
1155 		 */
1156 		other = NULL;
1157 		unix_state_double_lock(sk, other);
1158 	}
1159 
1160 	/*
1161 	 * If it was connected, reconnect.
1162 	 */
1163 	if (unix_peer(sk)) {
1164 		struct sock *old_peer = unix_peer(sk);
1165 		unix_peer(sk) = other;
1166 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1167 
1168 		unix_state_double_unlock(sk, other);
1169 
1170 		if (other != old_peer)
1171 			unix_dgram_disconnected(sk, old_peer);
1172 		sock_put(old_peer);
1173 	} else {
1174 		unix_peer(sk) = other;
1175 		unix_state_double_unlock(sk, other);
1176 	}
1177 	return 0;
1178 
1179 out_unlock:
1180 	unix_state_double_unlock(sk, other);
1181 	sock_put(other);
1182 out:
1183 	return err;
1184 }
1185 
1186 static long unix_wait_for_peer(struct sock *other, long timeo)
1187 {
1188 	struct unix_sock *u = unix_sk(other);
1189 	int sched;
1190 	DEFINE_WAIT(wait);
1191 
1192 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1193 
1194 	sched = !sock_flag(other, SOCK_DEAD) &&
1195 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1196 		unix_recvq_full(other);
1197 
1198 	unix_state_unlock(other);
1199 
1200 	if (sched)
1201 		timeo = schedule_timeout(timeo);
1202 
1203 	finish_wait(&u->peer_wait, &wait);
1204 	return timeo;
1205 }
1206 
1207 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1208 			       int addr_len, int flags)
1209 {
1210 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1211 	struct sock *sk = sock->sk;
1212 	struct net *net = sock_net(sk);
1213 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1214 	struct sock *newsk = NULL;
1215 	struct sock *other = NULL;
1216 	struct sk_buff *skb = NULL;
1217 	unsigned int hash;
1218 	int st;
1219 	int err;
1220 	long timeo;
1221 
1222 	err = unix_mkname(sunaddr, addr_len, &hash);
1223 	if (err < 0)
1224 		goto out;
1225 	addr_len = err;
1226 
1227 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1228 	    (err = unix_autobind(sock)) != 0)
1229 		goto out;
1230 
1231 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1232 
1233 	/* First of all allocate resources.
1234 	   If we will make it after state is locked,
1235 	   we will have to recheck all again in any case.
1236 	 */
1237 
1238 	err = -ENOMEM;
1239 
1240 	/* create new sock for complete connection */
1241 	newsk = unix_create1(sock_net(sk), NULL, 0);
1242 	if (newsk == NULL)
1243 		goto out;
1244 
1245 	/* Allocate skb for sending to listening sock */
1246 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1247 	if (skb == NULL)
1248 		goto out;
1249 
1250 restart:
1251 	/*  Find listening sock. */
1252 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1253 	if (!other)
1254 		goto out;
1255 
1256 	/* Latch state of peer */
1257 	unix_state_lock(other);
1258 
1259 	/* Apparently VFS overslept socket death. Retry. */
1260 	if (sock_flag(other, SOCK_DEAD)) {
1261 		unix_state_unlock(other);
1262 		sock_put(other);
1263 		goto restart;
1264 	}
1265 
1266 	err = -ECONNREFUSED;
1267 	if (other->sk_state != TCP_LISTEN)
1268 		goto out_unlock;
1269 	if (other->sk_shutdown & RCV_SHUTDOWN)
1270 		goto out_unlock;
1271 
1272 	if (unix_recvq_full(other)) {
1273 		err = -EAGAIN;
1274 		if (!timeo)
1275 			goto out_unlock;
1276 
1277 		timeo = unix_wait_for_peer(other, timeo);
1278 
1279 		err = sock_intr_errno(timeo);
1280 		if (signal_pending(current))
1281 			goto out;
1282 		sock_put(other);
1283 		goto restart;
1284 	}
1285 
1286 	/* Latch our state.
1287 
1288 	   It is tricky place. We need to grab our state lock and cannot
1289 	   drop lock on peer. It is dangerous because deadlock is
1290 	   possible. Connect to self case and simultaneous
1291 	   attempt to connect are eliminated by checking socket
1292 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1293 	   check this before attempt to grab lock.
1294 
1295 	   Well, and we have to recheck the state after socket locked.
1296 	 */
1297 	st = sk->sk_state;
1298 
1299 	switch (st) {
1300 	case TCP_CLOSE:
1301 		/* This is ok... continue with connect */
1302 		break;
1303 	case TCP_ESTABLISHED:
1304 		/* Socket is already connected */
1305 		err = -EISCONN;
1306 		goto out_unlock;
1307 	default:
1308 		err = -EINVAL;
1309 		goto out_unlock;
1310 	}
1311 
1312 	unix_state_lock_nested(sk);
1313 
1314 	if (sk->sk_state != st) {
1315 		unix_state_unlock(sk);
1316 		unix_state_unlock(other);
1317 		sock_put(other);
1318 		goto restart;
1319 	}
1320 
1321 	err = security_unix_stream_connect(sk, other, newsk);
1322 	if (err) {
1323 		unix_state_unlock(sk);
1324 		goto out_unlock;
1325 	}
1326 
1327 	/* The way is open! Fastly set all the necessary fields... */
1328 
1329 	sock_hold(sk);
1330 	unix_peer(newsk)	= sk;
1331 	newsk->sk_state		= TCP_ESTABLISHED;
1332 	newsk->sk_type		= sk->sk_type;
1333 	init_peercred(newsk);
1334 	newu = unix_sk(newsk);
1335 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1336 	otheru = unix_sk(other);
1337 
1338 	/* copy address information from listening to new sock*/
1339 	if (otheru->addr) {
1340 		atomic_inc(&otheru->addr->refcnt);
1341 		newu->addr = otheru->addr;
1342 	}
1343 	if (otheru->path.dentry) {
1344 		path_get(&otheru->path);
1345 		newu->path = otheru->path;
1346 	}
1347 
1348 	/* Set credentials */
1349 	copy_peercred(sk, other);
1350 
1351 	sock->state	= SS_CONNECTED;
1352 	sk->sk_state	= TCP_ESTABLISHED;
1353 	sock_hold(newsk);
1354 
1355 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1356 	unix_peer(sk)	= newsk;
1357 
1358 	unix_state_unlock(sk);
1359 
1360 	/* take ten and and send info to listening sock */
1361 	spin_lock(&other->sk_receive_queue.lock);
1362 	__skb_queue_tail(&other->sk_receive_queue, skb);
1363 	spin_unlock(&other->sk_receive_queue.lock);
1364 	unix_state_unlock(other);
1365 	other->sk_data_ready(other);
1366 	sock_put(other);
1367 	return 0;
1368 
1369 out_unlock:
1370 	if (other)
1371 		unix_state_unlock(other);
1372 
1373 out:
1374 	kfree_skb(skb);
1375 	if (newsk)
1376 		unix_release_sock(newsk, 0);
1377 	if (other)
1378 		sock_put(other);
1379 	return err;
1380 }
1381 
1382 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1383 {
1384 	struct sock *ska = socka->sk, *skb = sockb->sk;
1385 
1386 	/* Join our sockets back to back */
1387 	sock_hold(ska);
1388 	sock_hold(skb);
1389 	unix_peer(ska) = skb;
1390 	unix_peer(skb) = ska;
1391 	init_peercred(ska);
1392 	init_peercred(skb);
1393 
1394 	if (ska->sk_type != SOCK_DGRAM) {
1395 		ska->sk_state = TCP_ESTABLISHED;
1396 		skb->sk_state = TCP_ESTABLISHED;
1397 		socka->state  = SS_CONNECTED;
1398 		sockb->state  = SS_CONNECTED;
1399 	}
1400 	return 0;
1401 }
1402 
1403 static void unix_sock_inherit_flags(const struct socket *old,
1404 				    struct socket *new)
1405 {
1406 	if (test_bit(SOCK_PASSCRED, &old->flags))
1407 		set_bit(SOCK_PASSCRED, &new->flags);
1408 	if (test_bit(SOCK_PASSSEC, &old->flags))
1409 		set_bit(SOCK_PASSSEC, &new->flags);
1410 }
1411 
1412 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1413 {
1414 	struct sock *sk = sock->sk;
1415 	struct sock *tsk;
1416 	struct sk_buff *skb;
1417 	int err;
1418 
1419 	err = -EOPNOTSUPP;
1420 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1421 		goto out;
1422 
1423 	err = -EINVAL;
1424 	if (sk->sk_state != TCP_LISTEN)
1425 		goto out;
1426 
1427 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1428 	 * so that no locks are necessary.
1429 	 */
1430 
1431 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1432 	if (!skb) {
1433 		/* This means receive shutdown. */
1434 		if (err == 0)
1435 			err = -EINVAL;
1436 		goto out;
1437 	}
1438 
1439 	tsk = skb->sk;
1440 	skb_free_datagram(sk, skb);
1441 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1442 
1443 	/* attach accepted sock to socket */
1444 	unix_state_lock(tsk);
1445 	newsock->state = SS_CONNECTED;
1446 	unix_sock_inherit_flags(sock, newsock);
1447 	sock_graft(tsk, newsock);
1448 	unix_state_unlock(tsk);
1449 	return 0;
1450 
1451 out:
1452 	return err;
1453 }
1454 
1455 
1456 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1457 {
1458 	struct sock *sk = sock->sk;
1459 	struct unix_sock *u;
1460 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1461 	int err = 0;
1462 
1463 	if (peer) {
1464 		sk = unix_peer_get(sk);
1465 
1466 		err = -ENOTCONN;
1467 		if (!sk)
1468 			goto out;
1469 		err = 0;
1470 	} else {
1471 		sock_hold(sk);
1472 	}
1473 
1474 	u = unix_sk(sk);
1475 	unix_state_lock(sk);
1476 	if (!u->addr) {
1477 		sunaddr->sun_family = AF_UNIX;
1478 		sunaddr->sun_path[0] = 0;
1479 		*uaddr_len = sizeof(short);
1480 	} else {
1481 		struct unix_address *addr = u->addr;
1482 
1483 		*uaddr_len = addr->len;
1484 		memcpy(sunaddr, addr->name, *uaddr_len);
1485 	}
1486 	unix_state_unlock(sk);
1487 	sock_put(sk);
1488 out:
1489 	return err;
1490 }
1491 
1492 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1493 {
1494 	int i;
1495 
1496 	scm->fp = UNIXCB(skb).fp;
1497 	UNIXCB(skb).fp = NULL;
1498 
1499 	for (i = scm->fp->count-1; i >= 0; i--)
1500 		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1501 }
1502 
1503 static void unix_destruct_scm(struct sk_buff *skb)
1504 {
1505 	struct scm_cookie scm;
1506 	memset(&scm, 0, sizeof(scm));
1507 	scm.pid  = UNIXCB(skb).pid;
1508 	if (UNIXCB(skb).fp)
1509 		unix_detach_fds(&scm, skb);
1510 
1511 	/* Alas, it calls VFS */
1512 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1513 	scm_destroy(&scm);
1514 	sock_wfree(skb);
1515 }
1516 
1517 /*
1518  * The "user->unix_inflight" variable is protected by the garbage
1519  * collection lock, and we just read it locklessly here. If you go
1520  * over the limit, there might be a tiny race in actually noticing
1521  * it across threads. Tough.
1522  */
1523 static inline bool too_many_unix_fds(struct task_struct *p)
1524 {
1525 	struct user_struct *user = current_user();
1526 
1527 	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
1528 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1529 	return false;
1530 }
1531 
1532 #define MAX_RECURSION_LEVEL 4
1533 
1534 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1535 {
1536 	int i;
1537 	unsigned char max_level = 0;
1538 
1539 	if (too_many_unix_fds(current))
1540 		return -ETOOMANYREFS;
1541 
1542 	for (i = scm->fp->count - 1; i >= 0; i--) {
1543 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1544 
1545 		if (sk)
1546 			max_level = max(max_level,
1547 					unix_sk(sk)->recursion_level);
1548 	}
1549 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1550 		return -ETOOMANYREFS;
1551 
1552 	/*
1553 	 * Need to duplicate file references for the sake of garbage
1554 	 * collection.  Otherwise a socket in the fps might become a
1555 	 * candidate for GC while the skb is not yet queued.
1556 	 */
1557 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1558 	if (!UNIXCB(skb).fp)
1559 		return -ENOMEM;
1560 
1561 	for (i = scm->fp->count - 1; i >= 0; i--)
1562 		unix_inflight(scm->fp->user, scm->fp->fp[i]);
1563 	return max_level;
1564 }
1565 
1566 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1567 {
1568 	int err = 0;
1569 
1570 	UNIXCB(skb).pid  = get_pid(scm->pid);
1571 	UNIXCB(skb).uid = scm->creds.uid;
1572 	UNIXCB(skb).gid = scm->creds.gid;
1573 	UNIXCB(skb).fp = NULL;
1574 	unix_get_secdata(scm, skb);
1575 	if (scm->fp && send_fds)
1576 		err = unix_attach_fds(scm, skb);
1577 
1578 	skb->destructor = unix_destruct_scm;
1579 	return err;
1580 }
1581 
1582 static bool unix_passcred_enabled(const struct socket *sock,
1583 				  const struct sock *other)
1584 {
1585 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1586 	       !other->sk_socket ||
1587 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1588 }
1589 
1590 /*
1591  * Some apps rely on write() giving SCM_CREDENTIALS
1592  * We include credentials if source or destination socket
1593  * asserted SOCK_PASSCRED.
1594  */
1595 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1596 			    const struct sock *other)
1597 {
1598 	if (UNIXCB(skb).pid)
1599 		return;
1600 	if (unix_passcred_enabled(sock, other)) {
1601 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1602 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1603 	}
1604 }
1605 
1606 static int maybe_init_creds(struct scm_cookie *scm,
1607 			    struct socket *socket,
1608 			    const struct sock *other)
1609 {
1610 	int err;
1611 	struct msghdr msg = { .msg_controllen = 0 };
1612 
1613 	err = scm_send(socket, &msg, scm, false);
1614 	if (err)
1615 		return err;
1616 
1617 	if (unix_passcred_enabled(socket, other)) {
1618 		scm->pid = get_pid(task_tgid(current));
1619 		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1620 	}
1621 	return err;
1622 }
1623 
1624 static bool unix_skb_scm_eq(struct sk_buff *skb,
1625 			    struct scm_cookie *scm)
1626 {
1627 	const struct unix_skb_parms *u = &UNIXCB(skb);
1628 
1629 	return u->pid == scm->pid &&
1630 	       uid_eq(u->uid, scm->creds.uid) &&
1631 	       gid_eq(u->gid, scm->creds.gid) &&
1632 	       unix_secdata_eq(scm, skb);
1633 }
1634 
1635 /*
1636  *	Send AF_UNIX data.
1637  */
1638 
1639 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1640 			      size_t len)
1641 {
1642 	struct sock *sk = sock->sk;
1643 	struct net *net = sock_net(sk);
1644 	struct unix_sock *u = unix_sk(sk);
1645 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1646 	struct sock *other = NULL;
1647 	int namelen = 0; /* fake GCC */
1648 	int err;
1649 	unsigned int hash;
1650 	struct sk_buff *skb;
1651 	long timeo;
1652 	struct scm_cookie scm;
1653 	int max_level;
1654 	int data_len = 0;
1655 	int sk_locked;
1656 
1657 	wait_for_unix_gc();
1658 	err = scm_send(sock, msg, &scm, false);
1659 	if (err < 0)
1660 		return err;
1661 
1662 	err = -EOPNOTSUPP;
1663 	if (msg->msg_flags&MSG_OOB)
1664 		goto out;
1665 
1666 	if (msg->msg_namelen) {
1667 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1668 		if (err < 0)
1669 			goto out;
1670 		namelen = err;
1671 	} else {
1672 		sunaddr = NULL;
1673 		err = -ENOTCONN;
1674 		other = unix_peer_get(sk);
1675 		if (!other)
1676 			goto out;
1677 	}
1678 
1679 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1680 	    && (err = unix_autobind(sock)) != 0)
1681 		goto out;
1682 
1683 	err = -EMSGSIZE;
1684 	if (len > sk->sk_sndbuf - 32)
1685 		goto out;
1686 
1687 	if (len > SKB_MAX_ALLOC) {
1688 		data_len = min_t(size_t,
1689 				 len - SKB_MAX_ALLOC,
1690 				 MAX_SKB_FRAGS * PAGE_SIZE);
1691 		data_len = PAGE_ALIGN(data_len);
1692 
1693 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1694 	}
1695 
1696 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1697 				   msg->msg_flags & MSG_DONTWAIT, &err,
1698 				   PAGE_ALLOC_COSTLY_ORDER);
1699 	if (skb == NULL)
1700 		goto out;
1701 
1702 	err = unix_scm_to_skb(&scm, skb, true);
1703 	if (err < 0)
1704 		goto out_free;
1705 	max_level = err + 1;
1706 
1707 	skb_put(skb, len - data_len);
1708 	skb->data_len = data_len;
1709 	skb->len = len;
1710 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1711 	if (err)
1712 		goto out_free;
1713 
1714 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1715 
1716 restart:
1717 	if (!other) {
1718 		err = -ECONNRESET;
1719 		if (sunaddr == NULL)
1720 			goto out_free;
1721 
1722 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1723 					hash, &err);
1724 		if (other == NULL)
1725 			goto out_free;
1726 	}
1727 
1728 	if (sk_filter(other, skb) < 0) {
1729 		/* Toss the packet but do not return any error to the sender */
1730 		err = len;
1731 		goto out_free;
1732 	}
1733 
1734 	sk_locked = 0;
1735 	unix_state_lock(other);
1736 restart_locked:
1737 	err = -EPERM;
1738 	if (!unix_may_send(sk, other))
1739 		goto out_unlock;
1740 
1741 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1742 		/*
1743 		 *	Check with 1003.1g - what should
1744 		 *	datagram error
1745 		 */
1746 		unix_state_unlock(other);
1747 		sock_put(other);
1748 
1749 		if (!sk_locked)
1750 			unix_state_lock(sk);
1751 
1752 		err = 0;
1753 		if (unix_peer(sk) == other) {
1754 			unix_peer(sk) = NULL;
1755 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1756 
1757 			unix_state_unlock(sk);
1758 
1759 			unix_dgram_disconnected(sk, other);
1760 			sock_put(other);
1761 			err = -ECONNREFUSED;
1762 		} else {
1763 			unix_state_unlock(sk);
1764 		}
1765 
1766 		other = NULL;
1767 		if (err)
1768 			goto out_free;
1769 		goto restart;
1770 	}
1771 
1772 	err = -EPIPE;
1773 	if (other->sk_shutdown & RCV_SHUTDOWN)
1774 		goto out_unlock;
1775 
1776 	if (sk->sk_type != SOCK_SEQPACKET) {
1777 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1778 		if (err)
1779 			goto out_unlock;
1780 	}
1781 
1782 	/* other == sk && unix_peer(other) != sk if
1783 	 * - unix_peer(sk) == NULL, destination address bound to sk
1784 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
1785 	 */
1786 	if (other != sk &&
1787 	    unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1788 		if (timeo) {
1789 			timeo = unix_wait_for_peer(other, timeo);
1790 
1791 			err = sock_intr_errno(timeo);
1792 			if (signal_pending(current))
1793 				goto out_free;
1794 
1795 			goto restart;
1796 		}
1797 
1798 		if (!sk_locked) {
1799 			unix_state_unlock(other);
1800 			unix_state_double_lock(sk, other);
1801 		}
1802 
1803 		if (unix_peer(sk) != other ||
1804 		    unix_dgram_peer_wake_me(sk, other)) {
1805 			err = -EAGAIN;
1806 			sk_locked = 1;
1807 			goto out_unlock;
1808 		}
1809 
1810 		if (!sk_locked) {
1811 			sk_locked = 1;
1812 			goto restart_locked;
1813 		}
1814 	}
1815 
1816 	if (unlikely(sk_locked))
1817 		unix_state_unlock(sk);
1818 
1819 	if (sock_flag(other, SOCK_RCVTSTAMP))
1820 		__net_timestamp(skb);
1821 	maybe_add_creds(skb, sock, other);
1822 	skb_queue_tail(&other->sk_receive_queue, skb);
1823 	if (max_level > unix_sk(other)->recursion_level)
1824 		unix_sk(other)->recursion_level = max_level;
1825 	unix_state_unlock(other);
1826 	other->sk_data_ready(other);
1827 	sock_put(other);
1828 	scm_destroy(&scm);
1829 	return len;
1830 
1831 out_unlock:
1832 	if (sk_locked)
1833 		unix_state_unlock(sk);
1834 	unix_state_unlock(other);
1835 out_free:
1836 	kfree_skb(skb);
1837 out:
1838 	if (other)
1839 		sock_put(other);
1840 	scm_destroy(&scm);
1841 	return err;
1842 }
1843 
1844 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1845  * bytes, and a minimun of a full page.
1846  */
1847 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1848 
1849 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1850 			       size_t len)
1851 {
1852 	struct sock *sk = sock->sk;
1853 	struct sock *other = NULL;
1854 	int err, size;
1855 	struct sk_buff *skb;
1856 	int sent = 0;
1857 	struct scm_cookie scm;
1858 	bool fds_sent = false;
1859 	int max_level;
1860 	int data_len;
1861 
1862 	wait_for_unix_gc();
1863 	err = scm_send(sock, msg, &scm, false);
1864 	if (err < 0)
1865 		return err;
1866 
1867 	err = -EOPNOTSUPP;
1868 	if (msg->msg_flags&MSG_OOB)
1869 		goto out_err;
1870 
1871 	if (msg->msg_namelen) {
1872 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1873 		goto out_err;
1874 	} else {
1875 		err = -ENOTCONN;
1876 		other = unix_peer(sk);
1877 		if (!other)
1878 			goto out_err;
1879 	}
1880 
1881 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1882 		goto pipe_err;
1883 
1884 	while (sent < len) {
1885 		size = len - sent;
1886 
1887 		/* Keep two messages in the pipe so it schedules better */
1888 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1889 
1890 		/* allow fallback to order-0 allocations */
1891 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1892 
1893 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1894 
1895 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1896 
1897 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1898 					   msg->msg_flags & MSG_DONTWAIT, &err,
1899 					   get_order(UNIX_SKB_FRAGS_SZ));
1900 		if (!skb)
1901 			goto out_err;
1902 
1903 		/* Only send the fds in the first buffer */
1904 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
1905 		if (err < 0) {
1906 			kfree_skb(skb);
1907 			goto out_err;
1908 		}
1909 		max_level = err + 1;
1910 		fds_sent = true;
1911 
1912 		skb_put(skb, size - data_len);
1913 		skb->data_len = data_len;
1914 		skb->len = size;
1915 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1916 		if (err) {
1917 			kfree_skb(skb);
1918 			goto out_err;
1919 		}
1920 
1921 		unix_state_lock(other);
1922 
1923 		if (sock_flag(other, SOCK_DEAD) ||
1924 		    (other->sk_shutdown & RCV_SHUTDOWN))
1925 			goto pipe_err_free;
1926 
1927 		maybe_add_creds(skb, sock, other);
1928 		skb_queue_tail(&other->sk_receive_queue, skb);
1929 		if (max_level > unix_sk(other)->recursion_level)
1930 			unix_sk(other)->recursion_level = max_level;
1931 		unix_state_unlock(other);
1932 		other->sk_data_ready(other);
1933 		sent += size;
1934 	}
1935 
1936 	scm_destroy(&scm);
1937 
1938 	return sent;
1939 
1940 pipe_err_free:
1941 	unix_state_unlock(other);
1942 	kfree_skb(skb);
1943 pipe_err:
1944 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1945 		send_sig(SIGPIPE, current, 0);
1946 	err = -EPIPE;
1947 out_err:
1948 	scm_destroy(&scm);
1949 	return sent ? : err;
1950 }
1951 
1952 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1953 				    int offset, size_t size, int flags)
1954 {
1955 	int err;
1956 	bool send_sigpipe = false;
1957 	bool init_scm = true;
1958 	struct scm_cookie scm;
1959 	struct sock *other, *sk = socket->sk;
1960 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1961 
1962 	if (flags & MSG_OOB)
1963 		return -EOPNOTSUPP;
1964 
1965 	other = unix_peer(sk);
1966 	if (!other || sk->sk_state != TCP_ESTABLISHED)
1967 		return -ENOTCONN;
1968 
1969 	if (false) {
1970 alloc_skb:
1971 		unix_state_unlock(other);
1972 		mutex_unlock(&unix_sk(other)->readlock);
1973 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1974 					      &err, 0);
1975 		if (!newskb)
1976 			goto err;
1977 	}
1978 
1979 	/* we must acquire readlock as we modify already present
1980 	 * skbs in the sk_receive_queue and mess with skb->len
1981 	 */
1982 	err = mutex_lock_interruptible(&unix_sk(other)->readlock);
1983 	if (err) {
1984 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1985 		goto err;
1986 	}
1987 
1988 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
1989 		err = -EPIPE;
1990 		send_sigpipe = true;
1991 		goto err_unlock;
1992 	}
1993 
1994 	unix_state_lock(other);
1995 
1996 	if (sock_flag(other, SOCK_DEAD) ||
1997 	    other->sk_shutdown & RCV_SHUTDOWN) {
1998 		err = -EPIPE;
1999 		send_sigpipe = true;
2000 		goto err_state_unlock;
2001 	}
2002 
2003 	if (init_scm) {
2004 		err = maybe_init_creds(&scm, socket, other);
2005 		if (err)
2006 			goto err_state_unlock;
2007 		init_scm = false;
2008 	}
2009 
2010 	skb = skb_peek_tail(&other->sk_receive_queue);
2011 	if (tail && tail == skb) {
2012 		skb = newskb;
2013 	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2014 		if (newskb) {
2015 			skb = newskb;
2016 		} else {
2017 			tail = skb;
2018 			goto alloc_skb;
2019 		}
2020 	} else if (newskb) {
2021 		/* this is fast path, we don't necessarily need to
2022 		 * call to kfree_skb even though with newskb == NULL
2023 		 * this - does no harm
2024 		 */
2025 		consume_skb(newskb);
2026 		newskb = NULL;
2027 	}
2028 
2029 	if (skb_append_pagefrags(skb, page, offset, size)) {
2030 		tail = skb;
2031 		goto alloc_skb;
2032 	}
2033 
2034 	skb->len += size;
2035 	skb->data_len += size;
2036 	skb->truesize += size;
2037 	atomic_add(size, &sk->sk_wmem_alloc);
2038 
2039 	if (newskb) {
2040 		err = unix_scm_to_skb(&scm, skb, false);
2041 		if (err)
2042 			goto err_state_unlock;
2043 		spin_lock(&other->sk_receive_queue.lock);
2044 		__skb_queue_tail(&other->sk_receive_queue, newskb);
2045 		spin_unlock(&other->sk_receive_queue.lock);
2046 	}
2047 
2048 	unix_state_unlock(other);
2049 	mutex_unlock(&unix_sk(other)->readlock);
2050 
2051 	other->sk_data_ready(other);
2052 	scm_destroy(&scm);
2053 	return size;
2054 
2055 err_state_unlock:
2056 	unix_state_unlock(other);
2057 err_unlock:
2058 	mutex_unlock(&unix_sk(other)->readlock);
2059 err:
2060 	kfree_skb(newskb);
2061 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2062 		send_sig(SIGPIPE, current, 0);
2063 	if (!init_scm)
2064 		scm_destroy(&scm);
2065 	return err;
2066 }
2067 
2068 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2069 				  size_t len)
2070 {
2071 	int err;
2072 	struct sock *sk = sock->sk;
2073 
2074 	err = sock_error(sk);
2075 	if (err)
2076 		return err;
2077 
2078 	if (sk->sk_state != TCP_ESTABLISHED)
2079 		return -ENOTCONN;
2080 
2081 	if (msg->msg_namelen)
2082 		msg->msg_namelen = 0;
2083 
2084 	return unix_dgram_sendmsg(sock, msg, len);
2085 }
2086 
2087 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2088 				  size_t size, int flags)
2089 {
2090 	struct sock *sk = sock->sk;
2091 
2092 	if (sk->sk_state != TCP_ESTABLISHED)
2093 		return -ENOTCONN;
2094 
2095 	return unix_dgram_recvmsg(sock, msg, size, flags);
2096 }
2097 
2098 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2099 {
2100 	struct unix_sock *u = unix_sk(sk);
2101 
2102 	if (u->addr) {
2103 		msg->msg_namelen = u->addr->len;
2104 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
2105 	}
2106 }
2107 
2108 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2109 			      size_t size, int flags)
2110 {
2111 	struct scm_cookie scm;
2112 	struct sock *sk = sock->sk;
2113 	struct unix_sock *u = unix_sk(sk);
2114 	struct sk_buff *skb, *last;
2115 	long timeo;
2116 	int err;
2117 	int peeked, skip;
2118 
2119 	err = -EOPNOTSUPP;
2120 	if (flags&MSG_OOB)
2121 		goto out;
2122 
2123 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2124 
2125 	do {
2126 		mutex_lock(&u->readlock);
2127 
2128 		skip = sk_peek_offset(sk, flags);
2129 		skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
2130 					      &last);
2131 		if (skb)
2132 			break;
2133 
2134 		mutex_unlock(&u->readlock);
2135 
2136 		if (err != -EAGAIN)
2137 			break;
2138 	} while (timeo &&
2139 		 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2140 
2141 	if (!skb) { /* implies readlock unlocked */
2142 		unix_state_lock(sk);
2143 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2144 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2145 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2146 			err = 0;
2147 		unix_state_unlock(sk);
2148 		goto out;
2149 	}
2150 
2151 	if (wq_has_sleeper(&u->peer_wait))
2152 		wake_up_interruptible_sync_poll(&u->peer_wait,
2153 						POLLOUT | POLLWRNORM |
2154 						POLLWRBAND);
2155 
2156 	if (msg->msg_name)
2157 		unix_copy_addr(msg, skb->sk);
2158 
2159 	if (size > skb->len - skip)
2160 		size = skb->len - skip;
2161 	else if (size < skb->len - skip)
2162 		msg->msg_flags |= MSG_TRUNC;
2163 
2164 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2165 	if (err)
2166 		goto out_free;
2167 
2168 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2169 		__sock_recv_timestamp(msg, sk, skb);
2170 
2171 	memset(&scm, 0, sizeof(scm));
2172 
2173 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2174 	unix_set_secdata(&scm, skb);
2175 
2176 	if (!(flags & MSG_PEEK)) {
2177 		if (UNIXCB(skb).fp)
2178 			unix_detach_fds(&scm, skb);
2179 
2180 		sk_peek_offset_bwd(sk, skb->len);
2181 	} else {
2182 		/* It is questionable: on PEEK we could:
2183 		   - do not return fds - good, but too simple 8)
2184 		   - return fds, and do not return them on read (old strategy,
2185 		     apparently wrong)
2186 		   - clone fds (I chose it for now, it is the most universal
2187 		     solution)
2188 
2189 		   POSIX 1003.1g does not actually define this clearly
2190 		   at all. POSIX 1003.1g doesn't define a lot of things
2191 		   clearly however!
2192 
2193 		*/
2194 
2195 		sk_peek_offset_fwd(sk, size);
2196 
2197 		if (UNIXCB(skb).fp)
2198 			scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2199 	}
2200 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2201 
2202 	scm_recv(sock, msg, &scm, flags);
2203 
2204 out_free:
2205 	skb_free_datagram(sk, skb);
2206 	mutex_unlock(&u->readlock);
2207 out:
2208 	return err;
2209 }
2210 
2211 /*
2212  *	Sleep until more data has arrived. But check for races..
2213  */
2214 static long unix_stream_data_wait(struct sock *sk, long timeo,
2215 				  struct sk_buff *last, unsigned int last_len)
2216 {
2217 	struct sk_buff *tail;
2218 	DEFINE_WAIT(wait);
2219 
2220 	unix_state_lock(sk);
2221 
2222 	for (;;) {
2223 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2224 
2225 		tail = skb_peek_tail(&sk->sk_receive_queue);
2226 		if (tail != last ||
2227 		    (tail && tail->len != last_len) ||
2228 		    sk->sk_err ||
2229 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2230 		    signal_pending(current) ||
2231 		    !timeo)
2232 			break;
2233 
2234 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2235 		unix_state_unlock(sk);
2236 		timeo = freezable_schedule_timeout(timeo);
2237 		unix_state_lock(sk);
2238 
2239 		if (sock_flag(sk, SOCK_DEAD))
2240 			break;
2241 
2242 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2243 	}
2244 
2245 	finish_wait(sk_sleep(sk), &wait);
2246 	unix_state_unlock(sk);
2247 	return timeo;
2248 }
2249 
2250 static unsigned int unix_skb_len(const struct sk_buff *skb)
2251 {
2252 	return skb->len - UNIXCB(skb).consumed;
2253 }
2254 
2255 struct unix_stream_read_state {
2256 	int (*recv_actor)(struct sk_buff *, int, int,
2257 			  struct unix_stream_read_state *);
2258 	struct socket *socket;
2259 	struct msghdr *msg;
2260 	struct pipe_inode_info *pipe;
2261 	size_t size;
2262 	int flags;
2263 	unsigned int splice_flags;
2264 };
2265 
2266 static int unix_stream_read_generic(struct unix_stream_read_state *state)
2267 {
2268 	struct scm_cookie scm;
2269 	struct socket *sock = state->socket;
2270 	struct sock *sk = sock->sk;
2271 	struct unix_sock *u = unix_sk(sk);
2272 	int copied = 0;
2273 	int flags = state->flags;
2274 	int noblock = flags & MSG_DONTWAIT;
2275 	bool check_creds = false;
2276 	int target;
2277 	int err = 0;
2278 	long timeo;
2279 	int skip;
2280 	size_t size = state->size;
2281 	unsigned int last_len;
2282 
2283 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2284 		err = -EINVAL;
2285 		goto out;
2286 	}
2287 
2288 	if (unlikely(flags & MSG_OOB)) {
2289 		err = -EOPNOTSUPP;
2290 		goto out;
2291 	}
2292 
2293 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2294 	timeo = sock_rcvtimeo(sk, noblock);
2295 
2296 	memset(&scm, 0, sizeof(scm));
2297 
2298 	/* Lock the socket to prevent queue disordering
2299 	 * while sleeps in memcpy_tomsg
2300 	 */
2301 	mutex_lock(&u->readlock);
2302 
2303 	if (flags & MSG_PEEK)
2304 		skip = sk_peek_offset(sk, flags);
2305 	else
2306 		skip = 0;
2307 
2308 	do {
2309 		int chunk;
2310 		bool drop_skb;
2311 		struct sk_buff *skb, *last;
2312 
2313 redo:
2314 		unix_state_lock(sk);
2315 		if (sock_flag(sk, SOCK_DEAD)) {
2316 			err = -ECONNRESET;
2317 			goto unlock;
2318 		}
2319 		last = skb = skb_peek(&sk->sk_receive_queue);
2320 		last_len = last ? last->len : 0;
2321 again:
2322 		if (skb == NULL) {
2323 			unix_sk(sk)->recursion_level = 0;
2324 			if (copied >= target)
2325 				goto unlock;
2326 
2327 			/*
2328 			 *	POSIX 1003.1g mandates this order.
2329 			 */
2330 
2331 			err = sock_error(sk);
2332 			if (err)
2333 				goto unlock;
2334 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2335 				goto unlock;
2336 
2337 			unix_state_unlock(sk);
2338 			if (!timeo) {
2339 				err = -EAGAIN;
2340 				break;
2341 			}
2342 
2343 			mutex_unlock(&u->readlock);
2344 
2345 			timeo = unix_stream_data_wait(sk, timeo, last,
2346 						      last_len);
2347 
2348 			if (signal_pending(current)) {
2349 				err = sock_intr_errno(timeo);
2350 				scm_destroy(&scm);
2351 				goto out;
2352 			}
2353 
2354 			mutex_lock(&u->readlock);
2355 			goto redo;
2356 unlock:
2357 			unix_state_unlock(sk);
2358 			break;
2359 		}
2360 
2361 		while (skip >= unix_skb_len(skb)) {
2362 			skip -= unix_skb_len(skb);
2363 			last = skb;
2364 			last_len = skb->len;
2365 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2366 			if (!skb)
2367 				goto again;
2368 		}
2369 
2370 		unix_state_unlock(sk);
2371 
2372 		if (check_creds) {
2373 			/* Never glue messages from different writers */
2374 			if (!unix_skb_scm_eq(skb, &scm))
2375 				break;
2376 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2377 			/* Copy credentials */
2378 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2379 			unix_set_secdata(&scm, skb);
2380 			check_creds = true;
2381 		}
2382 
2383 		/* Copy address just once */
2384 		if (state->msg && state->msg->msg_name) {
2385 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2386 					 state->msg->msg_name);
2387 			unix_copy_addr(state->msg, skb->sk);
2388 			sunaddr = NULL;
2389 		}
2390 
2391 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2392 		skb_get(skb);
2393 		chunk = state->recv_actor(skb, skip, chunk, state);
2394 		drop_skb = !unix_skb_len(skb);
2395 		/* skb is only safe to use if !drop_skb */
2396 		consume_skb(skb);
2397 		if (chunk < 0) {
2398 			if (copied == 0)
2399 				copied = -EFAULT;
2400 			break;
2401 		}
2402 		copied += chunk;
2403 		size -= chunk;
2404 
2405 		if (drop_skb) {
2406 			/* the skb was touched by a concurrent reader;
2407 			 * we should not expect anything from this skb
2408 			 * anymore and assume it invalid - we can be
2409 			 * sure it was dropped from the socket queue
2410 			 *
2411 			 * let's report a short read
2412 			 */
2413 			err = 0;
2414 			break;
2415 		}
2416 
2417 		/* Mark read part of skb as used */
2418 		if (!(flags & MSG_PEEK)) {
2419 			UNIXCB(skb).consumed += chunk;
2420 
2421 			sk_peek_offset_bwd(sk, chunk);
2422 
2423 			if (UNIXCB(skb).fp)
2424 				unix_detach_fds(&scm, skb);
2425 
2426 			if (unix_skb_len(skb))
2427 				break;
2428 
2429 			skb_unlink(skb, &sk->sk_receive_queue);
2430 			consume_skb(skb);
2431 
2432 			if (scm.fp)
2433 				break;
2434 		} else {
2435 			/* It is questionable, see note in unix_dgram_recvmsg.
2436 			 */
2437 			if (UNIXCB(skb).fp)
2438 				scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2439 
2440 			sk_peek_offset_fwd(sk, chunk);
2441 
2442 			if (UNIXCB(skb).fp)
2443 				break;
2444 
2445 			skip = 0;
2446 			last = skb;
2447 			last_len = skb->len;
2448 			unix_state_lock(sk);
2449 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2450 			if (skb)
2451 				goto again;
2452 			unix_state_unlock(sk);
2453 			break;
2454 		}
2455 	} while (size);
2456 
2457 	mutex_unlock(&u->readlock);
2458 	if (state->msg)
2459 		scm_recv(sock, state->msg, &scm, flags);
2460 	else
2461 		scm_destroy(&scm);
2462 out:
2463 	return copied ? : err;
2464 }
2465 
2466 static int unix_stream_read_actor(struct sk_buff *skb,
2467 				  int skip, int chunk,
2468 				  struct unix_stream_read_state *state)
2469 {
2470 	int ret;
2471 
2472 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2473 				    state->msg, chunk);
2474 	return ret ?: chunk;
2475 }
2476 
2477 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2478 			       size_t size, int flags)
2479 {
2480 	struct unix_stream_read_state state = {
2481 		.recv_actor = unix_stream_read_actor,
2482 		.socket = sock,
2483 		.msg = msg,
2484 		.size = size,
2485 		.flags = flags
2486 	};
2487 
2488 	return unix_stream_read_generic(&state);
2489 }
2490 
2491 static ssize_t skb_unix_socket_splice(struct sock *sk,
2492 				      struct pipe_inode_info *pipe,
2493 				      struct splice_pipe_desc *spd)
2494 {
2495 	int ret;
2496 	struct unix_sock *u = unix_sk(sk);
2497 
2498 	mutex_unlock(&u->readlock);
2499 	ret = splice_to_pipe(pipe, spd);
2500 	mutex_lock(&u->readlock);
2501 
2502 	return ret;
2503 }
2504 
2505 static int unix_stream_splice_actor(struct sk_buff *skb,
2506 				    int skip, int chunk,
2507 				    struct unix_stream_read_state *state)
2508 {
2509 	return skb_splice_bits(skb, state->socket->sk,
2510 			       UNIXCB(skb).consumed + skip,
2511 			       state->pipe, chunk, state->splice_flags,
2512 			       skb_unix_socket_splice);
2513 }
2514 
2515 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2516 				       struct pipe_inode_info *pipe,
2517 				       size_t size, unsigned int flags)
2518 {
2519 	struct unix_stream_read_state state = {
2520 		.recv_actor = unix_stream_splice_actor,
2521 		.socket = sock,
2522 		.pipe = pipe,
2523 		.size = size,
2524 		.splice_flags = flags,
2525 	};
2526 
2527 	if (unlikely(*ppos))
2528 		return -ESPIPE;
2529 
2530 	if (sock->file->f_flags & O_NONBLOCK ||
2531 	    flags & SPLICE_F_NONBLOCK)
2532 		state.flags = MSG_DONTWAIT;
2533 
2534 	return unix_stream_read_generic(&state);
2535 }
2536 
2537 static int unix_shutdown(struct socket *sock, int mode)
2538 {
2539 	struct sock *sk = sock->sk;
2540 	struct sock *other;
2541 
2542 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2543 		return -EINVAL;
2544 	/* This maps:
2545 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2546 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2547 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2548 	 */
2549 	++mode;
2550 
2551 	unix_state_lock(sk);
2552 	sk->sk_shutdown |= mode;
2553 	other = unix_peer(sk);
2554 	if (other)
2555 		sock_hold(other);
2556 	unix_state_unlock(sk);
2557 	sk->sk_state_change(sk);
2558 
2559 	if (other &&
2560 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2561 
2562 		int peer_mode = 0;
2563 
2564 		if (mode&RCV_SHUTDOWN)
2565 			peer_mode |= SEND_SHUTDOWN;
2566 		if (mode&SEND_SHUTDOWN)
2567 			peer_mode |= RCV_SHUTDOWN;
2568 		unix_state_lock(other);
2569 		other->sk_shutdown |= peer_mode;
2570 		unix_state_unlock(other);
2571 		other->sk_state_change(other);
2572 		if (peer_mode == SHUTDOWN_MASK)
2573 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2574 		else if (peer_mode & RCV_SHUTDOWN)
2575 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2576 	}
2577 	if (other)
2578 		sock_put(other);
2579 
2580 	return 0;
2581 }
2582 
2583 long unix_inq_len(struct sock *sk)
2584 {
2585 	struct sk_buff *skb;
2586 	long amount = 0;
2587 
2588 	if (sk->sk_state == TCP_LISTEN)
2589 		return -EINVAL;
2590 
2591 	spin_lock(&sk->sk_receive_queue.lock);
2592 	if (sk->sk_type == SOCK_STREAM ||
2593 	    sk->sk_type == SOCK_SEQPACKET) {
2594 		skb_queue_walk(&sk->sk_receive_queue, skb)
2595 			amount += unix_skb_len(skb);
2596 	} else {
2597 		skb = skb_peek(&sk->sk_receive_queue);
2598 		if (skb)
2599 			amount = skb->len;
2600 	}
2601 	spin_unlock(&sk->sk_receive_queue.lock);
2602 
2603 	return amount;
2604 }
2605 EXPORT_SYMBOL_GPL(unix_inq_len);
2606 
2607 long unix_outq_len(struct sock *sk)
2608 {
2609 	return sk_wmem_alloc_get(sk);
2610 }
2611 EXPORT_SYMBOL_GPL(unix_outq_len);
2612 
2613 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2614 {
2615 	struct sock *sk = sock->sk;
2616 	long amount = 0;
2617 	int err;
2618 
2619 	switch (cmd) {
2620 	case SIOCOUTQ:
2621 		amount = unix_outq_len(sk);
2622 		err = put_user(amount, (int __user *)arg);
2623 		break;
2624 	case SIOCINQ:
2625 		amount = unix_inq_len(sk);
2626 		if (amount < 0)
2627 			err = amount;
2628 		else
2629 			err = put_user(amount, (int __user *)arg);
2630 		break;
2631 	default:
2632 		err = -ENOIOCTLCMD;
2633 		break;
2634 	}
2635 	return err;
2636 }
2637 
2638 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2639 {
2640 	struct sock *sk = sock->sk;
2641 	unsigned int mask;
2642 
2643 	sock_poll_wait(file, sk_sleep(sk), wait);
2644 	mask = 0;
2645 
2646 	/* exceptional events? */
2647 	if (sk->sk_err)
2648 		mask |= POLLERR;
2649 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2650 		mask |= POLLHUP;
2651 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2652 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2653 
2654 	/* readable? */
2655 	if (!skb_queue_empty(&sk->sk_receive_queue))
2656 		mask |= POLLIN | POLLRDNORM;
2657 
2658 	/* Connection-based need to check for termination and startup */
2659 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2660 	    sk->sk_state == TCP_CLOSE)
2661 		mask |= POLLHUP;
2662 
2663 	/*
2664 	 * we set writable also when the other side has shut down the
2665 	 * connection. This prevents stuck sockets.
2666 	 */
2667 	if (unix_writable(sk))
2668 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2669 
2670 	return mask;
2671 }
2672 
2673 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2674 				    poll_table *wait)
2675 {
2676 	struct sock *sk = sock->sk, *other;
2677 	unsigned int mask, writable;
2678 
2679 	sock_poll_wait(file, sk_sleep(sk), wait);
2680 	mask = 0;
2681 
2682 	/* exceptional events? */
2683 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2684 		mask |= POLLERR |
2685 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2686 
2687 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2688 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2689 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2690 		mask |= POLLHUP;
2691 
2692 	/* readable? */
2693 	if (!skb_queue_empty(&sk->sk_receive_queue))
2694 		mask |= POLLIN | POLLRDNORM;
2695 
2696 	/* Connection-based need to check for termination and startup */
2697 	if (sk->sk_type == SOCK_SEQPACKET) {
2698 		if (sk->sk_state == TCP_CLOSE)
2699 			mask |= POLLHUP;
2700 		/* connection hasn't started yet? */
2701 		if (sk->sk_state == TCP_SYN_SENT)
2702 			return mask;
2703 	}
2704 
2705 	/* No write status requested, avoid expensive OUT tests. */
2706 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2707 		return mask;
2708 
2709 	writable = unix_writable(sk);
2710 	if (writable) {
2711 		unix_state_lock(sk);
2712 
2713 		other = unix_peer(sk);
2714 		if (other && unix_peer(other) != sk &&
2715 		    unix_recvq_full(other) &&
2716 		    unix_dgram_peer_wake_me(sk, other))
2717 			writable = 0;
2718 
2719 		unix_state_unlock(sk);
2720 	}
2721 
2722 	if (writable)
2723 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2724 	else
2725 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2726 
2727 	return mask;
2728 }
2729 
2730 #ifdef CONFIG_PROC_FS
2731 
2732 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2733 
2734 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2735 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2736 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2737 
2738 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2739 {
2740 	unsigned long offset = get_offset(*pos);
2741 	unsigned long bucket = get_bucket(*pos);
2742 	struct sock *sk;
2743 	unsigned long count = 0;
2744 
2745 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2746 		if (sock_net(sk) != seq_file_net(seq))
2747 			continue;
2748 		if (++count == offset)
2749 			break;
2750 	}
2751 
2752 	return sk;
2753 }
2754 
2755 static struct sock *unix_next_socket(struct seq_file *seq,
2756 				     struct sock *sk,
2757 				     loff_t *pos)
2758 {
2759 	unsigned long bucket;
2760 
2761 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2762 		sk = sk_next(sk);
2763 		if (!sk)
2764 			goto next_bucket;
2765 		if (sock_net(sk) == seq_file_net(seq))
2766 			return sk;
2767 	}
2768 
2769 	do {
2770 		sk = unix_from_bucket(seq, pos);
2771 		if (sk)
2772 			return sk;
2773 
2774 next_bucket:
2775 		bucket = get_bucket(*pos) + 1;
2776 		*pos = set_bucket_offset(bucket, 1);
2777 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2778 
2779 	return NULL;
2780 }
2781 
2782 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2783 	__acquires(unix_table_lock)
2784 {
2785 	spin_lock(&unix_table_lock);
2786 
2787 	if (!*pos)
2788 		return SEQ_START_TOKEN;
2789 
2790 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2791 		return NULL;
2792 
2793 	return unix_next_socket(seq, NULL, pos);
2794 }
2795 
2796 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2797 {
2798 	++*pos;
2799 	return unix_next_socket(seq, v, pos);
2800 }
2801 
2802 static void unix_seq_stop(struct seq_file *seq, void *v)
2803 	__releases(unix_table_lock)
2804 {
2805 	spin_unlock(&unix_table_lock);
2806 }
2807 
2808 static int unix_seq_show(struct seq_file *seq, void *v)
2809 {
2810 
2811 	if (v == SEQ_START_TOKEN)
2812 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2813 			 "Inode Path\n");
2814 	else {
2815 		struct sock *s = v;
2816 		struct unix_sock *u = unix_sk(s);
2817 		unix_state_lock(s);
2818 
2819 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2820 			s,
2821 			atomic_read(&s->sk_refcnt),
2822 			0,
2823 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2824 			s->sk_type,
2825 			s->sk_socket ?
2826 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2827 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2828 			sock_i_ino(s));
2829 
2830 		if (u->addr) {
2831 			int i, len;
2832 			seq_putc(seq, ' ');
2833 
2834 			i = 0;
2835 			len = u->addr->len - sizeof(short);
2836 			if (!UNIX_ABSTRACT(s))
2837 				len--;
2838 			else {
2839 				seq_putc(seq, '@');
2840 				i++;
2841 			}
2842 			for ( ; i < len; i++)
2843 				seq_putc(seq, u->addr->name->sun_path[i]);
2844 		}
2845 		unix_state_unlock(s);
2846 		seq_putc(seq, '\n');
2847 	}
2848 
2849 	return 0;
2850 }
2851 
2852 static const struct seq_operations unix_seq_ops = {
2853 	.start  = unix_seq_start,
2854 	.next   = unix_seq_next,
2855 	.stop   = unix_seq_stop,
2856 	.show   = unix_seq_show,
2857 };
2858 
2859 static int unix_seq_open(struct inode *inode, struct file *file)
2860 {
2861 	return seq_open_net(inode, file, &unix_seq_ops,
2862 			    sizeof(struct seq_net_private));
2863 }
2864 
2865 static const struct file_operations unix_seq_fops = {
2866 	.owner		= THIS_MODULE,
2867 	.open		= unix_seq_open,
2868 	.read		= seq_read,
2869 	.llseek		= seq_lseek,
2870 	.release	= seq_release_net,
2871 };
2872 
2873 #endif
2874 
2875 static const struct net_proto_family unix_family_ops = {
2876 	.family = PF_UNIX,
2877 	.create = unix_create,
2878 	.owner	= THIS_MODULE,
2879 };
2880 
2881 
2882 static int __net_init unix_net_init(struct net *net)
2883 {
2884 	int error = -ENOMEM;
2885 
2886 	net->unx.sysctl_max_dgram_qlen = 10;
2887 	if (unix_sysctl_register(net))
2888 		goto out;
2889 
2890 #ifdef CONFIG_PROC_FS
2891 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2892 		unix_sysctl_unregister(net);
2893 		goto out;
2894 	}
2895 #endif
2896 	error = 0;
2897 out:
2898 	return error;
2899 }
2900 
2901 static void __net_exit unix_net_exit(struct net *net)
2902 {
2903 	unix_sysctl_unregister(net);
2904 	remove_proc_entry("unix", net->proc_net);
2905 }
2906 
2907 static struct pernet_operations unix_net_ops = {
2908 	.init = unix_net_init,
2909 	.exit = unix_net_exit,
2910 };
2911 
2912 static int __init af_unix_init(void)
2913 {
2914 	int rc = -1;
2915 
2916 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2917 
2918 	rc = proto_register(&unix_proto, 1);
2919 	if (rc != 0) {
2920 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2921 		goto out;
2922 	}
2923 
2924 	sock_register(&unix_family_ops);
2925 	register_pernet_subsys(&unix_net_ops);
2926 out:
2927 	return rc;
2928 }
2929 
2930 static void __exit af_unix_exit(void)
2931 {
2932 	sock_unregister(PF_UNIX);
2933 	proto_unregister(&unix_proto);
2934 	unregister_pernet_subsys(&unix_net_ops);
2935 }
2936 
2937 /* Earlier than device_initcall() so that other drivers invoking
2938    request_module() don't end up in a loop when modprobe tries
2939    to use a UNIX socket. But later than subsys_initcall() because
2940    we depend on stuff initialised there */
2941 fs_initcall(af_unix_init);
2942 module_exit(af_unix_exit);
2943 
2944 MODULE_LICENSE("GPL");
2945 MODULE_ALIAS_NETPROTO(PF_UNIX);
2946