xref: /openbmc/linux/net/unix/af_unix.c (revision 2891f2d5)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
84 
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched/signal.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
95 #include <linux/un.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <linux/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
120 #include <linux/file.h>
121 
122 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
123 EXPORT_SYMBOL_GPL(unix_socket_table);
124 DEFINE_SPINLOCK(unix_table_lock);
125 EXPORT_SYMBOL_GPL(unix_table_lock);
126 static atomic_long_t unix_nr_socks;
127 
128 
129 static struct hlist_head *unix_sockets_unbound(void *addr)
130 {
131 	unsigned long hash = (unsigned long)addr;
132 
133 	hash ^= hash >> 16;
134 	hash ^= hash >> 8;
135 	hash %= UNIX_HASH_SIZE;
136 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
137 }
138 
139 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
140 
141 #ifdef CONFIG_SECURITY_NETWORK
142 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
143 {
144 	UNIXCB(skb).secid = scm->secid;
145 }
146 
147 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
148 {
149 	scm->secid = UNIXCB(skb).secid;
150 }
151 
152 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
153 {
154 	return (scm->secid == UNIXCB(skb).secid);
155 }
156 #else
157 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
158 { }
159 
160 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
161 { }
162 
163 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
164 {
165 	return true;
166 }
167 #endif /* CONFIG_SECURITY_NETWORK */
168 
169 /*
170  *  SMP locking strategy:
171  *    hash table is protected with spinlock unix_table_lock
172  *    each socket state is protected by separate spin lock.
173  */
174 
175 static inline unsigned int unix_hash_fold(__wsum n)
176 {
177 	unsigned int hash = (__force unsigned int)csum_fold(n);
178 
179 	hash ^= hash>>8;
180 	return hash&(UNIX_HASH_SIZE-1);
181 }
182 
183 #define unix_peer(sk) (unix_sk(sk)->peer)
184 
185 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
186 {
187 	return unix_peer(osk) == sk;
188 }
189 
190 static inline int unix_may_send(struct sock *sk, struct sock *osk)
191 {
192 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
193 }
194 
195 static inline int unix_recvq_full(struct sock const *sk)
196 {
197 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
198 }
199 
200 struct sock *unix_peer_get(struct sock *s)
201 {
202 	struct sock *peer;
203 
204 	unix_state_lock(s);
205 	peer = unix_peer(s);
206 	if (peer)
207 		sock_hold(peer);
208 	unix_state_unlock(s);
209 	return peer;
210 }
211 EXPORT_SYMBOL_GPL(unix_peer_get);
212 
213 static inline void unix_release_addr(struct unix_address *addr)
214 {
215 	if (atomic_dec_and_test(&addr->refcnt))
216 		kfree(addr);
217 }
218 
219 /*
220  *	Check unix socket name:
221  *		- should be not zero length.
222  *	        - if started by not zero, should be NULL terminated (FS object)
223  *		- if started by zero, it is abstract name.
224  */
225 
226 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
227 {
228 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
229 		return -EINVAL;
230 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
231 		return -EINVAL;
232 	if (sunaddr->sun_path[0]) {
233 		/*
234 		 * This may look like an off by one error but it is a bit more
235 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
236 		 * sun_path[108] doesn't as such exist.  However in kernel space
237 		 * we are guaranteed that it is a valid memory location in our
238 		 * kernel address buffer.
239 		 */
240 		((char *)sunaddr)[len] = 0;
241 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
242 		return len;
243 	}
244 
245 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
246 	return len;
247 }
248 
249 static void __unix_remove_socket(struct sock *sk)
250 {
251 	sk_del_node_init(sk);
252 }
253 
254 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
255 {
256 	WARN_ON(!sk_unhashed(sk));
257 	sk_add_node(sk, list);
258 }
259 
260 static inline void unix_remove_socket(struct sock *sk)
261 {
262 	spin_lock(&unix_table_lock);
263 	__unix_remove_socket(sk);
264 	spin_unlock(&unix_table_lock);
265 }
266 
267 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
268 {
269 	spin_lock(&unix_table_lock);
270 	__unix_insert_socket(list, sk);
271 	spin_unlock(&unix_table_lock);
272 }
273 
274 static struct sock *__unix_find_socket_byname(struct net *net,
275 					      struct sockaddr_un *sunname,
276 					      int len, int type, unsigned int hash)
277 {
278 	struct sock *s;
279 
280 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
281 		struct unix_sock *u = unix_sk(s);
282 
283 		if (!net_eq(sock_net(s), net))
284 			continue;
285 
286 		if (u->addr->len == len &&
287 		    !memcmp(u->addr->name, sunname, len))
288 			goto found;
289 	}
290 	s = NULL;
291 found:
292 	return s;
293 }
294 
295 static inline struct sock *unix_find_socket_byname(struct net *net,
296 						   struct sockaddr_un *sunname,
297 						   int len, int type,
298 						   unsigned int hash)
299 {
300 	struct sock *s;
301 
302 	spin_lock(&unix_table_lock);
303 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
304 	if (s)
305 		sock_hold(s);
306 	spin_unlock(&unix_table_lock);
307 	return s;
308 }
309 
310 static struct sock *unix_find_socket_byinode(struct inode *i)
311 {
312 	struct sock *s;
313 
314 	spin_lock(&unix_table_lock);
315 	sk_for_each(s,
316 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
317 		struct dentry *dentry = unix_sk(s)->path.dentry;
318 
319 		if (dentry && d_backing_inode(dentry) == i) {
320 			sock_hold(s);
321 			goto found;
322 		}
323 	}
324 	s = NULL;
325 found:
326 	spin_unlock(&unix_table_lock);
327 	return s;
328 }
329 
330 /* Support code for asymmetrically connected dgram sockets
331  *
332  * If a datagram socket is connected to a socket not itself connected
333  * to the first socket (eg, /dev/log), clients may only enqueue more
334  * messages if the present receive queue of the server socket is not
335  * "too large". This means there's a second writeability condition
336  * poll and sendmsg need to test. The dgram recv code will do a wake
337  * up on the peer_wait wait queue of a socket upon reception of a
338  * datagram which needs to be propagated to sleeping would-be writers
339  * since these might not have sent anything so far. This can't be
340  * accomplished via poll_wait because the lifetime of the server
341  * socket might be less than that of its clients if these break their
342  * association with it or if the server socket is closed while clients
343  * are still connected to it and there's no way to inform "a polling
344  * implementation" that it should let go of a certain wait queue
345  *
346  * In order to propagate a wake up, a wait_queue_t of the client
347  * socket is enqueued on the peer_wait queue of the server socket
348  * whose wake function does a wake_up on the ordinary client socket
349  * wait queue. This connection is established whenever a write (or
350  * poll for write) hit the flow control condition and broken when the
351  * association to the server socket is dissolved or after a wake up
352  * was relayed.
353  */
354 
355 static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
356 				      void *key)
357 {
358 	struct unix_sock *u;
359 	wait_queue_head_t *u_sleep;
360 
361 	u = container_of(q, struct unix_sock, peer_wake);
362 
363 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
364 			    q);
365 	u->peer_wake.private = NULL;
366 
367 	/* relaying can only happen while the wq still exists */
368 	u_sleep = sk_sleep(&u->sk);
369 	if (u_sleep)
370 		wake_up_interruptible_poll(u_sleep, key);
371 
372 	return 0;
373 }
374 
375 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
376 {
377 	struct unix_sock *u, *u_other;
378 	int rc;
379 
380 	u = unix_sk(sk);
381 	u_other = unix_sk(other);
382 	rc = 0;
383 	spin_lock(&u_other->peer_wait.lock);
384 
385 	if (!u->peer_wake.private) {
386 		u->peer_wake.private = other;
387 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
388 
389 		rc = 1;
390 	}
391 
392 	spin_unlock(&u_other->peer_wait.lock);
393 	return rc;
394 }
395 
396 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
397 					    struct sock *other)
398 {
399 	struct unix_sock *u, *u_other;
400 
401 	u = unix_sk(sk);
402 	u_other = unix_sk(other);
403 	spin_lock(&u_other->peer_wait.lock);
404 
405 	if (u->peer_wake.private == other) {
406 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
407 		u->peer_wake.private = NULL;
408 	}
409 
410 	spin_unlock(&u_other->peer_wait.lock);
411 }
412 
413 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
414 						   struct sock *other)
415 {
416 	unix_dgram_peer_wake_disconnect(sk, other);
417 	wake_up_interruptible_poll(sk_sleep(sk),
418 				   POLLOUT |
419 				   POLLWRNORM |
420 				   POLLWRBAND);
421 }
422 
423 /* preconditions:
424  *	- unix_peer(sk) == other
425  *	- association is stable
426  */
427 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
428 {
429 	int connected;
430 
431 	connected = unix_dgram_peer_wake_connect(sk, other);
432 
433 	if (unix_recvq_full(other))
434 		return 1;
435 
436 	if (connected)
437 		unix_dgram_peer_wake_disconnect(sk, other);
438 
439 	return 0;
440 }
441 
442 static int unix_writable(const struct sock *sk)
443 {
444 	return sk->sk_state != TCP_LISTEN &&
445 	       (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
446 }
447 
448 static void unix_write_space(struct sock *sk)
449 {
450 	struct socket_wq *wq;
451 
452 	rcu_read_lock();
453 	if (unix_writable(sk)) {
454 		wq = rcu_dereference(sk->sk_wq);
455 		if (skwq_has_sleeper(wq))
456 			wake_up_interruptible_sync_poll(&wq->wait,
457 				POLLOUT | POLLWRNORM | POLLWRBAND);
458 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
459 	}
460 	rcu_read_unlock();
461 }
462 
463 /* When dgram socket disconnects (or changes its peer), we clear its receive
464  * queue of packets arrived from previous peer. First, it allows to do
465  * flow control based only on wmem_alloc; second, sk connected to peer
466  * may receive messages only from that peer. */
467 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
468 {
469 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
470 		skb_queue_purge(&sk->sk_receive_queue);
471 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
472 
473 		/* If one link of bidirectional dgram pipe is disconnected,
474 		 * we signal error. Messages are lost. Do not make this,
475 		 * when peer was not connected to us.
476 		 */
477 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
478 			other->sk_err = ECONNRESET;
479 			other->sk_error_report(other);
480 		}
481 	}
482 }
483 
484 static void unix_sock_destructor(struct sock *sk)
485 {
486 	struct unix_sock *u = unix_sk(sk);
487 
488 	skb_queue_purge(&sk->sk_receive_queue);
489 
490 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
491 	WARN_ON(!sk_unhashed(sk));
492 	WARN_ON(sk->sk_socket);
493 	if (!sock_flag(sk, SOCK_DEAD)) {
494 		pr_info("Attempt to release alive unix socket: %p\n", sk);
495 		return;
496 	}
497 
498 	if (u->addr)
499 		unix_release_addr(u->addr);
500 
501 	atomic_long_dec(&unix_nr_socks);
502 	local_bh_disable();
503 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
504 	local_bh_enable();
505 #ifdef UNIX_REFCNT_DEBUG
506 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
507 		atomic_long_read(&unix_nr_socks));
508 #endif
509 }
510 
511 static void unix_release_sock(struct sock *sk, int embrion)
512 {
513 	struct unix_sock *u = unix_sk(sk);
514 	struct path path;
515 	struct sock *skpair;
516 	struct sk_buff *skb;
517 	int state;
518 
519 	unix_remove_socket(sk);
520 
521 	/* Clear state */
522 	unix_state_lock(sk);
523 	sock_orphan(sk);
524 	sk->sk_shutdown = SHUTDOWN_MASK;
525 	path	     = u->path;
526 	u->path.dentry = NULL;
527 	u->path.mnt = NULL;
528 	state = sk->sk_state;
529 	sk->sk_state = TCP_CLOSE;
530 	unix_state_unlock(sk);
531 
532 	wake_up_interruptible_all(&u->peer_wait);
533 
534 	skpair = unix_peer(sk);
535 
536 	if (skpair != NULL) {
537 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
538 			unix_state_lock(skpair);
539 			/* No more writes */
540 			skpair->sk_shutdown = SHUTDOWN_MASK;
541 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
542 				skpair->sk_err = ECONNRESET;
543 			unix_state_unlock(skpair);
544 			skpair->sk_state_change(skpair);
545 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
546 		}
547 
548 		unix_dgram_peer_wake_disconnect(sk, skpair);
549 		sock_put(skpair); /* It may now die */
550 		unix_peer(sk) = NULL;
551 	}
552 
553 	/* Try to flush out this socket. Throw out buffers at least */
554 
555 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
556 		if (state == TCP_LISTEN)
557 			unix_release_sock(skb->sk, 1);
558 		/* passed fds are erased in the kfree_skb hook	      */
559 		UNIXCB(skb).consumed = skb->len;
560 		kfree_skb(skb);
561 	}
562 
563 	if (path.dentry)
564 		path_put(&path);
565 
566 	sock_put(sk);
567 
568 	/* ---- Socket is dead now and most probably destroyed ---- */
569 
570 	/*
571 	 * Fixme: BSD difference: In BSD all sockets connected to us get
572 	 *	  ECONNRESET and we die on the spot. In Linux we behave
573 	 *	  like files and pipes do and wait for the last
574 	 *	  dereference.
575 	 *
576 	 * Can't we simply set sock->err?
577 	 *
578 	 *	  What the above comment does talk about? --ANK(980817)
579 	 */
580 
581 	if (unix_tot_inflight)
582 		unix_gc();		/* Garbage collect fds */
583 }
584 
585 static void init_peercred(struct sock *sk)
586 {
587 	put_pid(sk->sk_peer_pid);
588 	if (sk->sk_peer_cred)
589 		put_cred(sk->sk_peer_cred);
590 	sk->sk_peer_pid  = get_pid(task_tgid(current));
591 	sk->sk_peer_cred = get_current_cred();
592 }
593 
594 static void copy_peercred(struct sock *sk, struct sock *peersk)
595 {
596 	put_pid(sk->sk_peer_pid);
597 	if (sk->sk_peer_cred)
598 		put_cred(sk->sk_peer_cred);
599 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
600 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
601 }
602 
603 static int unix_listen(struct socket *sock, int backlog)
604 {
605 	int err;
606 	struct sock *sk = sock->sk;
607 	struct unix_sock *u = unix_sk(sk);
608 	struct pid *old_pid = NULL;
609 
610 	err = -EOPNOTSUPP;
611 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
612 		goto out;	/* Only stream/seqpacket sockets accept */
613 	err = -EINVAL;
614 	if (!u->addr)
615 		goto out;	/* No listens on an unbound socket */
616 	unix_state_lock(sk);
617 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
618 		goto out_unlock;
619 	if (backlog > sk->sk_max_ack_backlog)
620 		wake_up_interruptible_all(&u->peer_wait);
621 	sk->sk_max_ack_backlog	= backlog;
622 	sk->sk_state		= TCP_LISTEN;
623 	/* set credentials so connect can copy them */
624 	init_peercred(sk);
625 	err = 0;
626 
627 out_unlock:
628 	unix_state_unlock(sk);
629 	put_pid(old_pid);
630 out:
631 	return err;
632 }
633 
634 static int unix_release(struct socket *);
635 static int unix_bind(struct socket *, struct sockaddr *, int);
636 static int unix_stream_connect(struct socket *, struct sockaddr *,
637 			       int addr_len, int flags);
638 static int unix_socketpair(struct socket *, struct socket *);
639 static int unix_accept(struct socket *, struct socket *, int, bool);
640 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
641 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
642 static unsigned int unix_dgram_poll(struct file *, struct socket *,
643 				    poll_table *);
644 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
645 static int unix_shutdown(struct socket *, int);
646 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
647 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
648 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
649 				    size_t size, int flags);
650 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
651 				       struct pipe_inode_info *, size_t size,
652 				       unsigned int flags);
653 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
654 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
655 static int unix_dgram_connect(struct socket *, struct sockaddr *,
656 			      int, int);
657 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
658 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
659 				  int);
660 
661 static int unix_set_peek_off(struct sock *sk, int val)
662 {
663 	struct unix_sock *u = unix_sk(sk);
664 
665 	if (mutex_lock_interruptible(&u->iolock))
666 		return -EINTR;
667 
668 	sk->sk_peek_off = val;
669 	mutex_unlock(&u->iolock);
670 
671 	return 0;
672 }
673 
674 
675 static const struct proto_ops unix_stream_ops = {
676 	.family =	PF_UNIX,
677 	.owner =	THIS_MODULE,
678 	.release =	unix_release,
679 	.bind =		unix_bind,
680 	.connect =	unix_stream_connect,
681 	.socketpair =	unix_socketpair,
682 	.accept =	unix_accept,
683 	.getname =	unix_getname,
684 	.poll =		unix_poll,
685 	.ioctl =	unix_ioctl,
686 	.listen =	unix_listen,
687 	.shutdown =	unix_shutdown,
688 	.setsockopt =	sock_no_setsockopt,
689 	.getsockopt =	sock_no_getsockopt,
690 	.sendmsg =	unix_stream_sendmsg,
691 	.recvmsg =	unix_stream_recvmsg,
692 	.mmap =		sock_no_mmap,
693 	.sendpage =	unix_stream_sendpage,
694 	.splice_read =	unix_stream_splice_read,
695 	.set_peek_off =	unix_set_peek_off,
696 };
697 
698 static const struct proto_ops unix_dgram_ops = {
699 	.family =	PF_UNIX,
700 	.owner =	THIS_MODULE,
701 	.release =	unix_release,
702 	.bind =		unix_bind,
703 	.connect =	unix_dgram_connect,
704 	.socketpair =	unix_socketpair,
705 	.accept =	sock_no_accept,
706 	.getname =	unix_getname,
707 	.poll =		unix_dgram_poll,
708 	.ioctl =	unix_ioctl,
709 	.listen =	sock_no_listen,
710 	.shutdown =	unix_shutdown,
711 	.setsockopt =	sock_no_setsockopt,
712 	.getsockopt =	sock_no_getsockopt,
713 	.sendmsg =	unix_dgram_sendmsg,
714 	.recvmsg =	unix_dgram_recvmsg,
715 	.mmap =		sock_no_mmap,
716 	.sendpage =	sock_no_sendpage,
717 	.set_peek_off =	unix_set_peek_off,
718 };
719 
720 static const struct proto_ops unix_seqpacket_ops = {
721 	.family =	PF_UNIX,
722 	.owner =	THIS_MODULE,
723 	.release =	unix_release,
724 	.bind =		unix_bind,
725 	.connect =	unix_stream_connect,
726 	.socketpair =	unix_socketpair,
727 	.accept =	unix_accept,
728 	.getname =	unix_getname,
729 	.poll =		unix_dgram_poll,
730 	.ioctl =	unix_ioctl,
731 	.listen =	unix_listen,
732 	.shutdown =	unix_shutdown,
733 	.setsockopt =	sock_no_setsockopt,
734 	.getsockopt =	sock_no_getsockopt,
735 	.sendmsg =	unix_seqpacket_sendmsg,
736 	.recvmsg =	unix_seqpacket_recvmsg,
737 	.mmap =		sock_no_mmap,
738 	.sendpage =	sock_no_sendpage,
739 	.set_peek_off =	unix_set_peek_off,
740 };
741 
742 static struct proto unix_proto = {
743 	.name			= "UNIX",
744 	.owner			= THIS_MODULE,
745 	.obj_size		= sizeof(struct unix_sock),
746 };
747 
748 /*
749  * AF_UNIX sockets do not interact with hardware, hence they
750  * dont trigger interrupts - so it's safe for them to have
751  * bh-unsafe locking for their sk_receive_queue.lock. Split off
752  * this special lock-class by reinitializing the spinlock key:
753  */
754 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
755 
756 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
757 {
758 	struct sock *sk = NULL;
759 	struct unix_sock *u;
760 
761 	atomic_long_inc(&unix_nr_socks);
762 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
763 		goto out;
764 
765 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
766 	if (!sk)
767 		goto out;
768 
769 	sock_init_data(sock, sk);
770 	lockdep_set_class(&sk->sk_receive_queue.lock,
771 				&af_unix_sk_receive_queue_lock_key);
772 
773 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
774 	sk->sk_write_space	= unix_write_space;
775 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
776 	sk->sk_destruct		= unix_sock_destructor;
777 	u	  = unix_sk(sk);
778 	u->path.dentry = NULL;
779 	u->path.mnt = NULL;
780 	spin_lock_init(&u->lock);
781 	atomic_long_set(&u->inflight, 0);
782 	INIT_LIST_HEAD(&u->link);
783 	mutex_init(&u->iolock); /* single task reading lock */
784 	mutex_init(&u->bindlock); /* single task binding lock */
785 	init_waitqueue_head(&u->peer_wait);
786 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
787 	unix_insert_socket(unix_sockets_unbound(sk), sk);
788 out:
789 	if (sk == NULL)
790 		atomic_long_dec(&unix_nr_socks);
791 	else {
792 		local_bh_disable();
793 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
794 		local_bh_enable();
795 	}
796 	return sk;
797 }
798 
799 static int unix_create(struct net *net, struct socket *sock, int protocol,
800 		       int kern)
801 {
802 	if (protocol && protocol != PF_UNIX)
803 		return -EPROTONOSUPPORT;
804 
805 	sock->state = SS_UNCONNECTED;
806 
807 	switch (sock->type) {
808 	case SOCK_STREAM:
809 		sock->ops = &unix_stream_ops;
810 		break;
811 		/*
812 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
813 		 *	nothing uses it.
814 		 */
815 	case SOCK_RAW:
816 		sock->type = SOCK_DGRAM;
817 	case SOCK_DGRAM:
818 		sock->ops = &unix_dgram_ops;
819 		break;
820 	case SOCK_SEQPACKET:
821 		sock->ops = &unix_seqpacket_ops;
822 		break;
823 	default:
824 		return -ESOCKTNOSUPPORT;
825 	}
826 
827 	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
828 }
829 
830 static int unix_release(struct socket *sock)
831 {
832 	struct sock *sk = sock->sk;
833 
834 	if (!sk)
835 		return 0;
836 
837 	unix_release_sock(sk, 0);
838 	sock->sk = NULL;
839 
840 	return 0;
841 }
842 
843 static int unix_autobind(struct socket *sock)
844 {
845 	struct sock *sk = sock->sk;
846 	struct net *net = sock_net(sk);
847 	struct unix_sock *u = unix_sk(sk);
848 	static u32 ordernum = 1;
849 	struct unix_address *addr;
850 	int err;
851 	unsigned int retries = 0;
852 
853 	err = mutex_lock_interruptible(&u->bindlock);
854 	if (err)
855 		return err;
856 
857 	err = 0;
858 	if (u->addr)
859 		goto out;
860 
861 	err = -ENOMEM;
862 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
863 	if (!addr)
864 		goto out;
865 
866 	addr->name->sun_family = AF_UNIX;
867 	atomic_set(&addr->refcnt, 1);
868 
869 retry:
870 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
871 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
872 
873 	spin_lock(&unix_table_lock);
874 	ordernum = (ordernum+1)&0xFFFFF;
875 
876 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
877 				      addr->hash)) {
878 		spin_unlock(&unix_table_lock);
879 		/*
880 		 * __unix_find_socket_byname() may take long time if many names
881 		 * are already in use.
882 		 */
883 		cond_resched();
884 		/* Give up if all names seems to be in use. */
885 		if (retries++ == 0xFFFFF) {
886 			err = -ENOSPC;
887 			kfree(addr);
888 			goto out;
889 		}
890 		goto retry;
891 	}
892 	addr->hash ^= sk->sk_type;
893 
894 	__unix_remove_socket(sk);
895 	u->addr = addr;
896 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
897 	spin_unlock(&unix_table_lock);
898 	err = 0;
899 
900 out:	mutex_unlock(&u->bindlock);
901 	return err;
902 }
903 
904 static struct sock *unix_find_other(struct net *net,
905 				    struct sockaddr_un *sunname, int len,
906 				    int type, unsigned int hash, int *error)
907 {
908 	struct sock *u;
909 	struct path path;
910 	int err = 0;
911 
912 	if (sunname->sun_path[0]) {
913 		struct inode *inode;
914 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
915 		if (err)
916 			goto fail;
917 		inode = d_backing_inode(path.dentry);
918 		err = inode_permission(inode, MAY_WRITE);
919 		if (err)
920 			goto put_fail;
921 
922 		err = -ECONNREFUSED;
923 		if (!S_ISSOCK(inode->i_mode))
924 			goto put_fail;
925 		u = unix_find_socket_byinode(inode);
926 		if (!u)
927 			goto put_fail;
928 
929 		if (u->sk_type == type)
930 			touch_atime(&path);
931 
932 		path_put(&path);
933 
934 		err = -EPROTOTYPE;
935 		if (u->sk_type != type) {
936 			sock_put(u);
937 			goto fail;
938 		}
939 	} else {
940 		err = -ECONNREFUSED;
941 		u = unix_find_socket_byname(net, sunname, len, type, hash);
942 		if (u) {
943 			struct dentry *dentry;
944 			dentry = unix_sk(u)->path.dentry;
945 			if (dentry)
946 				touch_atime(&unix_sk(u)->path);
947 		} else
948 			goto fail;
949 	}
950 	return u;
951 
952 put_fail:
953 	path_put(&path);
954 fail:
955 	*error = err;
956 	return NULL;
957 }
958 
959 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
960 {
961 	struct dentry *dentry;
962 	struct path path;
963 	int err = 0;
964 	/*
965 	 * Get the parent directory, calculate the hash for last
966 	 * component.
967 	 */
968 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
969 	err = PTR_ERR(dentry);
970 	if (IS_ERR(dentry))
971 		return err;
972 
973 	/*
974 	 * All right, let's create it.
975 	 */
976 	err = security_path_mknod(&path, dentry, mode, 0);
977 	if (!err) {
978 		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
979 		if (!err) {
980 			res->mnt = mntget(path.mnt);
981 			res->dentry = dget(dentry);
982 		}
983 	}
984 	done_path_create(&path, dentry);
985 	return err;
986 }
987 
988 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
989 {
990 	struct sock *sk = sock->sk;
991 	struct net *net = sock_net(sk);
992 	struct unix_sock *u = unix_sk(sk);
993 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
994 	char *sun_path = sunaddr->sun_path;
995 	int err;
996 	unsigned int hash;
997 	struct unix_address *addr;
998 	struct hlist_head *list;
999 	struct path path = { NULL, NULL };
1000 
1001 	err = -EINVAL;
1002 	if (sunaddr->sun_family != AF_UNIX)
1003 		goto out;
1004 
1005 	if (addr_len == sizeof(short)) {
1006 		err = unix_autobind(sock);
1007 		goto out;
1008 	}
1009 
1010 	err = unix_mkname(sunaddr, addr_len, &hash);
1011 	if (err < 0)
1012 		goto out;
1013 	addr_len = err;
1014 
1015 	if (sun_path[0]) {
1016 		umode_t mode = S_IFSOCK |
1017 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
1018 		err = unix_mknod(sun_path, mode, &path);
1019 		if (err) {
1020 			if (err == -EEXIST)
1021 				err = -EADDRINUSE;
1022 			goto out;
1023 		}
1024 	}
1025 
1026 	err = mutex_lock_interruptible(&u->bindlock);
1027 	if (err)
1028 		goto out_put;
1029 
1030 	err = -EINVAL;
1031 	if (u->addr)
1032 		goto out_up;
1033 
1034 	err = -ENOMEM;
1035 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1036 	if (!addr)
1037 		goto out_up;
1038 
1039 	memcpy(addr->name, sunaddr, addr_len);
1040 	addr->len = addr_len;
1041 	addr->hash = hash ^ sk->sk_type;
1042 	atomic_set(&addr->refcnt, 1);
1043 
1044 	if (sun_path[0]) {
1045 		addr->hash = UNIX_HASH_SIZE;
1046 		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1047 		spin_lock(&unix_table_lock);
1048 		u->path = path;
1049 		list = &unix_socket_table[hash];
1050 	} else {
1051 		spin_lock(&unix_table_lock);
1052 		err = -EADDRINUSE;
1053 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
1054 					      sk->sk_type, hash)) {
1055 			unix_release_addr(addr);
1056 			goto out_unlock;
1057 		}
1058 
1059 		list = &unix_socket_table[addr->hash];
1060 	}
1061 
1062 	err = 0;
1063 	__unix_remove_socket(sk);
1064 	u->addr = addr;
1065 	__unix_insert_socket(list, sk);
1066 
1067 out_unlock:
1068 	spin_unlock(&unix_table_lock);
1069 out_up:
1070 	mutex_unlock(&u->bindlock);
1071 out_put:
1072 	if (err)
1073 		path_put(&path);
1074 out:
1075 	return err;
1076 }
1077 
1078 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1079 {
1080 	if (unlikely(sk1 == sk2) || !sk2) {
1081 		unix_state_lock(sk1);
1082 		return;
1083 	}
1084 	if (sk1 < sk2) {
1085 		unix_state_lock(sk1);
1086 		unix_state_lock_nested(sk2);
1087 	} else {
1088 		unix_state_lock(sk2);
1089 		unix_state_lock_nested(sk1);
1090 	}
1091 }
1092 
1093 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1094 {
1095 	if (unlikely(sk1 == sk2) || !sk2) {
1096 		unix_state_unlock(sk1);
1097 		return;
1098 	}
1099 	unix_state_unlock(sk1);
1100 	unix_state_unlock(sk2);
1101 }
1102 
1103 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1104 			      int alen, int flags)
1105 {
1106 	struct sock *sk = sock->sk;
1107 	struct net *net = sock_net(sk);
1108 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1109 	struct sock *other;
1110 	unsigned int hash;
1111 	int err;
1112 
1113 	if (addr->sa_family != AF_UNSPEC) {
1114 		err = unix_mkname(sunaddr, alen, &hash);
1115 		if (err < 0)
1116 			goto out;
1117 		alen = err;
1118 
1119 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1120 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1121 			goto out;
1122 
1123 restart:
1124 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1125 		if (!other)
1126 			goto out;
1127 
1128 		unix_state_double_lock(sk, other);
1129 
1130 		/* Apparently VFS overslept socket death. Retry. */
1131 		if (sock_flag(other, SOCK_DEAD)) {
1132 			unix_state_double_unlock(sk, other);
1133 			sock_put(other);
1134 			goto restart;
1135 		}
1136 
1137 		err = -EPERM;
1138 		if (!unix_may_send(sk, other))
1139 			goto out_unlock;
1140 
1141 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1142 		if (err)
1143 			goto out_unlock;
1144 
1145 	} else {
1146 		/*
1147 		 *	1003.1g breaking connected state with AF_UNSPEC
1148 		 */
1149 		other = NULL;
1150 		unix_state_double_lock(sk, other);
1151 	}
1152 
1153 	/*
1154 	 * If it was connected, reconnect.
1155 	 */
1156 	if (unix_peer(sk)) {
1157 		struct sock *old_peer = unix_peer(sk);
1158 		unix_peer(sk) = other;
1159 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1160 
1161 		unix_state_double_unlock(sk, other);
1162 
1163 		if (other != old_peer)
1164 			unix_dgram_disconnected(sk, old_peer);
1165 		sock_put(old_peer);
1166 	} else {
1167 		unix_peer(sk) = other;
1168 		unix_state_double_unlock(sk, other);
1169 	}
1170 	return 0;
1171 
1172 out_unlock:
1173 	unix_state_double_unlock(sk, other);
1174 	sock_put(other);
1175 out:
1176 	return err;
1177 }
1178 
1179 static long unix_wait_for_peer(struct sock *other, long timeo)
1180 {
1181 	struct unix_sock *u = unix_sk(other);
1182 	int sched;
1183 	DEFINE_WAIT(wait);
1184 
1185 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1186 
1187 	sched = !sock_flag(other, SOCK_DEAD) &&
1188 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1189 		unix_recvq_full(other);
1190 
1191 	unix_state_unlock(other);
1192 
1193 	if (sched)
1194 		timeo = schedule_timeout(timeo);
1195 
1196 	finish_wait(&u->peer_wait, &wait);
1197 	return timeo;
1198 }
1199 
1200 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1201 			       int addr_len, int flags)
1202 {
1203 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1204 	struct sock *sk = sock->sk;
1205 	struct net *net = sock_net(sk);
1206 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1207 	struct sock *newsk = NULL;
1208 	struct sock *other = NULL;
1209 	struct sk_buff *skb = NULL;
1210 	unsigned int hash;
1211 	int st;
1212 	int err;
1213 	long timeo;
1214 
1215 	err = unix_mkname(sunaddr, addr_len, &hash);
1216 	if (err < 0)
1217 		goto out;
1218 	addr_len = err;
1219 
1220 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1221 	    (err = unix_autobind(sock)) != 0)
1222 		goto out;
1223 
1224 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1225 
1226 	/* First of all allocate resources.
1227 	   If we will make it after state is locked,
1228 	   we will have to recheck all again in any case.
1229 	 */
1230 
1231 	err = -ENOMEM;
1232 
1233 	/* create new sock for complete connection */
1234 	newsk = unix_create1(sock_net(sk), NULL, 0);
1235 	if (newsk == NULL)
1236 		goto out;
1237 
1238 	/* Allocate skb for sending to listening sock */
1239 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1240 	if (skb == NULL)
1241 		goto out;
1242 
1243 restart:
1244 	/*  Find listening sock. */
1245 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1246 	if (!other)
1247 		goto out;
1248 
1249 	/* Latch state of peer */
1250 	unix_state_lock(other);
1251 
1252 	/* Apparently VFS overslept socket death. Retry. */
1253 	if (sock_flag(other, SOCK_DEAD)) {
1254 		unix_state_unlock(other);
1255 		sock_put(other);
1256 		goto restart;
1257 	}
1258 
1259 	err = -ECONNREFUSED;
1260 	if (other->sk_state != TCP_LISTEN)
1261 		goto out_unlock;
1262 	if (other->sk_shutdown & RCV_SHUTDOWN)
1263 		goto out_unlock;
1264 
1265 	if (unix_recvq_full(other)) {
1266 		err = -EAGAIN;
1267 		if (!timeo)
1268 			goto out_unlock;
1269 
1270 		timeo = unix_wait_for_peer(other, timeo);
1271 
1272 		err = sock_intr_errno(timeo);
1273 		if (signal_pending(current))
1274 			goto out;
1275 		sock_put(other);
1276 		goto restart;
1277 	}
1278 
1279 	/* Latch our state.
1280 
1281 	   It is tricky place. We need to grab our state lock and cannot
1282 	   drop lock on peer. It is dangerous because deadlock is
1283 	   possible. Connect to self case and simultaneous
1284 	   attempt to connect are eliminated by checking socket
1285 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1286 	   check this before attempt to grab lock.
1287 
1288 	   Well, and we have to recheck the state after socket locked.
1289 	 */
1290 	st = sk->sk_state;
1291 
1292 	switch (st) {
1293 	case TCP_CLOSE:
1294 		/* This is ok... continue with connect */
1295 		break;
1296 	case TCP_ESTABLISHED:
1297 		/* Socket is already connected */
1298 		err = -EISCONN;
1299 		goto out_unlock;
1300 	default:
1301 		err = -EINVAL;
1302 		goto out_unlock;
1303 	}
1304 
1305 	unix_state_lock_nested(sk);
1306 
1307 	if (sk->sk_state != st) {
1308 		unix_state_unlock(sk);
1309 		unix_state_unlock(other);
1310 		sock_put(other);
1311 		goto restart;
1312 	}
1313 
1314 	err = security_unix_stream_connect(sk, other, newsk);
1315 	if (err) {
1316 		unix_state_unlock(sk);
1317 		goto out_unlock;
1318 	}
1319 
1320 	/* The way is open! Fastly set all the necessary fields... */
1321 
1322 	sock_hold(sk);
1323 	unix_peer(newsk)	= sk;
1324 	newsk->sk_state		= TCP_ESTABLISHED;
1325 	newsk->sk_type		= sk->sk_type;
1326 	init_peercred(newsk);
1327 	newu = unix_sk(newsk);
1328 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1329 	otheru = unix_sk(other);
1330 
1331 	/* copy address information from listening to new sock*/
1332 	if (otheru->addr) {
1333 		atomic_inc(&otheru->addr->refcnt);
1334 		newu->addr = otheru->addr;
1335 	}
1336 	if (otheru->path.dentry) {
1337 		path_get(&otheru->path);
1338 		newu->path = otheru->path;
1339 	}
1340 
1341 	/* Set credentials */
1342 	copy_peercred(sk, other);
1343 
1344 	sock->state	= SS_CONNECTED;
1345 	sk->sk_state	= TCP_ESTABLISHED;
1346 	sock_hold(newsk);
1347 
1348 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1349 	unix_peer(sk)	= newsk;
1350 
1351 	unix_state_unlock(sk);
1352 
1353 	/* take ten and and send info to listening sock */
1354 	spin_lock(&other->sk_receive_queue.lock);
1355 	__skb_queue_tail(&other->sk_receive_queue, skb);
1356 	spin_unlock(&other->sk_receive_queue.lock);
1357 	unix_state_unlock(other);
1358 	other->sk_data_ready(other);
1359 	sock_put(other);
1360 	return 0;
1361 
1362 out_unlock:
1363 	if (other)
1364 		unix_state_unlock(other);
1365 
1366 out:
1367 	kfree_skb(skb);
1368 	if (newsk)
1369 		unix_release_sock(newsk, 0);
1370 	if (other)
1371 		sock_put(other);
1372 	return err;
1373 }
1374 
1375 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1376 {
1377 	struct sock *ska = socka->sk, *skb = sockb->sk;
1378 
1379 	/* Join our sockets back to back */
1380 	sock_hold(ska);
1381 	sock_hold(skb);
1382 	unix_peer(ska) = skb;
1383 	unix_peer(skb) = ska;
1384 	init_peercred(ska);
1385 	init_peercred(skb);
1386 
1387 	if (ska->sk_type != SOCK_DGRAM) {
1388 		ska->sk_state = TCP_ESTABLISHED;
1389 		skb->sk_state = TCP_ESTABLISHED;
1390 		socka->state  = SS_CONNECTED;
1391 		sockb->state  = SS_CONNECTED;
1392 	}
1393 	return 0;
1394 }
1395 
1396 static void unix_sock_inherit_flags(const struct socket *old,
1397 				    struct socket *new)
1398 {
1399 	if (test_bit(SOCK_PASSCRED, &old->flags))
1400 		set_bit(SOCK_PASSCRED, &new->flags);
1401 	if (test_bit(SOCK_PASSSEC, &old->flags))
1402 		set_bit(SOCK_PASSSEC, &new->flags);
1403 }
1404 
1405 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1406 		       bool kern)
1407 {
1408 	struct sock *sk = sock->sk;
1409 	struct sock *tsk;
1410 	struct sk_buff *skb;
1411 	int err;
1412 
1413 	err = -EOPNOTSUPP;
1414 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1415 		goto out;
1416 
1417 	err = -EINVAL;
1418 	if (sk->sk_state != TCP_LISTEN)
1419 		goto out;
1420 
1421 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1422 	 * so that no locks are necessary.
1423 	 */
1424 
1425 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1426 	if (!skb) {
1427 		/* This means receive shutdown. */
1428 		if (err == 0)
1429 			err = -EINVAL;
1430 		goto out;
1431 	}
1432 
1433 	tsk = skb->sk;
1434 	skb_free_datagram(sk, skb);
1435 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1436 
1437 	/* attach accepted sock to socket */
1438 	unix_state_lock(tsk);
1439 	newsock->state = SS_CONNECTED;
1440 	unix_sock_inherit_flags(sock, newsock);
1441 	sock_graft(tsk, newsock);
1442 	unix_state_unlock(tsk);
1443 	return 0;
1444 
1445 out:
1446 	return err;
1447 }
1448 
1449 
1450 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1451 {
1452 	struct sock *sk = sock->sk;
1453 	struct unix_sock *u;
1454 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1455 	int err = 0;
1456 
1457 	if (peer) {
1458 		sk = unix_peer_get(sk);
1459 
1460 		err = -ENOTCONN;
1461 		if (!sk)
1462 			goto out;
1463 		err = 0;
1464 	} else {
1465 		sock_hold(sk);
1466 	}
1467 
1468 	u = unix_sk(sk);
1469 	unix_state_lock(sk);
1470 	if (!u->addr) {
1471 		sunaddr->sun_family = AF_UNIX;
1472 		sunaddr->sun_path[0] = 0;
1473 		*uaddr_len = sizeof(short);
1474 	} else {
1475 		struct unix_address *addr = u->addr;
1476 
1477 		*uaddr_len = addr->len;
1478 		memcpy(sunaddr, addr->name, *uaddr_len);
1479 	}
1480 	unix_state_unlock(sk);
1481 	sock_put(sk);
1482 out:
1483 	return err;
1484 }
1485 
1486 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1487 {
1488 	int i;
1489 
1490 	scm->fp = UNIXCB(skb).fp;
1491 	UNIXCB(skb).fp = NULL;
1492 
1493 	for (i = scm->fp->count-1; i >= 0; i--)
1494 		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
1495 }
1496 
1497 static void unix_destruct_scm(struct sk_buff *skb)
1498 {
1499 	struct scm_cookie scm;
1500 	memset(&scm, 0, sizeof(scm));
1501 	scm.pid  = UNIXCB(skb).pid;
1502 	if (UNIXCB(skb).fp)
1503 		unix_detach_fds(&scm, skb);
1504 
1505 	/* Alas, it calls VFS */
1506 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1507 	scm_destroy(&scm);
1508 	sock_wfree(skb);
1509 }
1510 
1511 /*
1512  * The "user->unix_inflight" variable is protected by the garbage
1513  * collection lock, and we just read it locklessly here. If you go
1514  * over the limit, there might be a tiny race in actually noticing
1515  * it across threads. Tough.
1516  */
1517 static inline bool too_many_unix_fds(struct task_struct *p)
1518 {
1519 	struct user_struct *user = current_user();
1520 
1521 	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
1522 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1523 	return false;
1524 }
1525 
1526 #define MAX_RECURSION_LEVEL 4
1527 
1528 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1529 {
1530 	int i;
1531 	unsigned char max_level = 0;
1532 
1533 	if (too_many_unix_fds(current))
1534 		return -ETOOMANYREFS;
1535 
1536 	for (i = scm->fp->count - 1; i >= 0; i--) {
1537 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1538 
1539 		if (sk)
1540 			max_level = max(max_level,
1541 					unix_sk(sk)->recursion_level);
1542 	}
1543 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1544 		return -ETOOMANYREFS;
1545 
1546 	/*
1547 	 * Need to duplicate file references for the sake of garbage
1548 	 * collection.  Otherwise a socket in the fps might become a
1549 	 * candidate for GC while the skb is not yet queued.
1550 	 */
1551 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1552 	if (!UNIXCB(skb).fp)
1553 		return -ENOMEM;
1554 
1555 	for (i = scm->fp->count - 1; i >= 0; i--)
1556 		unix_inflight(scm->fp->user, scm->fp->fp[i]);
1557 	return max_level;
1558 }
1559 
1560 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1561 {
1562 	int err = 0;
1563 
1564 	UNIXCB(skb).pid  = get_pid(scm->pid);
1565 	UNIXCB(skb).uid = scm->creds.uid;
1566 	UNIXCB(skb).gid = scm->creds.gid;
1567 	UNIXCB(skb).fp = NULL;
1568 	unix_get_secdata(scm, skb);
1569 	if (scm->fp && send_fds)
1570 		err = unix_attach_fds(scm, skb);
1571 
1572 	skb->destructor = unix_destruct_scm;
1573 	return err;
1574 }
1575 
1576 static bool unix_passcred_enabled(const struct socket *sock,
1577 				  const struct sock *other)
1578 {
1579 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1580 	       !other->sk_socket ||
1581 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1582 }
1583 
1584 /*
1585  * Some apps rely on write() giving SCM_CREDENTIALS
1586  * We include credentials if source or destination socket
1587  * asserted SOCK_PASSCRED.
1588  */
1589 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1590 			    const struct sock *other)
1591 {
1592 	if (UNIXCB(skb).pid)
1593 		return;
1594 	if (unix_passcred_enabled(sock, other)) {
1595 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1596 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1597 	}
1598 }
1599 
1600 static int maybe_init_creds(struct scm_cookie *scm,
1601 			    struct socket *socket,
1602 			    const struct sock *other)
1603 {
1604 	int err;
1605 	struct msghdr msg = { .msg_controllen = 0 };
1606 
1607 	err = scm_send(socket, &msg, scm, false);
1608 	if (err)
1609 		return err;
1610 
1611 	if (unix_passcred_enabled(socket, other)) {
1612 		scm->pid = get_pid(task_tgid(current));
1613 		current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1614 	}
1615 	return err;
1616 }
1617 
1618 static bool unix_skb_scm_eq(struct sk_buff *skb,
1619 			    struct scm_cookie *scm)
1620 {
1621 	const struct unix_skb_parms *u = &UNIXCB(skb);
1622 
1623 	return u->pid == scm->pid &&
1624 	       uid_eq(u->uid, scm->creds.uid) &&
1625 	       gid_eq(u->gid, scm->creds.gid) &&
1626 	       unix_secdata_eq(scm, skb);
1627 }
1628 
1629 /*
1630  *	Send AF_UNIX data.
1631  */
1632 
1633 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1634 			      size_t len)
1635 {
1636 	struct sock *sk = sock->sk;
1637 	struct net *net = sock_net(sk);
1638 	struct unix_sock *u = unix_sk(sk);
1639 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1640 	struct sock *other = NULL;
1641 	int namelen = 0; /* fake GCC */
1642 	int err;
1643 	unsigned int hash;
1644 	struct sk_buff *skb;
1645 	long timeo;
1646 	struct scm_cookie scm;
1647 	int max_level;
1648 	int data_len = 0;
1649 	int sk_locked;
1650 
1651 	wait_for_unix_gc();
1652 	err = scm_send(sock, msg, &scm, false);
1653 	if (err < 0)
1654 		return err;
1655 
1656 	err = -EOPNOTSUPP;
1657 	if (msg->msg_flags&MSG_OOB)
1658 		goto out;
1659 
1660 	if (msg->msg_namelen) {
1661 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1662 		if (err < 0)
1663 			goto out;
1664 		namelen = err;
1665 	} else {
1666 		sunaddr = NULL;
1667 		err = -ENOTCONN;
1668 		other = unix_peer_get(sk);
1669 		if (!other)
1670 			goto out;
1671 	}
1672 
1673 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1674 	    && (err = unix_autobind(sock)) != 0)
1675 		goto out;
1676 
1677 	err = -EMSGSIZE;
1678 	if (len > sk->sk_sndbuf - 32)
1679 		goto out;
1680 
1681 	if (len > SKB_MAX_ALLOC) {
1682 		data_len = min_t(size_t,
1683 				 len - SKB_MAX_ALLOC,
1684 				 MAX_SKB_FRAGS * PAGE_SIZE);
1685 		data_len = PAGE_ALIGN(data_len);
1686 
1687 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1688 	}
1689 
1690 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1691 				   msg->msg_flags & MSG_DONTWAIT, &err,
1692 				   PAGE_ALLOC_COSTLY_ORDER);
1693 	if (skb == NULL)
1694 		goto out;
1695 
1696 	err = unix_scm_to_skb(&scm, skb, true);
1697 	if (err < 0)
1698 		goto out_free;
1699 	max_level = err + 1;
1700 
1701 	skb_put(skb, len - data_len);
1702 	skb->data_len = data_len;
1703 	skb->len = len;
1704 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1705 	if (err)
1706 		goto out_free;
1707 
1708 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1709 
1710 restart:
1711 	if (!other) {
1712 		err = -ECONNRESET;
1713 		if (sunaddr == NULL)
1714 			goto out_free;
1715 
1716 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1717 					hash, &err);
1718 		if (other == NULL)
1719 			goto out_free;
1720 	}
1721 
1722 	if (sk_filter(other, skb) < 0) {
1723 		/* Toss the packet but do not return any error to the sender */
1724 		err = len;
1725 		goto out_free;
1726 	}
1727 
1728 	sk_locked = 0;
1729 	unix_state_lock(other);
1730 restart_locked:
1731 	err = -EPERM;
1732 	if (!unix_may_send(sk, other))
1733 		goto out_unlock;
1734 
1735 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
1736 		/*
1737 		 *	Check with 1003.1g - what should
1738 		 *	datagram error
1739 		 */
1740 		unix_state_unlock(other);
1741 		sock_put(other);
1742 
1743 		if (!sk_locked)
1744 			unix_state_lock(sk);
1745 
1746 		err = 0;
1747 		if (unix_peer(sk) == other) {
1748 			unix_peer(sk) = NULL;
1749 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1750 
1751 			unix_state_unlock(sk);
1752 
1753 			unix_dgram_disconnected(sk, other);
1754 			sock_put(other);
1755 			err = -ECONNREFUSED;
1756 		} else {
1757 			unix_state_unlock(sk);
1758 		}
1759 
1760 		other = NULL;
1761 		if (err)
1762 			goto out_free;
1763 		goto restart;
1764 	}
1765 
1766 	err = -EPIPE;
1767 	if (other->sk_shutdown & RCV_SHUTDOWN)
1768 		goto out_unlock;
1769 
1770 	if (sk->sk_type != SOCK_SEQPACKET) {
1771 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1772 		if (err)
1773 			goto out_unlock;
1774 	}
1775 
1776 	/* other == sk && unix_peer(other) != sk if
1777 	 * - unix_peer(sk) == NULL, destination address bound to sk
1778 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
1779 	 */
1780 	if (other != sk &&
1781 	    unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1782 		if (timeo) {
1783 			timeo = unix_wait_for_peer(other, timeo);
1784 
1785 			err = sock_intr_errno(timeo);
1786 			if (signal_pending(current))
1787 				goto out_free;
1788 
1789 			goto restart;
1790 		}
1791 
1792 		if (!sk_locked) {
1793 			unix_state_unlock(other);
1794 			unix_state_double_lock(sk, other);
1795 		}
1796 
1797 		if (unix_peer(sk) != other ||
1798 		    unix_dgram_peer_wake_me(sk, other)) {
1799 			err = -EAGAIN;
1800 			sk_locked = 1;
1801 			goto out_unlock;
1802 		}
1803 
1804 		if (!sk_locked) {
1805 			sk_locked = 1;
1806 			goto restart_locked;
1807 		}
1808 	}
1809 
1810 	if (unlikely(sk_locked))
1811 		unix_state_unlock(sk);
1812 
1813 	if (sock_flag(other, SOCK_RCVTSTAMP))
1814 		__net_timestamp(skb);
1815 	maybe_add_creds(skb, sock, other);
1816 	skb_queue_tail(&other->sk_receive_queue, skb);
1817 	if (max_level > unix_sk(other)->recursion_level)
1818 		unix_sk(other)->recursion_level = max_level;
1819 	unix_state_unlock(other);
1820 	other->sk_data_ready(other);
1821 	sock_put(other);
1822 	scm_destroy(&scm);
1823 	return len;
1824 
1825 out_unlock:
1826 	if (sk_locked)
1827 		unix_state_unlock(sk);
1828 	unix_state_unlock(other);
1829 out_free:
1830 	kfree_skb(skb);
1831 out:
1832 	if (other)
1833 		sock_put(other);
1834 	scm_destroy(&scm);
1835 	return err;
1836 }
1837 
1838 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1839  * bytes, and a minimun of a full page.
1840  */
1841 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1842 
1843 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1844 			       size_t len)
1845 {
1846 	struct sock *sk = sock->sk;
1847 	struct sock *other = NULL;
1848 	int err, size;
1849 	struct sk_buff *skb;
1850 	int sent = 0;
1851 	struct scm_cookie scm;
1852 	bool fds_sent = false;
1853 	int max_level;
1854 	int data_len;
1855 
1856 	wait_for_unix_gc();
1857 	err = scm_send(sock, msg, &scm, false);
1858 	if (err < 0)
1859 		return err;
1860 
1861 	err = -EOPNOTSUPP;
1862 	if (msg->msg_flags&MSG_OOB)
1863 		goto out_err;
1864 
1865 	if (msg->msg_namelen) {
1866 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1867 		goto out_err;
1868 	} else {
1869 		err = -ENOTCONN;
1870 		other = unix_peer(sk);
1871 		if (!other)
1872 			goto out_err;
1873 	}
1874 
1875 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1876 		goto pipe_err;
1877 
1878 	while (sent < len) {
1879 		size = len - sent;
1880 
1881 		/* Keep two messages in the pipe so it schedules better */
1882 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1883 
1884 		/* allow fallback to order-0 allocations */
1885 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1886 
1887 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1888 
1889 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1890 
1891 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1892 					   msg->msg_flags & MSG_DONTWAIT, &err,
1893 					   get_order(UNIX_SKB_FRAGS_SZ));
1894 		if (!skb)
1895 			goto out_err;
1896 
1897 		/* Only send the fds in the first buffer */
1898 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
1899 		if (err < 0) {
1900 			kfree_skb(skb);
1901 			goto out_err;
1902 		}
1903 		max_level = err + 1;
1904 		fds_sent = true;
1905 
1906 		skb_put(skb, size - data_len);
1907 		skb->data_len = data_len;
1908 		skb->len = size;
1909 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1910 		if (err) {
1911 			kfree_skb(skb);
1912 			goto out_err;
1913 		}
1914 
1915 		unix_state_lock(other);
1916 
1917 		if (sock_flag(other, SOCK_DEAD) ||
1918 		    (other->sk_shutdown & RCV_SHUTDOWN))
1919 			goto pipe_err_free;
1920 
1921 		maybe_add_creds(skb, sock, other);
1922 		skb_queue_tail(&other->sk_receive_queue, skb);
1923 		if (max_level > unix_sk(other)->recursion_level)
1924 			unix_sk(other)->recursion_level = max_level;
1925 		unix_state_unlock(other);
1926 		other->sk_data_ready(other);
1927 		sent += size;
1928 	}
1929 
1930 	scm_destroy(&scm);
1931 
1932 	return sent;
1933 
1934 pipe_err_free:
1935 	unix_state_unlock(other);
1936 	kfree_skb(skb);
1937 pipe_err:
1938 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1939 		send_sig(SIGPIPE, current, 0);
1940 	err = -EPIPE;
1941 out_err:
1942 	scm_destroy(&scm);
1943 	return sent ? : err;
1944 }
1945 
1946 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1947 				    int offset, size_t size, int flags)
1948 {
1949 	int err;
1950 	bool send_sigpipe = false;
1951 	bool init_scm = true;
1952 	struct scm_cookie scm;
1953 	struct sock *other, *sk = socket->sk;
1954 	struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1955 
1956 	if (flags & MSG_OOB)
1957 		return -EOPNOTSUPP;
1958 
1959 	other = unix_peer(sk);
1960 	if (!other || sk->sk_state != TCP_ESTABLISHED)
1961 		return -ENOTCONN;
1962 
1963 	if (false) {
1964 alloc_skb:
1965 		unix_state_unlock(other);
1966 		mutex_unlock(&unix_sk(other)->iolock);
1967 		newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1968 					      &err, 0);
1969 		if (!newskb)
1970 			goto err;
1971 	}
1972 
1973 	/* we must acquire iolock as we modify already present
1974 	 * skbs in the sk_receive_queue and mess with skb->len
1975 	 */
1976 	err = mutex_lock_interruptible(&unix_sk(other)->iolock);
1977 	if (err) {
1978 		err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1979 		goto err;
1980 	}
1981 
1982 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
1983 		err = -EPIPE;
1984 		send_sigpipe = true;
1985 		goto err_unlock;
1986 	}
1987 
1988 	unix_state_lock(other);
1989 
1990 	if (sock_flag(other, SOCK_DEAD) ||
1991 	    other->sk_shutdown & RCV_SHUTDOWN) {
1992 		err = -EPIPE;
1993 		send_sigpipe = true;
1994 		goto err_state_unlock;
1995 	}
1996 
1997 	if (init_scm) {
1998 		err = maybe_init_creds(&scm, socket, other);
1999 		if (err)
2000 			goto err_state_unlock;
2001 		init_scm = false;
2002 	}
2003 
2004 	skb = skb_peek_tail(&other->sk_receive_queue);
2005 	if (tail && tail == skb) {
2006 		skb = newskb;
2007 	} else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2008 		if (newskb) {
2009 			skb = newskb;
2010 		} else {
2011 			tail = skb;
2012 			goto alloc_skb;
2013 		}
2014 	} else if (newskb) {
2015 		/* this is fast path, we don't necessarily need to
2016 		 * call to kfree_skb even though with newskb == NULL
2017 		 * this - does no harm
2018 		 */
2019 		consume_skb(newskb);
2020 		newskb = NULL;
2021 	}
2022 
2023 	if (skb_append_pagefrags(skb, page, offset, size)) {
2024 		tail = skb;
2025 		goto alloc_skb;
2026 	}
2027 
2028 	skb->len += size;
2029 	skb->data_len += size;
2030 	skb->truesize += size;
2031 	atomic_add(size, &sk->sk_wmem_alloc);
2032 
2033 	if (newskb) {
2034 		err = unix_scm_to_skb(&scm, skb, false);
2035 		if (err)
2036 			goto err_state_unlock;
2037 		spin_lock(&other->sk_receive_queue.lock);
2038 		__skb_queue_tail(&other->sk_receive_queue, newskb);
2039 		spin_unlock(&other->sk_receive_queue.lock);
2040 	}
2041 
2042 	unix_state_unlock(other);
2043 	mutex_unlock(&unix_sk(other)->iolock);
2044 
2045 	other->sk_data_ready(other);
2046 	scm_destroy(&scm);
2047 	return size;
2048 
2049 err_state_unlock:
2050 	unix_state_unlock(other);
2051 err_unlock:
2052 	mutex_unlock(&unix_sk(other)->iolock);
2053 err:
2054 	kfree_skb(newskb);
2055 	if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2056 		send_sig(SIGPIPE, current, 0);
2057 	if (!init_scm)
2058 		scm_destroy(&scm);
2059 	return err;
2060 }
2061 
2062 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2063 				  size_t len)
2064 {
2065 	int err;
2066 	struct sock *sk = sock->sk;
2067 
2068 	err = sock_error(sk);
2069 	if (err)
2070 		return err;
2071 
2072 	if (sk->sk_state != TCP_ESTABLISHED)
2073 		return -ENOTCONN;
2074 
2075 	if (msg->msg_namelen)
2076 		msg->msg_namelen = 0;
2077 
2078 	return unix_dgram_sendmsg(sock, msg, len);
2079 }
2080 
2081 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2082 				  size_t size, int flags)
2083 {
2084 	struct sock *sk = sock->sk;
2085 
2086 	if (sk->sk_state != TCP_ESTABLISHED)
2087 		return -ENOTCONN;
2088 
2089 	return unix_dgram_recvmsg(sock, msg, size, flags);
2090 }
2091 
2092 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2093 {
2094 	struct unix_sock *u = unix_sk(sk);
2095 
2096 	if (u->addr) {
2097 		msg->msg_namelen = u->addr->len;
2098 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
2099 	}
2100 }
2101 
2102 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2103 			      size_t size, int flags)
2104 {
2105 	struct scm_cookie scm;
2106 	struct sock *sk = sock->sk;
2107 	struct unix_sock *u = unix_sk(sk);
2108 	struct sk_buff *skb, *last;
2109 	long timeo;
2110 	int err;
2111 	int peeked, skip;
2112 
2113 	err = -EOPNOTSUPP;
2114 	if (flags&MSG_OOB)
2115 		goto out;
2116 
2117 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2118 
2119 	do {
2120 		mutex_lock(&u->iolock);
2121 
2122 		skip = sk_peek_offset(sk, flags);
2123 		skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
2124 					      &err, &last);
2125 		if (skb)
2126 			break;
2127 
2128 		mutex_unlock(&u->iolock);
2129 
2130 		if (err != -EAGAIN)
2131 			break;
2132 	} while (timeo &&
2133 		 !__skb_wait_for_more_packets(sk, &err, &timeo, last));
2134 
2135 	if (!skb) { /* implies iolock unlocked */
2136 		unix_state_lock(sk);
2137 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2138 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2139 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2140 			err = 0;
2141 		unix_state_unlock(sk);
2142 		goto out;
2143 	}
2144 
2145 	if (wq_has_sleeper(&u->peer_wait))
2146 		wake_up_interruptible_sync_poll(&u->peer_wait,
2147 						POLLOUT | POLLWRNORM |
2148 						POLLWRBAND);
2149 
2150 	if (msg->msg_name)
2151 		unix_copy_addr(msg, skb->sk);
2152 
2153 	if (size > skb->len - skip)
2154 		size = skb->len - skip;
2155 	else if (size < skb->len - skip)
2156 		msg->msg_flags |= MSG_TRUNC;
2157 
2158 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2159 	if (err)
2160 		goto out_free;
2161 
2162 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2163 		__sock_recv_timestamp(msg, sk, skb);
2164 
2165 	memset(&scm, 0, sizeof(scm));
2166 
2167 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2168 	unix_set_secdata(&scm, skb);
2169 
2170 	if (!(flags & MSG_PEEK)) {
2171 		if (UNIXCB(skb).fp)
2172 			unix_detach_fds(&scm, skb);
2173 
2174 		sk_peek_offset_bwd(sk, skb->len);
2175 	} else {
2176 		/* It is questionable: on PEEK we could:
2177 		   - do not return fds - good, but too simple 8)
2178 		   - return fds, and do not return them on read (old strategy,
2179 		     apparently wrong)
2180 		   - clone fds (I chose it for now, it is the most universal
2181 		     solution)
2182 
2183 		   POSIX 1003.1g does not actually define this clearly
2184 		   at all. POSIX 1003.1g doesn't define a lot of things
2185 		   clearly however!
2186 
2187 		*/
2188 
2189 		sk_peek_offset_fwd(sk, size);
2190 
2191 		if (UNIXCB(skb).fp)
2192 			scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2193 	}
2194 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2195 
2196 	scm_recv(sock, msg, &scm, flags);
2197 
2198 out_free:
2199 	skb_free_datagram(sk, skb);
2200 	mutex_unlock(&u->iolock);
2201 out:
2202 	return err;
2203 }
2204 
2205 /*
2206  *	Sleep until more data has arrived. But check for races..
2207  */
2208 static long unix_stream_data_wait(struct sock *sk, long timeo,
2209 				  struct sk_buff *last, unsigned int last_len,
2210 				  bool freezable)
2211 {
2212 	struct sk_buff *tail;
2213 	DEFINE_WAIT(wait);
2214 
2215 	unix_state_lock(sk);
2216 
2217 	for (;;) {
2218 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2219 
2220 		tail = skb_peek_tail(&sk->sk_receive_queue);
2221 		if (tail != last ||
2222 		    (tail && tail->len != last_len) ||
2223 		    sk->sk_err ||
2224 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2225 		    signal_pending(current) ||
2226 		    !timeo)
2227 			break;
2228 
2229 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2230 		unix_state_unlock(sk);
2231 		if (freezable)
2232 			timeo = freezable_schedule_timeout(timeo);
2233 		else
2234 			timeo = schedule_timeout(timeo);
2235 		unix_state_lock(sk);
2236 
2237 		if (sock_flag(sk, SOCK_DEAD))
2238 			break;
2239 
2240 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2241 	}
2242 
2243 	finish_wait(sk_sleep(sk), &wait);
2244 	unix_state_unlock(sk);
2245 	return timeo;
2246 }
2247 
2248 static unsigned int unix_skb_len(const struct sk_buff *skb)
2249 {
2250 	return skb->len - UNIXCB(skb).consumed;
2251 }
2252 
2253 struct unix_stream_read_state {
2254 	int (*recv_actor)(struct sk_buff *, int, int,
2255 			  struct unix_stream_read_state *);
2256 	struct socket *socket;
2257 	struct msghdr *msg;
2258 	struct pipe_inode_info *pipe;
2259 	size_t size;
2260 	int flags;
2261 	unsigned int splice_flags;
2262 };
2263 
2264 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2265 				    bool freezable)
2266 {
2267 	struct scm_cookie scm;
2268 	struct socket *sock = state->socket;
2269 	struct sock *sk = sock->sk;
2270 	struct unix_sock *u = unix_sk(sk);
2271 	int copied = 0;
2272 	int flags = state->flags;
2273 	int noblock = flags & MSG_DONTWAIT;
2274 	bool check_creds = false;
2275 	int target;
2276 	int err = 0;
2277 	long timeo;
2278 	int skip;
2279 	size_t size = state->size;
2280 	unsigned int last_len;
2281 
2282 	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2283 		err = -EINVAL;
2284 		goto out;
2285 	}
2286 
2287 	if (unlikely(flags & MSG_OOB)) {
2288 		err = -EOPNOTSUPP;
2289 		goto out;
2290 	}
2291 
2292 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2293 	timeo = sock_rcvtimeo(sk, noblock);
2294 
2295 	memset(&scm, 0, sizeof(scm));
2296 
2297 	/* Lock the socket to prevent queue disordering
2298 	 * while sleeps in memcpy_tomsg
2299 	 */
2300 	mutex_lock(&u->iolock);
2301 
2302 	if (flags & MSG_PEEK)
2303 		skip = sk_peek_offset(sk, flags);
2304 	else
2305 		skip = 0;
2306 
2307 	do {
2308 		int chunk;
2309 		bool drop_skb;
2310 		struct sk_buff *skb, *last;
2311 
2312 redo:
2313 		unix_state_lock(sk);
2314 		if (sock_flag(sk, SOCK_DEAD)) {
2315 			err = -ECONNRESET;
2316 			goto unlock;
2317 		}
2318 		last = skb = skb_peek(&sk->sk_receive_queue);
2319 		last_len = last ? last->len : 0;
2320 again:
2321 		if (skb == NULL) {
2322 			unix_sk(sk)->recursion_level = 0;
2323 			if (copied >= target)
2324 				goto unlock;
2325 
2326 			/*
2327 			 *	POSIX 1003.1g mandates this order.
2328 			 */
2329 
2330 			err = sock_error(sk);
2331 			if (err)
2332 				goto unlock;
2333 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2334 				goto unlock;
2335 
2336 			unix_state_unlock(sk);
2337 			if (!timeo) {
2338 				err = -EAGAIN;
2339 				break;
2340 			}
2341 
2342 			mutex_unlock(&u->iolock);
2343 
2344 			timeo = unix_stream_data_wait(sk, timeo, last,
2345 						      last_len, freezable);
2346 
2347 			if (signal_pending(current)) {
2348 				err = sock_intr_errno(timeo);
2349 				scm_destroy(&scm);
2350 				goto out;
2351 			}
2352 
2353 			mutex_lock(&u->iolock);
2354 			goto redo;
2355 unlock:
2356 			unix_state_unlock(sk);
2357 			break;
2358 		}
2359 
2360 		while (skip >= unix_skb_len(skb)) {
2361 			skip -= unix_skb_len(skb);
2362 			last = skb;
2363 			last_len = skb->len;
2364 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2365 			if (!skb)
2366 				goto again;
2367 		}
2368 
2369 		unix_state_unlock(sk);
2370 
2371 		if (check_creds) {
2372 			/* Never glue messages from different writers */
2373 			if (!unix_skb_scm_eq(skb, &scm))
2374 				break;
2375 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2376 			/* Copy credentials */
2377 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2378 			unix_set_secdata(&scm, skb);
2379 			check_creds = true;
2380 		}
2381 
2382 		/* Copy address just once */
2383 		if (state->msg && state->msg->msg_name) {
2384 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2385 					 state->msg->msg_name);
2386 			unix_copy_addr(state->msg, skb->sk);
2387 			sunaddr = NULL;
2388 		}
2389 
2390 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2391 		skb_get(skb);
2392 		chunk = state->recv_actor(skb, skip, chunk, state);
2393 		drop_skb = !unix_skb_len(skb);
2394 		/* skb is only safe to use if !drop_skb */
2395 		consume_skb(skb);
2396 		if (chunk < 0) {
2397 			if (copied == 0)
2398 				copied = -EFAULT;
2399 			break;
2400 		}
2401 		copied += chunk;
2402 		size -= chunk;
2403 
2404 		if (drop_skb) {
2405 			/* the skb was touched by a concurrent reader;
2406 			 * we should not expect anything from this skb
2407 			 * anymore and assume it invalid - we can be
2408 			 * sure it was dropped from the socket queue
2409 			 *
2410 			 * let's report a short read
2411 			 */
2412 			err = 0;
2413 			break;
2414 		}
2415 
2416 		/* Mark read part of skb as used */
2417 		if (!(flags & MSG_PEEK)) {
2418 			UNIXCB(skb).consumed += chunk;
2419 
2420 			sk_peek_offset_bwd(sk, chunk);
2421 
2422 			if (UNIXCB(skb).fp)
2423 				unix_detach_fds(&scm, skb);
2424 
2425 			if (unix_skb_len(skb))
2426 				break;
2427 
2428 			skb_unlink(skb, &sk->sk_receive_queue);
2429 			consume_skb(skb);
2430 
2431 			if (scm.fp)
2432 				break;
2433 		} else {
2434 			/* It is questionable, see note in unix_dgram_recvmsg.
2435 			 */
2436 			if (UNIXCB(skb).fp)
2437 				scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2438 
2439 			sk_peek_offset_fwd(sk, chunk);
2440 
2441 			if (UNIXCB(skb).fp)
2442 				break;
2443 
2444 			skip = 0;
2445 			last = skb;
2446 			last_len = skb->len;
2447 			unix_state_lock(sk);
2448 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2449 			if (skb)
2450 				goto again;
2451 			unix_state_unlock(sk);
2452 			break;
2453 		}
2454 	} while (size);
2455 
2456 	mutex_unlock(&u->iolock);
2457 	if (state->msg)
2458 		scm_recv(sock, state->msg, &scm, flags);
2459 	else
2460 		scm_destroy(&scm);
2461 out:
2462 	return copied ? : err;
2463 }
2464 
2465 static int unix_stream_read_actor(struct sk_buff *skb,
2466 				  int skip, int chunk,
2467 				  struct unix_stream_read_state *state)
2468 {
2469 	int ret;
2470 
2471 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2472 				    state->msg, chunk);
2473 	return ret ?: chunk;
2474 }
2475 
2476 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2477 			       size_t size, int flags)
2478 {
2479 	struct unix_stream_read_state state = {
2480 		.recv_actor = unix_stream_read_actor,
2481 		.socket = sock,
2482 		.msg = msg,
2483 		.size = size,
2484 		.flags = flags
2485 	};
2486 
2487 	return unix_stream_read_generic(&state, true);
2488 }
2489 
2490 static int unix_stream_splice_actor(struct sk_buff *skb,
2491 				    int skip, int chunk,
2492 				    struct unix_stream_read_state *state)
2493 {
2494 	return skb_splice_bits(skb, state->socket->sk,
2495 			       UNIXCB(skb).consumed + skip,
2496 			       state->pipe, chunk, state->splice_flags);
2497 }
2498 
2499 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2500 				       struct pipe_inode_info *pipe,
2501 				       size_t size, unsigned int flags)
2502 {
2503 	struct unix_stream_read_state state = {
2504 		.recv_actor = unix_stream_splice_actor,
2505 		.socket = sock,
2506 		.pipe = pipe,
2507 		.size = size,
2508 		.splice_flags = flags,
2509 	};
2510 
2511 	if (unlikely(*ppos))
2512 		return -ESPIPE;
2513 
2514 	if (sock->file->f_flags & O_NONBLOCK ||
2515 	    flags & SPLICE_F_NONBLOCK)
2516 		state.flags = MSG_DONTWAIT;
2517 
2518 	return unix_stream_read_generic(&state, false);
2519 }
2520 
2521 static int unix_shutdown(struct socket *sock, int mode)
2522 {
2523 	struct sock *sk = sock->sk;
2524 	struct sock *other;
2525 
2526 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2527 		return -EINVAL;
2528 	/* This maps:
2529 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2530 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2531 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2532 	 */
2533 	++mode;
2534 
2535 	unix_state_lock(sk);
2536 	sk->sk_shutdown |= mode;
2537 	other = unix_peer(sk);
2538 	if (other)
2539 		sock_hold(other);
2540 	unix_state_unlock(sk);
2541 	sk->sk_state_change(sk);
2542 
2543 	if (other &&
2544 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2545 
2546 		int peer_mode = 0;
2547 
2548 		if (mode&RCV_SHUTDOWN)
2549 			peer_mode |= SEND_SHUTDOWN;
2550 		if (mode&SEND_SHUTDOWN)
2551 			peer_mode |= RCV_SHUTDOWN;
2552 		unix_state_lock(other);
2553 		other->sk_shutdown |= peer_mode;
2554 		unix_state_unlock(other);
2555 		other->sk_state_change(other);
2556 		if (peer_mode == SHUTDOWN_MASK)
2557 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2558 		else if (peer_mode & RCV_SHUTDOWN)
2559 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2560 	}
2561 	if (other)
2562 		sock_put(other);
2563 
2564 	return 0;
2565 }
2566 
2567 long unix_inq_len(struct sock *sk)
2568 {
2569 	struct sk_buff *skb;
2570 	long amount = 0;
2571 
2572 	if (sk->sk_state == TCP_LISTEN)
2573 		return -EINVAL;
2574 
2575 	spin_lock(&sk->sk_receive_queue.lock);
2576 	if (sk->sk_type == SOCK_STREAM ||
2577 	    sk->sk_type == SOCK_SEQPACKET) {
2578 		skb_queue_walk(&sk->sk_receive_queue, skb)
2579 			amount += unix_skb_len(skb);
2580 	} else {
2581 		skb = skb_peek(&sk->sk_receive_queue);
2582 		if (skb)
2583 			amount = skb->len;
2584 	}
2585 	spin_unlock(&sk->sk_receive_queue.lock);
2586 
2587 	return amount;
2588 }
2589 EXPORT_SYMBOL_GPL(unix_inq_len);
2590 
2591 long unix_outq_len(struct sock *sk)
2592 {
2593 	return sk_wmem_alloc_get(sk);
2594 }
2595 EXPORT_SYMBOL_GPL(unix_outq_len);
2596 
2597 static int unix_open_file(struct sock *sk)
2598 {
2599 	struct path path;
2600 	struct file *f;
2601 	int fd;
2602 
2603 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2604 		return -EPERM;
2605 
2606 	unix_state_lock(sk);
2607 	path = unix_sk(sk)->path;
2608 	if (!path.dentry) {
2609 		unix_state_unlock(sk);
2610 		return -ENOENT;
2611 	}
2612 
2613 	path_get(&path);
2614 	unix_state_unlock(sk);
2615 
2616 	fd = get_unused_fd_flags(O_CLOEXEC);
2617 	if (fd < 0)
2618 		goto out;
2619 
2620 	f = dentry_open(&path, O_PATH, current_cred());
2621 	if (IS_ERR(f)) {
2622 		put_unused_fd(fd);
2623 		fd = PTR_ERR(f);
2624 		goto out;
2625 	}
2626 
2627 	fd_install(fd, f);
2628 out:
2629 	path_put(&path);
2630 
2631 	return fd;
2632 }
2633 
2634 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2635 {
2636 	struct sock *sk = sock->sk;
2637 	long amount = 0;
2638 	int err;
2639 
2640 	switch (cmd) {
2641 	case SIOCOUTQ:
2642 		amount = unix_outq_len(sk);
2643 		err = put_user(amount, (int __user *)arg);
2644 		break;
2645 	case SIOCINQ:
2646 		amount = unix_inq_len(sk);
2647 		if (amount < 0)
2648 			err = amount;
2649 		else
2650 			err = put_user(amount, (int __user *)arg);
2651 		break;
2652 	case SIOCUNIXFILE:
2653 		err = unix_open_file(sk);
2654 		break;
2655 	default:
2656 		err = -ENOIOCTLCMD;
2657 		break;
2658 	}
2659 	return err;
2660 }
2661 
2662 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2663 {
2664 	struct sock *sk = sock->sk;
2665 	unsigned int mask;
2666 
2667 	sock_poll_wait(file, sk_sleep(sk), wait);
2668 	mask = 0;
2669 
2670 	/* exceptional events? */
2671 	if (sk->sk_err)
2672 		mask |= POLLERR;
2673 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2674 		mask |= POLLHUP;
2675 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2676 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2677 
2678 	/* readable? */
2679 	if (!skb_queue_empty(&sk->sk_receive_queue))
2680 		mask |= POLLIN | POLLRDNORM;
2681 
2682 	/* Connection-based need to check for termination and startup */
2683 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2684 	    sk->sk_state == TCP_CLOSE)
2685 		mask |= POLLHUP;
2686 
2687 	/*
2688 	 * we set writable also when the other side has shut down the
2689 	 * connection. This prevents stuck sockets.
2690 	 */
2691 	if (unix_writable(sk))
2692 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2693 
2694 	return mask;
2695 }
2696 
2697 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2698 				    poll_table *wait)
2699 {
2700 	struct sock *sk = sock->sk, *other;
2701 	unsigned int mask, writable;
2702 
2703 	sock_poll_wait(file, sk_sleep(sk), wait);
2704 	mask = 0;
2705 
2706 	/* exceptional events? */
2707 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2708 		mask |= POLLERR |
2709 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2710 
2711 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2712 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2713 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2714 		mask |= POLLHUP;
2715 
2716 	/* readable? */
2717 	if (!skb_queue_empty(&sk->sk_receive_queue))
2718 		mask |= POLLIN | POLLRDNORM;
2719 
2720 	/* Connection-based need to check for termination and startup */
2721 	if (sk->sk_type == SOCK_SEQPACKET) {
2722 		if (sk->sk_state == TCP_CLOSE)
2723 			mask |= POLLHUP;
2724 		/* connection hasn't started yet? */
2725 		if (sk->sk_state == TCP_SYN_SENT)
2726 			return mask;
2727 	}
2728 
2729 	/* No write status requested, avoid expensive OUT tests. */
2730 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2731 		return mask;
2732 
2733 	writable = unix_writable(sk);
2734 	if (writable) {
2735 		unix_state_lock(sk);
2736 
2737 		other = unix_peer(sk);
2738 		if (other && unix_peer(other) != sk &&
2739 		    unix_recvq_full(other) &&
2740 		    unix_dgram_peer_wake_me(sk, other))
2741 			writable = 0;
2742 
2743 		unix_state_unlock(sk);
2744 	}
2745 
2746 	if (writable)
2747 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2748 	else
2749 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2750 
2751 	return mask;
2752 }
2753 
2754 #ifdef CONFIG_PROC_FS
2755 
2756 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2757 
2758 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2759 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2760 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2761 
2762 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2763 {
2764 	unsigned long offset = get_offset(*pos);
2765 	unsigned long bucket = get_bucket(*pos);
2766 	struct sock *sk;
2767 	unsigned long count = 0;
2768 
2769 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2770 		if (sock_net(sk) != seq_file_net(seq))
2771 			continue;
2772 		if (++count == offset)
2773 			break;
2774 	}
2775 
2776 	return sk;
2777 }
2778 
2779 static struct sock *unix_next_socket(struct seq_file *seq,
2780 				     struct sock *sk,
2781 				     loff_t *pos)
2782 {
2783 	unsigned long bucket;
2784 
2785 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2786 		sk = sk_next(sk);
2787 		if (!sk)
2788 			goto next_bucket;
2789 		if (sock_net(sk) == seq_file_net(seq))
2790 			return sk;
2791 	}
2792 
2793 	do {
2794 		sk = unix_from_bucket(seq, pos);
2795 		if (sk)
2796 			return sk;
2797 
2798 next_bucket:
2799 		bucket = get_bucket(*pos) + 1;
2800 		*pos = set_bucket_offset(bucket, 1);
2801 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2802 
2803 	return NULL;
2804 }
2805 
2806 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2807 	__acquires(unix_table_lock)
2808 {
2809 	spin_lock(&unix_table_lock);
2810 
2811 	if (!*pos)
2812 		return SEQ_START_TOKEN;
2813 
2814 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2815 		return NULL;
2816 
2817 	return unix_next_socket(seq, NULL, pos);
2818 }
2819 
2820 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2821 {
2822 	++*pos;
2823 	return unix_next_socket(seq, v, pos);
2824 }
2825 
2826 static void unix_seq_stop(struct seq_file *seq, void *v)
2827 	__releases(unix_table_lock)
2828 {
2829 	spin_unlock(&unix_table_lock);
2830 }
2831 
2832 static int unix_seq_show(struct seq_file *seq, void *v)
2833 {
2834 
2835 	if (v == SEQ_START_TOKEN)
2836 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2837 			 "Inode Path\n");
2838 	else {
2839 		struct sock *s = v;
2840 		struct unix_sock *u = unix_sk(s);
2841 		unix_state_lock(s);
2842 
2843 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2844 			s,
2845 			atomic_read(&s->sk_refcnt),
2846 			0,
2847 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2848 			s->sk_type,
2849 			s->sk_socket ?
2850 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2851 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2852 			sock_i_ino(s));
2853 
2854 		if (u->addr) {
2855 			int i, len;
2856 			seq_putc(seq, ' ');
2857 
2858 			i = 0;
2859 			len = u->addr->len - sizeof(short);
2860 			if (!UNIX_ABSTRACT(s))
2861 				len--;
2862 			else {
2863 				seq_putc(seq, '@');
2864 				i++;
2865 			}
2866 			for ( ; i < len; i++)
2867 				seq_putc(seq, u->addr->name->sun_path[i] ?:
2868 					 '@');
2869 		}
2870 		unix_state_unlock(s);
2871 		seq_putc(seq, '\n');
2872 	}
2873 
2874 	return 0;
2875 }
2876 
2877 static const struct seq_operations unix_seq_ops = {
2878 	.start  = unix_seq_start,
2879 	.next   = unix_seq_next,
2880 	.stop   = unix_seq_stop,
2881 	.show   = unix_seq_show,
2882 };
2883 
2884 static int unix_seq_open(struct inode *inode, struct file *file)
2885 {
2886 	return seq_open_net(inode, file, &unix_seq_ops,
2887 			    sizeof(struct seq_net_private));
2888 }
2889 
2890 static const struct file_operations unix_seq_fops = {
2891 	.owner		= THIS_MODULE,
2892 	.open		= unix_seq_open,
2893 	.read		= seq_read,
2894 	.llseek		= seq_lseek,
2895 	.release	= seq_release_net,
2896 };
2897 
2898 #endif
2899 
2900 static const struct net_proto_family unix_family_ops = {
2901 	.family = PF_UNIX,
2902 	.create = unix_create,
2903 	.owner	= THIS_MODULE,
2904 };
2905 
2906 
2907 static int __net_init unix_net_init(struct net *net)
2908 {
2909 	int error = -ENOMEM;
2910 
2911 	net->unx.sysctl_max_dgram_qlen = 10;
2912 	if (unix_sysctl_register(net))
2913 		goto out;
2914 
2915 #ifdef CONFIG_PROC_FS
2916 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2917 		unix_sysctl_unregister(net);
2918 		goto out;
2919 	}
2920 #endif
2921 	error = 0;
2922 out:
2923 	return error;
2924 }
2925 
2926 static void __net_exit unix_net_exit(struct net *net)
2927 {
2928 	unix_sysctl_unregister(net);
2929 	remove_proc_entry("unix", net->proc_net);
2930 }
2931 
2932 static struct pernet_operations unix_net_ops = {
2933 	.init = unix_net_init,
2934 	.exit = unix_net_exit,
2935 };
2936 
2937 static int __init af_unix_init(void)
2938 {
2939 	int rc = -1;
2940 
2941 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2942 
2943 	rc = proto_register(&unix_proto, 1);
2944 	if (rc != 0) {
2945 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2946 		goto out;
2947 	}
2948 
2949 	sock_register(&unix_family_ops);
2950 	register_pernet_subsys(&unix_net_ops);
2951 out:
2952 	return rc;
2953 }
2954 
2955 static void __exit af_unix_exit(void)
2956 {
2957 	sock_unregister(PF_UNIX);
2958 	proto_unregister(&unix_proto);
2959 	unregister_pernet_subsys(&unix_net_ops);
2960 }
2961 
2962 /* Earlier than device_initcall() so that other drivers invoking
2963    request_module() don't end up in a loop when modprobe tries
2964    to use a UNIX socket. But later than subsys_initcall() because
2965    we depend on stuff initialised there */
2966 fs_initcall(af_unix_init);
2967 module_exit(af_unix_exit);
2968 
2969 MODULE_LICENSE("GPL");
2970 MODULE_ALIAS_NETPROTO(PF_UNIX);
2971