xref: /openbmc/linux/net/unix/af_unix.c (revision 52fb57e7)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
84 
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
95 #include <linux/un.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <asm/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
120 
121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
122 EXPORT_SYMBOL_GPL(unix_socket_table);
123 DEFINE_SPINLOCK(unix_table_lock);
124 EXPORT_SYMBOL_GPL(unix_table_lock);
125 static atomic_long_t unix_nr_socks;
126 
127 
128 static struct hlist_head *unix_sockets_unbound(void *addr)
129 {
130 	unsigned long hash = (unsigned long)addr;
131 
132 	hash ^= hash >> 16;
133 	hash ^= hash >> 8;
134 	hash %= UNIX_HASH_SIZE;
135 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
136 }
137 
138 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
139 
140 #ifdef CONFIG_SECURITY_NETWORK
141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
142 {
143 	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
144 }
145 
146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
147 {
148 	scm->secid = *UNIXSID(skb);
149 }
150 #else
151 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
152 { }
153 
154 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
155 { }
156 #endif /* CONFIG_SECURITY_NETWORK */
157 
158 /*
159  *  SMP locking strategy:
160  *    hash table is protected with spinlock unix_table_lock
161  *    each socket state is protected by separate spin lock.
162  */
163 
164 static inline unsigned int unix_hash_fold(__wsum n)
165 {
166 	unsigned int hash = (__force unsigned int)csum_fold(n);
167 
168 	hash ^= hash>>8;
169 	return hash&(UNIX_HASH_SIZE-1);
170 }
171 
172 #define unix_peer(sk) (unix_sk(sk)->peer)
173 
174 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
175 {
176 	return unix_peer(osk) == sk;
177 }
178 
179 static inline int unix_may_send(struct sock *sk, struct sock *osk)
180 {
181 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
182 }
183 
184 static inline int unix_recvq_full(struct sock const *sk)
185 {
186 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
187 }
188 
189 struct sock *unix_peer_get(struct sock *s)
190 {
191 	struct sock *peer;
192 
193 	unix_state_lock(s);
194 	peer = unix_peer(s);
195 	if (peer)
196 		sock_hold(peer);
197 	unix_state_unlock(s);
198 	return peer;
199 }
200 EXPORT_SYMBOL_GPL(unix_peer_get);
201 
202 static inline void unix_release_addr(struct unix_address *addr)
203 {
204 	if (atomic_dec_and_test(&addr->refcnt))
205 		kfree(addr);
206 }
207 
208 /*
209  *	Check unix socket name:
210  *		- should be not zero length.
211  *	        - if started by not zero, should be NULL terminated (FS object)
212  *		- if started by zero, it is abstract name.
213  */
214 
215 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
216 {
217 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
218 		return -EINVAL;
219 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
220 		return -EINVAL;
221 	if (sunaddr->sun_path[0]) {
222 		/*
223 		 * This may look like an off by one error but it is a bit more
224 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
225 		 * sun_path[108] doesn't as such exist.  However in kernel space
226 		 * we are guaranteed that it is a valid memory location in our
227 		 * kernel address buffer.
228 		 */
229 		((char *)sunaddr)[len] = 0;
230 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
231 		return len;
232 	}
233 
234 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
235 	return len;
236 }
237 
238 static void __unix_remove_socket(struct sock *sk)
239 {
240 	sk_del_node_init(sk);
241 }
242 
243 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
244 {
245 	WARN_ON(!sk_unhashed(sk));
246 	sk_add_node(sk, list);
247 }
248 
249 static inline void unix_remove_socket(struct sock *sk)
250 {
251 	spin_lock(&unix_table_lock);
252 	__unix_remove_socket(sk);
253 	spin_unlock(&unix_table_lock);
254 }
255 
256 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
257 {
258 	spin_lock(&unix_table_lock);
259 	__unix_insert_socket(list, sk);
260 	spin_unlock(&unix_table_lock);
261 }
262 
263 static struct sock *__unix_find_socket_byname(struct net *net,
264 					      struct sockaddr_un *sunname,
265 					      int len, int type, unsigned int hash)
266 {
267 	struct sock *s;
268 
269 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
270 		struct unix_sock *u = unix_sk(s);
271 
272 		if (!net_eq(sock_net(s), net))
273 			continue;
274 
275 		if (u->addr->len == len &&
276 		    !memcmp(u->addr->name, sunname, len))
277 			goto found;
278 	}
279 	s = NULL;
280 found:
281 	return s;
282 }
283 
284 static inline struct sock *unix_find_socket_byname(struct net *net,
285 						   struct sockaddr_un *sunname,
286 						   int len, int type,
287 						   unsigned int hash)
288 {
289 	struct sock *s;
290 
291 	spin_lock(&unix_table_lock);
292 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
293 	if (s)
294 		sock_hold(s);
295 	spin_unlock(&unix_table_lock);
296 	return s;
297 }
298 
299 static struct sock *unix_find_socket_byinode(struct inode *i)
300 {
301 	struct sock *s;
302 
303 	spin_lock(&unix_table_lock);
304 	sk_for_each(s,
305 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
306 		struct dentry *dentry = unix_sk(s)->path.dentry;
307 
308 		if (dentry && d_backing_inode(dentry) == i) {
309 			sock_hold(s);
310 			goto found;
311 		}
312 	}
313 	s = NULL;
314 found:
315 	spin_unlock(&unix_table_lock);
316 	return s;
317 }
318 
319 static inline int unix_writable(struct sock *sk)
320 {
321 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
322 }
323 
324 static void unix_write_space(struct sock *sk)
325 {
326 	struct socket_wq *wq;
327 
328 	rcu_read_lock();
329 	if (unix_writable(sk)) {
330 		wq = rcu_dereference(sk->sk_wq);
331 		if (wq_has_sleeper(wq))
332 			wake_up_interruptible_sync_poll(&wq->wait,
333 				POLLOUT | POLLWRNORM | POLLWRBAND);
334 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
335 	}
336 	rcu_read_unlock();
337 }
338 
339 /* When dgram socket disconnects (or changes its peer), we clear its receive
340  * queue of packets arrived from previous peer. First, it allows to do
341  * flow control based only on wmem_alloc; second, sk connected to peer
342  * may receive messages only from that peer. */
343 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
344 {
345 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
346 		skb_queue_purge(&sk->sk_receive_queue);
347 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
348 
349 		/* If one link of bidirectional dgram pipe is disconnected,
350 		 * we signal error. Messages are lost. Do not make this,
351 		 * when peer was not connected to us.
352 		 */
353 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
354 			other->sk_err = ECONNRESET;
355 			other->sk_error_report(other);
356 		}
357 	}
358 }
359 
360 static void unix_sock_destructor(struct sock *sk)
361 {
362 	struct unix_sock *u = unix_sk(sk);
363 
364 	skb_queue_purge(&sk->sk_receive_queue);
365 
366 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
367 	WARN_ON(!sk_unhashed(sk));
368 	WARN_ON(sk->sk_socket);
369 	if (!sock_flag(sk, SOCK_DEAD)) {
370 		pr_info("Attempt to release alive unix socket: %p\n", sk);
371 		return;
372 	}
373 
374 	if (u->addr)
375 		unix_release_addr(u->addr);
376 
377 	atomic_long_dec(&unix_nr_socks);
378 	local_bh_disable();
379 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
380 	local_bh_enable();
381 #ifdef UNIX_REFCNT_DEBUG
382 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
383 		atomic_long_read(&unix_nr_socks));
384 #endif
385 }
386 
387 static void unix_release_sock(struct sock *sk, int embrion)
388 {
389 	struct unix_sock *u = unix_sk(sk);
390 	struct path path;
391 	struct sock *skpair;
392 	struct sk_buff *skb;
393 	int state;
394 
395 	unix_remove_socket(sk);
396 
397 	/* Clear state */
398 	unix_state_lock(sk);
399 	sock_orphan(sk);
400 	sk->sk_shutdown = SHUTDOWN_MASK;
401 	path	     = u->path;
402 	u->path.dentry = NULL;
403 	u->path.mnt = NULL;
404 	state = sk->sk_state;
405 	sk->sk_state = TCP_CLOSE;
406 	unix_state_unlock(sk);
407 
408 	wake_up_interruptible_all(&u->peer_wait);
409 
410 	skpair = unix_peer(sk);
411 
412 	if (skpair != NULL) {
413 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
414 			unix_state_lock(skpair);
415 			/* No more writes */
416 			skpair->sk_shutdown = SHUTDOWN_MASK;
417 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
418 				skpair->sk_err = ECONNRESET;
419 			unix_state_unlock(skpair);
420 			skpair->sk_state_change(skpair);
421 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
422 		}
423 		sock_put(skpair); /* It may now die */
424 		unix_peer(sk) = NULL;
425 	}
426 
427 	/* Try to flush out this socket. Throw out buffers at least */
428 
429 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
430 		if (state == TCP_LISTEN)
431 			unix_release_sock(skb->sk, 1);
432 		/* passed fds are erased in the kfree_skb hook	      */
433 		kfree_skb(skb);
434 	}
435 
436 	if (path.dentry)
437 		path_put(&path);
438 
439 	sock_put(sk);
440 
441 	/* ---- Socket is dead now and most probably destroyed ---- */
442 
443 	/*
444 	 * Fixme: BSD difference: In BSD all sockets connected to us get
445 	 *	  ECONNRESET and we die on the spot. In Linux we behave
446 	 *	  like files and pipes do and wait for the last
447 	 *	  dereference.
448 	 *
449 	 * Can't we simply set sock->err?
450 	 *
451 	 *	  What the above comment does talk about? --ANK(980817)
452 	 */
453 
454 	if (unix_tot_inflight)
455 		unix_gc();		/* Garbage collect fds */
456 }
457 
458 static void init_peercred(struct sock *sk)
459 {
460 	put_pid(sk->sk_peer_pid);
461 	if (sk->sk_peer_cred)
462 		put_cred(sk->sk_peer_cred);
463 	sk->sk_peer_pid  = get_pid(task_tgid(current));
464 	sk->sk_peer_cred = get_current_cred();
465 }
466 
467 static void copy_peercred(struct sock *sk, struct sock *peersk)
468 {
469 	put_pid(sk->sk_peer_pid);
470 	if (sk->sk_peer_cred)
471 		put_cred(sk->sk_peer_cred);
472 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
473 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
474 }
475 
476 static int unix_listen(struct socket *sock, int backlog)
477 {
478 	int err;
479 	struct sock *sk = sock->sk;
480 	struct unix_sock *u = unix_sk(sk);
481 	struct pid *old_pid = NULL;
482 
483 	err = -EOPNOTSUPP;
484 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
485 		goto out;	/* Only stream/seqpacket sockets accept */
486 	err = -EINVAL;
487 	if (!u->addr)
488 		goto out;	/* No listens on an unbound socket */
489 	unix_state_lock(sk);
490 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
491 		goto out_unlock;
492 	if (backlog > sk->sk_max_ack_backlog)
493 		wake_up_interruptible_all(&u->peer_wait);
494 	sk->sk_max_ack_backlog	= backlog;
495 	sk->sk_state		= TCP_LISTEN;
496 	/* set credentials so connect can copy them */
497 	init_peercred(sk);
498 	err = 0;
499 
500 out_unlock:
501 	unix_state_unlock(sk);
502 	put_pid(old_pid);
503 out:
504 	return err;
505 }
506 
507 static int unix_release(struct socket *);
508 static int unix_bind(struct socket *, struct sockaddr *, int);
509 static int unix_stream_connect(struct socket *, struct sockaddr *,
510 			       int addr_len, int flags);
511 static int unix_socketpair(struct socket *, struct socket *);
512 static int unix_accept(struct socket *, struct socket *, int);
513 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
514 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
515 static unsigned int unix_dgram_poll(struct file *, struct socket *,
516 				    poll_table *);
517 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
518 static int unix_shutdown(struct socket *, int);
519 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
520 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
521 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
522 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
523 static int unix_dgram_connect(struct socket *, struct sockaddr *,
524 			      int, int);
525 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
526 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
527 				  int);
528 
529 static int unix_set_peek_off(struct sock *sk, int val)
530 {
531 	struct unix_sock *u = unix_sk(sk);
532 
533 	if (mutex_lock_interruptible(&u->readlock))
534 		return -EINTR;
535 
536 	sk->sk_peek_off = val;
537 	mutex_unlock(&u->readlock);
538 
539 	return 0;
540 }
541 
542 
543 static const struct proto_ops unix_stream_ops = {
544 	.family =	PF_UNIX,
545 	.owner =	THIS_MODULE,
546 	.release =	unix_release,
547 	.bind =		unix_bind,
548 	.connect =	unix_stream_connect,
549 	.socketpair =	unix_socketpair,
550 	.accept =	unix_accept,
551 	.getname =	unix_getname,
552 	.poll =		unix_poll,
553 	.ioctl =	unix_ioctl,
554 	.listen =	unix_listen,
555 	.shutdown =	unix_shutdown,
556 	.setsockopt =	sock_no_setsockopt,
557 	.getsockopt =	sock_no_getsockopt,
558 	.sendmsg =	unix_stream_sendmsg,
559 	.recvmsg =	unix_stream_recvmsg,
560 	.mmap =		sock_no_mmap,
561 	.sendpage =	sock_no_sendpage,
562 	.set_peek_off =	unix_set_peek_off,
563 };
564 
565 static const struct proto_ops unix_dgram_ops = {
566 	.family =	PF_UNIX,
567 	.owner =	THIS_MODULE,
568 	.release =	unix_release,
569 	.bind =		unix_bind,
570 	.connect =	unix_dgram_connect,
571 	.socketpair =	unix_socketpair,
572 	.accept =	sock_no_accept,
573 	.getname =	unix_getname,
574 	.poll =		unix_dgram_poll,
575 	.ioctl =	unix_ioctl,
576 	.listen =	sock_no_listen,
577 	.shutdown =	unix_shutdown,
578 	.setsockopt =	sock_no_setsockopt,
579 	.getsockopt =	sock_no_getsockopt,
580 	.sendmsg =	unix_dgram_sendmsg,
581 	.recvmsg =	unix_dgram_recvmsg,
582 	.mmap =		sock_no_mmap,
583 	.sendpage =	sock_no_sendpage,
584 	.set_peek_off =	unix_set_peek_off,
585 };
586 
587 static const struct proto_ops unix_seqpacket_ops = {
588 	.family =	PF_UNIX,
589 	.owner =	THIS_MODULE,
590 	.release =	unix_release,
591 	.bind =		unix_bind,
592 	.connect =	unix_stream_connect,
593 	.socketpair =	unix_socketpair,
594 	.accept =	unix_accept,
595 	.getname =	unix_getname,
596 	.poll =		unix_dgram_poll,
597 	.ioctl =	unix_ioctl,
598 	.listen =	unix_listen,
599 	.shutdown =	unix_shutdown,
600 	.setsockopt =	sock_no_setsockopt,
601 	.getsockopt =	sock_no_getsockopt,
602 	.sendmsg =	unix_seqpacket_sendmsg,
603 	.recvmsg =	unix_seqpacket_recvmsg,
604 	.mmap =		sock_no_mmap,
605 	.sendpage =	sock_no_sendpage,
606 	.set_peek_off =	unix_set_peek_off,
607 };
608 
609 static struct proto unix_proto = {
610 	.name			= "UNIX",
611 	.owner			= THIS_MODULE,
612 	.obj_size		= sizeof(struct unix_sock),
613 };
614 
615 /*
616  * AF_UNIX sockets do not interact with hardware, hence they
617  * dont trigger interrupts - so it's safe for them to have
618  * bh-unsafe locking for their sk_receive_queue.lock. Split off
619  * this special lock-class by reinitializing the spinlock key:
620  */
621 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
622 
623 static struct sock *unix_create1(struct net *net, struct socket *sock)
624 {
625 	struct sock *sk = NULL;
626 	struct unix_sock *u;
627 
628 	atomic_long_inc(&unix_nr_socks);
629 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
630 		goto out;
631 
632 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
633 	if (!sk)
634 		goto out;
635 
636 	sock_init_data(sock, sk);
637 	lockdep_set_class(&sk->sk_receive_queue.lock,
638 				&af_unix_sk_receive_queue_lock_key);
639 
640 	sk->sk_write_space	= unix_write_space;
641 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
642 	sk->sk_destruct		= unix_sock_destructor;
643 	u	  = unix_sk(sk);
644 	u->path.dentry = NULL;
645 	u->path.mnt = NULL;
646 	spin_lock_init(&u->lock);
647 	atomic_long_set(&u->inflight, 0);
648 	INIT_LIST_HEAD(&u->link);
649 	mutex_init(&u->readlock); /* single task reading lock */
650 	init_waitqueue_head(&u->peer_wait);
651 	unix_insert_socket(unix_sockets_unbound(sk), sk);
652 out:
653 	if (sk == NULL)
654 		atomic_long_dec(&unix_nr_socks);
655 	else {
656 		local_bh_disable();
657 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
658 		local_bh_enable();
659 	}
660 	return sk;
661 }
662 
663 static int unix_create(struct net *net, struct socket *sock, int protocol,
664 		       int kern)
665 {
666 	if (protocol && protocol != PF_UNIX)
667 		return -EPROTONOSUPPORT;
668 
669 	sock->state = SS_UNCONNECTED;
670 
671 	switch (sock->type) {
672 	case SOCK_STREAM:
673 		sock->ops = &unix_stream_ops;
674 		break;
675 		/*
676 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
677 		 *	nothing uses it.
678 		 */
679 	case SOCK_RAW:
680 		sock->type = SOCK_DGRAM;
681 	case SOCK_DGRAM:
682 		sock->ops = &unix_dgram_ops;
683 		break;
684 	case SOCK_SEQPACKET:
685 		sock->ops = &unix_seqpacket_ops;
686 		break;
687 	default:
688 		return -ESOCKTNOSUPPORT;
689 	}
690 
691 	return unix_create1(net, sock) ? 0 : -ENOMEM;
692 }
693 
694 static int unix_release(struct socket *sock)
695 {
696 	struct sock *sk = sock->sk;
697 
698 	if (!sk)
699 		return 0;
700 
701 	unix_release_sock(sk, 0);
702 	sock->sk = NULL;
703 
704 	return 0;
705 }
706 
707 static int unix_autobind(struct socket *sock)
708 {
709 	struct sock *sk = sock->sk;
710 	struct net *net = sock_net(sk);
711 	struct unix_sock *u = unix_sk(sk);
712 	static u32 ordernum = 1;
713 	struct unix_address *addr;
714 	int err;
715 	unsigned int retries = 0;
716 
717 	err = mutex_lock_interruptible(&u->readlock);
718 	if (err)
719 		return err;
720 
721 	err = 0;
722 	if (u->addr)
723 		goto out;
724 
725 	err = -ENOMEM;
726 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
727 	if (!addr)
728 		goto out;
729 
730 	addr->name->sun_family = AF_UNIX;
731 	atomic_set(&addr->refcnt, 1);
732 
733 retry:
734 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
735 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
736 
737 	spin_lock(&unix_table_lock);
738 	ordernum = (ordernum+1)&0xFFFFF;
739 
740 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
741 				      addr->hash)) {
742 		spin_unlock(&unix_table_lock);
743 		/*
744 		 * __unix_find_socket_byname() may take long time if many names
745 		 * are already in use.
746 		 */
747 		cond_resched();
748 		/* Give up if all names seems to be in use. */
749 		if (retries++ == 0xFFFFF) {
750 			err = -ENOSPC;
751 			kfree(addr);
752 			goto out;
753 		}
754 		goto retry;
755 	}
756 	addr->hash ^= sk->sk_type;
757 
758 	__unix_remove_socket(sk);
759 	u->addr = addr;
760 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
761 	spin_unlock(&unix_table_lock);
762 	err = 0;
763 
764 out:	mutex_unlock(&u->readlock);
765 	return err;
766 }
767 
768 static struct sock *unix_find_other(struct net *net,
769 				    struct sockaddr_un *sunname, int len,
770 				    int type, unsigned int hash, int *error)
771 {
772 	struct sock *u;
773 	struct path path;
774 	int err = 0;
775 
776 	if (sunname->sun_path[0]) {
777 		struct inode *inode;
778 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
779 		if (err)
780 			goto fail;
781 		inode = d_backing_inode(path.dentry);
782 		err = inode_permission(inode, MAY_WRITE);
783 		if (err)
784 			goto put_fail;
785 
786 		err = -ECONNREFUSED;
787 		if (!S_ISSOCK(inode->i_mode))
788 			goto put_fail;
789 		u = unix_find_socket_byinode(inode);
790 		if (!u)
791 			goto put_fail;
792 
793 		if (u->sk_type == type)
794 			touch_atime(&path);
795 
796 		path_put(&path);
797 
798 		err = -EPROTOTYPE;
799 		if (u->sk_type != type) {
800 			sock_put(u);
801 			goto fail;
802 		}
803 	} else {
804 		err = -ECONNREFUSED;
805 		u = unix_find_socket_byname(net, sunname, len, type, hash);
806 		if (u) {
807 			struct dentry *dentry;
808 			dentry = unix_sk(u)->path.dentry;
809 			if (dentry)
810 				touch_atime(&unix_sk(u)->path);
811 		} else
812 			goto fail;
813 	}
814 	return u;
815 
816 put_fail:
817 	path_put(&path);
818 fail:
819 	*error = err;
820 	return NULL;
821 }
822 
823 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
824 {
825 	struct dentry *dentry;
826 	struct path path;
827 	int err = 0;
828 	/*
829 	 * Get the parent directory, calculate the hash for last
830 	 * component.
831 	 */
832 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
833 	err = PTR_ERR(dentry);
834 	if (IS_ERR(dentry))
835 		return err;
836 
837 	/*
838 	 * All right, let's create it.
839 	 */
840 	err = security_path_mknod(&path, dentry, mode, 0);
841 	if (!err) {
842 		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
843 		if (!err) {
844 			res->mnt = mntget(path.mnt);
845 			res->dentry = dget(dentry);
846 		}
847 	}
848 	done_path_create(&path, dentry);
849 	return err;
850 }
851 
852 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
853 {
854 	struct sock *sk = sock->sk;
855 	struct net *net = sock_net(sk);
856 	struct unix_sock *u = unix_sk(sk);
857 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
858 	char *sun_path = sunaddr->sun_path;
859 	int err;
860 	unsigned int hash;
861 	struct unix_address *addr;
862 	struct hlist_head *list;
863 
864 	err = -EINVAL;
865 	if (sunaddr->sun_family != AF_UNIX)
866 		goto out;
867 
868 	if (addr_len == sizeof(short)) {
869 		err = unix_autobind(sock);
870 		goto out;
871 	}
872 
873 	err = unix_mkname(sunaddr, addr_len, &hash);
874 	if (err < 0)
875 		goto out;
876 	addr_len = err;
877 
878 	err = mutex_lock_interruptible(&u->readlock);
879 	if (err)
880 		goto out;
881 
882 	err = -EINVAL;
883 	if (u->addr)
884 		goto out_up;
885 
886 	err = -ENOMEM;
887 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
888 	if (!addr)
889 		goto out_up;
890 
891 	memcpy(addr->name, sunaddr, addr_len);
892 	addr->len = addr_len;
893 	addr->hash = hash ^ sk->sk_type;
894 	atomic_set(&addr->refcnt, 1);
895 
896 	if (sun_path[0]) {
897 		struct path path;
898 		umode_t mode = S_IFSOCK |
899 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
900 		err = unix_mknod(sun_path, mode, &path);
901 		if (err) {
902 			if (err == -EEXIST)
903 				err = -EADDRINUSE;
904 			unix_release_addr(addr);
905 			goto out_up;
906 		}
907 		addr->hash = UNIX_HASH_SIZE;
908 		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
909 		spin_lock(&unix_table_lock);
910 		u->path = path;
911 		list = &unix_socket_table[hash];
912 	} else {
913 		spin_lock(&unix_table_lock);
914 		err = -EADDRINUSE;
915 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
916 					      sk->sk_type, hash)) {
917 			unix_release_addr(addr);
918 			goto out_unlock;
919 		}
920 
921 		list = &unix_socket_table[addr->hash];
922 	}
923 
924 	err = 0;
925 	__unix_remove_socket(sk);
926 	u->addr = addr;
927 	__unix_insert_socket(list, sk);
928 
929 out_unlock:
930 	spin_unlock(&unix_table_lock);
931 out_up:
932 	mutex_unlock(&u->readlock);
933 out:
934 	return err;
935 }
936 
937 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
938 {
939 	if (unlikely(sk1 == sk2) || !sk2) {
940 		unix_state_lock(sk1);
941 		return;
942 	}
943 	if (sk1 < sk2) {
944 		unix_state_lock(sk1);
945 		unix_state_lock_nested(sk2);
946 	} else {
947 		unix_state_lock(sk2);
948 		unix_state_lock_nested(sk1);
949 	}
950 }
951 
952 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
953 {
954 	if (unlikely(sk1 == sk2) || !sk2) {
955 		unix_state_unlock(sk1);
956 		return;
957 	}
958 	unix_state_unlock(sk1);
959 	unix_state_unlock(sk2);
960 }
961 
962 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
963 			      int alen, int flags)
964 {
965 	struct sock *sk = sock->sk;
966 	struct net *net = sock_net(sk);
967 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
968 	struct sock *other;
969 	unsigned int hash;
970 	int err;
971 
972 	if (addr->sa_family != AF_UNSPEC) {
973 		err = unix_mkname(sunaddr, alen, &hash);
974 		if (err < 0)
975 			goto out;
976 		alen = err;
977 
978 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
979 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
980 			goto out;
981 
982 restart:
983 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
984 		if (!other)
985 			goto out;
986 
987 		unix_state_double_lock(sk, other);
988 
989 		/* Apparently VFS overslept socket death. Retry. */
990 		if (sock_flag(other, SOCK_DEAD)) {
991 			unix_state_double_unlock(sk, other);
992 			sock_put(other);
993 			goto restart;
994 		}
995 
996 		err = -EPERM;
997 		if (!unix_may_send(sk, other))
998 			goto out_unlock;
999 
1000 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1001 		if (err)
1002 			goto out_unlock;
1003 
1004 	} else {
1005 		/*
1006 		 *	1003.1g breaking connected state with AF_UNSPEC
1007 		 */
1008 		other = NULL;
1009 		unix_state_double_lock(sk, other);
1010 	}
1011 
1012 	/*
1013 	 * If it was connected, reconnect.
1014 	 */
1015 	if (unix_peer(sk)) {
1016 		struct sock *old_peer = unix_peer(sk);
1017 		unix_peer(sk) = other;
1018 		unix_state_double_unlock(sk, other);
1019 
1020 		if (other != old_peer)
1021 			unix_dgram_disconnected(sk, old_peer);
1022 		sock_put(old_peer);
1023 	} else {
1024 		unix_peer(sk) = other;
1025 		unix_state_double_unlock(sk, other);
1026 	}
1027 	return 0;
1028 
1029 out_unlock:
1030 	unix_state_double_unlock(sk, other);
1031 	sock_put(other);
1032 out:
1033 	return err;
1034 }
1035 
1036 static long unix_wait_for_peer(struct sock *other, long timeo)
1037 {
1038 	struct unix_sock *u = unix_sk(other);
1039 	int sched;
1040 	DEFINE_WAIT(wait);
1041 
1042 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1043 
1044 	sched = !sock_flag(other, SOCK_DEAD) &&
1045 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1046 		unix_recvq_full(other);
1047 
1048 	unix_state_unlock(other);
1049 
1050 	if (sched)
1051 		timeo = schedule_timeout(timeo);
1052 
1053 	finish_wait(&u->peer_wait, &wait);
1054 	return timeo;
1055 }
1056 
1057 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1058 			       int addr_len, int flags)
1059 {
1060 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1061 	struct sock *sk = sock->sk;
1062 	struct net *net = sock_net(sk);
1063 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1064 	struct sock *newsk = NULL;
1065 	struct sock *other = NULL;
1066 	struct sk_buff *skb = NULL;
1067 	unsigned int hash;
1068 	int st;
1069 	int err;
1070 	long timeo;
1071 
1072 	err = unix_mkname(sunaddr, addr_len, &hash);
1073 	if (err < 0)
1074 		goto out;
1075 	addr_len = err;
1076 
1077 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1078 	    (err = unix_autobind(sock)) != 0)
1079 		goto out;
1080 
1081 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1082 
1083 	/* First of all allocate resources.
1084 	   If we will make it after state is locked,
1085 	   we will have to recheck all again in any case.
1086 	 */
1087 
1088 	err = -ENOMEM;
1089 
1090 	/* create new sock for complete connection */
1091 	newsk = unix_create1(sock_net(sk), NULL);
1092 	if (newsk == NULL)
1093 		goto out;
1094 
1095 	/* Allocate skb for sending to listening sock */
1096 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1097 	if (skb == NULL)
1098 		goto out;
1099 
1100 restart:
1101 	/*  Find listening sock. */
1102 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1103 	if (!other)
1104 		goto out;
1105 
1106 	/* Latch state of peer */
1107 	unix_state_lock(other);
1108 
1109 	/* Apparently VFS overslept socket death. Retry. */
1110 	if (sock_flag(other, SOCK_DEAD)) {
1111 		unix_state_unlock(other);
1112 		sock_put(other);
1113 		goto restart;
1114 	}
1115 
1116 	err = -ECONNREFUSED;
1117 	if (other->sk_state != TCP_LISTEN)
1118 		goto out_unlock;
1119 	if (other->sk_shutdown & RCV_SHUTDOWN)
1120 		goto out_unlock;
1121 
1122 	if (unix_recvq_full(other)) {
1123 		err = -EAGAIN;
1124 		if (!timeo)
1125 			goto out_unlock;
1126 
1127 		timeo = unix_wait_for_peer(other, timeo);
1128 
1129 		err = sock_intr_errno(timeo);
1130 		if (signal_pending(current))
1131 			goto out;
1132 		sock_put(other);
1133 		goto restart;
1134 	}
1135 
1136 	/* Latch our state.
1137 
1138 	   It is tricky place. We need to grab our state lock and cannot
1139 	   drop lock on peer. It is dangerous because deadlock is
1140 	   possible. Connect to self case and simultaneous
1141 	   attempt to connect are eliminated by checking socket
1142 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1143 	   check this before attempt to grab lock.
1144 
1145 	   Well, and we have to recheck the state after socket locked.
1146 	 */
1147 	st = sk->sk_state;
1148 
1149 	switch (st) {
1150 	case TCP_CLOSE:
1151 		/* This is ok... continue with connect */
1152 		break;
1153 	case TCP_ESTABLISHED:
1154 		/* Socket is already connected */
1155 		err = -EISCONN;
1156 		goto out_unlock;
1157 	default:
1158 		err = -EINVAL;
1159 		goto out_unlock;
1160 	}
1161 
1162 	unix_state_lock_nested(sk);
1163 
1164 	if (sk->sk_state != st) {
1165 		unix_state_unlock(sk);
1166 		unix_state_unlock(other);
1167 		sock_put(other);
1168 		goto restart;
1169 	}
1170 
1171 	err = security_unix_stream_connect(sk, other, newsk);
1172 	if (err) {
1173 		unix_state_unlock(sk);
1174 		goto out_unlock;
1175 	}
1176 
1177 	/* The way is open! Fastly set all the necessary fields... */
1178 
1179 	sock_hold(sk);
1180 	unix_peer(newsk)	= sk;
1181 	newsk->sk_state		= TCP_ESTABLISHED;
1182 	newsk->sk_type		= sk->sk_type;
1183 	init_peercred(newsk);
1184 	newu = unix_sk(newsk);
1185 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1186 	otheru = unix_sk(other);
1187 
1188 	/* copy address information from listening to new sock*/
1189 	if (otheru->addr) {
1190 		atomic_inc(&otheru->addr->refcnt);
1191 		newu->addr = otheru->addr;
1192 	}
1193 	if (otheru->path.dentry) {
1194 		path_get(&otheru->path);
1195 		newu->path = otheru->path;
1196 	}
1197 
1198 	/* Set credentials */
1199 	copy_peercred(sk, other);
1200 
1201 	sock->state	= SS_CONNECTED;
1202 	sk->sk_state	= TCP_ESTABLISHED;
1203 	sock_hold(newsk);
1204 
1205 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1206 	unix_peer(sk)	= newsk;
1207 
1208 	unix_state_unlock(sk);
1209 
1210 	/* take ten and and send info to listening sock */
1211 	spin_lock(&other->sk_receive_queue.lock);
1212 	__skb_queue_tail(&other->sk_receive_queue, skb);
1213 	spin_unlock(&other->sk_receive_queue.lock);
1214 	unix_state_unlock(other);
1215 	other->sk_data_ready(other);
1216 	sock_put(other);
1217 	return 0;
1218 
1219 out_unlock:
1220 	if (other)
1221 		unix_state_unlock(other);
1222 
1223 out:
1224 	kfree_skb(skb);
1225 	if (newsk)
1226 		unix_release_sock(newsk, 0);
1227 	if (other)
1228 		sock_put(other);
1229 	return err;
1230 }
1231 
1232 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1233 {
1234 	struct sock *ska = socka->sk, *skb = sockb->sk;
1235 
1236 	/* Join our sockets back to back */
1237 	sock_hold(ska);
1238 	sock_hold(skb);
1239 	unix_peer(ska) = skb;
1240 	unix_peer(skb) = ska;
1241 	init_peercred(ska);
1242 	init_peercred(skb);
1243 
1244 	if (ska->sk_type != SOCK_DGRAM) {
1245 		ska->sk_state = TCP_ESTABLISHED;
1246 		skb->sk_state = TCP_ESTABLISHED;
1247 		socka->state  = SS_CONNECTED;
1248 		sockb->state  = SS_CONNECTED;
1249 	}
1250 	return 0;
1251 }
1252 
1253 static void unix_sock_inherit_flags(const struct socket *old,
1254 				    struct socket *new)
1255 {
1256 	if (test_bit(SOCK_PASSCRED, &old->flags))
1257 		set_bit(SOCK_PASSCRED, &new->flags);
1258 	if (test_bit(SOCK_PASSSEC, &old->flags))
1259 		set_bit(SOCK_PASSSEC, &new->flags);
1260 }
1261 
1262 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1263 {
1264 	struct sock *sk = sock->sk;
1265 	struct sock *tsk;
1266 	struct sk_buff *skb;
1267 	int err;
1268 
1269 	err = -EOPNOTSUPP;
1270 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1271 		goto out;
1272 
1273 	err = -EINVAL;
1274 	if (sk->sk_state != TCP_LISTEN)
1275 		goto out;
1276 
1277 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1278 	 * so that no locks are necessary.
1279 	 */
1280 
1281 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1282 	if (!skb) {
1283 		/* This means receive shutdown. */
1284 		if (err == 0)
1285 			err = -EINVAL;
1286 		goto out;
1287 	}
1288 
1289 	tsk = skb->sk;
1290 	skb_free_datagram(sk, skb);
1291 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1292 
1293 	/* attach accepted sock to socket */
1294 	unix_state_lock(tsk);
1295 	newsock->state = SS_CONNECTED;
1296 	unix_sock_inherit_flags(sock, newsock);
1297 	sock_graft(tsk, newsock);
1298 	unix_state_unlock(tsk);
1299 	return 0;
1300 
1301 out:
1302 	return err;
1303 }
1304 
1305 
1306 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1307 {
1308 	struct sock *sk = sock->sk;
1309 	struct unix_sock *u;
1310 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1311 	int err = 0;
1312 
1313 	if (peer) {
1314 		sk = unix_peer_get(sk);
1315 
1316 		err = -ENOTCONN;
1317 		if (!sk)
1318 			goto out;
1319 		err = 0;
1320 	} else {
1321 		sock_hold(sk);
1322 	}
1323 
1324 	u = unix_sk(sk);
1325 	unix_state_lock(sk);
1326 	if (!u->addr) {
1327 		sunaddr->sun_family = AF_UNIX;
1328 		sunaddr->sun_path[0] = 0;
1329 		*uaddr_len = sizeof(short);
1330 	} else {
1331 		struct unix_address *addr = u->addr;
1332 
1333 		*uaddr_len = addr->len;
1334 		memcpy(sunaddr, addr->name, *uaddr_len);
1335 	}
1336 	unix_state_unlock(sk);
1337 	sock_put(sk);
1338 out:
1339 	return err;
1340 }
1341 
1342 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1343 {
1344 	int i;
1345 
1346 	scm->fp = UNIXCB(skb).fp;
1347 	UNIXCB(skb).fp = NULL;
1348 
1349 	for (i = scm->fp->count-1; i >= 0; i--)
1350 		unix_notinflight(scm->fp->fp[i]);
1351 }
1352 
1353 static void unix_destruct_scm(struct sk_buff *skb)
1354 {
1355 	struct scm_cookie scm;
1356 	memset(&scm, 0, sizeof(scm));
1357 	scm.pid  = UNIXCB(skb).pid;
1358 	if (UNIXCB(skb).fp)
1359 		unix_detach_fds(&scm, skb);
1360 
1361 	/* Alas, it calls VFS */
1362 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1363 	scm_destroy(&scm);
1364 	sock_wfree(skb);
1365 }
1366 
1367 #define MAX_RECURSION_LEVEL 4
1368 
1369 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1370 {
1371 	int i;
1372 	unsigned char max_level = 0;
1373 	int unix_sock_count = 0;
1374 
1375 	for (i = scm->fp->count - 1; i >= 0; i--) {
1376 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1377 
1378 		if (sk) {
1379 			unix_sock_count++;
1380 			max_level = max(max_level,
1381 					unix_sk(sk)->recursion_level);
1382 		}
1383 	}
1384 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1385 		return -ETOOMANYREFS;
1386 
1387 	/*
1388 	 * Need to duplicate file references for the sake of garbage
1389 	 * collection.  Otherwise a socket in the fps might become a
1390 	 * candidate for GC while the skb is not yet queued.
1391 	 */
1392 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1393 	if (!UNIXCB(skb).fp)
1394 		return -ENOMEM;
1395 
1396 	if (unix_sock_count) {
1397 		for (i = scm->fp->count - 1; i >= 0; i--)
1398 			unix_inflight(scm->fp->fp[i]);
1399 	}
1400 	return max_level;
1401 }
1402 
1403 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1404 {
1405 	int err = 0;
1406 
1407 	UNIXCB(skb).pid  = get_pid(scm->pid);
1408 	UNIXCB(skb).uid = scm->creds.uid;
1409 	UNIXCB(skb).gid = scm->creds.gid;
1410 	UNIXCB(skb).fp = NULL;
1411 	if (scm->fp && send_fds)
1412 		err = unix_attach_fds(scm, skb);
1413 
1414 	skb->destructor = unix_destruct_scm;
1415 	return err;
1416 }
1417 
1418 /*
1419  * Some apps rely on write() giving SCM_CREDENTIALS
1420  * We include credentials if source or destination socket
1421  * asserted SOCK_PASSCRED.
1422  */
1423 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1424 			    const struct sock *other)
1425 {
1426 	if (UNIXCB(skb).pid)
1427 		return;
1428 	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1429 	    !other->sk_socket ||
1430 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1431 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1432 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1433 	}
1434 }
1435 
1436 /*
1437  *	Send AF_UNIX data.
1438  */
1439 
1440 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1441 			      size_t len)
1442 {
1443 	struct sock *sk = sock->sk;
1444 	struct net *net = sock_net(sk);
1445 	struct unix_sock *u = unix_sk(sk);
1446 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1447 	struct sock *other = NULL;
1448 	int namelen = 0; /* fake GCC */
1449 	int err;
1450 	unsigned int hash;
1451 	struct sk_buff *skb;
1452 	long timeo;
1453 	struct scm_cookie scm;
1454 	int max_level;
1455 	int data_len = 0;
1456 
1457 	wait_for_unix_gc();
1458 	err = scm_send(sock, msg, &scm, false);
1459 	if (err < 0)
1460 		return err;
1461 
1462 	err = -EOPNOTSUPP;
1463 	if (msg->msg_flags&MSG_OOB)
1464 		goto out;
1465 
1466 	if (msg->msg_namelen) {
1467 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1468 		if (err < 0)
1469 			goto out;
1470 		namelen = err;
1471 	} else {
1472 		sunaddr = NULL;
1473 		err = -ENOTCONN;
1474 		other = unix_peer_get(sk);
1475 		if (!other)
1476 			goto out;
1477 	}
1478 
1479 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1480 	    && (err = unix_autobind(sock)) != 0)
1481 		goto out;
1482 
1483 	err = -EMSGSIZE;
1484 	if (len > sk->sk_sndbuf - 32)
1485 		goto out;
1486 
1487 	if (len > SKB_MAX_ALLOC) {
1488 		data_len = min_t(size_t,
1489 				 len - SKB_MAX_ALLOC,
1490 				 MAX_SKB_FRAGS * PAGE_SIZE);
1491 		data_len = PAGE_ALIGN(data_len);
1492 
1493 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1494 	}
1495 
1496 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1497 				   msg->msg_flags & MSG_DONTWAIT, &err,
1498 				   PAGE_ALLOC_COSTLY_ORDER);
1499 	if (skb == NULL)
1500 		goto out;
1501 
1502 	err = unix_scm_to_skb(&scm, skb, true);
1503 	if (err < 0)
1504 		goto out_free;
1505 	max_level = err + 1;
1506 	unix_get_secdata(&scm, skb);
1507 
1508 	skb_put(skb, len - data_len);
1509 	skb->data_len = data_len;
1510 	skb->len = len;
1511 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1512 	if (err)
1513 		goto out_free;
1514 
1515 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1516 
1517 restart:
1518 	if (!other) {
1519 		err = -ECONNRESET;
1520 		if (sunaddr == NULL)
1521 			goto out_free;
1522 
1523 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1524 					hash, &err);
1525 		if (other == NULL)
1526 			goto out_free;
1527 	}
1528 
1529 	if (sk_filter(other, skb) < 0) {
1530 		/* Toss the packet but do not return any error to the sender */
1531 		err = len;
1532 		goto out_free;
1533 	}
1534 
1535 	unix_state_lock(other);
1536 	err = -EPERM;
1537 	if (!unix_may_send(sk, other))
1538 		goto out_unlock;
1539 
1540 	if (sock_flag(other, SOCK_DEAD)) {
1541 		/*
1542 		 *	Check with 1003.1g - what should
1543 		 *	datagram error
1544 		 */
1545 		unix_state_unlock(other);
1546 		sock_put(other);
1547 
1548 		err = 0;
1549 		unix_state_lock(sk);
1550 		if (unix_peer(sk) == other) {
1551 			unix_peer(sk) = NULL;
1552 			unix_state_unlock(sk);
1553 
1554 			unix_dgram_disconnected(sk, other);
1555 			sock_put(other);
1556 			err = -ECONNREFUSED;
1557 		} else {
1558 			unix_state_unlock(sk);
1559 		}
1560 
1561 		other = NULL;
1562 		if (err)
1563 			goto out_free;
1564 		goto restart;
1565 	}
1566 
1567 	err = -EPIPE;
1568 	if (other->sk_shutdown & RCV_SHUTDOWN)
1569 		goto out_unlock;
1570 
1571 	if (sk->sk_type != SOCK_SEQPACKET) {
1572 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1573 		if (err)
1574 			goto out_unlock;
1575 	}
1576 
1577 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1578 		if (!timeo) {
1579 			err = -EAGAIN;
1580 			goto out_unlock;
1581 		}
1582 
1583 		timeo = unix_wait_for_peer(other, timeo);
1584 
1585 		err = sock_intr_errno(timeo);
1586 		if (signal_pending(current))
1587 			goto out_free;
1588 
1589 		goto restart;
1590 	}
1591 
1592 	if (sock_flag(other, SOCK_RCVTSTAMP))
1593 		__net_timestamp(skb);
1594 	maybe_add_creds(skb, sock, other);
1595 	skb_queue_tail(&other->sk_receive_queue, skb);
1596 	if (max_level > unix_sk(other)->recursion_level)
1597 		unix_sk(other)->recursion_level = max_level;
1598 	unix_state_unlock(other);
1599 	other->sk_data_ready(other);
1600 	sock_put(other);
1601 	scm_destroy(&scm);
1602 	return len;
1603 
1604 out_unlock:
1605 	unix_state_unlock(other);
1606 out_free:
1607 	kfree_skb(skb);
1608 out:
1609 	if (other)
1610 		sock_put(other);
1611 	scm_destroy(&scm);
1612 	return err;
1613 }
1614 
1615 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1616  * bytes, and a minimun of a full page.
1617  */
1618 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1619 
1620 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1621 			       size_t len)
1622 {
1623 	struct sock *sk = sock->sk;
1624 	struct sock *other = NULL;
1625 	int err, size;
1626 	struct sk_buff *skb;
1627 	int sent = 0;
1628 	struct scm_cookie scm;
1629 	bool fds_sent = false;
1630 	int max_level;
1631 	int data_len;
1632 
1633 	wait_for_unix_gc();
1634 	err = scm_send(sock, msg, &scm, false);
1635 	if (err < 0)
1636 		return err;
1637 
1638 	err = -EOPNOTSUPP;
1639 	if (msg->msg_flags&MSG_OOB)
1640 		goto out_err;
1641 
1642 	if (msg->msg_namelen) {
1643 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1644 		goto out_err;
1645 	} else {
1646 		err = -ENOTCONN;
1647 		other = unix_peer(sk);
1648 		if (!other)
1649 			goto out_err;
1650 	}
1651 
1652 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1653 		goto pipe_err;
1654 
1655 	while (sent < len) {
1656 		size = len - sent;
1657 
1658 		/* Keep two messages in the pipe so it schedules better */
1659 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1660 
1661 		/* allow fallback to order-0 allocations */
1662 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1663 
1664 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1665 
1666 		data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1667 
1668 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1669 					   msg->msg_flags & MSG_DONTWAIT, &err,
1670 					   get_order(UNIX_SKB_FRAGS_SZ));
1671 		if (!skb)
1672 			goto out_err;
1673 
1674 		/* Only send the fds in the first buffer */
1675 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
1676 		if (err < 0) {
1677 			kfree_skb(skb);
1678 			goto out_err;
1679 		}
1680 		max_level = err + 1;
1681 		fds_sent = true;
1682 
1683 		skb_put(skb, size - data_len);
1684 		skb->data_len = data_len;
1685 		skb->len = size;
1686 		err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1687 		if (err) {
1688 			kfree_skb(skb);
1689 			goto out_err;
1690 		}
1691 
1692 		unix_state_lock(other);
1693 
1694 		if (sock_flag(other, SOCK_DEAD) ||
1695 		    (other->sk_shutdown & RCV_SHUTDOWN))
1696 			goto pipe_err_free;
1697 
1698 		maybe_add_creds(skb, sock, other);
1699 		skb_queue_tail(&other->sk_receive_queue, skb);
1700 		if (max_level > unix_sk(other)->recursion_level)
1701 			unix_sk(other)->recursion_level = max_level;
1702 		unix_state_unlock(other);
1703 		other->sk_data_ready(other);
1704 		sent += size;
1705 	}
1706 
1707 	scm_destroy(&scm);
1708 
1709 	return sent;
1710 
1711 pipe_err_free:
1712 	unix_state_unlock(other);
1713 	kfree_skb(skb);
1714 pipe_err:
1715 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1716 		send_sig(SIGPIPE, current, 0);
1717 	err = -EPIPE;
1718 out_err:
1719 	scm_destroy(&scm);
1720 	return sent ? : err;
1721 }
1722 
1723 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
1724 				  size_t len)
1725 {
1726 	int err;
1727 	struct sock *sk = sock->sk;
1728 
1729 	err = sock_error(sk);
1730 	if (err)
1731 		return err;
1732 
1733 	if (sk->sk_state != TCP_ESTABLISHED)
1734 		return -ENOTCONN;
1735 
1736 	if (msg->msg_namelen)
1737 		msg->msg_namelen = 0;
1738 
1739 	return unix_dgram_sendmsg(sock, msg, len);
1740 }
1741 
1742 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
1743 				  size_t size, int flags)
1744 {
1745 	struct sock *sk = sock->sk;
1746 
1747 	if (sk->sk_state != TCP_ESTABLISHED)
1748 		return -ENOTCONN;
1749 
1750 	return unix_dgram_recvmsg(sock, msg, size, flags);
1751 }
1752 
1753 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1754 {
1755 	struct unix_sock *u = unix_sk(sk);
1756 
1757 	if (u->addr) {
1758 		msg->msg_namelen = u->addr->len;
1759 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1760 	}
1761 }
1762 
1763 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
1764 			      size_t size, int flags)
1765 {
1766 	struct scm_cookie scm;
1767 	struct sock *sk = sock->sk;
1768 	struct unix_sock *u = unix_sk(sk);
1769 	int noblock = flags & MSG_DONTWAIT;
1770 	struct sk_buff *skb;
1771 	int err;
1772 	int peeked, skip;
1773 
1774 	err = -EOPNOTSUPP;
1775 	if (flags&MSG_OOB)
1776 		goto out;
1777 
1778 	err = mutex_lock_interruptible(&u->readlock);
1779 	if (unlikely(err)) {
1780 		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
1781 		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1782 		 */
1783 		err = noblock ? -EAGAIN : -ERESTARTSYS;
1784 		goto out;
1785 	}
1786 
1787 	skip = sk_peek_offset(sk, flags);
1788 
1789 	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1790 	if (!skb) {
1791 		unix_state_lock(sk);
1792 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1793 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1794 		    (sk->sk_shutdown & RCV_SHUTDOWN))
1795 			err = 0;
1796 		unix_state_unlock(sk);
1797 		goto out_unlock;
1798 	}
1799 
1800 	wake_up_interruptible_sync_poll(&u->peer_wait,
1801 					POLLOUT | POLLWRNORM | POLLWRBAND);
1802 
1803 	if (msg->msg_name)
1804 		unix_copy_addr(msg, skb->sk);
1805 
1806 	if (size > skb->len - skip)
1807 		size = skb->len - skip;
1808 	else if (size < skb->len - skip)
1809 		msg->msg_flags |= MSG_TRUNC;
1810 
1811 	err = skb_copy_datagram_msg(skb, skip, msg, size);
1812 	if (err)
1813 		goto out_free;
1814 
1815 	if (sock_flag(sk, SOCK_RCVTSTAMP))
1816 		__sock_recv_timestamp(msg, sk, skb);
1817 
1818 	memset(&scm, 0, sizeof(scm));
1819 
1820 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1821 	unix_set_secdata(&scm, skb);
1822 
1823 	if (!(flags & MSG_PEEK)) {
1824 		if (UNIXCB(skb).fp)
1825 			unix_detach_fds(&scm, skb);
1826 
1827 		sk_peek_offset_bwd(sk, skb->len);
1828 	} else {
1829 		/* It is questionable: on PEEK we could:
1830 		   - do not return fds - good, but too simple 8)
1831 		   - return fds, and do not return them on read (old strategy,
1832 		     apparently wrong)
1833 		   - clone fds (I chose it for now, it is the most universal
1834 		     solution)
1835 
1836 		   POSIX 1003.1g does not actually define this clearly
1837 		   at all. POSIX 1003.1g doesn't define a lot of things
1838 		   clearly however!
1839 
1840 		*/
1841 
1842 		sk_peek_offset_fwd(sk, size);
1843 
1844 		if (UNIXCB(skb).fp)
1845 			scm.fp = scm_fp_dup(UNIXCB(skb).fp);
1846 	}
1847 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1848 
1849 	scm_recv(sock, msg, &scm, flags);
1850 
1851 out_free:
1852 	skb_free_datagram(sk, skb);
1853 out_unlock:
1854 	mutex_unlock(&u->readlock);
1855 out:
1856 	return err;
1857 }
1858 
1859 /*
1860  *	Sleep until more data has arrived. But check for races..
1861  */
1862 static long unix_stream_data_wait(struct sock *sk, long timeo,
1863 				  struct sk_buff *last)
1864 {
1865 	DEFINE_WAIT(wait);
1866 
1867 	unix_state_lock(sk);
1868 
1869 	for (;;) {
1870 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1871 
1872 		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1873 		    sk->sk_err ||
1874 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1875 		    signal_pending(current) ||
1876 		    !timeo)
1877 			break;
1878 
1879 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1880 		unix_state_unlock(sk);
1881 		timeo = freezable_schedule_timeout(timeo);
1882 		unix_state_lock(sk);
1883 
1884 		if (sock_flag(sk, SOCK_DEAD))
1885 			break;
1886 
1887 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1888 	}
1889 
1890 	finish_wait(sk_sleep(sk), &wait);
1891 	unix_state_unlock(sk);
1892 	return timeo;
1893 }
1894 
1895 static unsigned int unix_skb_len(const struct sk_buff *skb)
1896 {
1897 	return skb->len - UNIXCB(skb).consumed;
1898 }
1899 
1900 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
1901 			       size_t size, int flags)
1902 {
1903 	struct scm_cookie scm;
1904 	struct sock *sk = sock->sk;
1905 	struct unix_sock *u = unix_sk(sk);
1906 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1907 	int copied = 0;
1908 	int noblock = flags & MSG_DONTWAIT;
1909 	int check_creds = 0;
1910 	int target;
1911 	int err = 0;
1912 	long timeo;
1913 	int skip;
1914 
1915 	err = -EINVAL;
1916 	if (sk->sk_state != TCP_ESTABLISHED)
1917 		goto out;
1918 
1919 	err = -EOPNOTSUPP;
1920 	if (flags&MSG_OOB)
1921 		goto out;
1922 
1923 	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1924 	timeo = sock_rcvtimeo(sk, noblock);
1925 
1926 	/* Lock the socket to prevent queue disordering
1927 	 * while sleeps in memcpy_tomsg
1928 	 */
1929 
1930 	memset(&scm, 0, sizeof(scm));
1931 
1932 	err = mutex_lock_interruptible(&u->readlock);
1933 	if (unlikely(err)) {
1934 		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
1935 		 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1936 		 */
1937 		err = noblock ? -EAGAIN : -ERESTARTSYS;
1938 		goto out;
1939 	}
1940 
1941 	do {
1942 		int chunk;
1943 		struct sk_buff *skb, *last;
1944 
1945 		unix_state_lock(sk);
1946 		if (sock_flag(sk, SOCK_DEAD)) {
1947 			err = -ECONNRESET;
1948 			goto unlock;
1949 		}
1950 		last = skb = skb_peek(&sk->sk_receive_queue);
1951 again:
1952 		if (skb == NULL) {
1953 			unix_sk(sk)->recursion_level = 0;
1954 			if (copied >= target)
1955 				goto unlock;
1956 
1957 			/*
1958 			 *	POSIX 1003.1g mandates this order.
1959 			 */
1960 
1961 			err = sock_error(sk);
1962 			if (err)
1963 				goto unlock;
1964 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1965 				goto unlock;
1966 
1967 			unix_state_unlock(sk);
1968 			err = -EAGAIN;
1969 			if (!timeo)
1970 				break;
1971 			mutex_unlock(&u->readlock);
1972 
1973 			timeo = unix_stream_data_wait(sk, timeo, last);
1974 
1975 			if (signal_pending(current)
1976 			    ||  mutex_lock_interruptible(&u->readlock)) {
1977 				err = sock_intr_errno(timeo);
1978 				goto out;
1979 			}
1980 
1981 			continue;
1982  unlock:
1983 			unix_state_unlock(sk);
1984 			break;
1985 		}
1986 
1987 		skip = sk_peek_offset(sk, flags);
1988 		while (skip >= unix_skb_len(skb)) {
1989 			skip -= unix_skb_len(skb);
1990 			last = skb;
1991 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1992 			if (!skb)
1993 				goto again;
1994 		}
1995 
1996 		unix_state_unlock(sk);
1997 
1998 		if (check_creds) {
1999 			/* Never glue messages from different writers */
2000 			if ((UNIXCB(skb).pid  != scm.pid) ||
2001 			    !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
2002 			    !gid_eq(UNIXCB(skb).gid, scm.creds.gid))
2003 				break;
2004 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2005 			/* Copy credentials */
2006 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2007 			check_creds = 1;
2008 		}
2009 
2010 		/* Copy address just once */
2011 		if (sunaddr) {
2012 			unix_copy_addr(msg, skb->sk);
2013 			sunaddr = NULL;
2014 		}
2015 
2016 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2017 		if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2018 					  msg, chunk)) {
2019 			if (copied == 0)
2020 				copied = -EFAULT;
2021 			break;
2022 		}
2023 		copied += chunk;
2024 		size -= chunk;
2025 
2026 		/* Mark read part of skb as used */
2027 		if (!(flags & MSG_PEEK)) {
2028 			UNIXCB(skb).consumed += chunk;
2029 
2030 			sk_peek_offset_bwd(sk, chunk);
2031 
2032 			if (UNIXCB(skb).fp)
2033 				unix_detach_fds(&scm, skb);
2034 
2035 			if (unix_skb_len(skb))
2036 				break;
2037 
2038 			skb_unlink(skb, &sk->sk_receive_queue);
2039 			consume_skb(skb);
2040 
2041 			if (scm.fp)
2042 				break;
2043 		} else {
2044 			/* It is questionable, see note in unix_dgram_recvmsg.
2045 			 */
2046 			if (UNIXCB(skb).fp)
2047 				scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2048 
2049 			sk_peek_offset_fwd(sk, chunk);
2050 
2051 			break;
2052 		}
2053 	} while (size);
2054 
2055 	mutex_unlock(&u->readlock);
2056 	scm_recv(sock, msg, &scm, flags);
2057 out:
2058 	return copied ? : err;
2059 }
2060 
2061 static int unix_shutdown(struct socket *sock, int mode)
2062 {
2063 	struct sock *sk = sock->sk;
2064 	struct sock *other;
2065 
2066 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2067 		return -EINVAL;
2068 	/* This maps:
2069 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2070 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2071 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2072 	 */
2073 	++mode;
2074 
2075 	unix_state_lock(sk);
2076 	sk->sk_shutdown |= mode;
2077 	other = unix_peer(sk);
2078 	if (other)
2079 		sock_hold(other);
2080 	unix_state_unlock(sk);
2081 	sk->sk_state_change(sk);
2082 
2083 	if (other &&
2084 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2085 
2086 		int peer_mode = 0;
2087 
2088 		if (mode&RCV_SHUTDOWN)
2089 			peer_mode |= SEND_SHUTDOWN;
2090 		if (mode&SEND_SHUTDOWN)
2091 			peer_mode |= RCV_SHUTDOWN;
2092 		unix_state_lock(other);
2093 		other->sk_shutdown |= peer_mode;
2094 		unix_state_unlock(other);
2095 		other->sk_state_change(other);
2096 		if (peer_mode == SHUTDOWN_MASK)
2097 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2098 		else if (peer_mode & RCV_SHUTDOWN)
2099 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2100 	}
2101 	if (other)
2102 		sock_put(other);
2103 
2104 	return 0;
2105 }
2106 
2107 long unix_inq_len(struct sock *sk)
2108 {
2109 	struct sk_buff *skb;
2110 	long amount = 0;
2111 
2112 	if (sk->sk_state == TCP_LISTEN)
2113 		return -EINVAL;
2114 
2115 	spin_lock(&sk->sk_receive_queue.lock);
2116 	if (sk->sk_type == SOCK_STREAM ||
2117 	    sk->sk_type == SOCK_SEQPACKET) {
2118 		skb_queue_walk(&sk->sk_receive_queue, skb)
2119 			amount += unix_skb_len(skb);
2120 	} else {
2121 		skb = skb_peek(&sk->sk_receive_queue);
2122 		if (skb)
2123 			amount = skb->len;
2124 	}
2125 	spin_unlock(&sk->sk_receive_queue.lock);
2126 
2127 	return amount;
2128 }
2129 EXPORT_SYMBOL_GPL(unix_inq_len);
2130 
2131 long unix_outq_len(struct sock *sk)
2132 {
2133 	return sk_wmem_alloc_get(sk);
2134 }
2135 EXPORT_SYMBOL_GPL(unix_outq_len);
2136 
2137 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2138 {
2139 	struct sock *sk = sock->sk;
2140 	long amount = 0;
2141 	int err;
2142 
2143 	switch (cmd) {
2144 	case SIOCOUTQ:
2145 		amount = unix_outq_len(sk);
2146 		err = put_user(amount, (int __user *)arg);
2147 		break;
2148 	case SIOCINQ:
2149 		amount = unix_inq_len(sk);
2150 		if (amount < 0)
2151 			err = amount;
2152 		else
2153 			err = put_user(amount, (int __user *)arg);
2154 		break;
2155 	default:
2156 		err = -ENOIOCTLCMD;
2157 		break;
2158 	}
2159 	return err;
2160 }
2161 
2162 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2163 {
2164 	struct sock *sk = sock->sk;
2165 	unsigned int mask;
2166 
2167 	sock_poll_wait(file, sk_sleep(sk), wait);
2168 	mask = 0;
2169 
2170 	/* exceptional events? */
2171 	if (sk->sk_err)
2172 		mask |= POLLERR;
2173 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2174 		mask |= POLLHUP;
2175 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2176 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2177 
2178 	/* readable? */
2179 	if (!skb_queue_empty(&sk->sk_receive_queue))
2180 		mask |= POLLIN | POLLRDNORM;
2181 
2182 	/* Connection-based need to check for termination and startup */
2183 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2184 	    sk->sk_state == TCP_CLOSE)
2185 		mask |= POLLHUP;
2186 
2187 	/*
2188 	 * we set writable also when the other side has shut down the
2189 	 * connection. This prevents stuck sockets.
2190 	 */
2191 	if (unix_writable(sk))
2192 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2193 
2194 	return mask;
2195 }
2196 
2197 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2198 				    poll_table *wait)
2199 {
2200 	struct sock *sk = sock->sk, *other;
2201 	unsigned int mask, writable;
2202 
2203 	sock_poll_wait(file, sk_sleep(sk), wait);
2204 	mask = 0;
2205 
2206 	/* exceptional events? */
2207 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2208 		mask |= POLLERR |
2209 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2210 
2211 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2212 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2213 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2214 		mask |= POLLHUP;
2215 
2216 	/* readable? */
2217 	if (!skb_queue_empty(&sk->sk_receive_queue))
2218 		mask |= POLLIN | POLLRDNORM;
2219 
2220 	/* Connection-based need to check for termination and startup */
2221 	if (sk->sk_type == SOCK_SEQPACKET) {
2222 		if (sk->sk_state == TCP_CLOSE)
2223 			mask |= POLLHUP;
2224 		/* connection hasn't started yet? */
2225 		if (sk->sk_state == TCP_SYN_SENT)
2226 			return mask;
2227 	}
2228 
2229 	/* No write status requested, avoid expensive OUT tests. */
2230 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2231 		return mask;
2232 
2233 	writable = unix_writable(sk);
2234 	other = unix_peer_get(sk);
2235 	if (other) {
2236 		if (unix_peer(other) != sk) {
2237 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2238 			if (unix_recvq_full(other))
2239 				writable = 0;
2240 		}
2241 		sock_put(other);
2242 	}
2243 
2244 	if (writable)
2245 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2246 	else
2247 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2248 
2249 	return mask;
2250 }
2251 
2252 #ifdef CONFIG_PROC_FS
2253 
2254 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2255 
2256 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2257 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2258 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2259 
2260 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2261 {
2262 	unsigned long offset = get_offset(*pos);
2263 	unsigned long bucket = get_bucket(*pos);
2264 	struct sock *sk;
2265 	unsigned long count = 0;
2266 
2267 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2268 		if (sock_net(sk) != seq_file_net(seq))
2269 			continue;
2270 		if (++count == offset)
2271 			break;
2272 	}
2273 
2274 	return sk;
2275 }
2276 
2277 static struct sock *unix_next_socket(struct seq_file *seq,
2278 				     struct sock *sk,
2279 				     loff_t *pos)
2280 {
2281 	unsigned long bucket;
2282 
2283 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2284 		sk = sk_next(sk);
2285 		if (!sk)
2286 			goto next_bucket;
2287 		if (sock_net(sk) == seq_file_net(seq))
2288 			return sk;
2289 	}
2290 
2291 	do {
2292 		sk = unix_from_bucket(seq, pos);
2293 		if (sk)
2294 			return sk;
2295 
2296 next_bucket:
2297 		bucket = get_bucket(*pos) + 1;
2298 		*pos = set_bucket_offset(bucket, 1);
2299 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2300 
2301 	return NULL;
2302 }
2303 
2304 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2305 	__acquires(unix_table_lock)
2306 {
2307 	spin_lock(&unix_table_lock);
2308 
2309 	if (!*pos)
2310 		return SEQ_START_TOKEN;
2311 
2312 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2313 		return NULL;
2314 
2315 	return unix_next_socket(seq, NULL, pos);
2316 }
2317 
2318 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2319 {
2320 	++*pos;
2321 	return unix_next_socket(seq, v, pos);
2322 }
2323 
2324 static void unix_seq_stop(struct seq_file *seq, void *v)
2325 	__releases(unix_table_lock)
2326 {
2327 	spin_unlock(&unix_table_lock);
2328 }
2329 
2330 static int unix_seq_show(struct seq_file *seq, void *v)
2331 {
2332 
2333 	if (v == SEQ_START_TOKEN)
2334 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2335 			 "Inode Path\n");
2336 	else {
2337 		struct sock *s = v;
2338 		struct unix_sock *u = unix_sk(s);
2339 		unix_state_lock(s);
2340 
2341 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2342 			s,
2343 			atomic_read(&s->sk_refcnt),
2344 			0,
2345 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2346 			s->sk_type,
2347 			s->sk_socket ?
2348 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2349 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2350 			sock_i_ino(s));
2351 
2352 		if (u->addr) {
2353 			int i, len;
2354 			seq_putc(seq, ' ');
2355 
2356 			i = 0;
2357 			len = u->addr->len - sizeof(short);
2358 			if (!UNIX_ABSTRACT(s))
2359 				len--;
2360 			else {
2361 				seq_putc(seq, '@');
2362 				i++;
2363 			}
2364 			for ( ; i < len; i++)
2365 				seq_putc(seq, u->addr->name->sun_path[i]);
2366 		}
2367 		unix_state_unlock(s);
2368 		seq_putc(seq, '\n');
2369 	}
2370 
2371 	return 0;
2372 }
2373 
2374 static const struct seq_operations unix_seq_ops = {
2375 	.start  = unix_seq_start,
2376 	.next   = unix_seq_next,
2377 	.stop   = unix_seq_stop,
2378 	.show   = unix_seq_show,
2379 };
2380 
2381 static int unix_seq_open(struct inode *inode, struct file *file)
2382 {
2383 	return seq_open_net(inode, file, &unix_seq_ops,
2384 			    sizeof(struct seq_net_private));
2385 }
2386 
2387 static const struct file_operations unix_seq_fops = {
2388 	.owner		= THIS_MODULE,
2389 	.open		= unix_seq_open,
2390 	.read		= seq_read,
2391 	.llseek		= seq_lseek,
2392 	.release	= seq_release_net,
2393 };
2394 
2395 #endif
2396 
2397 static const struct net_proto_family unix_family_ops = {
2398 	.family = PF_UNIX,
2399 	.create = unix_create,
2400 	.owner	= THIS_MODULE,
2401 };
2402 
2403 
2404 static int __net_init unix_net_init(struct net *net)
2405 {
2406 	int error = -ENOMEM;
2407 
2408 	net->unx.sysctl_max_dgram_qlen = 10;
2409 	if (unix_sysctl_register(net))
2410 		goto out;
2411 
2412 #ifdef CONFIG_PROC_FS
2413 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2414 		unix_sysctl_unregister(net);
2415 		goto out;
2416 	}
2417 #endif
2418 	error = 0;
2419 out:
2420 	return error;
2421 }
2422 
2423 static void __net_exit unix_net_exit(struct net *net)
2424 {
2425 	unix_sysctl_unregister(net);
2426 	remove_proc_entry("unix", net->proc_net);
2427 }
2428 
2429 static struct pernet_operations unix_net_ops = {
2430 	.init = unix_net_init,
2431 	.exit = unix_net_exit,
2432 };
2433 
2434 static int __init af_unix_init(void)
2435 {
2436 	int rc = -1;
2437 
2438 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2439 
2440 	rc = proto_register(&unix_proto, 1);
2441 	if (rc != 0) {
2442 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2443 		goto out;
2444 	}
2445 
2446 	sock_register(&unix_family_ops);
2447 	register_pernet_subsys(&unix_net_ops);
2448 out:
2449 	return rc;
2450 }
2451 
2452 static void __exit af_unix_exit(void)
2453 {
2454 	sock_unregister(PF_UNIX);
2455 	proto_unregister(&unix_proto);
2456 	unregister_pernet_subsys(&unix_net_ops);
2457 }
2458 
2459 /* Earlier than device_initcall() so that other drivers invoking
2460    request_module() don't end up in a loop when modprobe tries
2461    to use a UNIX socket. But later than subsys_initcall() because
2462    we depend on stuff initialised there */
2463 fs_initcall(af_unix_init);
2464 module_exit(af_unix_exit);
2465 
2466 MODULE_LICENSE("GPL");
2467 MODULE_ALIAS_NETPROTO(PF_UNIX);
2468