xref: /openbmc/linux/net/unix/af_unix.c (revision 840ef8b7)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #include <linux/module.h>
84 #include <linux/kernel.h>
85 #include <linux/signal.h>
86 #include <linux/sched.h>
87 #include <linux/errno.h>
88 #include <linux/string.h>
89 #include <linux/stat.h>
90 #include <linux/dcache.h>
91 #include <linux/namei.h>
92 #include <linux/socket.h>
93 #include <linux/un.h>
94 #include <linux/fcntl.h>
95 #include <linux/termios.h>
96 #include <linux/sockios.h>
97 #include <linux/net.h>
98 #include <linux/in.h>
99 #include <linux/fs.h>
100 #include <linux/slab.h>
101 #include <asm/uaccess.h>
102 #include <linux/skbuff.h>
103 #include <linux/netdevice.h>
104 #include <net/net_namespace.h>
105 #include <net/sock.h>
106 #include <net/tcp_states.h>
107 #include <net/af_unix.h>
108 #include <linux/proc_fs.h>
109 #include <linux/seq_file.h>
110 #include <net/scm.h>
111 #include <linux/init.h>
112 #include <linux/poll.h>
113 #include <linux/rtnetlink.h>
114 #include <linux/mount.h>
115 #include <net/checksum.h>
116 #include <linux/security.h>
117 
118 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
119 EXPORT_SYMBOL_GPL(unix_socket_table);
120 DEFINE_SPINLOCK(unix_table_lock);
121 EXPORT_SYMBOL_GPL(unix_table_lock);
122 static atomic_long_t unix_nr_socks;
123 
124 
125 static struct hlist_head *unix_sockets_unbound(void *addr)
126 {
127 	unsigned long hash = (unsigned long)addr;
128 
129 	hash ^= hash >> 16;
130 	hash ^= hash >> 8;
131 	hash %= UNIX_HASH_SIZE;
132 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
133 }
134 
135 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
136 
137 #ifdef CONFIG_SECURITY_NETWORK
138 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
139 {
140 	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
141 }
142 
143 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
144 {
145 	scm->secid = *UNIXSID(skb);
146 }
147 #else
148 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
149 { }
150 
151 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
152 { }
153 #endif /* CONFIG_SECURITY_NETWORK */
154 
155 /*
156  *  SMP locking strategy:
157  *    hash table is protected with spinlock unix_table_lock
158  *    each socket state is protected by separate spin lock.
159  */
160 
161 static inline unsigned int unix_hash_fold(__wsum n)
162 {
163 	unsigned int hash = (__force unsigned int)n;
164 
165 	hash ^= hash>>16;
166 	hash ^= hash>>8;
167 	return hash&(UNIX_HASH_SIZE-1);
168 }
169 
170 #define unix_peer(sk) (unix_sk(sk)->peer)
171 
172 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
173 {
174 	return unix_peer(osk) == sk;
175 }
176 
177 static inline int unix_may_send(struct sock *sk, struct sock *osk)
178 {
179 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
180 }
181 
182 static inline int unix_recvq_full(struct sock const *sk)
183 {
184 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
185 }
186 
187 struct sock *unix_peer_get(struct sock *s)
188 {
189 	struct sock *peer;
190 
191 	unix_state_lock(s);
192 	peer = unix_peer(s);
193 	if (peer)
194 		sock_hold(peer);
195 	unix_state_unlock(s);
196 	return peer;
197 }
198 EXPORT_SYMBOL_GPL(unix_peer_get);
199 
200 static inline void unix_release_addr(struct unix_address *addr)
201 {
202 	if (atomic_dec_and_test(&addr->refcnt))
203 		kfree(addr);
204 }
205 
206 /*
207  *	Check unix socket name:
208  *		- should be not zero length.
209  *	        - if started by not zero, should be NULL terminated (FS object)
210  *		- if started by zero, it is abstract name.
211  */
212 
213 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
214 {
215 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
216 		return -EINVAL;
217 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
218 		return -EINVAL;
219 	if (sunaddr->sun_path[0]) {
220 		/*
221 		 * This may look like an off by one error but it is a bit more
222 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
223 		 * sun_path[108] doesn't as such exist.  However in kernel space
224 		 * we are guaranteed that it is a valid memory location in our
225 		 * kernel address buffer.
226 		 */
227 		((char *)sunaddr)[len] = 0;
228 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
229 		return len;
230 	}
231 
232 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
233 	return len;
234 }
235 
236 static void __unix_remove_socket(struct sock *sk)
237 {
238 	sk_del_node_init(sk);
239 }
240 
241 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
242 {
243 	WARN_ON(!sk_unhashed(sk));
244 	sk_add_node(sk, list);
245 }
246 
247 static inline void unix_remove_socket(struct sock *sk)
248 {
249 	spin_lock(&unix_table_lock);
250 	__unix_remove_socket(sk);
251 	spin_unlock(&unix_table_lock);
252 }
253 
254 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
255 {
256 	spin_lock(&unix_table_lock);
257 	__unix_insert_socket(list, sk);
258 	spin_unlock(&unix_table_lock);
259 }
260 
261 static struct sock *__unix_find_socket_byname(struct net *net,
262 					      struct sockaddr_un *sunname,
263 					      int len, int type, unsigned int hash)
264 {
265 	struct sock *s;
266 
267 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
268 		struct unix_sock *u = unix_sk(s);
269 
270 		if (!net_eq(sock_net(s), net))
271 			continue;
272 
273 		if (u->addr->len == len &&
274 		    !memcmp(u->addr->name, sunname, len))
275 			goto found;
276 	}
277 	s = NULL;
278 found:
279 	return s;
280 }
281 
282 static inline struct sock *unix_find_socket_byname(struct net *net,
283 						   struct sockaddr_un *sunname,
284 						   int len, int type,
285 						   unsigned int hash)
286 {
287 	struct sock *s;
288 
289 	spin_lock(&unix_table_lock);
290 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
291 	if (s)
292 		sock_hold(s);
293 	spin_unlock(&unix_table_lock);
294 	return s;
295 }
296 
297 static struct sock *unix_find_socket_byinode(struct inode *i)
298 {
299 	struct sock *s;
300 
301 	spin_lock(&unix_table_lock);
302 	sk_for_each(s,
303 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
304 		struct dentry *dentry = unix_sk(s)->path.dentry;
305 
306 		if (dentry && dentry->d_inode == i) {
307 			sock_hold(s);
308 			goto found;
309 		}
310 	}
311 	s = NULL;
312 found:
313 	spin_unlock(&unix_table_lock);
314 	return s;
315 }
316 
317 static inline int unix_writable(struct sock *sk)
318 {
319 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
320 }
321 
322 static void unix_write_space(struct sock *sk)
323 {
324 	struct socket_wq *wq;
325 
326 	rcu_read_lock();
327 	if (unix_writable(sk)) {
328 		wq = rcu_dereference(sk->sk_wq);
329 		if (wq_has_sleeper(wq))
330 			wake_up_interruptible_sync_poll(&wq->wait,
331 				POLLOUT | POLLWRNORM | POLLWRBAND);
332 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
333 	}
334 	rcu_read_unlock();
335 }
336 
337 /* When dgram socket disconnects (or changes its peer), we clear its receive
338  * queue of packets arrived from previous peer. First, it allows to do
339  * flow control based only on wmem_alloc; second, sk connected to peer
340  * may receive messages only from that peer. */
341 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
342 {
343 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
344 		skb_queue_purge(&sk->sk_receive_queue);
345 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
346 
347 		/* If one link of bidirectional dgram pipe is disconnected,
348 		 * we signal error. Messages are lost. Do not make this,
349 		 * when peer was not connected to us.
350 		 */
351 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
352 			other->sk_err = ECONNRESET;
353 			other->sk_error_report(other);
354 		}
355 	}
356 }
357 
358 static void unix_sock_destructor(struct sock *sk)
359 {
360 	struct unix_sock *u = unix_sk(sk);
361 
362 	skb_queue_purge(&sk->sk_receive_queue);
363 
364 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
365 	WARN_ON(!sk_unhashed(sk));
366 	WARN_ON(sk->sk_socket);
367 	if (!sock_flag(sk, SOCK_DEAD)) {
368 		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
369 		return;
370 	}
371 
372 	if (u->addr)
373 		unix_release_addr(u->addr);
374 
375 	atomic_long_dec(&unix_nr_socks);
376 	local_bh_disable();
377 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
378 	local_bh_enable();
379 #ifdef UNIX_REFCNT_DEBUG
380 	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
381 		atomic_long_read(&unix_nr_socks));
382 #endif
383 }
384 
385 static int unix_release_sock(struct sock *sk, int embrion)
386 {
387 	struct unix_sock *u = unix_sk(sk);
388 	struct path path;
389 	struct sock *skpair;
390 	struct sk_buff *skb;
391 	int state;
392 
393 	unix_remove_socket(sk);
394 
395 	/* Clear state */
396 	unix_state_lock(sk);
397 	sock_orphan(sk);
398 	sk->sk_shutdown = SHUTDOWN_MASK;
399 	path	     = u->path;
400 	u->path.dentry = NULL;
401 	u->path.mnt = NULL;
402 	state = sk->sk_state;
403 	sk->sk_state = TCP_CLOSE;
404 	unix_state_unlock(sk);
405 
406 	wake_up_interruptible_all(&u->peer_wait);
407 
408 	skpair = unix_peer(sk);
409 
410 	if (skpair != NULL) {
411 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
412 			unix_state_lock(skpair);
413 			/* No more writes */
414 			skpair->sk_shutdown = SHUTDOWN_MASK;
415 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
416 				skpair->sk_err = ECONNRESET;
417 			unix_state_unlock(skpair);
418 			skpair->sk_state_change(skpair);
419 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
420 		}
421 		sock_put(skpair); /* It may now die */
422 		unix_peer(sk) = NULL;
423 	}
424 
425 	/* Try to flush out this socket. Throw out buffers at least */
426 
427 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
428 		if (state == TCP_LISTEN)
429 			unix_release_sock(skb->sk, 1);
430 		/* passed fds are erased in the kfree_skb hook	      */
431 		kfree_skb(skb);
432 	}
433 
434 	if (path.dentry)
435 		path_put(&path);
436 
437 	sock_put(sk);
438 
439 	/* ---- Socket is dead now and most probably destroyed ---- */
440 
441 	/*
442 	 * Fixme: BSD difference: In BSD all sockets connected to us get
443 	 *	  ECONNRESET and we die on the spot. In Linux we behave
444 	 *	  like files and pipes do and wait for the last
445 	 *	  dereference.
446 	 *
447 	 * Can't we simply set sock->err?
448 	 *
449 	 *	  What the above comment does talk about? --ANK(980817)
450 	 */
451 
452 	if (unix_tot_inflight)
453 		unix_gc();		/* Garbage collect fds */
454 
455 	return 0;
456 }
457 
458 static void init_peercred(struct sock *sk)
459 {
460 	put_pid(sk->sk_peer_pid);
461 	if (sk->sk_peer_cred)
462 		put_cred(sk->sk_peer_cred);
463 	sk->sk_peer_pid  = get_pid(task_tgid(current));
464 	sk->sk_peer_cred = get_current_cred();
465 }
466 
467 static void copy_peercred(struct sock *sk, struct sock *peersk)
468 {
469 	put_pid(sk->sk_peer_pid);
470 	if (sk->sk_peer_cred)
471 		put_cred(sk->sk_peer_cred);
472 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
473 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
474 }
475 
476 static int unix_listen(struct socket *sock, int backlog)
477 {
478 	int err;
479 	struct sock *sk = sock->sk;
480 	struct unix_sock *u = unix_sk(sk);
481 	struct pid *old_pid = NULL;
482 
483 	err = -EOPNOTSUPP;
484 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
485 		goto out;	/* Only stream/seqpacket sockets accept */
486 	err = -EINVAL;
487 	if (!u->addr)
488 		goto out;	/* No listens on an unbound socket */
489 	unix_state_lock(sk);
490 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
491 		goto out_unlock;
492 	if (backlog > sk->sk_max_ack_backlog)
493 		wake_up_interruptible_all(&u->peer_wait);
494 	sk->sk_max_ack_backlog	= backlog;
495 	sk->sk_state		= TCP_LISTEN;
496 	/* set credentials so connect can copy them */
497 	init_peercred(sk);
498 	err = 0;
499 
500 out_unlock:
501 	unix_state_unlock(sk);
502 	put_pid(old_pid);
503 out:
504 	return err;
505 }
506 
507 static int unix_release(struct socket *);
508 static int unix_bind(struct socket *, struct sockaddr *, int);
509 static int unix_stream_connect(struct socket *, struct sockaddr *,
510 			       int addr_len, int flags);
511 static int unix_socketpair(struct socket *, struct socket *);
512 static int unix_accept(struct socket *, struct socket *, int);
513 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
514 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
515 static unsigned int unix_dgram_poll(struct file *, struct socket *,
516 				    poll_table *);
517 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
518 static int unix_shutdown(struct socket *, int);
519 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
520 			       struct msghdr *, size_t);
521 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
522 			       struct msghdr *, size_t, int);
523 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
524 			      struct msghdr *, size_t);
525 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
526 			      struct msghdr *, size_t, int);
527 static int unix_dgram_connect(struct socket *, struct sockaddr *,
528 			      int, int);
529 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
530 				  struct msghdr *, size_t);
531 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
532 				  struct msghdr *, size_t, int);
533 
534 static void unix_set_peek_off(struct sock *sk, int val)
535 {
536 	struct unix_sock *u = unix_sk(sk);
537 
538 	mutex_lock(&u->readlock);
539 	sk->sk_peek_off = val;
540 	mutex_unlock(&u->readlock);
541 }
542 
543 
544 static const struct proto_ops unix_stream_ops = {
545 	.family =	PF_UNIX,
546 	.owner =	THIS_MODULE,
547 	.release =	unix_release,
548 	.bind =		unix_bind,
549 	.connect =	unix_stream_connect,
550 	.socketpair =	unix_socketpair,
551 	.accept =	unix_accept,
552 	.getname =	unix_getname,
553 	.poll =		unix_poll,
554 	.ioctl =	unix_ioctl,
555 	.listen =	unix_listen,
556 	.shutdown =	unix_shutdown,
557 	.setsockopt =	sock_no_setsockopt,
558 	.getsockopt =	sock_no_getsockopt,
559 	.sendmsg =	unix_stream_sendmsg,
560 	.recvmsg =	unix_stream_recvmsg,
561 	.mmap =		sock_no_mmap,
562 	.sendpage =	sock_no_sendpage,
563 	.set_peek_off =	unix_set_peek_off,
564 };
565 
566 static const struct proto_ops unix_dgram_ops = {
567 	.family =	PF_UNIX,
568 	.owner =	THIS_MODULE,
569 	.release =	unix_release,
570 	.bind =		unix_bind,
571 	.connect =	unix_dgram_connect,
572 	.socketpair =	unix_socketpair,
573 	.accept =	sock_no_accept,
574 	.getname =	unix_getname,
575 	.poll =		unix_dgram_poll,
576 	.ioctl =	unix_ioctl,
577 	.listen =	sock_no_listen,
578 	.shutdown =	unix_shutdown,
579 	.setsockopt =	sock_no_setsockopt,
580 	.getsockopt =	sock_no_getsockopt,
581 	.sendmsg =	unix_dgram_sendmsg,
582 	.recvmsg =	unix_dgram_recvmsg,
583 	.mmap =		sock_no_mmap,
584 	.sendpage =	sock_no_sendpage,
585 	.set_peek_off =	unix_set_peek_off,
586 };
587 
588 static const struct proto_ops unix_seqpacket_ops = {
589 	.family =	PF_UNIX,
590 	.owner =	THIS_MODULE,
591 	.release =	unix_release,
592 	.bind =		unix_bind,
593 	.connect =	unix_stream_connect,
594 	.socketpair =	unix_socketpair,
595 	.accept =	unix_accept,
596 	.getname =	unix_getname,
597 	.poll =		unix_dgram_poll,
598 	.ioctl =	unix_ioctl,
599 	.listen =	unix_listen,
600 	.shutdown =	unix_shutdown,
601 	.setsockopt =	sock_no_setsockopt,
602 	.getsockopt =	sock_no_getsockopt,
603 	.sendmsg =	unix_seqpacket_sendmsg,
604 	.recvmsg =	unix_seqpacket_recvmsg,
605 	.mmap =		sock_no_mmap,
606 	.sendpage =	sock_no_sendpage,
607 	.set_peek_off =	unix_set_peek_off,
608 };
609 
610 static struct proto unix_proto = {
611 	.name			= "UNIX",
612 	.owner			= THIS_MODULE,
613 	.obj_size		= sizeof(struct unix_sock),
614 };
615 
616 /*
617  * AF_UNIX sockets do not interact with hardware, hence they
618  * dont trigger interrupts - so it's safe for them to have
619  * bh-unsafe locking for their sk_receive_queue.lock. Split off
620  * this special lock-class by reinitializing the spinlock key:
621  */
622 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
623 
624 static struct sock *unix_create1(struct net *net, struct socket *sock)
625 {
626 	struct sock *sk = NULL;
627 	struct unix_sock *u;
628 
629 	atomic_long_inc(&unix_nr_socks);
630 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
631 		goto out;
632 
633 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
634 	if (!sk)
635 		goto out;
636 
637 	sock_init_data(sock, sk);
638 	lockdep_set_class(&sk->sk_receive_queue.lock,
639 				&af_unix_sk_receive_queue_lock_key);
640 
641 	sk->sk_write_space	= unix_write_space;
642 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
643 	sk->sk_destruct		= unix_sock_destructor;
644 	u	  = unix_sk(sk);
645 	u->path.dentry = NULL;
646 	u->path.mnt = NULL;
647 	spin_lock_init(&u->lock);
648 	atomic_long_set(&u->inflight, 0);
649 	INIT_LIST_HEAD(&u->link);
650 	mutex_init(&u->readlock); /* single task reading lock */
651 	init_waitqueue_head(&u->peer_wait);
652 	unix_insert_socket(unix_sockets_unbound(sk), sk);
653 out:
654 	if (sk == NULL)
655 		atomic_long_dec(&unix_nr_socks);
656 	else {
657 		local_bh_disable();
658 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
659 		local_bh_enable();
660 	}
661 	return sk;
662 }
663 
664 static int unix_create(struct net *net, struct socket *sock, int protocol,
665 		       int kern)
666 {
667 	if (protocol && protocol != PF_UNIX)
668 		return -EPROTONOSUPPORT;
669 
670 	sock->state = SS_UNCONNECTED;
671 
672 	switch (sock->type) {
673 	case SOCK_STREAM:
674 		sock->ops = &unix_stream_ops;
675 		break;
676 		/*
677 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
678 		 *	nothing uses it.
679 		 */
680 	case SOCK_RAW:
681 		sock->type = SOCK_DGRAM;
682 	case SOCK_DGRAM:
683 		sock->ops = &unix_dgram_ops;
684 		break;
685 	case SOCK_SEQPACKET:
686 		sock->ops = &unix_seqpacket_ops;
687 		break;
688 	default:
689 		return -ESOCKTNOSUPPORT;
690 	}
691 
692 	return unix_create1(net, sock) ? 0 : -ENOMEM;
693 }
694 
695 static int unix_release(struct socket *sock)
696 {
697 	struct sock *sk = sock->sk;
698 
699 	if (!sk)
700 		return 0;
701 
702 	sock->sk = NULL;
703 
704 	return unix_release_sock(sk, 0);
705 }
706 
707 static int unix_autobind(struct socket *sock)
708 {
709 	struct sock *sk = sock->sk;
710 	struct net *net = sock_net(sk);
711 	struct unix_sock *u = unix_sk(sk);
712 	static u32 ordernum = 1;
713 	struct unix_address *addr;
714 	int err;
715 	unsigned int retries = 0;
716 
717 	mutex_lock(&u->readlock);
718 
719 	err = 0;
720 	if (u->addr)
721 		goto out;
722 
723 	err = -ENOMEM;
724 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
725 	if (!addr)
726 		goto out;
727 
728 	addr->name->sun_family = AF_UNIX;
729 	atomic_set(&addr->refcnt, 1);
730 
731 retry:
732 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
733 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
734 
735 	spin_lock(&unix_table_lock);
736 	ordernum = (ordernum+1)&0xFFFFF;
737 
738 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
739 				      addr->hash)) {
740 		spin_unlock(&unix_table_lock);
741 		/*
742 		 * __unix_find_socket_byname() may take long time if many names
743 		 * are already in use.
744 		 */
745 		cond_resched();
746 		/* Give up if all names seems to be in use. */
747 		if (retries++ == 0xFFFFF) {
748 			err = -ENOSPC;
749 			kfree(addr);
750 			goto out;
751 		}
752 		goto retry;
753 	}
754 	addr->hash ^= sk->sk_type;
755 
756 	__unix_remove_socket(sk);
757 	u->addr = addr;
758 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
759 	spin_unlock(&unix_table_lock);
760 	err = 0;
761 
762 out:	mutex_unlock(&u->readlock);
763 	return err;
764 }
765 
766 static struct sock *unix_find_other(struct net *net,
767 				    struct sockaddr_un *sunname, int len,
768 				    int type, unsigned int hash, int *error)
769 {
770 	struct sock *u;
771 	struct path path;
772 	int err = 0;
773 
774 	if (sunname->sun_path[0]) {
775 		struct inode *inode;
776 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
777 		if (err)
778 			goto fail;
779 		inode = path.dentry->d_inode;
780 		err = inode_permission(inode, MAY_WRITE);
781 		if (err)
782 			goto put_fail;
783 
784 		err = -ECONNREFUSED;
785 		if (!S_ISSOCK(inode->i_mode))
786 			goto put_fail;
787 		u = unix_find_socket_byinode(inode);
788 		if (!u)
789 			goto put_fail;
790 
791 		if (u->sk_type == type)
792 			touch_atime(&path);
793 
794 		path_put(&path);
795 
796 		err = -EPROTOTYPE;
797 		if (u->sk_type != type) {
798 			sock_put(u);
799 			goto fail;
800 		}
801 	} else {
802 		err = -ECONNREFUSED;
803 		u = unix_find_socket_byname(net, sunname, len, type, hash);
804 		if (u) {
805 			struct dentry *dentry;
806 			dentry = unix_sk(u)->path.dentry;
807 			if (dentry)
808 				touch_atime(&unix_sk(u)->path);
809 		} else
810 			goto fail;
811 	}
812 	return u;
813 
814 put_fail:
815 	path_put(&path);
816 fail:
817 	*error = err;
818 	return NULL;
819 }
820 
821 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
822 {
823 	struct dentry *dentry;
824 	struct path path;
825 	int err = 0;
826 	/*
827 	 * Get the parent directory, calculate the hash for last
828 	 * component.
829 	 */
830 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
831 	err = PTR_ERR(dentry);
832 	if (IS_ERR(dentry))
833 		return err;
834 
835 	/*
836 	 * All right, let's create it.
837 	 */
838 	err = security_path_mknod(&path, dentry, mode, 0);
839 	if (!err) {
840 		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
841 		if (!err) {
842 			res->mnt = mntget(path.mnt);
843 			res->dentry = dget(dentry);
844 		}
845 	}
846 	done_path_create(&path, dentry);
847 	return err;
848 }
849 
850 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
851 {
852 	struct sock *sk = sock->sk;
853 	struct net *net = sock_net(sk);
854 	struct unix_sock *u = unix_sk(sk);
855 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
856 	char *sun_path = sunaddr->sun_path;
857 	int err;
858 	unsigned int hash;
859 	struct unix_address *addr;
860 	struct hlist_head *list;
861 
862 	err = -EINVAL;
863 	if (sunaddr->sun_family != AF_UNIX)
864 		goto out;
865 
866 	if (addr_len == sizeof(short)) {
867 		err = unix_autobind(sock);
868 		goto out;
869 	}
870 
871 	err = unix_mkname(sunaddr, addr_len, &hash);
872 	if (err < 0)
873 		goto out;
874 	addr_len = err;
875 
876 	mutex_lock(&u->readlock);
877 
878 	err = -EINVAL;
879 	if (u->addr)
880 		goto out_up;
881 
882 	err = -ENOMEM;
883 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
884 	if (!addr)
885 		goto out_up;
886 
887 	memcpy(addr->name, sunaddr, addr_len);
888 	addr->len = addr_len;
889 	addr->hash = hash ^ sk->sk_type;
890 	atomic_set(&addr->refcnt, 1);
891 
892 	if (sun_path[0]) {
893 		struct path path;
894 		umode_t mode = S_IFSOCK |
895 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
896 		err = unix_mknod(sun_path, mode, &path);
897 		if (err) {
898 			if (err == -EEXIST)
899 				err = -EADDRINUSE;
900 			unix_release_addr(addr);
901 			goto out_up;
902 		}
903 		addr->hash = UNIX_HASH_SIZE;
904 		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
905 		spin_lock(&unix_table_lock);
906 		u->path = path;
907 		list = &unix_socket_table[hash];
908 	} else {
909 		spin_lock(&unix_table_lock);
910 		err = -EADDRINUSE;
911 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
912 					      sk->sk_type, hash)) {
913 			unix_release_addr(addr);
914 			goto out_unlock;
915 		}
916 
917 		list = &unix_socket_table[addr->hash];
918 	}
919 
920 	err = 0;
921 	__unix_remove_socket(sk);
922 	u->addr = addr;
923 	__unix_insert_socket(list, sk);
924 
925 out_unlock:
926 	spin_unlock(&unix_table_lock);
927 out_up:
928 	mutex_unlock(&u->readlock);
929 out:
930 	return err;
931 }
932 
933 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
934 {
935 	if (unlikely(sk1 == sk2) || !sk2) {
936 		unix_state_lock(sk1);
937 		return;
938 	}
939 	if (sk1 < sk2) {
940 		unix_state_lock(sk1);
941 		unix_state_lock_nested(sk2);
942 	} else {
943 		unix_state_lock(sk2);
944 		unix_state_lock_nested(sk1);
945 	}
946 }
947 
948 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
949 {
950 	if (unlikely(sk1 == sk2) || !sk2) {
951 		unix_state_unlock(sk1);
952 		return;
953 	}
954 	unix_state_unlock(sk1);
955 	unix_state_unlock(sk2);
956 }
957 
958 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
959 			      int alen, int flags)
960 {
961 	struct sock *sk = sock->sk;
962 	struct net *net = sock_net(sk);
963 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
964 	struct sock *other;
965 	unsigned int hash;
966 	int err;
967 
968 	if (addr->sa_family != AF_UNSPEC) {
969 		err = unix_mkname(sunaddr, alen, &hash);
970 		if (err < 0)
971 			goto out;
972 		alen = err;
973 
974 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
975 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
976 			goto out;
977 
978 restart:
979 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
980 		if (!other)
981 			goto out;
982 
983 		unix_state_double_lock(sk, other);
984 
985 		/* Apparently VFS overslept socket death. Retry. */
986 		if (sock_flag(other, SOCK_DEAD)) {
987 			unix_state_double_unlock(sk, other);
988 			sock_put(other);
989 			goto restart;
990 		}
991 
992 		err = -EPERM;
993 		if (!unix_may_send(sk, other))
994 			goto out_unlock;
995 
996 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
997 		if (err)
998 			goto out_unlock;
999 
1000 	} else {
1001 		/*
1002 		 *	1003.1g breaking connected state with AF_UNSPEC
1003 		 */
1004 		other = NULL;
1005 		unix_state_double_lock(sk, other);
1006 	}
1007 
1008 	/*
1009 	 * If it was connected, reconnect.
1010 	 */
1011 	if (unix_peer(sk)) {
1012 		struct sock *old_peer = unix_peer(sk);
1013 		unix_peer(sk) = other;
1014 		unix_state_double_unlock(sk, other);
1015 
1016 		if (other != old_peer)
1017 			unix_dgram_disconnected(sk, old_peer);
1018 		sock_put(old_peer);
1019 	} else {
1020 		unix_peer(sk) = other;
1021 		unix_state_double_unlock(sk, other);
1022 	}
1023 	return 0;
1024 
1025 out_unlock:
1026 	unix_state_double_unlock(sk, other);
1027 	sock_put(other);
1028 out:
1029 	return err;
1030 }
1031 
1032 static long unix_wait_for_peer(struct sock *other, long timeo)
1033 {
1034 	struct unix_sock *u = unix_sk(other);
1035 	int sched;
1036 	DEFINE_WAIT(wait);
1037 
1038 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1039 
1040 	sched = !sock_flag(other, SOCK_DEAD) &&
1041 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1042 		unix_recvq_full(other);
1043 
1044 	unix_state_unlock(other);
1045 
1046 	if (sched)
1047 		timeo = schedule_timeout(timeo);
1048 
1049 	finish_wait(&u->peer_wait, &wait);
1050 	return timeo;
1051 }
1052 
1053 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1054 			       int addr_len, int flags)
1055 {
1056 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1057 	struct sock *sk = sock->sk;
1058 	struct net *net = sock_net(sk);
1059 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1060 	struct sock *newsk = NULL;
1061 	struct sock *other = NULL;
1062 	struct sk_buff *skb = NULL;
1063 	unsigned int hash;
1064 	int st;
1065 	int err;
1066 	long timeo;
1067 
1068 	err = unix_mkname(sunaddr, addr_len, &hash);
1069 	if (err < 0)
1070 		goto out;
1071 	addr_len = err;
1072 
1073 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1074 	    (err = unix_autobind(sock)) != 0)
1075 		goto out;
1076 
1077 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1078 
1079 	/* First of all allocate resources.
1080 	   If we will make it after state is locked,
1081 	   we will have to recheck all again in any case.
1082 	 */
1083 
1084 	err = -ENOMEM;
1085 
1086 	/* create new sock for complete connection */
1087 	newsk = unix_create1(sock_net(sk), NULL);
1088 	if (newsk == NULL)
1089 		goto out;
1090 
1091 	/* Allocate skb for sending to listening sock */
1092 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1093 	if (skb == NULL)
1094 		goto out;
1095 
1096 restart:
1097 	/*  Find listening sock. */
1098 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1099 	if (!other)
1100 		goto out;
1101 
1102 	/* Latch state of peer */
1103 	unix_state_lock(other);
1104 
1105 	/* Apparently VFS overslept socket death. Retry. */
1106 	if (sock_flag(other, SOCK_DEAD)) {
1107 		unix_state_unlock(other);
1108 		sock_put(other);
1109 		goto restart;
1110 	}
1111 
1112 	err = -ECONNREFUSED;
1113 	if (other->sk_state != TCP_LISTEN)
1114 		goto out_unlock;
1115 	if (other->sk_shutdown & RCV_SHUTDOWN)
1116 		goto out_unlock;
1117 
1118 	if (unix_recvq_full(other)) {
1119 		err = -EAGAIN;
1120 		if (!timeo)
1121 			goto out_unlock;
1122 
1123 		timeo = unix_wait_for_peer(other, timeo);
1124 
1125 		err = sock_intr_errno(timeo);
1126 		if (signal_pending(current))
1127 			goto out;
1128 		sock_put(other);
1129 		goto restart;
1130 	}
1131 
1132 	/* Latch our state.
1133 
1134 	   It is tricky place. We need to grab our state lock and cannot
1135 	   drop lock on peer. It is dangerous because deadlock is
1136 	   possible. Connect to self case and simultaneous
1137 	   attempt to connect are eliminated by checking socket
1138 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1139 	   check this before attempt to grab lock.
1140 
1141 	   Well, and we have to recheck the state after socket locked.
1142 	 */
1143 	st = sk->sk_state;
1144 
1145 	switch (st) {
1146 	case TCP_CLOSE:
1147 		/* This is ok... continue with connect */
1148 		break;
1149 	case TCP_ESTABLISHED:
1150 		/* Socket is already connected */
1151 		err = -EISCONN;
1152 		goto out_unlock;
1153 	default:
1154 		err = -EINVAL;
1155 		goto out_unlock;
1156 	}
1157 
1158 	unix_state_lock_nested(sk);
1159 
1160 	if (sk->sk_state != st) {
1161 		unix_state_unlock(sk);
1162 		unix_state_unlock(other);
1163 		sock_put(other);
1164 		goto restart;
1165 	}
1166 
1167 	err = security_unix_stream_connect(sk, other, newsk);
1168 	if (err) {
1169 		unix_state_unlock(sk);
1170 		goto out_unlock;
1171 	}
1172 
1173 	/* The way is open! Fastly set all the necessary fields... */
1174 
1175 	sock_hold(sk);
1176 	unix_peer(newsk)	= sk;
1177 	newsk->sk_state		= TCP_ESTABLISHED;
1178 	newsk->sk_type		= sk->sk_type;
1179 	init_peercred(newsk);
1180 	newu = unix_sk(newsk);
1181 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1182 	otheru = unix_sk(other);
1183 
1184 	/* copy address information from listening to new sock*/
1185 	if (otheru->addr) {
1186 		atomic_inc(&otheru->addr->refcnt);
1187 		newu->addr = otheru->addr;
1188 	}
1189 	if (otheru->path.dentry) {
1190 		path_get(&otheru->path);
1191 		newu->path = otheru->path;
1192 	}
1193 
1194 	/* Set credentials */
1195 	copy_peercred(sk, other);
1196 
1197 	sock->state	= SS_CONNECTED;
1198 	sk->sk_state	= TCP_ESTABLISHED;
1199 	sock_hold(newsk);
1200 
1201 	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1202 	unix_peer(sk)	= newsk;
1203 
1204 	unix_state_unlock(sk);
1205 
1206 	/* take ten and and send info to listening sock */
1207 	spin_lock(&other->sk_receive_queue.lock);
1208 	__skb_queue_tail(&other->sk_receive_queue, skb);
1209 	spin_unlock(&other->sk_receive_queue.lock);
1210 	unix_state_unlock(other);
1211 	other->sk_data_ready(other, 0);
1212 	sock_put(other);
1213 	return 0;
1214 
1215 out_unlock:
1216 	if (other)
1217 		unix_state_unlock(other);
1218 
1219 out:
1220 	kfree_skb(skb);
1221 	if (newsk)
1222 		unix_release_sock(newsk, 0);
1223 	if (other)
1224 		sock_put(other);
1225 	return err;
1226 }
1227 
1228 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1229 {
1230 	struct sock *ska = socka->sk, *skb = sockb->sk;
1231 
1232 	/* Join our sockets back to back */
1233 	sock_hold(ska);
1234 	sock_hold(skb);
1235 	unix_peer(ska) = skb;
1236 	unix_peer(skb) = ska;
1237 	init_peercred(ska);
1238 	init_peercred(skb);
1239 
1240 	if (ska->sk_type != SOCK_DGRAM) {
1241 		ska->sk_state = TCP_ESTABLISHED;
1242 		skb->sk_state = TCP_ESTABLISHED;
1243 		socka->state  = SS_CONNECTED;
1244 		sockb->state  = SS_CONNECTED;
1245 	}
1246 	return 0;
1247 }
1248 
1249 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1250 {
1251 	struct sock *sk = sock->sk;
1252 	struct sock *tsk;
1253 	struct sk_buff *skb;
1254 	int err;
1255 
1256 	err = -EOPNOTSUPP;
1257 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1258 		goto out;
1259 
1260 	err = -EINVAL;
1261 	if (sk->sk_state != TCP_LISTEN)
1262 		goto out;
1263 
1264 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1265 	 * so that no locks are necessary.
1266 	 */
1267 
1268 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1269 	if (!skb) {
1270 		/* This means receive shutdown. */
1271 		if (err == 0)
1272 			err = -EINVAL;
1273 		goto out;
1274 	}
1275 
1276 	tsk = skb->sk;
1277 	skb_free_datagram(sk, skb);
1278 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1279 
1280 	/* attach accepted sock to socket */
1281 	unix_state_lock(tsk);
1282 	newsock->state = SS_CONNECTED;
1283 	sock_graft(tsk, newsock);
1284 	unix_state_unlock(tsk);
1285 	return 0;
1286 
1287 out:
1288 	return err;
1289 }
1290 
1291 
1292 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1293 {
1294 	struct sock *sk = sock->sk;
1295 	struct unix_sock *u;
1296 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1297 	int err = 0;
1298 
1299 	if (peer) {
1300 		sk = unix_peer_get(sk);
1301 
1302 		err = -ENOTCONN;
1303 		if (!sk)
1304 			goto out;
1305 		err = 0;
1306 	} else {
1307 		sock_hold(sk);
1308 	}
1309 
1310 	u = unix_sk(sk);
1311 	unix_state_lock(sk);
1312 	if (!u->addr) {
1313 		sunaddr->sun_family = AF_UNIX;
1314 		sunaddr->sun_path[0] = 0;
1315 		*uaddr_len = sizeof(short);
1316 	} else {
1317 		struct unix_address *addr = u->addr;
1318 
1319 		*uaddr_len = addr->len;
1320 		memcpy(sunaddr, addr->name, *uaddr_len);
1321 	}
1322 	unix_state_unlock(sk);
1323 	sock_put(sk);
1324 out:
1325 	return err;
1326 }
1327 
1328 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1329 {
1330 	int i;
1331 
1332 	scm->fp = UNIXCB(skb).fp;
1333 	UNIXCB(skb).fp = NULL;
1334 
1335 	for (i = scm->fp->count-1; i >= 0; i--)
1336 		unix_notinflight(scm->fp->fp[i]);
1337 }
1338 
1339 static void unix_destruct_scm(struct sk_buff *skb)
1340 {
1341 	struct scm_cookie scm;
1342 	memset(&scm, 0, sizeof(scm));
1343 	scm.pid  = UNIXCB(skb).pid;
1344 	scm.cred = UNIXCB(skb).cred;
1345 	if (UNIXCB(skb).fp)
1346 		unix_detach_fds(&scm, skb);
1347 
1348 	/* Alas, it calls VFS */
1349 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1350 	scm_destroy(&scm);
1351 	sock_wfree(skb);
1352 }
1353 
1354 #define MAX_RECURSION_LEVEL 4
1355 
1356 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1357 {
1358 	int i;
1359 	unsigned char max_level = 0;
1360 	int unix_sock_count = 0;
1361 
1362 	for (i = scm->fp->count - 1; i >= 0; i--) {
1363 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1364 
1365 		if (sk) {
1366 			unix_sock_count++;
1367 			max_level = max(max_level,
1368 					unix_sk(sk)->recursion_level);
1369 		}
1370 	}
1371 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1372 		return -ETOOMANYREFS;
1373 
1374 	/*
1375 	 * Need to duplicate file references for the sake of garbage
1376 	 * collection.  Otherwise a socket in the fps might become a
1377 	 * candidate for GC while the skb is not yet queued.
1378 	 */
1379 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1380 	if (!UNIXCB(skb).fp)
1381 		return -ENOMEM;
1382 
1383 	if (unix_sock_count) {
1384 		for (i = scm->fp->count - 1; i >= 0; i--)
1385 			unix_inflight(scm->fp->fp[i]);
1386 	}
1387 	return max_level;
1388 }
1389 
1390 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1391 {
1392 	int err = 0;
1393 
1394 	UNIXCB(skb).pid  = get_pid(scm->pid);
1395 	if (scm->cred)
1396 		UNIXCB(skb).cred = get_cred(scm->cred);
1397 	UNIXCB(skb).fp = NULL;
1398 	if (scm->fp && send_fds)
1399 		err = unix_attach_fds(scm, skb);
1400 
1401 	skb->destructor = unix_destruct_scm;
1402 	return err;
1403 }
1404 
1405 /*
1406  * Some apps rely on write() giving SCM_CREDENTIALS
1407  * We include credentials if source or destination socket
1408  * asserted SOCK_PASSCRED.
1409  */
1410 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1411 			    const struct sock *other)
1412 {
1413 	if (UNIXCB(skb).cred)
1414 		return;
1415 	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1416 	    !other->sk_socket ||
1417 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1418 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1419 		UNIXCB(skb).cred = get_current_cred();
1420 	}
1421 }
1422 
1423 /*
1424  *	Send AF_UNIX data.
1425  */
1426 
1427 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1428 			      struct msghdr *msg, size_t len)
1429 {
1430 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1431 	struct sock *sk = sock->sk;
1432 	struct net *net = sock_net(sk);
1433 	struct unix_sock *u = unix_sk(sk);
1434 	struct sockaddr_un *sunaddr = msg->msg_name;
1435 	struct sock *other = NULL;
1436 	int namelen = 0; /* fake GCC */
1437 	int err;
1438 	unsigned int hash;
1439 	struct sk_buff *skb;
1440 	long timeo;
1441 	struct scm_cookie tmp_scm;
1442 	int max_level;
1443 	int data_len = 0;
1444 
1445 	if (NULL == siocb->scm)
1446 		siocb->scm = &tmp_scm;
1447 	wait_for_unix_gc();
1448 	err = scm_send(sock, msg, siocb->scm, false);
1449 	if (err < 0)
1450 		return err;
1451 
1452 	err = -EOPNOTSUPP;
1453 	if (msg->msg_flags&MSG_OOB)
1454 		goto out;
1455 
1456 	if (msg->msg_namelen) {
1457 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1458 		if (err < 0)
1459 			goto out;
1460 		namelen = err;
1461 	} else {
1462 		sunaddr = NULL;
1463 		err = -ENOTCONN;
1464 		other = unix_peer_get(sk);
1465 		if (!other)
1466 			goto out;
1467 	}
1468 
1469 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1470 	    && (err = unix_autobind(sock)) != 0)
1471 		goto out;
1472 
1473 	err = -EMSGSIZE;
1474 	if (len > sk->sk_sndbuf - 32)
1475 		goto out;
1476 
1477 	if (len > SKB_MAX_ALLOC)
1478 		data_len = min_t(size_t,
1479 				 len - SKB_MAX_ALLOC,
1480 				 MAX_SKB_FRAGS * PAGE_SIZE);
1481 
1482 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1483 				   msg->msg_flags & MSG_DONTWAIT, &err);
1484 	if (skb == NULL)
1485 		goto out;
1486 
1487 	err = unix_scm_to_skb(siocb->scm, skb, true);
1488 	if (err < 0)
1489 		goto out_free;
1490 	max_level = err + 1;
1491 	unix_get_secdata(siocb->scm, skb);
1492 
1493 	skb_put(skb, len - data_len);
1494 	skb->data_len = data_len;
1495 	skb->len = len;
1496 	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1497 	if (err)
1498 		goto out_free;
1499 
1500 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1501 
1502 restart:
1503 	if (!other) {
1504 		err = -ECONNRESET;
1505 		if (sunaddr == NULL)
1506 			goto out_free;
1507 
1508 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1509 					hash, &err);
1510 		if (other == NULL)
1511 			goto out_free;
1512 	}
1513 
1514 	if (sk_filter(other, skb) < 0) {
1515 		/* Toss the packet but do not return any error to the sender */
1516 		err = len;
1517 		goto out_free;
1518 	}
1519 
1520 	unix_state_lock(other);
1521 	err = -EPERM;
1522 	if (!unix_may_send(sk, other))
1523 		goto out_unlock;
1524 
1525 	if (sock_flag(other, SOCK_DEAD)) {
1526 		/*
1527 		 *	Check with 1003.1g - what should
1528 		 *	datagram error
1529 		 */
1530 		unix_state_unlock(other);
1531 		sock_put(other);
1532 
1533 		err = 0;
1534 		unix_state_lock(sk);
1535 		if (unix_peer(sk) == other) {
1536 			unix_peer(sk) = NULL;
1537 			unix_state_unlock(sk);
1538 
1539 			unix_dgram_disconnected(sk, other);
1540 			sock_put(other);
1541 			err = -ECONNREFUSED;
1542 		} else {
1543 			unix_state_unlock(sk);
1544 		}
1545 
1546 		other = NULL;
1547 		if (err)
1548 			goto out_free;
1549 		goto restart;
1550 	}
1551 
1552 	err = -EPIPE;
1553 	if (other->sk_shutdown & RCV_SHUTDOWN)
1554 		goto out_unlock;
1555 
1556 	if (sk->sk_type != SOCK_SEQPACKET) {
1557 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1558 		if (err)
1559 			goto out_unlock;
1560 	}
1561 
1562 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1563 		if (!timeo) {
1564 			err = -EAGAIN;
1565 			goto out_unlock;
1566 		}
1567 
1568 		timeo = unix_wait_for_peer(other, timeo);
1569 
1570 		err = sock_intr_errno(timeo);
1571 		if (signal_pending(current))
1572 			goto out_free;
1573 
1574 		goto restart;
1575 	}
1576 
1577 	if (sock_flag(other, SOCK_RCVTSTAMP))
1578 		__net_timestamp(skb);
1579 	maybe_add_creds(skb, sock, other);
1580 	skb_queue_tail(&other->sk_receive_queue, skb);
1581 	if (max_level > unix_sk(other)->recursion_level)
1582 		unix_sk(other)->recursion_level = max_level;
1583 	unix_state_unlock(other);
1584 	other->sk_data_ready(other, len);
1585 	sock_put(other);
1586 	scm_destroy(siocb->scm);
1587 	return len;
1588 
1589 out_unlock:
1590 	unix_state_unlock(other);
1591 out_free:
1592 	kfree_skb(skb);
1593 out:
1594 	if (other)
1595 		sock_put(other);
1596 	scm_destroy(siocb->scm);
1597 	return err;
1598 }
1599 
1600 
1601 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1602 			       struct msghdr *msg, size_t len)
1603 {
1604 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1605 	struct sock *sk = sock->sk;
1606 	struct sock *other = NULL;
1607 	int err, size;
1608 	struct sk_buff *skb;
1609 	int sent = 0;
1610 	struct scm_cookie tmp_scm;
1611 	bool fds_sent = false;
1612 	int max_level;
1613 
1614 	if (NULL == siocb->scm)
1615 		siocb->scm = &tmp_scm;
1616 	wait_for_unix_gc();
1617 	err = scm_send(sock, msg, siocb->scm, false);
1618 	if (err < 0)
1619 		return err;
1620 
1621 	err = -EOPNOTSUPP;
1622 	if (msg->msg_flags&MSG_OOB)
1623 		goto out_err;
1624 
1625 	if (msg->msg_namelen) {
1626 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1627 		goto out_err;
1628 	} else {
1629 		err = -ENOTCONN;
1630 		other = unix_peer(sk);
1631 		if (!other)
1632 			goto out_err;
1633 	}
1634 
1635 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1636 		goto pipe_err;
1637 
1638 	while (sent < len) {
1639 		/*
1640 		 *	Optimisation for the fact that under 0.01% of X
1641 		 *	messages typically need breaking up.
1642 		 */
1643 
1644 		size = len-sent;
1645 
1646 		/* Keep two messages in the pipe so it schedules better */
1647 		if (size > ((sk->sk_sndbuf >> 1) - 64))
1648 			size = (sk->sk_sndbuf >> 1) - 64;
1649 
1650 		if (size > SKB_MAX_ALLOC)
1651 			size = SKB_MAX_ALLOC;
1652 
1653 		/*
1654 		 *	Grab a buffer
1655 		 */
1656 
1657 		skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
1658 					  &err);
1659 
1660 		if (skb == NULL)
1661 			goto out_err;
1662 
1663 		/*
1664 		 *	If you pass two values to the sock_alloc_send_skb
1665 		 *	it tries to grab the large buffer with GFP_NOFS
1666 		 *	(which can fail easily), and if it fails grab the
1667 		 *	fallback size buffer which is under a page and will
1668 		 *	succeed. [Alan]
1669 		 */
1670 		size = min_t(int, size, skb_tailroom(skb));
1671 
1672 
1673 		/* Only send the fds in the first buffer */
1674 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1675 		if (err < 0) {
1676 			kfree_skb(skb);
1677 			goto out_err;
1678 		}
1679 		max_level = err + 1;
1680 		fds_sent = true;
1681 
1682 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
1683 		if (err) {
1684 			kfree_skb(skb);
1685 			goto out_err;
1686 		}
1687 
1688 		unix_state_lock(other);
1689 
1690 		if (sock_flag(other, SOCK_DEAD) ||
1691 		    (other->sk_shutdown & RCV_SHUTDOWN))
1692 			goto pipe_err_free;
1693 
1694 		maybe_add_creds(skb, sock, other);
1695 		skb_queue_tail(&other->sk_receive_queue, skb);
1696 		if (max_level > unix_sk(other)->recursion_level)
1697 			unix_sk(other)->recursion_level = max_level;
1698 		unix_state_unlock(other);
1699 		other->sk_data_ready(other, size);
1700 		sent += size;
1701 	}
1702 
1703 	scm_destroy(siocb->scm);
1704 	siocb->scm = NULL;
1705 
1706 	return sent;
1707 
1708 pipe_err_free:
1709 	unix_state_unlock(other);
1710 	kfree_skb(skb);
1711 pipe_err:
1712 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1713 		send_sig(SIGPIPE, current, 0);
1714 	err = -EPIPE;
1715 out_err:
1716 	scm_destroy(siocb->scm);
1717 	siocb->scm = NULL;
1718 	return sent ? : err;
1719 }
1720 
1721 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1722 				  struct msghdr *msg, size_t len)
1723 {
1724 	int err;
1725 	struct sock *sk = sock->sk;
1726 
1727 	err = sock_error(sk);
1728 	if (err)
1729 		return err;
1730 
1731 	if (sk->sk_state != TCP_ESTABLISHED)
1732 		return -ENOTCONN;
1733 
1734 	if (msg->msg_namelen)
1735 		msg->msg_namelen = 0;
1736 
1737 	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1738 }
1739 
1740 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1741 			      struct msghdr *msg, size_t size,
1742 			      int flags)
1743 {
1744 	struct sock *sk = sock->sk;
1745 
1746 	if (sk->sk_state != TCP_ESTABLISHED)
1747 		return -ENOTCONN;
1748 
1749 	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1750 }
1751 
1752 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1753 {
1754 	struct unix_sock *u = unix_sk(sk);
1755 
1756 	msg->msg_namelen = 0;
1757 	if (u->addr) {
1758 		msg->msg_namelen = u->addr->len;
1759 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1760 	}
1761 }
1762 
1763 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1764 			      struct msghdr *msg, size_t size,
1765 			      int flags)
1766 {
1767 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1768 	struct scm_cookie tmp_scm;
1769 	struct sock *sk = sock->sk;
1770 	struct unix_sock *u = unix_sk(sk);
1771 	int noblock = flags & MSG_DONTWAIT;
1772 	struct sk_buff *skb;
1773 	int err;
1774 	int peeked, skip;
1775 
1776 	err = -EOPNOTSUPP;
1777 	if (flags&MSG_OOB)
1778 		goto out;
1779 
1780 	msg->msg_namelen = 0;
1781 
1782 	err = mutex_lock_interruptible(&u->readlock);
1783 	if (err) {
1784 		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1785 		goto out;
1786 	}
1787 
1788 	skip = sk_peek_offset(sk, flags);
1789 
1790 	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1791 	if (!skb) {
1792 		unix_state_lock(sk);
1793 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1794 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1795 		    (sk->sk_shutdown & RCV_SHUTDOWN))
1796 			err = 0;
1797 		unix_state_unlock(sk);
1798 		goto out_unlock;
1799 	}
1800 
1801 	wake_up_interruptible_sync_poll(&u->peer_wait,
1802 					POLLOUT | POLLWRNORM | POLLWRBAND);
1803 
1804 	if (msg->msg_name)
1805 		unix_copy_addr(msg, skb->sk);
1806 
1807 	if (size > skb->len - skip)
1808 		size = skb->len - skip;
1809 	else if (size < skb->len - skip)
1810 		msg->msg_flags |= MSG_TRUNC;
1811 
1812 	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1813 	if (err)
1814 		goto out_free;
1815 
1816 	if (sock_flag(sk, SOCK_RCVTSTAMP))
1817 		__sock_recv_timestamp(msg, sk, skb);
1818 
1819 	if (!siocb->scm) {
1820 		siocb->scm = &tmp_scm;
1821 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1822 	}
1823 	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
1824 	unix_set_secdata(siocb->scm, skb);
1825 
1826 	if (!(flags & MSG_PEEK)) {
1827 		if (UNIXCB(skb).fp)
1828 			unix_detach_fds(siocb->scm, skb);
1829 
1830 		sk_peek_offset_bwd(sk, skb->len);
1831 	} else {
1832 		/* It is questionable: on PEEK we could:
1833 		   - do not return fds - good, but too simple 8)
1834 		   - return fds, and do not return them on read (old strategy,
1835 		     apparently wrong)
1836 		   - clone fds (I chose it for now, it is the most universal
1837 		     solution)
1838 
1839 		   POSIX 1003.1g does not actually define this clearly
1840 		   at all. POSIX 1003.1g doesn't define a lot of things
1841 		   clearly however!
1842 
1843 		*/
1844 
1845 		sk_peek_offset_fwd(sk, size);
1846 
1847 		if (UNIXCB(skb).fp)
1848 			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1849 	}
1850 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1851 
1852 	scm_recv(sock, msg, siocb->scm, flags);
1853 
1854 out_free:
1855 	skb_free_datagram(sk, skb);
1856 out_unlock:
1857 	mutex_unlock(&u->readlock);
1858 out:
1859 	return err;
1860 }
1861 
1862 /*
1863  *	Sleep until data has arrive. But check for races..
1864  */
1865 
1866 static long unix_stream_data_wait(struct sock *sk, long timeo)
1867 {
1868 	DEFINE_WAIT(wait);
1869 
1870 	unix_state_lock(sk);
1871 
1872 	for (;;) {
1873 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1874 
1875 		if (!skb_queue_empty(&sk->sk_receive_queue) ||
1876 		    sk->sk_err ||
1877 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1878 		    signal_pending(current) ||
1879 		    !timeo)
1880 			break;
1881 
1882 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1883 		unix_state_unlock(sk);
1884 		timeo = schedule_timeout(timeo);
1885 		unix_state_lock(sk);
1886 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1887 	}
1888 
1889 	finish_wait(sk_sleep(sk), &wait);
1890 	unix_state_unlock(sk);
1891 	return timeo;
1892 }
1893 
1894 
1895 
1896 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1897 			       struct msghdr *msg, size_t size,
1898 			       int flags)
1899 {
1900 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1901 	struct scm_cookie tmp_scm;
1902 	struct sock *sk = sock->sk;
1903 	struct unix_sock *u = unix_sk(sk);
1904 	struct sockaddr_un *sunaddr = msg->msg_name;
1905 	int copied = 0;
1906 	int check_creds = 0;
1907 	int target;
1908 	int err = 0;
1909 	long timeo;
1910 	int skip;
1911 
1912 	err = -EINVAL;
1913 	if (sk->sk_state != TCP_ESTABLISHED)
1914 		goto out;
1915 
1916 	err = -EOPNOTSUPP;
1917 	if (flags&MSG_OOB)
1918 		goto out;
1919 
1920 	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1921 	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1922 
1923 	msg->msg_namelen = 0;
1924 
1925 	/* Lock the socket to prevent queue disordering
1926 	 * while sleeps in memcpy_tomsg
1927 	 */
1928 
1929 	if (!siocb->scm) {
1930 		siocb->scm = &tmp_scm;
1931 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1932 	}
1933 
1934 	err = mutex_lock_interruptible(&u->readlock);
1935 	if (err) {
1936 		err = sock_intr_errno(timeo);
1937 		goto out;
1938 	}
1939 
1940 	skip = sk_peek_offset(sk, flags);
1941 
1942 	do {
1943 		int chunk;
1944 		struct sk_buff *skb;
1945 
1946 		unix_state_lock(sk);
1947 		skb = skb_peek(&sk->sk_receive_queue);
1948 again:
1949 		if (skb == NULL) {
1950 			unix_sk(sk)->recursion_level = 0;
1951 			if (copied >= target)
1952 				goto unlock;
1953 
1954 			/*
1955 			 *	POSIX 1003.1g mandates this order.
1956 			 */
1957 
1958 			err = sock_error(sk);
1959 			if (err)
1960 				goto unlock;
1961 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1962 				goto unlock;
1963 
1964 			unix_state_unlock(sk);
1965 			err = -EAGAIN;
1966 			if (!timeo)
1967 				break;
1968 			mutex_unlock(&u->readlock);
1969 
1970 			timeo = unix_stream_data_wait(sk, timeo);
1971 
1972 			if (signal_pending(current)
1973 			    ||  mutex_lock_interruptible(&u->readlock)) {
1974 				err = sock_intr_errno(timeo);
1975 				goto out;
1976 			}
1977 
1978 			continue;
1979  unlock:
1980 			unix_state_unlock(sk);
1981 			break;
1982 		}
1983 
1984 		if (skip >= skb->len) {
1985 			skip -= skb->len;
1986 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1987 			goto again;
1988 		}
1989 
1990 		unix_state_unlock(sk);
1991 
1992 		if (check_creds) {
1993 			/* Never glue messages from different writers */
1994 			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1995 			    (UNIXCB(skb).cred != siocb->scm->cred))
1996 				break;
1997 		} else {
1998 			/* Copy credentials */
1999 			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
2000 			check_creds = 1;
2001 		}
2002 
2003 		/* Copy address just once */
2004 		if (sunaddr) {
2005 			unix_copy_addr(msg, skb->sk);
2006 			sunaddr = NULL;
2007 		}
2008 
2009 		chunk = min_t(unsigned int, skb->len - skip, size);
2010 		if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) {
2011 			if (copied == 0)
2012 				copied = -EFAULT;
2013 			break;
2014 		}
2015 		copied += chunk;
2016 		size -= chunk;
2017 
2018 		/* Mark read part of skb as used */
2019 		if (!(flags & MSG_PEEK)) {
2020 			skb_pull(skb, chunk);
2021 
2022 			sk_peek_offset_bwd(sk, chunk);
2023 
2024 			if (UNIXCB(skb).fp)
2025 				unix_detach_fds(siocb->scm, skb);
2026 
2027 			if (skb->len)
2028 				break;
2029 
2030 			skb_unlink(skb, &sk->sk_receive_queue);
2031 			consume_skb(skb);
2032 
2033 			if (siocb->scm->fp)
2034 				break;
2035 		} else {
2036 			/* It is questionable, see note in unix_dgram_recvmsg.
2037 			 */
2038 			if (UNIXCB(skb).fp)
2039 				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2040 
2041 			sk_peek_offset_fwd(sk, chunk);
2042 
2043 			break;
2044 		}
2045 	} while (size);
2046 
2047 	mutex_unlock(&u->readlock);
2048 	scm_recv(sock, msg, siocb->scm, flags);
2049 out:
2050 	return copied ? : err;
2051 }
2052 
2053 static int unix_shutdown(struct socket *sock, int mode)
2054 {
2055 	struct sock *sk = sock->sk;
2056 	struct sock *other;
2057 
2058 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2059 		return -EINVAL;
2060 	/* This maps:
2061 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2062 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2063 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2064 	 */
2065 	++mode;
2066 
2067 	unix_state_lock(sk);
2068 	sk->sk_shutdown |= mode;
2069 	other = unix_peer(sk);
2070 	if (other)
2071 		sock_hold(other);
2072 	unix_state_unlock(sk);
2073 	sk->sk_state_change(sk);
2074 
2075 	if (other &&
2076 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2077 
2078 		int peer_mode = 0;
2079 
2080 		if (mode&RCV_SHUTDOWN)
2081 			peer_mode |= SEND_SHUTDOWN;
2082 		if (mode&SEND_SHUTDOWN)
2083 			peer_mode |= RCV_SHUTDOWN;
2084 		unix_state_lock(other);
2085 		other->sk_shutdown |= peer_mode;
2086 		unix_state_unlock(other);
2087 		other->sk_state_change(other);
2088 		if (peer_mode == SHUTDOWN_MASK)
2089 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2090 		else if (peer_mode & RCV_SHUTDOWN)
2091 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2092 	}
2093 	if (other)
2094 		sock_put(other);
2095 
2096 	return 0;
2097 }
2098 
2099 long unix_inq_len(struct sock *sk)
2100 {
2101 	struct sk_buff *skb;
2102 	long amount = 0;
2103 
2104 	if (sk->sk_state == TCP_LISTEN)
2105 		return -EINVAL;
2106 
2107 	spin_lock(&sk->sk_receive_queue.lock);
2108 	if (sk->sk_type == SOCK_STREAM ||
2109 	    sk->sk_type == SOCK_SEQPACKET) {
2110 		skb_queue_walk(&sk->sk_receive_queue, skb)
2111 			amount += skb->len;
2112 	} else {
2113 		skb = skb_peek(&sk->sk_receive_queue);
2114 		if (skb)
2115 			amount = skb->len;
2116 	}
2117 	spin_unlock(&sk->sk_receive_queue.lock);
2118 
2119 	return amount;
2120 }
2121 EXPORT_SYMBOL_GPL(unix_inq_len);
2122 
2123 long unix_outq_len(struct sock *sk)
2124 {
2125 	return sk_wmem_alloc_get(sk);
2126 }
2127 EXPORT_SYMBOL_GPL(unix_outq_len);
2128 
2129 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2130 {
2131 	struct sock *sk = sock->sk;
2132 	long amount = 0;
2133 	int err;
2134 
2135 	switch (cmd) {
2136 	case SIOCOUTQ:
2137 		amount = unix_outq_len(sk);
2138 		err = put_user(amount, (int __user *)arg);
2139 		break;
2140 	case SIOCINQ:
2141 		amount = unix_inq_len(sk);
2142 		if (amount < 0)
2143 			err = amount;
2144 		else
2145 			err = put_user(amount, (int __user *)arg);
2146 		break;
2147 	default:
2148 		err = -ENOIOCTLCMD;
2149 		break;
2150 	}
2151 	return err;
2152 }
2153 
2154 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2155 {
2156 	struct sock *sk = sock->sk;
2157 	unsigned int mask;
2158 
2159 	sock_poll_wait(file, sk_sleep(sk), wait);
2160 	mask = 0;
2161 
2162 	/* exceptional events? */
2163 	if (sk->sk_err)
2164 		mask |= POLLERR;
2165 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2166 		mask |= POLLHUP;
2167 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2168 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2169 
2170 	/* readable? */
2171 	if (!skb_queue_empty(&sk->sk_receive_queue))
2172 		mask |= POLLIN | POLLRDNORM;
2173 
2174 	/* Connection-based need to check for termination and startup */
2175 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2176 	    sk->sk_state == TCP_CLOSE)
2177 		mask |= POLLHUP;
2178 
2179 	/*
2180 	 * we set writable also when the other side has shut down the
2181 	 * connection. This prevents stuck sockets.
2182 	 */
2183 	if (unix_writable(sk))
2184 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2185 
2186 	return mask;
2187 }
2188 
2189 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2190 				    poll_table *wait)
2191 {
2192 	struct sock *sk = sock->sk, *other;
2193 	unsigned int mask, writable;
2194 
2195 	sock_poll_wait(file, sk_sleep(sk), wait);
2196 	mask = 0;
2197 
2198 	/* exceptional events? */
2199 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2200 		mask |= POLLERR;
2201 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2202 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2203 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2204 		mask |= POLLHUP;
2205 
2206 	/* readable? */
2207 	if (!skb_queue_empty(&sk->sk_receive_queue))
2208 		mask |= POLLIN | POLLRDNORM;
2209 
2210 	/* Connection-based need to check for termination and startup */
2211 	if (sk->sk_type == SOCK_SEQPACKET) {
2212 		if (sk->sk_state == TCP_CLOSE)
2213 			mask |= POLLHUP;
2214 		/* connection hasn't started yet? */
2215 		if (sk->sk_state == TCP_SYN_SENT)
2216 			return mask;
2217 	}
2218 
2219 	/* No write status requested, avoid expensive OUT tests. */
2220 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2221 		return mask;
2222 
2223 	writable = unix_writable(sk);
2224 	other = unix_peer_get(sk);
2225 	if (other) {
2226 		if (unix_peer(other) != sk) {
2227 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2228 			if (unix_recvq_full(other))
2229 				writable = 0;
2230 		}
2231 		sock_put(other);
2232 	}
2233 
2234 	if (writable)
2235 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2236 	else
2237 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2238 
2239 	return mask;
2240 }
2241 
2242 #ifdef CONFIG_PROC_FS
2243 
2244 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2245 
2246 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2247 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2248 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2249 
2250 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2251 {
2252 	unsigned long offset = get_offset(*pos);
2253 	unsigned long bucket = get_bucket(*pos);
2254 	struct sock *sk;
2255 	unsigned long count = 0;
2256 
2257 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2258 		if (sock_net(sk) != seq_file_net(seq))
2259 			continue;
2260 		if (++count == offset)
2261 			break;
2262 	}
2263 
2264 	return sk;
2265 }
2266 
2267 static struct sock *unix_next_socket(struct seq_file *seq,
2268 				     struct sock *sk,
2269 				     loff_t *pos)
2270 {
2271 	unsigned long bucket;
2272 
2273 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2274 		sk = sk_next(sk);
2275 		if (!sk)
2276 			goto next_bucket;
2277 		if (sock_net(sk) == seq_file_net(seq))
2278 			return sk;
2279 	}
2280 
2281 	do {
2282 		sk = unix_from_bucket(seq, pos);
2283 		if (sk)
2284 			return sk;
2285 
2286 next_bucket:
2287 		bucket = get_bucket(*pos) + 1;
2288 		*pos = set_bucket_offset(bucket, 1);
2289 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2290 
2291 	return NULL;
2292 }
2293 
2294 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2295 	__acquires(unix_table_lock)
2296 {
2297 	spin_lock(&unix_table_lock);
2298 
2299 	if (!*pos)
2300 		return SEQ_START_TOKEN;
2301 
2302 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2303 		return NULL;
2304 
2305 	return unix_next_socket(seq, NULL, pos);
2306 }
2307 
2308 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2309 {
2310 	++*pos;
2311 	return unix_next_socket(seq, v, pos);
2312 }
2313 
2314 static void unix_seq_stop(struct seq_file *seq, void *v)
2315 	__releases(unix_table_lock)
2316 {
2317 	spin_unlock(&unix_table_lock);
2318 }
2319 
2320 static int unix_seq_show(struct seq_file *seq, void *v)
2321 {
2322 
2323 	if (v == SEQ_START_TOKEN)
2324 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2325 			 "Inode Path\n");
2326 	else {
2327 		struct sock *s = v;
2328 		struct unix_sock *u = unix_sk(s);
2329 		unix_state_lock(s);
2330 
2331 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2332 			s,
2333 			atomic_read(&s->sk_refcnt),
2334 			0,
2335 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2336 			s->sk_type,
2337 			s->sk_socket ?
2338 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2339 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2340 			sock_i_ino(s));
2341 
2342 		if (u->addr) {
2343 			int i, len;
2344 			seq_putc(seq, ' ');
2345 
2346 			i = 0;
2347 			len = u->addr->len - sizeof(short);
2348 			if (!UNIX_ABSTRACT(s))
2349 				len--;
2350 			else {
2351 				seq_putc(seq, '@');
2352 				i++;
2353 			}
2354 			for ( ; i < len; i++)
2355 				seq_putc(seq, u->addr->name->sun_path[i]);
2356 		}
2357 		unix_state_unlock(s);
2358 		seq_putc(seq, '\n');
2359 	}
2360 
2361 	return 0;
2362 }
2363 
2364 static const struct seq_operations unix_seq_ops = {
2365 	.start  = unix_seq_start,
2366 	.next   = unix_seq_next,
2367 	.stop   = unix_seq_stop,
2368 	.show   = unix_seq_show,
2369 };
2370 
2371 static int unix_seq_open(struct inode *inode, struct file *file)
2372 {
2373 	return seq_open_net(inode, file, &unix_seq_ops,
2374 			    sizeof(struct seq_net_private));
2375 }
2376 
2377 static const struct file_operations unix_seq_fops = {
2378 	.owner		= THIS_MODULE,
2379 	.open		= unix_seq_open,
2380 	.read		= seq_read,
2381 	.llseek		= seq_lseek,
2382 	.release	= seq_release_net,
2383 };
2384 
2385 #endif
2386 
2387 static const struct net_proto_family unix_family_ops = {
2388 	.family = PF_UNIX,
2389 	.create = unix_create,
2390 	.owner	= THIS_MODULE,
2391 };
2392 
2393 
2394 static int __net_init unix_net_init(struct net *net)
2395 {
2396 	int error = -ENOMEM;
2397 
2398 	net->unx.sysctl_max_dgram_qlen = 10;
2399 	if (unix_sysctl_register(net))
2400 		goto out;
2401 
2402 #ifdef CONFIG_PROC_FS
2403 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2404 		unix_sysctl_unregister(net);
2405 		goto out;
2406 	}
2407 #endif
2408 	error = 0;
2409 out:
2410 	return error;
2411 }
2412 
2413 static void __net_exit unix_net_exit(struct net *net)
2414 {
2415 	unix_sysctl_unregister(net);
2416 	remove_proc_entry("unix", net->proc_net);
2417 }
2418 
2419 static struct pernet_operations unix_net_ops = {
2420 	.init = unix_net_init,
2421 	.exit = unix_net_exit,
2422 };
2423 
2424 static int __init af_unix_init(void)
2425 {
2426 	int rc = -1;
2427 
2428 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2429 
2430 	rc = proto_register(&unix_proto, 1);
2431 	if (rc != 0) {
2432 		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2433 		       __func__);
2434 		goto out;
2435 	}
2436 
2437 	sock_register(&unix_family_ops);
2438 	register_pernet_subsys(&unix_net_ops);
2439 out:
2440 	return rc;
2441 }
2442 
2443 static void __exit af_unix_exit(void)
2444 {
2445 	sock_unregister(PF_UNIX);
2446 	proto_unregister(&unix_proto);
2447 	unregister_pernet_subsys(&unix_net_ops);
2448 }
2449 
2450 /* Earlier than device_initcall() so that other drivers invoking
2451    request_module() don't end up in a loop when modprobe tries
2452    to use a UNIX socket. But later than subsys_initcall() because
2453    we depend on stuff initialised there */
2454 fs_initcall(af_unix_init);
2455 module_exit(af_unix_exit);
2456 
2457 MODULE_LICENSE("GPL");
2458 MODULE_ALIAS_NETPROTO(PF_UNIX);
2459