xref: /openbmc/linux/net/unix/af_unix.c (revision b34081f1)
1 /*
2  * NET4:	Implementation of BSD Unix domain sockets.
3  *
4  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *
6  *		This program is free software; you can redistribute it and/or
7  *		modify it under the terms of the GNU General Public License
8  *		as published by the Free Software Foundation; either version
9  *		2 of the License, or (at your option) any later version.
10  *
11  * Fixes:
12  *		Linus Torvalds	:	Assorted bug cures.
13  *		Niibe Yutaka	:	async I/O support.
14  *		Carsten Paeth	:	PF_UNIX check, address fixes.
15  *		Alan Cox	:	Limit size of allocated blocks.
16  *		Alan Cox	:	Fixed the stupid socketpair bug.
17  *		Alan Cox	:	BSD compatibility fine tuning.
18  *		Alan Cox	:	Fixed a bug in connect when interrupted.
19  *		Alan Cox	:	Sorted out a proper draft version of
20  *					file descriptor passing hacked up from
21  *					Mike Shaver's work.
22  *		Marty Leisner	:	Fixes to fd passing
23  *		Nick Nevin	:	recvmsg bugfix.
24  *		Alan Cox	:	Started proper garbage collector
25  *		Heiko EiBfeldt	:	Missing verify_area check
26  *		Alan Cox	:	Started POSIXisms
27  *		Andreas Schwab	:	Replace inode by dentry for proper
28  *					reference counting
29  *		Kirk Petersen	:	Made this a module
30  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31  *					Lots of bug fixes.
32  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33  *					by above two patches.
34  *	     Andrea Arcangeli	:	If possible we block in connect(2)
35  *					if the max backlog of the listen socket
36  *					is been reached. This won't break
37  *					old apps and it will avoid huge amount
38  *					of socks hashed (this for unix_gc()
39  *					performances reasons).
40  *					Security fix that limits the max
41  *					number of socks to 2*max_files and
42  *					the number of skb queueable in the
43  *					dgram receiver.
44  *		Artur Skawina   :	Hash function optimizations
45  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46  *	      Malcolm Beattie   :	Set peercred for socketpair
47  *	     Michal Ostrowski   :       Module initialization cleanup.
48  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49  *	     				the core infrastructure is doing that
50  *	     				for all net proto families now (2.5.69+)
51  *
52  *
53  * Known differences from reference BSD that was tested:
54  *
55  *	[TO FIX]
56  *	ECONNREFUSED is not returned from one end of a connected() socket to the
57  *		other the moment one end closes.
58  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60  *	[NOT TO FIX]
61  *	accept() returns a path name even if the connecting socket has closed
62  *		in the meantime (BSD loses the path and gives up).
63  *	accept() returns 0 length path for an unbound connector. BSD returns 16
64  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66  *	BSD af_unix apparently has connect forgetting to block properly.
67  *		(need to check this with the POSIX spec in detail)
68  *
69  * Differences from 2.0.0-11-... (ANK)
70  *	Bug fixes and improvements.
71  *		- client shutdown killed server socket.
72  *		- removed all useless cli/sti pairs.
73  *
74  *	Semantic changes/extensions.
75  *		- generic control message passing.
76  *		- SCM_CREDENTIALS control message.
77  *		- "Abstract" (not FS based) socket bindings.
78  *		  Abstract names are sequences of bytes (not zero terminated)
79  *		  started by 0, so that this name space does not intersect
80  *		  with BSD names.
81  */
82 
83 #include <linux/module.h>
84 #include <linux/kernel.h>
85 #include <linux/signal.h>
86 #include <linux/sched.h>
87 #include <linux/errno.h>
88 #include <linux/string.h>
89 #include <linux/stat.h>
90 #include <linux/dcache.h>
91 #include <linux/namei.h>
92 #include <linux/socket.h>
93 #include <linux/un.h>
94 #include <linux/fcntl.h>
95 #include <linux/termios.h>
96 #include <linux/sockios.h>
97 #include <linux/net.h>
98 #include <linux/in.h>
99 #include <linux/fs.h>
100 #include <linux/slab.h>
101 #include <asm/uaccess.h>
102 #include <linux/skbuff.h>
103 #include <linux/netdevice.h>
104 #include <net/net_namespace.h>
105 #include <net/sock.h>
106 #include <net/tcp_states.h>
107 #include <net/af_unix.h>
108 #include <linux/proc_fs.h>
109 #include <linux/seq_file.h>
110 #include <net/scm.h>
111 #include <linux/init.h>
112 #include <linux/poll.h>
113 #include <linux/rtnetlink.h>
114 #include <linux/mount.h>
115 #include <net/checksum.h>
116 #include <linux/security.h>
117 #include <linux/freezer.h>
118 
119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
120 EXPORT_SYMBOL_GPL(unix_socket_table);
121 DEFINE_SPINLOCK(unix_table_lock);
122 EXPORT_SYMBOL_GPL(unix_table_lock);
123 static atomic_long_t unix_nr_socks;
124 
125 
126 static struct hlist_head *unix_sockets_unbound(void *addr)
127 {
128 	unsigned long hash = (unsigned long)addr;
129 
130 	hash ^= hash >> 16;
131 	hash ^= hash >> 8;
132 	hash %= UNIX_HASH_SIZE;
133 	return &unix_socket_table[UNIX_HASH_SIZE + hash];
134 }
135 
136 #define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
137 
138 #ifdef CONFIG_SECURITY_NETWORK
139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
140 {
141 	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
142 }
143 
144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
145 {
146 	scm->secid = *UNIXSID(skb);
147 }
148 #else
149 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
150 { }
151 
152 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
153 { }
154 #endif /* CONFIG_SECURITY_NETWORK */
155 
156 /*
157  *  SMP locking strategy:
158  *    hash table is protected with spinlock unix_table_lock
159  *    each socket state is protected by separate spin lock.
160  */
161 
162 static inline unsigned int unix_hash_fold(__wsum n)
163 {
164 	unsigned int hash = (__force unsigned int)n;
165 
166 	hash ^= hash>>16;
167 	hash ^= hash>>8;
168 	return hash&(UNIX_HASH_SIZE-1);
169 }
170 
171 #define unix_peer(sk) (unix_sk(sk)->peer)
172 
173 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
174 {
175 	return unix_peer(osk) == sk;
176 }
177 
178 static inline int unix_may_send(struct sock *sk, struct sock *osk)
179 {
180 	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
181 }
182 
183 static inline int unix_recvq_full(struct sock const *sk)
184 {
185 	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
186 }
187 
188 struct sock *unix_peer_get(struct sock *s)
189 {
190 	struct sock *peer;
191 
192 	unix_state_lock(s);
193 	peer = unix_peer(s);
194 	if (peer)
195 		sock_hold(peer);
196 	unix_state_unlock(s);
197 	return peer;
198 }
199 EXPORT_SYMBOL_GPL(unix_peer_get);
200 
201 static inline void unix_release_addr(struct unix_address *addr)
202 {
203 	if (atomic_dec_and_test(&addr->refcnt))
204 		kfree(addr);
205 }
206 
207 /*
208  *	Check unix socket name:
209  *		- should be not zero length.
210  *	        - if started by not zero, should be NULL terminated (FS object)
211  *		- if started by zero, it is abstract name.
212  */
213 
214 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
215 {
216 	if (len <= sizeof(short) || len > sizeof(*sunaddr))
217 		return -EINVAL;
218 	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
219 		return -EINVAL;
220 	if (sunaddr->sun_path[0]) {
221 		/*
222 		 * This may look like an off by one error but it is a bit more
223 		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
224 		 * sun_path[108] doesn't as such exist.  However in kernel space
225 		 * we are guaranteed that it is a valid memory location in our
226 		 * kernel address buffer.
227 		 */
228 		((char *)sunaddr)[len] = 0;
229 		len = strlen(sunaddr->sun_path)+1+sizeof(short);
230 		return len;
231 	}
232 
233 	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
234 	return len;
235 }
236 
237 static void __unix_remove_socket(struct sock *sk)
238 {
239 	sk_del_node_init(sk);
240 }
241 
242 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
243 {
244 	WARN_ON(!sk_unhashed(sk));
245 	sk_add_node(sk, list);
246 }
247 
248 static inline void unix_remove_socket(struct sock *sk)
249 {
250 	spin_lock(&unix_table_lock);
251 	__unix_remove_socket(sk);
252 	spin_unlock(&unix_table_lock);
253 }
254 
255 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
256 {
257 	spin_lock(&unix_table_lock);
258 	__unix_insert_socket(list, sk);
259 	spin_unlock(&unix_table_lock);
260 }
261 
262 static struct sock *__unix_find_socket_byname(struct net *net,
263 					      struct sockaddr_un *sunname,
264 					      int len, int type, unsigned int hash)
265 {
266 	struct sock *s;
267 
268 	sk_for_each(s, &unix_socket_table[hash ^ type]) {
269 		struct unix_sock *u = unix_sk(s);
270 
271 		if (!net_eq(sock_net(s), net))
272 			continue;
273 
274 		if (u->addr->len == len &&
275 		    !memcmp(u->addr->name, sunname, len))
276 			goto found;
277 	}
278 	s = NULL;
279 found:
280 	return s;
281 }
282 
283 static inline struct sock *unix_find_socket_byname(struct net *net,
284 						   struct sockaddr_un *sunname,
285 						   int len, int type,
286 						   unsigned int hash)
287 {
288 	struct sock *s;
289 
290 	spin_lock(&unix_table_lock);
291 	s = __unix_find_socket_byname(net, sunname, len, type, hash);
292 	if (s)
293 		sock_hold(s);
294 	spin_unlock(&unix_table_lock);
295 	return s;
296 }
297 
298 static struct sock *unix_find_socket_byinode(struct inode *i)
299 {
300 	struct sock *s;
301 
302 	spin_lock(&unix_table_lock);
303 	sk_for_each(s,
304 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
305 		struct dentry *dentry = unix_sk(s)->path.dentry;
306 
307 		if (dentry && dentry->d_inode == i) {
308 			sock_hold(s);
309 			goto found;
310 		}
311 	}
312 	s = NULL;
313 found:
314 	spin_unlock(&unix_table_lock);
315 	return s;
316 }
317 
318 static inline int unix_writable(struct sock *sk)
319 {
320 	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
321 }
322 
323 static void unix_write_space(struct sock *sk)
324 {
325 	struct socket_wq *wq;
326 
327 	rcu_read_lock();
328 	if (unix_writable(sk)) {
329 		wq = rcu_dereference(sk->sk_wq);
330 		if (wq_has_sleeper(wq))
331 			wake_up_interruptible_sync_poll(&wq->wait,
332 				POLLOUT | POLLWRNORM | POLLWRBAND);
333 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
334 	}
335 	rcu_read_unlock();
336 }
337 
338 /* When dgram socket disconnects (or changes its peer), we clear its receive
339  * queue of packets arrived from previous peer. First, it allows to do
340  * flow control based only on wmem_alloc; second, sk connected to peer
341  * may receive messages only from that peer. */
342 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
343 {
344 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
345 		skb_queue_purge(&sk->sk_receive_queue);
346 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
347 
348 		/* If one link of bidirectional dgram pipe is disconnected,
349 		 * we signal error. Messages are lost. Do not make this,
350 		 * when peer was not connected to us.
351 		 */
352 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
353 			other->sk_err = ECONNRESET;
354 			other->sk_error_report(other);
355 		}
356 	}
357 }
358 
359 static void unix_sock_destructor(struct sock *sk)
360 {
361 	struct unix_sock *u = unix_sk(sk);
362 
363 	skb_queue_purge(&sk->sk_receive_queue);
364 
365 	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
366 	WARN_ON(!sk_unhashed(sk));
367 	WARN_ON(sk->sk_socket);
368 	if (!sock_flag(sk, SOCK_DEAD)) {
369 		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
370 		return;
371 	}
372 
373 	if (u->addr)
374 		unix_release_addr(u->addr);
375 
376 	atomic_long_dec(&unix_nr_socks);
377 	local_bh_disable();
378 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
379 	local_bh_enable();
380 #ifdef UNIX_REFCNT_DEBUG
381 	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
382 		atomic_long_read(&unix_nr_socks));
383 #endif
384 }
385 
386 static void unix_release_sock(struct sock *sk, int embrion)
387 {
388 	struct unix_sock *u = unix_sk(sk);
389 	struct path path;
390 	struct sock *skpair;
391 	struct sk_buff *skb;
392 	int state;
393 
394 	unix_remove_socket(sk);
395 
396 	/* Clear state */
397 	unix_state_lock(sk);
398 	sock_orphan(sk);
399 	sk->sk_shutdown = SHUTDOWN_MASK;
400 	path	     = u->path;
401 	u->path.dentry = NULL;
402 	u->path.mnt = NULL;
403 	state = sk->sk_state;
404 	sk->sk_state = TCP_CLOSE;
405 	unix_state_unlock(sk);
406 
407 	wake_up_interruptible_all(&u->peer_wait);
408 
409 	skpair = unix_peer(sk);
410 
411 	if (skpair != NULL) {
412 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
413 			unix_state_lock(skpair);
414 			/* No more writes */
415 			skpair->sk_shutdown = SHUTDOWN_MASK;
416 			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
417 				skpair->sk_err = ECONNRESET;
418 			unix_state_unlock(skpair);
419 			skpair->sk_state_change(skpair);
420 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
421 		}
422 		sock_put(skpair); /* It may now die */
423 		unix_peer(sk) = NULL;
424 	}
425 
426 	/* Try to flush out this socket. Throw out buffers at least */
427 
428 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
429 		if (state == TCP_LISTEN)
430 			unix_release_sock(skb->sk, 1);
431 		/* passed fds are erased in the kfree_skb hook	      */
432 		kfree_skb(skb);
433 	}
434 
435 	if (path.dentry)
436 		path_put(&path);
437 
438 	sock_put(sk);
439 
440 	/* ---- Socket is dead now and most probably destroyed ---- */
441 
442 	/*
443 	 * Fixme: BSD difference: In BSD all sockets connected to us get
444 	 *	  ECONNRESET and we die on the spot. In Linux we behave
445 	 *	  like files and pipes do and wait for the last
446 	 *	  dereference.
447 	 *
448 	 * Can't we simply set sock->err?
449 	 *
450 	 *	  What the above comment does talk about? --ANK(980817)
451 	 */
452 
453 	if (unix_tot_inflight)
454 		unix_gc();		/* Garbage collect fds */
455 }
456 
457 static void init_peercred(struct sock *sk)
458 {
459 	put_pid(sk->sk_peer_pid);
460 	if (sk->sk_peer_cred)
461 		put_cred(sk->sk_peer_cred);
462 	sk->sk_peer_pid  = get_pid(task_tgid(current));
463 	sk->sk_peer_cred = get_current_cred();
464 }
465 
466 static void copy_peercred(struct sock *sk, struct sock *peersk)
467 {
468 	put_pid(sk->sk_peer_pid);
469 	if (sk->sk_peer_cred)
470 		put_cred(sk->sk_peer_cred);
471 	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
472 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
473 }
474 
475 static int unix_listen(struct socket *sock, int backlog)
476 {
477 	int err;
478 	struct sock *sk = sock->sk;
479 	struct unix_sock *u = unix_sk(sk);
480 	struct pid *old_pid = NULL;
481 
482 	err = -EOPNOTSUPP;
483 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
484 		goto out;	/* Only stream/seqpacket sockets accept */
485 	err = -EINVAL;
486 	if (!u->addr)
487 		goto out;	/* No listens on an unbound socket */
488 	unix_state_lock(sk);
489 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
490 		goto out_unlock;
491 	if (backlog > sk->sk_max_ack_backlog)
492 		wake_up_interruptible_all(&u->peer_wait);
493 	sk->sk_max_ack_backlog	= backlog;
494 	sk->sk_state		= TCP_LISTEN;
495 	/* set credentials so connect can copy them */
496 	init_peercred(sk);
497 	err = 0;
498 
499 out_unlock:
500 	unix_state_unlock(sk);
501 	put_pid(old_pid);
502 out:
503 	return err;
504 }
505 
506 static int unix_release(struct socket *);
507 static int unix_bind(struct socket *, struct sockaddr *, int);
508 static int unix_stream_connect(struct socket *, struct sockaddr *,
509 			       int addr_len, int flags);
510 static int unix_socketpair(struct socket *, struct socket *);
511 static int unix_accept(struct socket *, struct socket *, int);
512 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
513 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
514 static unsigned int unix_dgram_poll(struct file *, struct socket *,
515 				    poll_table *);
516 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
517 static int unix_shutdown(struct socket *, int);
518 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
519 			       struct msghdr *, size_t);
520 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
521 			       struct msghdr *, size_t, int);
522 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
523 			      struct msghdr *, size_t);
524 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
525 			      struct msghdr *, size_t, int);
526 static int unix_dgram_connect(struct socket *, struct sockaddr *,
527 			      int, int);
528 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
529 				  struct msghdr *, size_t);
530 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
531 				  struct msghdr *, size_t, int);
532 
533 static void unix_set_peek_off(struct sock *sk, int val)
534 {
535 	struct unix_sock *u = unix_sk(sk);
536 
537 	mutex_lock(&u->readlock);
538 	sk->sk_peek_off = val;
539 	mutex_unlock(&u->readlock);
540 }
541 
542 
543 static const struct proto_ops unix_stream_ops = {
544 	.family =	PF_UNIX,
545 	.owner =	THIS_MODULE,
546 	.release =	unix_release,
547 	.bind =		unix_bind,
548 	.connect =	unix_stream_connect,
549 	.socketpair =	unix_socketpair,
550 	.accept =	unix_accept,
551 	.getname =	unix_getname,
552 	.poll =		unix_poll,
553 	.ioctl =	unix_ioctl,
554 	.listen =	unix_listen,
555 	.shutdown =	unix_shutdown,
556 	.setsockopt =	sock_no_setsockopt,
557 	.getsockopt =	sock_no_getsockopt,
558 	.sendmsg =	unix_stream_sendmsg,
559 	.recvmsg =	unix_stream_recvmsg,
560 	.mmap =		sock_no_mmap,
561 	.sendpage =	sock_no_sendpage,
562 	.set_peek_off =	unix_set_peek_off,
563 };
564 
565 static const struct proto_ops unix_dgram_ops = {
566 	.family =	PF_UNIX,
567 	.owner =	THIS_MODULE,
568 	.release =	unix_release,
569 	.bind =		unix_bind,
570 	.connect =	unix_dgram_connect,
571 	.socketpair =	unix_socketpair,
572 	.accept =	sock_no_accept,
573 	.getname =	unix_getname,
574 	.poll =		unix_dgram_poll,
575 	.ioctl =	unix_ioctl,
576 	.listen =	sock_no_listen,
577 	.shutdown =	unix_shutdown,
578 	.setsockopt =	sock_no_setsockopt,
579 	.getsockopt =	sock_no_getsockopt,
580 	.sendmsg =	unix_dgram_sendmsg,
581 	.recvmsg =	unix_dgram_recvmsg,
582 	.mmap =		sock_no_mmap,
583 	.sendpage =	sock_no_sendpage,
584 	.set_peek_off =	unix_set_peek_off,
585 };
586 
587 static const struct proto_ops unix_seqpacket_ops = {
588 	.family =	PF_UNIX,
589 	.owner =	THIS_MODULE,
590 	.release =	unix_release,
591 	.bind =		unix_bind,
592 	.connect =	unix_stream_connect,
593 	.socketpair =	unix_socketpair,
594 	.accept =	unix_accept,
595 	.getname =	unix_getname,
596 	.poll =		unix_dgram_poll,
597 	.ioctl =	unix_ioctl,
598 	.listen =	unix_listen,
599 	.shutdown =	unix_shutdown,
600 	.setsockopt =	sock_no_setsockopt,
601 	.getsockopt =	sock_no_getsockopt,
602 	.sendmsg =	unix_seqpacket_sendmsg,
603 	.recvmsg =	unix_seqpacket_recvmsg,
604 	.mmap =		sock_no_mmap,
605 	.sendpage =	sock_no_sendpage,
606 	.set_peek_off =	unix_set_peek_off,
607 };
608 
609 static struct proto unix_proto = {
610 	.name			= "UNIX",
611 	.owner			= THIS_MODULE,
612 	.obj_size		= sizeof(struct unix_sock),
613 };
614 
615 /*
616  * AF_UNIX sockets do not interact with hardware, hence they
617  * dont trigger interrupts - so it's safe for them to have
618  * bh-unsafe locking for their sk_receive_queue.lock. Split off
619  * this special lock-class by reinitializing the spinlock key:
620  */
621 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
622 
623 static struct sock *unix_create1(struct net *net, struct socket *sock)
624 {
625 	struct sock *sk = NULL;
626 	struct unix_sock *u;
627 
628 	atomic_long_inc(&unix_nr_socks);
629 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
630 		goto out;
631 
632 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
633 	if (!sk)
634 		goto out;
635 
636 	sock_init_data(sock, sk);
637 	lockdep_set_class(&sk->sk_receive_queue.lock,
638 				&af_unix_sk_receive_queue_lock_key);
639 
640 	sk->sk_write_space	= unix_write_space;
641 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
642 	sk->sk_destruct		= unix_sock_destructor;
643 	u	  = unix_sk(sk);
644 	u->path.dentry = NULL;
645 	u->path.mnt = NULL;
646 	spin_lock_init(&u->lock);
647 	atomic_long_set(&u->inflight, 0);
648 	INIT_LIST_HEAD(&u->link);
649 	mutex_init(&u->readlock); /* single task reading lock */
650 	init_waitqueue_head(&u->peer_wait);
651 	unix_insert_socket(unix_sockets_unbound(sk), sk);
652 out:
653 	if (sk == NULL)
654 		atomic_long_dec(&unix_nr_socks);
655 	else {
656 		local_bh_disable();
657 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
658 		local_bh_enable();
659 	}
660 	return sk;
661 }
662 
663 static int unix_create(struct net *net, struct socket *sock, int protocol,
664 		       int kern)
665 {
666 	if (protocol && protocol != PF_UNIX)
667 		return -EPROTONOSUPPORT;
668 
669 	sock->state = SS_UNCONNECTED;
670 
671 	switch (sock->type) {
672 	case SOCK_STREAM:
673 		sock->ops = &unix_stream_ops;
674 		break;
675 		/*
676 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
677 		 *	nothing uses it.
678 		 */
679 	case SOCK_RAW:
680 		sock->type = SOCK_DGRAM;
681 	case SOCK_DGRAM:
682 		sock->ops = &unix_dgram_ops;
683 		break;
684 	case SOCK_SEQPACKET:
685 		sock->ops = &unix_seqpacket_ops;
686 		break;
687 	default:
688 		return -ESOCKTNOSUPPORT;
689 	}
690 
691 	return unix_create1(net, sock) ? 0 : -ENOMEM;
692 }
693 
694 static int unix_release(struct socket *sock)
695 {
696 	struct sock *sk = sock->sk;
697 
698 	if (!sk)
699 		return 0;
700 
701 	unix_release_sock(sk, 0);
702 	sock->sk = NULL;
703 
704 	return 0;
705 }
706 
707 static int unix_autobind(struct socket *sock)
708 {
709 	struct sock *sk = sock->sk;
710 	struct net *net = sock_net(sk);
711 	struct unix_sock *u = unix_sk(sk);
712 	static u32 ordernum = 1;
713 	struct unix_address *addr;
714 	int err;
715 	unsigned int retries = 0;
716 
717 	mutex_lock(&u->readlock);
718 
719 	err = 0;
720 	if (u->addr)
721 		goto out;
722 
723 	err = -ENOMEM;
724 	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
725 	if (!addr)
726 		goto out;
727 
728 	addr->name->sun_family = AF_UNIX;
729 	atomic_set(&addr->refcnt, 1);
730 
731 retry:
732 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
733 	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
734 
735 	spin_lock(&unix_table_lock);
736 	ordernum = (ordernum+1)&0xFFFFF;
737 
738 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
739 				      addr->hash)) {
740 		spin_unlock(&unix_table_lock);
741 		/*
742 		 * __unix_find_socket_byname() may take long time if many names
743 		 * are already in use.
744 		 */
745 		cond_resched();
746 		/* Give up if all names seems to be in use. */
747 		if (retries++ == 0xFFFFF) {
748 			err = -ENOSPC;
749 			kfree(addr);
750 			goto out;
751 		}
752 		goto retry;
753 	}
754 	addr->hash ^= sk->sk_type;
755 
756 	__unix_remove_socket(sk);
757 	u->addr = addr;
758 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
759 	spin_unlock(&unix_table_lock);
760 	err = 0;
761 
762 out:	mutex_unlock(&u->readlock);
763 	return err;
764 }
765 
766 static struct sock *unix_find_other(struct net *net,
767 				    struct sockaddr_un *sunname, int len,
768 				    int type, unsigned int hash, int *error)
769 {
770 	struct sock *u;
771 	struct path path;
772 	int err = 0;
773 
774 	if (sunname->sun_path[0]) {
775 		struct inode *inode;
776 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
777 		if (err)
778 			goto fail;
779 		inode = path.dentry->d_inode;
780 		err = inode_permission(inode, MAY_WRITE);
781 		if (err)
782 			goto put_fail;
783 
784 		err = -ECONNREFUSED;
785 		if (!S_ISSOCK(inode->i_mode))
786 			goto put_fail;
787 		u = unix_find_socket_byinode(inode);
788 		if (!u)
789 			goto put_fail;
790 
791 		if (u->sk_type == type)
792 			touch_atime(&path);
793 
794 		path_put(&path);
795 
796 		err = -EPROTOTYPE;
797 		if (u->sk_type != type) {
798 			sock_put(u);
799 			goto fail;
800 		}
801 	} else {
802 		err = -ECONNREFUSED;
803 		u = unix_find_socket_byname(net, sunname, len, type, hash);
804 		if (u) {
805 			struct dentry *dentry;
806 			dentry = unix_sk(u)->path.dentry;
807 			if (dentry)
808 				touch_atime(&unix_sk(u)->path);
809 		} else
810 			goto fail;
811 	}
812 	return u;
813 
814 put_fail:
815 	path_put(&path);
816 fail:
817 	*error = err;
818 	return NULL;
819 }
820 
821 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
822 {
823 	struct dentry *dentry;
824 	struct path path;
825 	int err = 0;
826 	/*
827 	 * Get the parent directory, calculate the hash for last
828 	 * component.
829 	 */
830 	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
831 	err = PTR_ERR(dentry);
832 	if (IS_ERR(dentry))
833 		return err;
834 
835 	/*
836 	 * All right, let's create it.
837 	 */
838 	err = security_path_mknod(&path, dentry, mode, 0);
839 	if (!err) {
840 		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
841 		if (!err) {
842 			res->mnt = mntget(path.mnt);
843 			res->dentry = dget(dentry);
844 		}
845 	}
846 	done_path_create(&path, dentry);
847 	return err;
848 }
849 
850 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
851 {
852 	struct sock *sk = sock->sk;
853 	struct net *net = sock_net(sk);
854 	struct unix_sock *u = unix_sk(sk);
855 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
856 	char *sun_path = sunaddr->sun_path;
857 	int err;
858 	unsigned int hash;
859 	struct unix_address *addr;
860 	struct hlist_head *list;
861 
862 	err = -EINVAL;
863 	if (sunaddr->sun_family != AF_UNIX)
864 		goto out;
865 
866 	if (addr_len == sizeof(short)) {
867 		err = unix_autobind(sock);
868 		goto out;
869 	}
870 
871 	err = unix_mkname(sunaddr, addr_len, &hash);
872 	if (err < 0)
873 		goto out;
874 	addr_len = err;
875 
876 	mutex_lock(&u->readlock);
877 
878 	err = -EINVAL;
879 	if (u->addr)
880 		goto out_up;
881 
882 	err = -ENOMEM;
883 	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
884 	if (!addr)
885 		goto out_up;
886 
887 	memcpy(addr->name, sunaddr, addr_len);
888 	addr->len = addr_len;
889 	addr->hash = hash ^ sk->sk_type;
890 	atomic_set(&addr->refcnt, 1);
891 
892 	if (sun_path[0]) {
893 		struct path path;
894 		umode_t mode = S_IFSOCK |
895 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
896 		err = unix_mknod(sun_path, mode, &path);
897 		if (err) {
898 			if (err == -EEXIST)
899 				err = -EADDRINUSE;
900 			unix_release_addr(addr);
901 			goto out_up;
902 		}
903 		addr->hash = UNIX_HASH_SIZE;
904 		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
905 		spin_lock(&unix_table_lock);
906 		u->path = path;
907 		list = &unix_socket_table[hash];
908 	} else {
909 		spin_lock(&unix_table_lock);
910 		err = -EADDRINUSE;
911 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
912 					      sk->sk_type, hash)) {
913 			unix_release_addr(addr);
914 			goto out_unlock;
915 		}
916 
917 		list = &unix_socket_table[addr->hash];
918 	}
919 
920 	err = 0;
921 	__unix_remove_socket(sk);
922 	u->addr = addr;
923 	__unix_insert_socket(list, sk);
924 
925 out_unlock:
926 	spin_unlock(&unix_table_lock);
927 out_up:
928 	mutex_unlock(&u->readlock);
929 out:
930 	return err;
931 }
932 
933 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
934 {
935 	if (unlikely(sk1 == sk2) || !sk2) {
936 		unix_state_lock(sk1);
937 		return;
938 	}
939 	if (sk1 < sk2) {
940 		unix_state_lock(sk1);
941 		unix_state_lock_nested(sk2);
942 	} else {
943 		unix_state_lock(sk2);
944 		unix_state_lock_nested(sk1);
945 	}
946 }
947 
948 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
949 {
950 	if (unlikely(sk1 == sk2) || !sk2) {
951 		unix_state_unlock(sk1);
952 		return;
953 	}
954 	unix_state_unlock(sk1);
955 	unix_state_unlock(sk2);
956 }
957 
958 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
959 			      int alen, int flags)
960 {
961 	struct sock *sk = sock->sk;
962 	struct net *net = sock_net(sk);
963 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
964 	struct sock *other;
965 	unsigned int hash;
966 	int err;
967 
968 	if (addr->sa_family != AF_UNSPEC) {
969 		err = unix_mkname(sunaddr, alen, &hash);
970 		if (err < 0)
971 			goto out;
972 		alen = err;
973 
974 		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
975 		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
976 			goto out;
977 
978 restart:
979 		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
980 		if (!other)
981 			goto out;
982 
983 		unix_state_double_lock(sk, other);
984 
985 		/* Apparently VFS overslept socket death. Retry. */
986 		if (sock_flag(other, SOCK_DEAD)) {
987 			unix_state_double_unlock(sk, other);
988 			sock_put(other);
989 			goto restart;
990 		}
991 
992 		err = -EPERM;
993 		if (!unix_may_send(sk, other))
994 			goto out_unlock;
995 
996 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
997 		if (err)
998 			goto out_unlock;
999 
1000 	} else {
1001 		/*
1002 		 *	1003.1g breaking connected state with AF_UNSPEC
1003 		 */
1004 		other = NULL;
1005 		unix_state_double_lock(sk, other);
1006 	}
1007 
1008 	/*
1009 	 * If it was connected, reconnect.
1010 	 */
1011 	if (unix_peer(sk)) {
1012 		struct sock *old_peer = unix_peer(sk);
1013 		unix_peer(sk) = other;
1014 		unix_state_double_unlock(sk, other);
1015 
1016 		if (other != old_peer)
1017 			unix_dgram_disconnected(sk, old_peer);
1018 		sock_put(old_peer);
1019 	} else {
1020 		unix_peer(sk) = other;
1021 		unix_state_double_unlock(sk, other);
1022 	}
1023 	return 0;
1024 
1025 out_unlock:
1026 	unix_state_double_unlock(sk, other);
1027 	sock_put(other);
1028 out:
1029 	return err;
1030 }
1031 
1032 static long unix_wait_for_peer(struct sock *other, long timeo)
1033 {
1034 	struct unix_sock *u = unix_sk(other);
1035 	int sched;
1036 	DEFINE_WAIT(wait);
1037 
1038 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1039 
1040 	sched = !sock_flag(other, SOCK_DEAD) &&
1041 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1042 		unix_recvq_full(other);
1043 
1044 	unix_state_unlock(other);
1045 
1046 	if (sched)
1047 		timeo = schedule_timeout(timeo);
1048 
1049 	finish_wait(&u->peer_wait, &wait);
1050 	return timeo;
1051 }
1052 
1053 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1054 			       int addr_len, int flags)
1055 {
1056 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1057 	struct sock *sk = sock->sk;
1058 	struct net *net = sock_net(sk);
1059 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1060 	struct sock *newsk = NULL;
1061 	struct sock *other = NULL;
1062 	struct sk_buff *skb = NULL;
1063 	unsigned int hash;
1064 	int st;
1065 	int err;
1066 	long timeo;
1067 
1068 	err = unix_mkname(sunaddr, addr_len, &hash);
1069 	if (err < 0)
1070 		goto out;
1071 	addr_len = err;
1072 
1073 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1074 	    (err = unix_autobind(sock)) != 0)
1075 		goto out;
1076 
1077 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1078 
1079 	/* First of all allocate resources.
1080 	   If we will make it after state is locked,
1081 	   we will have to recheck all again in any case.
1082 	 */
1083 
1084 	err = -ENOMEM;
1085 
1086 	/* create new sock for complete connection */
1087 	newsk = unix_create1(sock_net(sk), NULL);
1088 	if (newsk == NULL)
1089 		goto out;
1090 
1091 	/* Allocate skb for sending to listening sock */
1092 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1093 	if (skb == NULL)
1094 		goto out;
1095 
1096 restart:
1097 	/*  Find listening sock. */
1098 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1099 	if (!other)
1100 		goto out;
1101 
1102 	/* Latch state of peer */
1103 	unix_state_lock(other);
1104 
1105 	/* Apparently VFS overslept socket death. Retry. */
1106 	if (sock_flag(other, SOCK_DEAD)) {
1107 		unix_state_unlock(other);
1108 		sock_put(other);
1109 		goto restart;
1110 	}
1111 
1112 	err = -ECONNREFUSED;
1113 	if (other->sk_state != TCP_LISTEN)
1114 		goto out_unlock;
1115 	if (other->sk_shutdown & RCV_SHUTDOWN)
1116 		goto out_unlock;
1117 
1118 	if (unix_recvq_full(other)) {
1119 		err = -EAGAIN;
1120 		if (!timeo)
1121 			goto out_unlock;
1122 
1123 		timeo = unix_wait_for_peer(other, timeo);
1124 
1125 		err = sock_intr_errno(timeo);
1126 		if (signal_pending(current))
1127 			goto out;
1128 		sock_put(other);
1129 		goto restart;
1130 	}
1131 
1132 	/* Latch our state.
1133 
1134 	   It is tricky place. We need to grab our state lock and cannot
1135 	   drop lock on peer. It is dangerous because deadlock is
1136 	   possible. Connect to self case and simultaneous
1137 	   attempt to connect are eliminated by checking socket
1138 	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1139 	   check this before attempt to grab lock.
1140 
1141 	   Well, and we have to recheck the state after socket locked.
1142 	 */
1143 	st = sk->sk_state;
1144 
1145 	switch (st) {
1146 	case TCP_CLOSE:
1147 		/* This is ok... continue with connect */
1148 		break;
1149 	case TCP_ESTABLISHED:
1150 		/* Socket is already connected */
1151 		err = -EISCONN;
1152 		goto out_unlock;
1153 	default:
1154 		err = -EINVAL;
1155 		goto out_unlock;
1156 	}
1157 
1158 	unix_state_lock_nested(sk);
1159 
1160 	if (sk->sk_state != st) {
1161 		unix_state_unlock(sk);
1162 		unix_state_unlock(other);
1163 		sock_put(other);
1164 		goto restart;
1165 	}
1166 
1167 	err = security_unix_stream_connect(sk, other, newsk);
1168 	if (err) {
1169 		unix_state_unlock(sk);
1170 		goto out_unlock;
1171 	}
1172 
1173 	/* The way is open! Fastly set all the necessary fields... */
1174 
1175 	sock_hold(sk);
1176 	unix_peer(newsk)	= sk;
1177 	newsk->sk_state		= TCP_ESTABLISHED;
1178 	newsk->sk_type		= sk->sk_type;
1179 	init_peercred(newsk);
1180 	newu = unix_sk(newsk);
1181 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1182 	otheru = unix_sk(other);
1183 
1184 	/* copy address information from listening to new sock*/
1185 	if (otheru->addr) {
1186 		atomic_inc(&otheru->addr->refcnt);
1187 		newu->addr = otheru->addr;
1188 	}
1189 	if (otheru->path.dentry) {
1190 		path_get(&otheru->path);
1191 		newu->path = otheru->path;
1192 	}
1193 
1194 	/* Set credentials */
1195 	copy_peercred(sk, other);
1196 
1197 	sock->state	= SS_CONNECTED;
1198 	sk->sk_state	= TCP_ESTABLISHED;
1199 	sock_hold(newsk);
1200 
1201 	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1202 	unix_peer(sk)	= newsk;
1203 
1204 	unix_state_unlock(sk);
1205 
1206 	/* take ten and and send info to listening sock */
1207 	spin_lock(&other->sk_receive_queue.lock);
1208 	__skb_queue_tail(&other->sk_receive_queue, skb);
1209 	spin_unlock(&other->sk_receive_queue.lock);
1210 	unix_state_unlock(other);
1211 	other->sk_data_ready(other, 0);
1212 	sock_put(other);
1213 	return 0;
1214 
1215 out_unlock:
1216 	if (other)
1217 		unix_state_unlock(other);
1218 
1219 out:
1220 	kfree_skb(skb);
1221 	if (newsk)
1222 		unix_release_sock(newsk, 0);
1223 	if (other)
1224 		sock_put(other);
1225 	return err;
1226 }
1227 
1228 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1229 {
1230 	struct sock *ska = socka->sk, *skb = sockb->sk;
1231 
1232 	/* Join our sockets back to back */
1233 	sock_hold(ska);
1234 	sock_hold(skb);
1235 	unix_peer(ska) = skb;
1236 	unix_peer(skb) = ska;
1237 	init_peercred(ska);
1238 	init_peercred(skb);
1239 
1240 	if (ska->sk_type != SOCK_DGRAM) {
1241 		ska->sk_state = TCP_ESTABLISHED;
1242 		skb->sk_state = TCP_ESTABLISHED;
1243 		socka->state  = SS_CONNECTED;
1244 		sockb->state  = SS_CONNECTED;
1245 	}
1246 	return 0;
1247 }
1248 
1249 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1250 {
1251 	struct sock *sk = sock->sk;
1252 	struct sock *tsk;
1253 	struct sk_buff *skb;
1254 	int err;
1255 
1256 	err = -EOPNOTSUPP;
1257 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1258 		goto out;
1259 
1260 	err = -EINVAL;
1261 	if (sk->sk_state != TCP_LISTEN)
1262 		goto out;
1263 
1264 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1265 	 * so that no locks are necessary.
1266 	 */
1267 
1268 	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1269 	if (!skb) {
1270 		/* This means receive shutdown. */
1271 		if (err == 0)
1272 			err = -EINVAL;
1273 		goto out;
1274 	}
1275 
1276 	tsk = skb->sk;
1277 	skb_free_datagram(sk, skb);
1278 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1279 
1280 	/* attach accepted sock to socket */
1281 	unix_state_lock(tsk);
1282 	newsock->state = SS_CONNECTED;
1283 	sock_graft(tsk, newsock);
1284 	unix_state_unlock(tsk);
1285 	return 0;
1286 
1287 out:
1288 	return err;
1289 }
1290 
1291 
1292 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1293 {
1294 	struct sock *sk = sock->sk;
1295 	struct unix_sock *u;
1296 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1297 	int err = 0;
1298 
1299 	if (peer) {
1300 		sk = unix_peer_get(sk);
1301 
1302 		err = -ENOTCONN;
1303 		if (!sk)
1304 			goto out;
1305 		err = 0;
1306 	} else {
1307 		sock_hold(sk);
1308 	}
1309 
1310 	u = unix_sk(sk);
1311 	unix_state_lock(sk);
1312 	if (!u->addr) {
1313 		sunaddr->sun_family = AF_UNIX;
1314 		sunaddr->sun_path[0] = 0;
1315 		*uaddr_len = sizeof(short);
1316 	} else {
1317 		struct unix_address *addr = u->addr;
1318 
1319 		*uaddr_len = addr->len;
1320 		memcpy(sunaddr, addr->name, *uaddr_len);
1321 	}
1322 	unix_state_unlock(sk);
1323 	sock_put(sk);
1324 out:
1325 	return err;
1326 }
1327 
1328 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1329 {
1330 	int i;
1331 
1332 	scm->fp = UNIXCB(skb).fp;
1333 	UNIXCB(skb).fp = NULL;
1334 
1335 	for (i = scm->fp->count-1; i >= 0; i--)
1336 		unix_notinflight(scm->fp->fp[i]);
1337 }
1338 
1339 static void unix_destruct_scm(struct sk_buff *skb)
1340 {
1341 	struct scm_cookie scm;
1342 	memset(&scm, 0, sizeof(scm));
1343 	scm.pid  = UNIXCB(skb).pid;
1344 	if (UNIXCB(skb).fp)
1345 		unix_detach_fds(&scm, skb);
1346 
1347 	/* Alas, it calls VFS */
1348 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1349 	scm_destroy(&scm);
1350 	sock_wfree(skb);
1351 }
1352 
1353 #define MAX_RECURSION_LEVEL 4
1354 
1355 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1356 {
1357 	int i;
1358 	unsigned char max_level = 0;
1359 	int unix_sock_count = 0;
1360 
1361 	for (i = scm->fp->count - 1; i >= 0; i--) {
1362 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1363 
1364 		if (sk) {
1365 			unix_sock_count++;
1366 			max_level = max(max_level,
1367 					unix_sk(sk)->recursion_level);
1368 		}
1369 	}
1370 	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1371 		return -ETOOMANYREFS;
1372 
1373 	/*
1374 	 * Need to duplicate file references for the sake of garbage
1375 	 * collection.  Otherwise a socket in the fps might become a
1376 	 * candidate for GC while the skb is not yet queued.
1377 	 */
1378 	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1379 	if (!UNIXCB(skb).fp)
1380 		return -ENOMEM;
1381 
1382 	if (unix_sock_count) {
1383 		for (i = scm->fp->count - 1; i >= 0; i--)
1384 			unix_inflight(scm->fp->fp[i]);
1385 	}
1386 	return max_level;
1387 }
1388 
1389 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1390 {
1391 	int err = 0;
1392 
1393 	UNIXCB(skb).pid  = get_pid(scm->pid);
1394 	UNIXCB(skb).uid = scm->creds.uid;
1395 	UNIXCB(skb).gid = scm->creds.gid;
1396 	UNIXCB(skb).fp = NULL;
1397 	if (scm->fp && send_fds)
1398 		err = unix_attach_fds(scm, skb);
1399 
1400 	skb->destructor = unix_destruct_scm;
1401 	return err;
1402 }
1403 
1404 /*
1405  * Some apps rely on write() giving SCM_CREDENTIALS
1406  * We include credentials if source or destination socket
1407  * asserted SOCK_PASSCRED.
1408  */
1409 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1410 			    const struct sock *other)
1411 {
1412 	if (UNIXCB(skb).pid)
1413 		return;
1414 	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1415 	    !other->sk_socket ||
1416 	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1417 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1418 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1419 	}
1420 }
1421 
1422 /*
1423  *	Send AF_UNIX data.
1424  */
1425 
1426 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1427 			      struct msghdr *msg, size_t len)
1428 {
1429 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1430 	struct sock *sk = sock->sk;
1431 	struct net *net = sock_net(sk);
1432 	struct unix_sock *u = unix_sk(sk);
1433 	struct sockaddr_un *sunaddr = msg->msg_name;
1434 	struct sock *other = NULL;
1435 	int namelen = 0; /* fake GCC */
1436 	int err;
1437 	unsigned int hash;
1438 	struct sk_buff *skb;
1439 	long timeo;
1440 	struct scm_cookie tmp_scm;
1441 	int max_level;
1442 	int data_len = 0;
1443 
1444 	if (NULL == siocb->scm)
1445 		siocb->scm = &tmp_scm;
1446 	wait_for_unix_gc();
1447 	err = scm_send(sock, msg, siocb->scm, false);
1448 	if (err < 0)
1449 		return err;
1450 
1451 	err = -EOPNOTSUPP;
1452 	if (msg->msg_flags&MSG_OOB)
1453 		goto out;
1454 
1455 	if (msg->msg_namelen) {
1456 		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1457 		if (err < 0)
1458 			goto out;
1459 		namelen = err;
1460 	} else {
1461 		sunaddr = NULL;
1462 		err = -ENOTCONN;
1463 		other = unix_peer_get(sk);
1464 		if (!other)
1465 			goto out;
1466 	}
1467 
1468 	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1469 	    && (err = unix_autobind(sock)) != 0)
1470 		goto out;
1471 
1472 	err = -EMSGSIZE;
1473 	if (len > sk->sk_sndbuf - 32)
1474 		goto out;
1475 
1476 	if (len > SKB_MAX_ALLOC)
1477 		data_len = min_t(size_t,
1478 				 len - SKB_MAX_ALLOC,
1479 				 MAX_SKB_FRAGS * PAGE_SIZE);
1480 
1481 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1482 				   msg->msg_flags & MSG_DONTWAIT, &err,
1483 				   PAGE_ALLOC_COSTLY_ORDER);
1484 	if (skb == NULL)
1485 		goto out;
1486 
1487 	err = unix_scm_to_skb(siocb->scm, skb, true);
1488 	if (err < 0)
1489 		goto out_free;
1490 	max_level = err + 1;
1491 	unix_get_secdata(siocb->scm, skb);
1492 
1493 	skb_put(skb, len - data_len);
1494 	skb->data_len = data_len;
1495 	skb->len = len;
1496 	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1497 	if (err)
1498 		goto out_free;
1499 
1500 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1501 
1502 restart:
1503 	if (!other) {
1504 		err = -ECONNRESET;
1505 		if (sunaddr == NULL)
1506 			goto out_free;
1507 
1508 		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1509 					hash, &err);
1510 		if (other == NULL)
1511 			goto out_free;
1512 	}
1513 
1514 	if (sk_filter(other, skb) < 0) {
1515 		/* Toss the packet but do not return any error to the sender */
1516 		err = len;
1517 		goto out_free;
1518 	}
1519 
1520 	unix_state_lock(other);
1521 	err = -EPERM;
1522 	if (!unix_may_send(sk, other))
1523 		goto out_unlock;
1524 
1525 	if (sock_flag(other, SOCK_DEAD)) {
1526 		/*
1527 		 *	Check with 1003.1g - what should
1528 		 *	datagram error
1529 		 */
1530 		unix_state_unlock(other);
1531 		sock_put(other);
1532 
1533 		err = 0;
1534 		unix_state_lock(sk);
1535 		if (unix_peer(sk) == other) {
1536 			unix_peer(sk) = NULL;
1537 			unix_state_unlock(sk);
1538 
1539 			unix_dgram_disconnected(sk, other);
1540 			sock_put(other);
1541 			err = -ECONNREFUSED;
1542 		} else {
1543 			unix_state_unlock(sk);
1544 		}
1545 
1546 		other = NULL;
1547 		if (err)
1548 			goto out_free;
1549 		goto restart;
1550 	}
1551 
1552 	err = -EPIPE;
1553 	if (other->sk_shutdown & RCV_SHUTDOWN)
1554 		goto out_unlock;
1555 
1556 	if (sk->sk_type != SOCK_SEQPACKET) {
1557 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1558 		if (err)
1559 			goto out_unlock;
1560 	}
1561 
1562 	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1563 		if (!timeo) {
1564 			err = -EAGAIN;
1565 			goto out_unlock;
1566 		}
1567 
1568 		timeo = unix_wait_for_peer(other, timeo);
1569 
1570 		err = sock_intr_errno(timeo);
1571 		if (signal_pending(current))
1572 			goto out_free;
1573 
1574 		goto restart;
1575 	}
1576 
1577 	if (sock_flag(other, SOCK_RCVTSTAMP))
1578 		__net_timestamp(skb);
1579 	maybe_add_creds(skb, sock, other);
1580 	skb_queue_tail(&other->sk_receive_queue, skb);
1581 	if (max_level > unix_sk(other)->recursion_level)
1582 		unix_sk(other)->recursion_level = max_level;
1583 	unix_state_unlock(other);
1584 	other->sk_data_ready(other, len);
1585 	sock_put(other);
1586 	scm_destroy(siocb->scm);
1587 	return len;
1588 
1589 out_unlock:
1590 	unix_state_unlock(other);
1591 out_free:
1592 	kfree_skb(skb);
1593 out:
1594 	if (other)
1595 		sock_put(other);
1596 	scm_destroy(siocb->scm);
1597 	return err;
1598 }
1599 
1600 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1601  * bytes, and a minimun of a full page.
1602  */
1603 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1604 
1605 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1606 			       struct msghdr *msg, size_t len)
1607 {
1608 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1609 	struct sock *sk = sock->sk;
1610 	struct sock *other = NULL;
1611 	int err, size;
1612 	struct sk_buff *skb;
1613 	int sent = 0;
1614 	struct scm_cookie tmp_scm;
1615 	bool fds_sent = false;
1616 	int max_level;
1617 	int data_len;
1618 
1619 	if (NULL == siocb->scm)
1620 		siocb->scm = &tmp_scm;
1621 	wait_for_unix_gc();
1622 	err = scm_send(sock, msg, siocb->scm, false);
1623 	if (err < 0)
1624 		return err;
1625 
1626 	err = -EOPNOTSUPP;
1627 	if (msg->msg_flags&MSG_OOB)
1628 		goto out_err;
1629 
1630 	if (msg->msg_namelen) {
1631 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1632 		goto out_err;
1633 	} else {
1634 		err = -ENOTCONN;
1635 		other = unix_peer(sk);
1636 		if (!other)
1637 			goto out_err;
1638 	}
1639 
1640 	if (sk->sk_shutdown & SEND_SHUTDOWN)
1641 		goto pipe_err;
1642 
1643 	while (sent < len) {
1644 		size = len - sent;
1645 
1646 		/* Keep two messages in the pipe so it schedules better */
1647 		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1648 
1649 		/* allow fallback to order-0 allocations */
1650 		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1651 
1652 		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1653 
1654 		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1655 					   msg->msg_flags & MSG_DONTWAIT, &err,
1656 					   get_order(UNIX_SKB_FRAGS_SZ));
1657 		if (!skb)
1658 			goto out_err;
1659 
1660 		/* Only send the fds in the first buffer */
1661 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1662 		if (err < 0) {
1663 			kfree_skb(skb);
1664 			goto out_err;
1665 		}
1666 		max_level = err + 1;
1667 		fds_sent = true;
1668 
1669 		skb_put(skb, size - data_len);
1670 		skb->data_len = data_len;
1671 		skb->len = size;
1672 		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1673 						   sent, size);
1674 		if (err) {
1675 			kfree_skb(skb);
1676 			goto out_err;
1677 		}
1678 
1679 		unix_state_lock(other);
1680 
1681 		if (sock_flag(other, SOCK_DEAD) ||
1682 		    (other->sk_shutdown & RCV_SHUTDOWN))
1683 			goto pipe_err_free;
1684 
1685 		maybe_add_creds(skb, sock, other);
1686 		skb_queue_tail(&other->sk_receive_queue, skb);
1687 		if (max_level > unix_sk(other)->recursion_level)
1688 			unix_sk(other)->recursion_level = max_level;
1689 		unix_state_unlock(other);
1690 		other->sk_data_ready(other, size);
1691 		sent += size;
1692 	}
1693 
1694 	scm_destroy(siocb->scm);
1695 	siocb->scm = NULL;
1696 
1697 	return sent;
1698 
1699 pipe_err_free:
1700 	unix_state_unlock(other);
1701 	kfree_skb(skb);
1702 pipe_err:
1703 	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1704 		send_sig(SIGPIPE, current, 0);
1705 	err = -EPIPE;
1706 out_err:
1707 	scm_destroy(siocb->scm);
1708 	siocb->scm = NULL;
1709 	return sent ? : err;
1710 }
1711 
1712 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1713 				  struct msghdr *msg, size_t len)
1714 {
1715 	int err;
1716 	struct sock *sk = sock->sk;
1717 
1718 	err = sock_error(sk);
1719 	if (err)
1720 		return err;
1721 
1722 	if (sk->sk_state != TCP_ESTABLISHED)
1723 		return -ENOTCONN;
1724 
1725 	if (msg->msg_namelen)
1726 		msg->msg_namelen = 0;
1727 
1728 	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1729 }
1730 
1731 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1732 			      struct msghdr *msg, size_t size,
1733 			      int flags)
1734 {
1735 	struct sock *sk = sock->sk;
1736 
1737 	if (sk->sk_state != TCP_ESTABLISHED)
1738 		return -ENOTCONN;
1739 
1740 	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1741 }
1742 
1743 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1744 {
1745 	struct unix_sock *u = unix_sk(sk);
1746 
1747 	msg->msg_namelen = 0;
1748 	if (u->addr) {
1749 		msg->msg_namelen = u->addr->len;
1750 		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1751 	}
1752 }
1753 
1754 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1755 			      struct msghdr *msg, size_t size,
1756 			      int flags)
1757 {
1758 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1759 	struct scm_cookie tmp_scm;
1760 	struct sock *sk = sock->sk;
1761 	struct unix_sock *u = unix_sk(sk);
1762 	int noblock = flags & MSG_DONTWAIT;
1763 	struct sk_buff *skb;
1764 	int err;
1765 	int peeked, skip;
1766 
1767 	err = -EOPNOTSUPP;
1768 	if (flags&MSG_OOB)
1769 		goto out;
1770 
1771 	msg->msg_namelen = 0;
1772 
1773 	err = mutex_lock_interruptible(&u->readlock);
1774 	if (err) {
1775 		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1776 		goto out;
1777 	}
1778 
1779 	skip = sk_peek_offset(sk, flags);
1780 
1781 	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1782 	if (!skb) {
1783 		unix_state_lock(sk);
1784 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1785 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1786 		    (sk->sk_shutdown & RCV_SHUTDOWN))
1787 			err = 0;
1788 		unix_state_unlock(sk);
1789 		goto out_unlock;
1790 	}
1791 
1792 	wake_up_interruptible_sync_poll(&u->peer_wait,
1793 					POLLOUT | POLLWRNORM | POLLWRBAND);
1794 
1795 	if (msg->msg_name)
1796 		unix_copy_addr(msg, skb->sk);
1797 
1798 	if (size > skb->len - skip)
1799 		size = skb->len - skip;
1800 	else if (size < skb->len - skip)
1801 		msg->msg_flags |= MSG_TRUNC;
1802 
1803 	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1804 	if (err)
1805 		goto out_free;
1806 
1807 	if (sock_flag(sk, SOCK_RCVTSTAMP))
1808 		__sock_recv_timestamp(msg, sk, skb);
1809 
1810 	if (!siocb->scm) {
1811 		siocb->scm = &tmp_scm;
1812 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1813 	}
1814 	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1815 	unix_set_secdata(siocb->scm, skb);
1816 
1817 	if (!(flags & MSG_PEEK)) {
1818 		if (UNIXCB(skb).fp)
1819 			unix_detach_fds(siocb->scm, skb);
1820 
1821 		sk_peek_offset_bwd(sk, skb->len);
1822 	} else {
1823 		/* It is questionable: on PEEK we could:
1824 		   - do not return fds - good, but too simple 8)
1825 		   - return fds, and do not return them on read (old strategy,
1826 		     apparently wrong)
1827 		   - clone fds (I chose it for now, it is the most universal
1828 		     solution)
1829 
1830 		   POSIX 1003.1g does not actually define this clearly
1831 		   at all. POSIX 1003.1g doesn't define a lot of things
1832 		   clearly however!
1833 
1834 		*/
1835 
1836 		sk_peek_offset_fwd(sk, size);
1837 
1838 		if (UNIXCB(skb).fp)
1839 			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1840 	}
1841 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1842 
1843 	scm_recv(sock, msg, siocb->scm, flags);
1844 
1845 out_free:
1846 	skb_free_datagram(sk, skb);
1847 out_unlock:
1848 	mutex_unlock(&u->readlock);
1849 out:
1850 	return err;
1851 }
1852 
1853 /*
1854  *	Sleep until more data has arrived. But check for races..
1855  */
1856 static long unix_stream_data_wait(struct sock *sk, long timeo,
1857 				  struct sk_buff *last)
1858 {
1859 	DEFINE_WAIT(wait);
1860 
1861 	unix_state_lock(sk);
1862 
1863 	for (;;) {
1864 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1865 
1866 		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1867 		    sk->sk_err ||
1868 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1869 		    signal_pending(current) ||
1870 		    !timeo)
1871 			break;
1872 
1873 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1874 		unix_state_unlock(sk);
1875 		timeo = freezable_schedule_timeout(timeo);
1876 		unix_state_lock(sk);
1877 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1878 	}
1879 
1880 	finish_wait(sk_sleep(sk), &wait);
1881 	unix_state_unlock(sk);
1882 	return timeo;
1883 }
1884 
1885 static unsigned int unix_skb_len(const struct sk_buff *skb)
1886 {
1887 	return skb->len - UNIXCB(skb).consumed;
1888 }
1889 
1890 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1891 			       struct msghdr *msg, size_t size,
1892 			       int flags)
1893 {
1894 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1895 	struct scm_cookie tmp_scm;
1896 	struct sock *sk = sock->sk;
1897 	struct unix_sock *u = unix_sk(sk);
1898 	struct sockaddr_un *sunaddr = msg->msg_name;
1899 	int copied = 0;
1900 	int check_creds = 0;
1901 	int target;
1902 	int err = 0;
1903 	long timeo;
1904 	int skip;
1905 
1906 	err = -EINVAL;
1907 	if (sk->sk_state != TCP_ESTABLISHED)
1908 		goto out;
1909 
1910 	err = -EOPNOTSUPP;
1911 	if (flags&MSG_OOB)
1912 		goto out;
1913 
1914 	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1915 	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1916 
1917 	msg->msg_namelen = 0;
1918 
1919 	/* Lock the socket to prevent queue disordering
1920 	 * while sleeps in memcpy_tomsg
1921 	 */
1922 
1923 	if (!siocb->scm) {
1924 		siocb->scm = &tmp_scm;
1925 		memset(&tmp_scm, 0, sizeof(tmp_scm));
1926 	}
1927 
1928 	err = mutex_lock_interruptible(&u->readlock);
1929 	if (err) {
1930 		err = sock_intr_errno(timeo);
1931 		goto out;
1932 	}
1933 
1934 	do {
1935 		int chunk;
1936 		struct sk_buff *skb, *last;
1937 
1938 		unix_state_lock(sk);
1939 		last = skb = skb_peek(&sk->sk_receive_queue);
1940 again:
1941 		if (skb == NULL) {
1942 			unix_sk(sk)->recursion_level = 0;
1943 			if (copied >= target)
1944 				goto unlock;
1945 
1946 			/*
1947 			 *	POSIX 1003.1g mandates this order.
1948 			 */
1949 
1950 			err = sock_error(sk);
1951 			if (err)
1952 				goto unlock;
1953 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1954 				goto unlock;
1955 
1956 			unix_state_unlock(sk);
1957 			err = -EAGAIN;
1958 			if (!timeo)
1959 				break;
1960 			mutex_unlock(&u->readlock);
1961 
1962 			timeo = unix_stream_data_wait(sk, timeo, last);
1963 
1964 			if (signal_pending(current)
1965 			    ||  mutex_lock_interruptible(&u->readlock)) {
1966 				err = sock_intr_errno(timeo);
1967 				goto out;
1968 			}
1969 
1970 			continue;
1971  unlock:
1972 			unix_state_unlock(sk);
1973 			break;
1974 		}
1975 
1976 		skip = sk_peek_offset(sk, flags);
1977 		while (skip >= unix_skb_len(skb)) {
1978 			skip -= unix_skb_len(skb);
1979 			last = skb;
1980 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1981 			if (!skb)
1982 				goto again;
1983 		}
1984 
1985 		unix_state_unlock(sk);
1986 
1987 		if (check_creds) {
1988 			/* Never glue messages from different writers */
1989 			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1990 			    !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
1991 			    !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
1992 				break;
1993 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
1994 			/* Copy credentials */
1995 			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1996 			check_creds = 1;
1997 		}
1998 
1999 		/* Copy address just once */
2000 		if (sunaddr) {
2001 			unix_copy_addr(msg, skb->sk);
2002 			sunaddr = NULL;
2003 		}
2004 
2005 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2006 		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2007 					    msg->msg_iov, chunk)) {
2008 			if (copied == 0)
2009 				copied = -EFAULT;
2010 			break;
2011 		}
2012 		copied += chunk;
2013 		size -= chunk;
2014 
2015 		/* Mark read part of skb as used */
2016 		if (!(flags & MSG_PEEK)) {
2017 			UNIXCB(skb).consumed += chunk;
2018 
2019 			sk_peek_offset_bwd(sk, chunk);
2020 
2021 			if (UNIXCB(skb).fp)
2022 				unix_detach_fds(siocb->scm, skb);
2023 
2024 			if (unix_skb_len(skb))
2025 				break;
2026 
2027 			skb_unlink(skb, &sk->sk_receive_queue);
2028 			consume_skb(skb);
2029 
2030 			if (siocb->scm->fp)
2031 				break;
2032 		} else {
2033 			/* It is questionable, see note in unix_dgram_recvmsg.
2034 			 */
2035 			if (UNIXCB(skb).fp)
2036 				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2037 
2038 			sk_peek_offset_fwd(sk, chunk);
2039 
2040 			break;
2041 		}
2042 	} while (size);
2043 
2044 	mutex_unlock(&u->readlock);
2045 	scm_recv(sock, msg, siocb->scm, flags);
2046 out:
2047 	return copied ? : err;
2048 }
2049 
2050 static int unix_shutdown(struct socket *sock, int mode)
2051 {
2052 	struct sock *sk = sock->sk;
2053 	struct sock *other;
2054 
2055 	if (mode < SHUT_RD || mode > SHUT_RDWR)
2056 		return -EINVAL;
2057 	/* This maps:
2058 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2059 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2060 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2061 	 */
2062 	++mode;
2063 
2064 	unix_state_lock(sk);
2065 	sk->sk_shutdown |= mode;
2066 	other = unix_peer(sk);
2067 	if (other)
2068 		sock_hold(other);
2069 	unix_state_unlock(sk);
2070 	sk->sk_state_change(sk);
2071 
2072 	if (other &&
2073 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2074 
2075 		int peer_mode = 0;
2076 
2077 		if (mode&RCV_SHUTDOWN)
2078 			peer_mode |= SEND_SHUTDOWN;
2079 		if (mode&SEND_SHUTDOWN)
2080 			peer_mode |= RCV_SHUTDOWN;
2081 		unix_state_lock(other);
2082 		other->sk_shutdown |= peer_mode;
2083 		unix_state_unlock(other);
2084 		other->sk_state_change(other);
2085 		if (peer_mode == SHUTDOWN_MASK)
2086 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2087 		else if (peer_mode & RCV_SHUTDOWN)
2088 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2089 	}
2090 	if (other)
2091 		sock_put(other);
2092 
2093 	return 0;
2094 }
2095 
2096 long unix_inq_len(struct sock *sk)
2097 {
2098 	struct sk_buff *skb;
2099 	long amount = 0;
2100 
2101 	if (sk->sk_state == TCP_LISTEN)
2102 		return -EINVAL;
2103 
2104 	spin_lock(&sk->sk_receive_queue.lock);
2105 	if (sk->sk_type == SOCK_STREAM ||
2106 	    sk->sk_type == SOCK_SEQPACKET) {
2107 		skb_queue_walk(&sk->sk_receive_queue, skb)
2108 			amount += unix_skb_len(skb);
2109 	} else {
2110 		skb = skb_peek(&sk->sk_receive_queue);
2111 		if (skb)
2112 			amount = skb->len;
2113 	}
2114 	spin_unlock(&sk->sk_receive_queue.lock);
2115 
2116 	return amount;
2117 }
2118 EXPORT_SYMBOL_GPL(unix_inq_len);
2119 
2120 long unix_outq_len(struct sock *sk)
2121 {
2122 	return sk_wmem_alloc_get(sk);
2123 }
2124 EXPORT_SYMBOL_GPL(unix_outq_len);
2125 
2126 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2127 {
2128 	struct sock *sk = sock->sk;
2129 	long amount = 0;
2130 	int err;
2131 
2132 	switch (cmd) {
2133 	case SIOCOUTQ:
2134 		amount = unix_outq_len(sk);
2135 		err = put_user(amount, (int __user *)arg);
2136 		break;
2137 	case SIOCINQ:
2138 		amount = unix_inq_len(sk);
2139 		if (amount < 0)
2140 			err = amount;
2141 		else
2142 			err = put_user(amount, (int __user *)arg);
2143 		break;
2144 	default:
2145 		err = -ENOIOCTLCMD;
2146 		break;
2147 	}
2148 	return err;
2149 }
2150 
2151 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2152 {
2153 	struct sock *sk = sock->sk;
2154 	unsigned int mask;
2155 
2156 	sock_poll_wait(file, sk_sleep(sk), wait);
2157 	mask = 0;
2158 
2159 	/* exceptional events? */
2160 	if (sk->sk_err)
2161 		mask |= POLLERR;
2162 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2163 		mask |= POLLHUP;
2164 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2165 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2166 
2167 	/* readable? */
2168 	if (!skb_queue_empty(&sk->sk_receive_queue))
2169 		mask |= POLLIN | POLLRDNORM;
2170 
2171 	/* Connection-based need to check for termination and startup */
2172 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2173 	    sk->sk_state == TCP_CLOSE)
2174 		mask |= POLLHUP;
2175 
2176 	/*
2177 	 * we set writable also when the other side has shut down the
2178 	 * connection. This prevents stuck sockets.
2179 	 */
2180 	if (unix_writable(sk))
2181 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2182 
2183 	return mask;
2184 }
2185 
2186 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2187 				    poll_table *wait)
2188 {
2189 	struct sock *sk = sock->sk, *other;
2190 	unsigned int mask, writable;
2191 
2192 	sock_poll_wait(file, sk_sleep(sk), wait);
2193 	mask = 0;
2194 
2195 	/* exceptional events? */
2196 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2197 		mask |= POLLERR |
2198 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2199 
2200 	if (sk->sk_shutdown & RCV_SHUTDOWN)
2201 		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2202 	if (sk->sk_shutdown == SHUTDOWN_MASK)
2203 		mask |= POLLHUP;
2204 
2205 	/* readable? */
2206 	if (!skb_queue_empty(&sk->sk_receive_queue))
2207 		mask |= POLLIN | POLLRDNORM;
2208 
2209 	/* Connection-based need to check for termination and startup */
2210 	if (sk->sk_type == SOCK_SEQPACKET) {
2211 		if (sk->sk_state == TCP_CLOSE)
2212 			mask |= POLLHUP;
2213 		/* connection hasn't started yet? */
2214 		if (sk->sk_state == TCP_SYN_SENT)
2215 			return mask;
2216 	}
2217 
2218 	/* No write status requested, avoid expensive OUT tests. */
2219 	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2220 		return mask;
2221 
2222 	writable = unix_writable(sk);
2223 	other = unix_peer_get(sk);
2224 	if (other) {
2225 		if (unix_peer(other) != sk) {
2226 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2227 			if (unix_recvq_full(other))
2228 				writable = 0;
2229 		}
2230 		sock_put(other);
2231 	}
2232 
2233 	if (writable)
2234 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2235 	else
2236 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2237 
2238 	return mask;
2239 }
2240 
2241 #ifdef CONFIG_PROC_FS
2242 
2243 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2244 
2245 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2246 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2247 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2248 
2249 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2250 {
2251 	unsigned long offset = get_offset(*pos);
2252 	unsigned long bucket = get_bucket(*pos);
2253 	struct sock *sk;
2254 	unsigned long count = 0;
2255 
2256 	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2257 		if (sock_net(sk) != seq_file_net(seq))
2258 			continue;
2259 		if (++count == offset)
2260 			break;
2261 	}
2262 
2263 	return sk;
2264 }
2265 
2266 static struct sock *unix_next_socket(struct seq_file *seq,
2267 				     struct sock *sk,
2268 				     loff_t *pos)
2269 {
2270 	unsigned long bucket;
2271 
2272 	while (sk > (struct sock *)SEQ_START_TOKEN) {
2273 		sk = sk_next(sk);
2274 		if (!sk)
2275 			goto next_bucket;
2276 		if (sock_net(sk) == seq_file_net(seq))
2277 			return sk;
2278 	}
2279 
2280 	do {
2281 		sk = unix_from_bucket(seq, pos);
2282 		if (sk)
2283 			return sk;
2284 
2285 next_bucket:
2286 		bucket = get_bucket(*pos) + 1;
2287 		*pos = set_bucket_offset(bucket, 1);
2288 	} while (bucket < ARRAY_SIZE(unix_socket_table));
2289 
2290 	return NULL;
2291 }
2292 
2293 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2294 	__acquires(unix_table_lock)
2295 {
2296 	spin_lock(&unix_table_lock);
2297 
2298 	if (!*pos)
2299 		return SEQ_START_TOKEN;
2300 
2301 	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2302 		return NULL;
2303 
2304 	return unix_next_socket(seq, NULL, pos);
2305 }
2306 
2307 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2308 {
2309 	++*pos;
2310 	return unix_next_socket(seq, v, pos);
2311 }
2312 
2313 static void unix_seq_stop(struct seq_file *seq, void *v)
2314 	__releases(unix_table_lock)
2315 {
2316 	spin_unlock(&unix_table_lock);
2317 }
2318 
2319 static int unix_seq_show(struct seq_file *seq, void *v)
2320 {
2321 
2322 	if (v == SEQ_START_TOKEN)
2323 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2324 			 "Inode Path\n");
2325 	else {
2326 		struct sock *s = v;
2327 		struct unix_sock *u = unix_sk(s);
2328 		unix_state_lock(s);
2329 
2330 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2331 			s,
2332 			atomic_read(&s->sk_refcnt),
2333 			0,
2334 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2335 			s->sk_type,
2336 			s->sk_socket ?
2337 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2338 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2339 			sock_i_ino(s));
2340 
2341 		if (u->addr) {
2342 			int i, len;
2343 			seq_putc(seq, ' ');
2344 
2345 			i = 0;
2346 			len = u->addr->len - sizeof(short);
2347 			if (!UNIX_ABSTRACT(s))
2348 				len--;
2349 			else {
2350 				seq_putc(seq, '@');
2351 				i++;
2352 			}
2353 			for ( ; i < len; i++)
2354 				seq_putc(seq, u->addr->name->sun_path[i]);
2355 		}
2356 		unix_state_unlock(s);
2357 		seq_putc(seq, '\n');
2358 	}
2359 
2360 	return 0;
2361 }
2362 
2363 static const struct seq_operations unix_seq_ops = {
2364 	.start  = unix_seq_start,
2365 	.next   = unix_seq_next,
2366 	.stop   = unix_seq_stop,
2367 	.show   = unix_seq_show,
2368 };
2369 
2370 static int unix_seq_open(struct inode *inode, struct file *file)
2371 {
2372 	return seq_open_net(inode, file, &unix_seq_ops,
2373 			    sizeof(struct seq_net_private));
2374 }
2375 
2376 static const struct file_operations unix_seq_fops = {
2377 	.owner		= THIS_MODULE,
2378 	.open		= unix_seq_open,
2379 	.read		= seq_read,
2380 	.llseek		= seq_lseek,
2381 	.release	= seq_release_net,
2382 };
2383 
2384 #endif
2385 
2386 static const struct net_proto_family unix_family_ops = {
2387 	.family = PF_UNIX,
2388 	.create = unix_create,
2389 	.owner	= THIS_MODULE,
2390 };
2391 
2392 
2393 static int __net_init unix_net_init(struct net *net)
2394 {
2395 	int error = -ENOMEM;
2396 
2397 	net->unx.sysctl_max_dgram_qlen = 10;
2398 	if (unix_sysctl_register(net))
2399 		goto out;
2400 
2401 #ifdef CONFIG_PROC_FS
2402 	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2403 		unix_sysctl_unregister(net);
2404 		goto out;
2405 	}
2406 #endif
2407 	error = 0;
2408 out:
2409 	return error;
2410 }
2411 
2412 static void __net_exit unix_net_exit(struct net *net)
2413 {
2414 	unix_sysctl_unregister(net);
2415 	remove_proc_entry("unix", net->proc_net);
2416 }
2417 
2418 static struct pernet_operations unix_net_ops = {
2419 	.init = unix_net_init,
2420 	.exit = unix_net_exit,
2421 };
2422 
2423 static int __init af_unix_init(void)
2424 {
2425 	int rc = -1;
2426 
2427 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2428 
2429 	rc = proto_register(&unix_proto, 1);
2430 	if (rc != 0) {
2431 		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2432 		       __func__);
2433 		goto out;
2434 	}
2435 
2436 	sock_register(&unix_family_ops);
2437 	register_pernet_subsys(&unix_net_ops);
2438 out:
2439 	return rc;
2440 }
2441 
2442 static void __exit af_unix_exit(void)
2443 {
2444 	sock_unregister(PF_UNIX);
2445 	proto_unregister(&unix_proto);
2446 	unregister_pernet_subsys(&unix_net_ops);
2447 }
2448 
2449 /* Earlier than device_initcall() so that other drivers invoking
2450    request_module() don't end up in a loop when modprobe tries
2451    to use a UNIX socket. But later than subsys_initcall() because
2452    we depend on stuff initialised there */
2453 fs_initcall(af_unix_init);
2454 module_exit(af_unix_exit);
2455 
2456 MODULE_LICENSE("GPL");
2457 MODULE_ALIAS_NETPROTO(PF_UNIX);
2458