xref: /openbmc/linux/net/dccp/proto.c (revision 711aab1d)
1 /*
2  *  net/dccp/proto.c
3  *
4  *  An implementation of the DCCP protocol
5  *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6  *
7  *	This program is free software; you can redistribute it and/or modify it
8  *	under the terms of the GNU General Public License version 2 as
9  *	published by the Free Software Foundation.
10  */
11 
12 #include <linux/dccp.h>
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/sched.h>
16 #include <linux/kernel.h>
17 #include <linux/skbuff.h>
18 #include <linux/netdevice.h>
19 #include <linux/in.h>
20 #include <linux/if_arp.h>
21 #include <linux/init.h>
22 #include <linux/random.h>
23 #include <linux/slab.h>
24 #include <net/checksum.h>
25 
26 #include <net/inet_sock.h>
27 #include <net/inet_common.h>
28 #include <net/sock.h>
29 #include <net/xfrm.h>
30 
31 #include <asm/ioctls.h>
32 #include <linux/spinlock.h>
33 #include <linux/timer.h>
34 #include <linux/delay.h>
35 #include <linux/poll.h>
36 
37 #include "ccid.h"
38 #include "dccp.h"
39 #include "feat.h"
40 
41 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
42 
43 EXPORT_SYMBOL_GPL(dccp_statistics);
44 
45 struct percpu_counter dccp_orphan_count;
46 EXPORT_SYMBOL_GPL(dccp_orphan_count);
47 
48 struct inet_hashinfo dccp_hashinfo;
49 EXPORT_SYMBOL_GPL(dccp_hashinfo);
50 
51 /* the maximum queue length for tx in packets. 0 is no limit */
52 int sysctl_dccp_tx_qlen __read_mostly = 5;
53 
54 #ifdef CONFIG_IP_DCCP_DEBUG
55 static const char *dccp_state_name(const int state)
56 {
57 	static const char *const dccp_state_names[] = {
58 	[DCCP_OPEN]		= "OPEN",
59 	[DCCP_REQUESTING]	= "REQUESTING",
60 	[DCCP_PARTOPEN]		= "PARTOPEN",
61 	[DCCP_LISTEN]		= "LISTEN",
62 	[DCCP_RESPOND]		= "RESPOND",
63 	[DCCP_CLOSING]		= "CLOSING",
64 	[DCCP_ACTIVE_CLOSEREQ]	= "CLOSEREQ",
65 	[DCCP_PASSIVE_CLOSE]	= "PASSIVE_CLOSE",
66 	[DCCP_PASSIVE_CLOSEREQ]	= "PASSIVE_CLOSEREQ",
67 	[DCCP_TIME_WAIT]	= "TIME_WAIT",
68 	[DCCP_CLOSED]		= "CLOSED",
69 	};
70 
71 	if (state >= DCCP_MAX_STATES)
72 		return "INVALID STATE!";
73 	else
74 		return dccp_state_names[state];
75 }
76 #endif
77 
78 void dccp_set_state(struct sock *sk, const int state)
79 {
80 	const int oldstate = sk->sk_state;
81 
82 	dccp_pr_debug("%s(%p)  %s  -->  %s\n", dccp_role(sk), sk,
83 		      dccp_state_name(oldstate), dccp_state_name(state));
84 	WARN_ON(state == oldstate);
85 
86 	switch (state) {
87 	case DCCP_OPEN:
88 		if (oldstate != DCCP_OPEN)
89 			DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
90 		/* Client retransmits all Confirm options until entering OPEN */
91 		if (oldstate == DCCP_PARTOPEN)
92 			dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
93 		break;
94 
95 	case DCCP_CLOSED:
96 		if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
97 		    oldstate == DCCP_CLOSING)
98 			DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
99 
100 		sk->sk_prot->unhash(sk);
101 		if (inet_csk(sk)->icsk_bind_hash != NULL &&
102 		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
103 			inet_put_port(sk);
104 		/* fall through */
105 	default:
106 		if (oldstate == DCCP_OPEN)
107 			DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
108 	}
109 
110 	/* Change state AFTER socket is unhashed to avoid closed
111 	 * socket sitting in hash tables.
112 	 */
113 	sk->sk_state = state;
114 }
115 
116 EXPORT_SYMBOL_GPL(dccp_set_state);
117 
118 static void dccp_finish_passive_close(struct sock *sk)
119 {
120 	switch (sk->sk_state) {
121 	case DCCP_PASSIVE_CLOSE:
122 		/* Node (client or server) has received Close packet. */
123 		dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
124 		dccp_set_state(sk, DCCP_CLOSED);
125 		break;
126 	case DCCP_PASSIVE_CLOSEREQ:
127 		/*
128 		 * Client received CloseReq. We set the `active' flag so that
129 		 * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
130 		 */
131 		dccp_send_close(sk, 1);
132 		dccp_set_state(sk, DCCP_CLOSING);
133 	}
134 }
135 
136 void dccp_done(struct sock *sk)
137 {
138 	dccp_set_state(sk, DCCP_CLOSED);
139 	dccp_clear_xmit_timers(sk);
140 
141 	sk->sk_shutdown = SHUTDOWN_MASK;
142 
143 	if (!sock_flag(sk, SOCK_DEAD))
144 		sk->sk_state_change(sk);
145 	else
146 		inet_csk_destroy_sock(sk);
147 }
148 
149 EXPORT_SYMBOL_GPL(dccp_done);
150 
151 const char *dccp_packet_name(const int type)
152 {
153 	static const char *const dccp_packet_names[] = {
154 		[DCCP_PKT_REQUEST]  = "REQUEST",
155 		[DCCP_PKT_RESPONSE] = "RESPONSE",
156 		[DCCP_PKT_DATA]	    = "DATA",
157 		[DCCP_PKT_ACK]	    = "ACK",
158 		[DCCP_PKT_DATAACK]  = "DATAACK",
159 		[DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
160 		[DCCP_PKT_CLOSE]    = "CLOSE",
161 		[DCCP_PKT_RESET]    = "RESET",
162 		[DCCP_PKT_SYNC]	    = "SYNC",
163 		[DCCP_PKT_SYNCACK]  = "SYNCACK",
164 	};
165 
166 	if (type >= DCCP_NR_PKT_TYPES)
167 		return "INVALID";
168 	else
169 		return dccp_packet_names[type];
170 }
171 
172 EXPORT_SYMBOL_GPL(dccp_packet_name);
173 
174 static void dccp_sk_destruct(struct sock *sk)
175 {
176 	struct dccp_sock *dp = dccp_sk(sk);
177 
178 	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
179 	dp->dccps_hc_tx_ccid = NULL;
180 	inet_sock_destruct(sk);
181 }
182 
183 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
184 {
185 	struct dccp_sock *dp = dccp_sk(sk);
186 	struct inet_connection_sock *icsk = inet_csk(sk);
187 
188 	icsk->icsk_rto		= DCCP_TIMEOUT_INIT;
189 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
190 	sk->sk_state		= DCCP_CLOSED;
191 	sk->sk_write_space	= dccp_write_space;
192 	sk->sk_destruct		= dccp_sk_destruct;
193 	icsk->icsk_sync_mss	= dccp_sync_mss;
194 	dp->dccps_mss_cache	= 536;
195 	dp->dccps_rate_last	= jiffies;
196 	dp->dccps_role		= DCCP_ROLE_UNDEFINED;
197 	dp->dccps_service	= DCCP_SERVICE_CODE_IS_ABSENT;
198 	dp->dccps_tx_qlen	= sysctl_dccp_tx_qlen;
199 
200 	dccp_init_xmit_timers(sk);
201 
202 	INIT_LIST_HEAD(&dp->dccps_featneg);
203 	/* control socket doesn't need feat nego */
204 	if (likely(ctl_sock_initialized))
205 		return dccp_feat_init(sk);
206 	return 0;
207 }
208 
209 EXPORT_SYMBOL_GPL(dccp_init_sock);
210 
211 void dccp_destroy_sock(struct sock *sk)
212 {
213 	struct dccp_sock *dp = dccp_sk(sk);
214 
215 	__skb_queue_purge(&sk->sk_write_queue);
216 	if (sk->sk_send_head != NULL) {
217 		kfree_skb(sk->sk_send_head);
218 		sk->sk_send_head = NULL;
219 	}
220 
221 	/* Clean up a referenced DCCP bind bucket. */
222 	if (inet_csk(sk)->icsk_bind_hash != NULL)
223 		inet_put_port(sk);
224 
225 	kfree(dp->dccps_service_list);
226 	dp->dccps_service_list = NULL;
227 
228 	if (dp->dccps_hc_rx_ackvec != NULL) {
229 		dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
230 		dp->dccps_hc_rx_ackvec = NULL;
231 	}
232 	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
233 	dp->dccps_hc_rx_ccid = NULL;
234 
235 	/* clean up feature negotiation state */
236 	dccp_feat_list_purge(&dp->dccps_featneg);
237 }
238 
239 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
240 
241 static inline int dccp_listen_start(struct sock *sk, int backlog)
242 {
243 	struct dccp_sock *dp = dccp_sk(sk);
244 
245 	dp->dccps_role = DCCP_ROLE_LISTEN;
246 	/* do not start to listen if feature negotiation setup fails */
247 	if (dccp_feat_finalise_settings(dp))
248 		return -EPROTO;
249 	return inet_csk_listen_start(sk, backlog);
250 }
251 
252 static inline int dccp_need_reset(int state)
253 {
254 	return state != DCCP_CLOSED && state != DCCP_LISTEN &&
255 	       state != DCCP_REQUESTING;
256 }
257 
258 int dccp_disconnect(struct sock *sk, int flags)
259 {
260 	struct inet_connection_sock *icsk = inet_csk(sk);
261 	struct inet_sock *inet = inet_sk(sk);
262 	int err = 0;
263 	const int old_state = sk->sk_state;
264 
265 	if (old_state != DCCP_CLOSED)
266 		dccp_set_state(sk, DCCP_CLOSED);
267 
268 	/*
269 	 * This corresponds to the ABORT function of RFC793, sec. 3.8
270 	 * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
271 	 */
272 	if (old_state == DCCP_LISTEN) {
273 		inet_csk_listen_stop(sk);
274 	} else if (dccp_need_reset(old_state)) {
275 		dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
276 		sk->sk_err = ECONNRESET;
277 	} else if (old_state == DCCP_REQUESTING)
278 		sk->sk_err = ECONNRESET;
279 
280 	dccp_clear_xmit_timers(sk);
281 
282 	__skb_queue_purge(&sk->sk_receive_queue);
283 	__skb_queue_purge(&sk->sk_write_queue);
284 	if (sk->sk_send_head != NULL) {
285 		__kfree_skb(sk->sk_send_head);
286 		sk->sk_send_head = NULL;
287 	}
288 
289 	inet->inet_dport = 0;
290 
291 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
292 		inet_reset_saddr(sk);
293 
294 	sk->sk_shutdown = 0;
295 	sock_reset_flag(sk, SOCK_DONE);
296 
297 	icsk->icsk_backoff = 0;
298 	inet_csk_delack_init(sk);
299 	__sk_dst_reset(sk);
300 
301 	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
302 
303 	sk->sk_error_report(sk);
304 	return err;
305 }
306 
307 EXPORT_SYMBOL_GPL(dccp_disconnect);
308 
309 /*
310  *	Wait for a DCCP event.
311  *
312  *	Note that we don't need to lock the socket, as the upper poll layers
313  *	take care of normal races (between the test and the event) and we don't
314  *	go look at any of the socket buffers directly.
315  */
316 unsigned int dccp_poll(struct file *file, struct socket *sock,
317 		       poll_table *wait)
318 {
319 	unsigned int mask;
320 	struct sock *sk = sock->sk;
321 
322 	sock_poll_wait(file, sk_sleep(sk), wait);
323 	if (sk->sk_state == DCCP_LISTEN)
324 		return inet_csk_listen_poll(sk);
325 
326 	/* Socket is not locked. We are protected from async events
327 	   by poll logic and correct handling of state changes
328 	   made by another threads is impossible in any case.
329 	 */
330 
331 	mask = 0;
332 	if (sk->sk_err)
333 		mask = POLLERR;
334 
335 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
336 		mask |= POLLHUP;
337 	if (sk->sk_shutdown & RCV_SHUTDOWN)
338 		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
339 
340 	/* Connected? */
341 	if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
342 		if (atomic_read(&sk->sk_rmem_alloc) > 0)
343 			mask |= POLLIN | POLLRDNORM;
344 
345 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
346 			if (sk_stream_is_writeable(sk)) {
347 				mask |= POLLOUT | POLLWRNORM;
348 			} else {  /* send SIGIO later */
349 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
350 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
351 
352 				/* Race breaker. If space is freed after
353 				 * wspace test but before the flags are set,
354 				 * IO signal will be lost.
355 				 */
356 				if (sk_stream_is_writeable(sk))
357 					mask |= POLLOUT | POLLWRNORM;
358 			}
359 		}
360 	}
361 	return mask;
362 }
363 
364 EXPORT_SYMBOL_GPL(dccp_poll);
365 
366 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
367 {
368 	int rc = -ENOTCONN;
369 
370 	lock_sock(sk);
371 
372 	if (sk->sk_state == DCCP_LISTEN)
373 		goto out;
374 
375 	switch (cmd) {
376 	case SIOCINQ: {
377 		struct sk_buff *skb;
378 		unsigned long amount = 0;
379 
380 		skb = skb_peek(&sk->sk_receive_queue);
381 		if (skb != NULL) {
382 			/*
383 			 * We will only return the amount of this packet since
384 			 * that is all that will be read.
385 			 */
386 			amount = skb->len;
387 		}
388 		rc = put_user(amount, (int __user *)arg);
389 	}
390 		break;
391 	default:
392 		rc = -ENOIOCTLCMD;
393 		break;
394 	}
395 out:
396 	release_sock(sk);
397 	return rc;
398 }
399 
400 EXPORT_SYMBOL_GPL(dccp_ioctl);
401 
402 static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
403 				   char __user *optval, unsigned int optlen)
404 {
405 	struct dccp_sock *dp = dccp_sk(sk);
406 	struct dccp_service_list *sl = NULL;
407 
408 	if (service == DCCP_SERVICE_INVALID_VALUE ||
409 	    optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
410 		return -EINVAL;
411 
412 	if (optlen > sizeof(service)) {
413 		sl = kmalloc(optlen, GFP_KERNEL);
414 		if (sl == NULL)
415 			return -ENOMEM;
416 
417 		sl->dccpsl_nr = optlen / sizeof(u32) - 1;
418 		if (copy_from_user(sl->dccpsl_list,
419 				   optval + sizeof(service),
420 				   optlen - sizeof(service)) ||
421 		    dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
422 			kfree(sl);
423 			return -EFAULT;
424 		}
425 	}
426 
427 	lock_sock(sk);
428 	dp->dccps_service = service;
429 
430 	kfree(dp->dccps_service_list);
431 
432 	dp->dccps_service_list = sl;
433 	release_sock(sk);
434 	return 0;
435 }
436 
437 static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
438 {
439 	u8 *list, len;
440 	int i, rc;
441 
442 	if (cscov < 0 || cscov > 15)
443 		return -EINVAL;
444 	/*
445 	 * Populate a list of permissible values, in the range cscov...15. This
446 	 * is necessary since feature negotiation of single values only works if
447 	 * both sides incidentally choose the same value. Since the list starts
448 	 * lowest-value first, negotiation will pick the smallest shared value.
449 	 */
450 	if (cscov == 0)
451 		return 0;
452 	len = 16 - cscov;
453 
454 	list = kmalloc(len, GFP_KERNEL);
455 	if (list == NULL)
456 		return -ENOBUFS;
457 
458 	for (i = 0; i < len; i++)
459 		list[i] = cscov++;
460 
461 	rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
462 
463 	if (rc == 0) {
464 		if (rx)
465 			dccp_sk(sk)->dccps_pcrlen = cscov;
466 		else
467 			dccp_sk(sk)->dccps_pcslen = cscov;
468 	}
469 	kfree(list);
470 	return rc;
471 }
472 
473 static int dccp_setsockopt_ccid(struct sock *sk, int type,
474 				char __user *optval, unsigned int optlen)
475 {
476 	u8 *val;
477 	int rc = 0;
478 
479 	if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
480 		return -EINVAL;
481 
482 	val = memdup_user(optval, optlen);
483 	if (IS_ERR(val))
484 		return PTR_ERR(val);
485 
486 	lock_sock(sk);
487 	if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
488 		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
489 
490 	if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
491 		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
492 	release_sock(sk);
493 
494 	kfree(val);
495 	return rc;
496 }
497 
498 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
499 		char __user *optval, unsigned int optlen)
500 {
501 	struct dccp_sock *dp = dccp_sk(sk);
502 	int val, err = 0;
503 
504 	switch (optname) {
505 	case DCCP_SOCKOPT_PACKET_SIZE:
506 		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
507 		return 0;
508 	case DCCP_SOCKOPT_CHANGE_L:
509 	case DCCP_SOCKOPT_CHANGE_R:
510 		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
511 		return 0;
512 	case DCCP_SOCKOPT_CCID:
513 	case DCCP_SOCKOPT_RX_CCID:
514 	case DCCP_SOCKOPT_TX_CCID:
515 		return dccp_setsockopt_ccid(sk, optname, optval, optlen);
516 	}
517 
518 	if (optlen < (int)sizeof(int))
519 		return -EINVAL;
520 
521 	if (get_user(val, (int __user *)optval))
522 		return -EFAULT;
523 
524 	if (optname == DCCP_SOCKOPT_SERVICE)
525 		return dccp_setsockopt_service(sk, val, optval, optlen);
526 
527 	lock_sock(sk);
528 	switch (optname) {
529 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
530 		if (dp->dccps_role != DCCP_ROLE_SERVER)
531 			err = -EOPNOTSUPP;
532 		else
533 			dp->dccps_server_timewait = (val != 0);
534 		break;
535 	case DCCP_SOCKOPT_SEND_CSCOV:
536 		err = dccp_setsockopt_cscov(sk, val, false);
537 		break;
538 	case DCCP_SOCKOPT_RECV_CSCOV:
539 		err = dccp_setsockopt_cscov(sk, val, true);
540 		break;
541 	case DCCP_SOCKOPT_QPOLICY_ID:
542 		if (sk->sk_state != DCCP_CLOSED)
543 			err = -EISCONN;
544 		else if (val < 0 || val >= DCCPQ_POLICY_MAX)
545 			err = -EINVAL;
546 		else
547 			dp->dccps_qpolicy = val;
548 		break;
549 	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
550 		if (val < 0)
551 			err = -EINVAL;
552 		else
553 			dp->dccps_tx_qlen = val;
554 		break;
555 	default:
556 		err = -ENOPROTOOPT;
557 		break;
558 	}
559 	release_sock(sk);
560 
561 	return err;
562 }
563 
564 int dccp_setsockopt(struct sock *sk, int level, int optname,
565 		    char __user *optval, unsigned int optlen)
566 {
567 	if (level != SOL_DCCP)
568 		return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
569 							     optname, optval,
570 							     optlen);
571 	return do_dccp_setsockopt(sk, level, optname, optval, optlen);
572 }
573 
574 EXPORT_SYMBOL_GPL(dccp_setsockopt);
575 
576 #ifdef CONFIG_COMPAT
577 int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
578 			   char __user *optval, unsigned int optlen)
579 {
580 	if (level != SOL_DCCP)
581 		return inet_csk_compat_setsockopt(sk, level, optname,
582 						  optval, optlen);
583 	return do_dccp_setsockopt(sk, level, optname, optval, optlen);
584 }
585 
586 EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
587 #endif
588 
589 static int dccp_getsockopt_service(struct sock *sk, int len,
590 				   __be32 __user *optval,
591 				   int __user *optlen)
592 {
593 	const struct dccp_sock *dp = dccp_sk(sk);
594 	const struct dccp_service_list *sl;
595 	int err = -ENOENT, slen = 0, total_len = sizeof(u32);
596 
597 	lock_sock(sk);
598 	if ((sl = dp->dccps_service_list) != NULL) {
599 		slen = sl->dccpsl_nr * sizeof(u32);
600 		total_len += slen;
601 	}
602 
603 	err = -EINVAL;
604 	if (total_len > len)
605 		goto out;
606 
607 	err = 0;
608 	if (put_user(total_len, optlen) ||
609 	    put_user(dp->dccps_service, optval) ||
610 	    (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
611 		err = -EFAULT;
612 out:
613 	release_sock(sk);
614 	return err;
615 }
616 
617 static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
618 		    char __user *optval, int __user *optlen)
619 {
620 	struct dccp_sock *dp;
621 	int val, len;
622 
623 	if (get_user(len, optlen))
624 		return -EFAULT;
625 
626 	if (len < (int)sizeof(int))
627 		return -EINVAL;
628 
629 	dp = dccp_sk(sk);
630 
631 	switch (optname) {
632 	case DCCP_SOCKOPT_PACKET_SIZE:
633 		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
634 		return 0;
635 	case DCCP_SOCKOPT_SERVICE:
636 		return dccp_getsockopt_service(sk, len,
637 					       (__be32 __user *)optval, optlen);
638 	case DCCP_SOCKOPT_GET_CUR_MPS:
639 		val = dp->dccps_mss_cache;
640 		break;
641 	case DCCP_SOCKOPT_AVAILABLE_CCIDS:
642 		return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
643 	case DCCP_SOCKOPT_TX_CCID:
644 		val = ccid_get_current_tx_ccid(dp);
645 		if (val < 0)
646 			return -ENOPROTOOPT;
647 		break;
648 	case DCCP_SOCKOPT_RX_CCID:
649 		val = ccid_get_current_rx_ccid(dp);
650 		if (val < 0)
651 			return -ENOPROTOOPT;
652 		break;
653 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
654 		val = dp->dccps_server_timewait;
655 		break;
656 	case DCCP_SOCKOPT_SEND_CSCOV:
657 		val = dp->dccps_pcslen;
658 		break;
659 	case DCCP_SOCKOPT_RECV_CSCOV:
660 		val = dp->dccps_pcrlen;
661 		break;
662 	case DCCP_SOCKOPT_QPOLICY_ID:
663 		val = dp->dccps_qpolicy;
664 		break;
665 	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
666 		val = dp->dccps_tx_qlen;
667 		break;
668 	case 128 ... 191:
669 		return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
670 					     len, (u32 __user *)optval, optlen);
671 	case 192 ... 255:
672 		return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
673 					     len, (u32 __user *)optval, optlen);
674 	default:
675 		return -ENOPROTOOPT;
676 	}
677 
678 	len = sizeof(val);
679 	if (put_user(len, optlen) || copy_to_user(optval, &val, len))
680 		return -EFAULT;
681 
682 	return 0;
683 }
684 
685 int dccp_getsockopt(struct sock *sk, int level, int optname,
686 		    char __user *optval, int __user *optlen)
687 {
688 	if (level != SOL_DCCP)
689 		return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
690 							     optname, optval,
691 							     optlen);
692 	return do_dccp_getsockopt(sk, level, optname, optval, optlen);
693 }
694 
695 EXPORT_SYMBOL_GPL(dccp_getsockopt);
696 
697 #ifdef CONFIG_COMPAT
698 int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
699 			   char __user *optval, int __user *optlen)
700 {
701 	if (level != SOL_DCCP)
702 		return inet_csk_compat_getsockopt(sk, level, optname,
703 						  optval, optlen);
704 	return do_dccp_getsockopt(sk, level, optname, optval, optlen);
705 }
706 
707 EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
708 #endif
709 
710 static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
711 {
712 	struct cmsghdr *cmsg;
713 
714 	/*
715 	 * Assign an (opaque) qpolicy priority value to skb->priority.
716 	 *
717 	 * We are overloading this skb field for use with the qpolicy subystem.
718 	 * The skb->priority is normally used for the SO_PRIORITY option, which
719 	 * is initialised from sk_priority. Since the assignment of sk_priority
720 	 * to skb->priority happens later (on layer 3), we overload this field
721 	 * for use with queueing priorities as long as the skb is on layer 4.
722 	 * The default priority value (if nothing is set) is 0.
723 	 */
724 	skb->priority = 0;
725 
726 	for_each_cmsghdr(cmsg, msg) {
727 		if (!CMSG_OK(msg, cmsg))
728 			return -EINVAL;
729 
730 		if (cmsg->cmsg_level != SOL_DCCP)
731 			continue;
732 
733 		if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
734 		    !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
735 			return -EINVAL;
736 
737 		switch (cmsg->cmsg_type) {
738 		case DCCP_SCM_PRIORITY:
739 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
740 				return -EINVAL;
741 			skb->priority = *(__u32 *)CMSG_DATA(cmsg);
742 			break;
743 		default:
744 			return -EINVAL;
745 		}
746 	}
747 	return 0;
748 }
749 
750 int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
751 {
752 	const struct dccp_sock *dp = dccp_sk(sk);
753 	const int flags = msg->msg_flags;
754 	const int noblock = flags & MSG_DONTWAIT;
755 	struct sk_buff *skb;
756 	int rc, size;
757 	long timeo;
758 
759 	if (len > dp->dccps_mss_cache)
760 		return -EMSGSIZE;
761 
762 	lock_sock(sk);
763 
764 	if (dccp_qpolicy_full(sk)) {
765 		rc = -EAGAIN;
766 		goto out_release;
767 	}
768 
769 	timeo = sock_sndtimeo(sk, noblock);
770 
771 	/*
772 	 * We have to use sk_stream_wait_connect here to set sk_write_pending,
773 	 * so that the trick in dccp_rcv_request_sent_state_process.
774 	 */
775 	/* Wait for a connection to finish. */
776 	if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
777 		if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
778 			goto out_release;
779 
780 	size = sk->sk_prot->max_header + len;
781 	release_sock(sk);
782 	skb = sock_alloc_send_skb(sk, size, noblock, &rc);
783 	lock_sock(sk);
784 	if (skb == NULL)
785 		goto out_release;
786 
787 	skb_reserve(skb, sk->sk_prot->max_header);
788 	rc = memcpy_from_msg(skb_put(skb, len), msg, len);
789 	if (rc != 0)
790 		goto out_discard;
791 
792 	rc = dccp_msghdr_parse(msg, skb);
793 	if (rc != 0)
794 		goto out_discard;
795 
796 	dccp_qpolicy_push(sk, skb);
797 	/*
798 	 * The xmit_timer is set if the TX CCID is rate-based and will expire
799 	 * when congestion control permits to release further packets into the
800 	 * network. Window-based CCIDs do not use this timer.
801 	 */
802 	if (!timer_pending(&dp->dccps_xmit_timer))
803 		dccp_write_xmit(sk);
804 out_release:
805 	release_sock(sk);
806 	return rc ? : len;
807 out_discard:
808 	kfree_skb(skb);
809 	goto out_release;
810 }
811 
812 EXPORT_SYMBOL_GPL(dccp_sendmsg);
813 
814 int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
815 		 int flags, int *addr_len)
816 {
817 	const struct dccp_hdr *dh;
818 	long timeo;
819 
820 	lock_sock(sk);
821 
822 	if (sk->sk_state == DCCP_LISTEN) {
823 		len = -ENOTCONN;
824 		goto out;
825 	}
826 
827 	timeo = sock_rcvtimeo(sk, nonblock);
828 
829 	do {
830 		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
831 
832 		if (skb == NULL)
833 			goto verify_sock_status;
834 
835 		dh = dccp_hdr(skb);
836 
837 		switch (dh->dccph_type) {
838 		case DCCP_PKT_DATA:
839 		case DCCP_PKT_DATAACK:
840 			goto found_ok_skb;
841 
842 		case DCCP_PKT_CLOSE:
843 		case DCCP_PKT_CLOSEREQ:
844 			if (!(flags & MSG_PEEK))
845 				dccp_finish_passive_close(sk);
846 			/* fall through */
847 		case DCCP_PKT_RESET:
848 			dccp_pr_debug("found fin (%s) ok!\n",
849 				      dccp_packet_name(dh->dccph_type));
850 			len = 0;
851 			goto found_fin_ok;
852 		default:
853 			dccp_pr_debug("packet_type=%s\n",
854 				      dccp_packet_name(dh->dccph_type));
855 			sk_eat_skb(sk, skb);
856 		}
857 verify_sock_status:
858 		if (sock_flag(sk, SOCK_DONE)) {
859 			len = 0;
860 			break;
861 		}
862 
863 		if (sk->sk_err) {
864 			len = sock_error(sk);
865 			break;
866 		}
867 
868 		if (sk->sk_shutdown & RCV_SHUTDOWN) {
869 			len = 0;
870 			break;
871 		}
872 
873 		if (sk->sk_state == DCCP_CLOSED) {
874 			if (!sock_flag(sk, SOCK_DONE)) {
875 				/* This occurs when user tries to read
876 				 * from never connected socket.
877 				 */
878 				len = -ENOTCONN;
879 				break;
880 			}
881 			len = 0;
882 			break;
883 		}
884 
885 		if (!timeo) {
886 			len = -EAGAIN;
887 			break;
888 		}
889 
890 		if (signal_pending(current)) {
891 			len = sock_intr_errno(timeo);
892 			break;
893 		}
894 
895 		sk_wait_data(sk, &timeo, NULL);
896 		continue;
897 	found_ok_skb:
898 		if (len > skb->len)
899 			len = skb->len;
900 		else if (len < skb->len)
901 			msg->msg_flags |= MSG_TRUNC;
902 
903 		if (skb_copy_datagram_msg(skb, 0, msg, len)) {
904 			/* Exception. Bailout! */
905 			len = -EFAULT;
906 			break;
907 		}
908 		if (flags & MSG_TRUNC)
909 			len = skb->len;
910 	found_fin_ok:
911 		if (!(flags & MSG_PEEK))
912 			sk_eat_skb(sk, skb);
913 		break;
914 	} while (1);
915 out:
916 	release_sock(sk);
917 	return len;
918 }
919 
920 EXPORT_SYMBOL_GPL(dccp_recvmsg);
921 
922 int inet_dccp_listen(struct socket *sock, int backlog)
923 {
924 	struct sock *sk = sock->sk;
925 	unsigned char old_state;
926 	int err;
927 
928 	lock_sock(sk);
929 
930 	err = -EINVAL;
931 	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
932 		goto out;
933 
934 	old_state = sk->sk_state;
935 	if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
936 		goto out;
937 
938 	/* Really, if the socket is already in listen state
939 	 * we can only allow the backlog to be adjusted.
940 	 */
941 	if (old_state != DCCP_LISTEN) {
942 		/*
943 		 * FIXME: here it probably should be sk->sk_prot->listen_start
944 		 * see tcp_listen_start
945 		 */
946 		err = dccp_listen_start(sk, backlog);
947 		if (err)
948 			goto out;
949 	}
950 	sk->sk_max_ack_backlog = backlog;
951 	err = 0;
952 
953 out:
954 	release_sock(sk);
955 	return err;
956 }
957 
958 EXPORT_SYMBOL_GPL(inet_dccp_listen);
959 
960 static void dccp_terminate_connection(struct sock *sk)
961 {
962 	u8 next_state = DCCP_CLOSED;
963 
964 	switch (sk->sk_state) {
965 	case DCCP_PASSIVE_CLOSE:
966 	case DCCP_PASSIVE_CLOSEREQ:
967 		dccp_finish_passive_close(sk);
968 		break;
969 	case DCCP_PARTOPEN:
970 		dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
971 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
972 		/* fall through */
973 	case DCCP_OPEN:
974 		dccp_send_close(sk, 1);
975 
976 		if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
977 		    !dccp_sk(sk)->dccps_server_timewait)
978 			next_state = DCCP_ACTIVE_CLOSEREQ;
979 		else
980 			next_state = DCCP_CLOSING;
981 		/* fall through */
982 	default:
983 		dccp_set_state(sk, next_state);
984 	}
985 }
986 
987 void dccp_close(struct sock *sk, long timeout)
988 {
989 	struct dccp_sock *dp = dccp_sk(sk);
990 	struct sk_buff *skb;
991 	u32 data_was_unread = 0;
992 	int state;
993 
994 	lock_sock(sk);
995 
996 	sk->sk_shutdown = SHUTDOWN_MASK;
997 
998 	if (sk->sk_state == DCCP_LISTEN) {
999 		dccp_set_state(sk, DCCP_CLOSED);
1000 
1001 		/* Special case. */
1002 		inet_csk_listen_stop(sk);
1003 
1004 		goto adjudge_to_death;
1005 	}
1006 
1007 	sk_stop_timer(sk, &dp->dccps_xmit_timer);
1008 
1009 	/*
1010 	 * We need to flush the recv. buffs.  We do this only on the
1011 	 * descriptor close, not protocol-sourced closes, because the
1012 	  *reader process may not have drained the data yet!
1013 	 */
1014 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1015 		data_was_unread += skb->len;
1016 		__kfree_skb(skb);
1017 	}
1018 
1019 	/* If socket has been already reset kill it. */
1020 	if (sk->sk_state == DCCP_CLOSED)
1021 		goto adjudge_to_death;
1022 
1023 	if (data_was_unread) {
1024 		/* Unread data was tossed, send an appropriate Reset Code */
1025 		DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
1026 		dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
1027 		dccp_set_state(sk, DCCP_CLOSED);
1028 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1029 		/* Check zero linger _after_ checking for unread data. */
1030 		sk->sk_prot->disconnect(sk, 0);
1031 	} else if (sk->sk_state != DCCP_CLOSED) {
1032 		/*
1033 		 * Normal connection termination. May need to wait if there are
1034 		 * still packets in the TX queue that are delayed by the CCID.
1035 		 */
1036 		dccp_flush_write_queue(sk, &timeout);
1037 		dccp_terminate_connection(sk);
1038 	}
1039 
1040 	/*
1041 	 * Flush write queue. This may be necessary in several cases:
1042 	 * - we have been closed by the peer but still have application data;
1043 	 * - abortive termination (unread data or zero linger time),
1044 	 * - normal termination but queue could not be flushed within time limit
1045 	 */
1046 	__skb_queue_purge(&sk->sk_write_queue);
1047 
1048 	sk_stream_wait_close(sk, timeout);
1049 
1050 adjudge_to_death:
1051 	state = sk->sk_state;
1052 	sock_hold(sk);
1053 	sock_orphan(sk);
1054 
1055 	/*
1056 	 * It is the last release_sock in its life. It will remove backlog.
1057 	 */
1058 	release_sock(sk);
1059 	/*
1060 	 * Now socket is owned by kernel and we acquire BH lock
1061 	 * to finish close. No need to check for user refs.
1062 	 */
1063 	local_bh_disable();
1064 	bh_lock_sock(sk);
1065 	WARN_ON(sock_owned_by_user(sk));
1066 
1067 	percpu_counter_inc(sk->sk_prot->orphan_count);
1068 
1069 	/* Have we already been destroyed by a softirq or backlog? */
1070 	if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
1071 		goto out;
1072 
1073 	if (sk->sk_state == DCCP_CLOSED)
1074 		inet_csk_destroy_sock(sk);
1075 
1076 	/* Otherwise, socket is reprieved until protocol close. */
1077 
1078 out:
1079 	bh_unlock_sock(sk);
1080 	local_bh_enable();
1081 	sock_put(sk);
1082 }
1083 
1084 EXPORT_SYMBOL_GPL(dccp_close);
1085 
1086 void dccp_shutdown(struct sock *sk, int how)
1087 {
1088 	dccp_pr_debug("called shutdown(%x)\n", how);
1089 }
1090 
1091 EXPORT_SYMBOL_GPL(dccp_shutdown);
1092 
1093 static inline int __init dccp_mib_init(void)
1094 {
1095 	dccp_statistics = alloc_percpu(struct dccp_mib);
1096 	if (!dccp_statistics)
1097 		return -ENOMEM;
1098 	return 0;
1099 }
1100 
1101 static inline void dccp_mib_exit(void)
1102 {
1103 	free_percpu(dccp_statistics);
1104 }
1105 
1106 static int thash_entries;
1107 module_param(thash_entries, int, 0444);
1108 MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
1109 
1110 #ifdef CONFIG_IP_DCCP_DEBUG
1111 bool dccp_debug;
1112 module_param(dccp_debug, bool, 0644);
1113 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
1114 
1115 EXPORT_SYMBOL_GPL(dccp_debug);
1116 #endif
1117 
1118 static int __init dccp_init(void)
1119 {
1120 	unsigned long goal;
1121 	int ehash_order, bhash_order, i;
1122 	int rc;
1123 
1124 	BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
1125 		     FIELD_SIZEOF(struct sk_buff, cb));
1126 	rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
1127 	if (rc)
1128 		goto out_fail;
1129 	rc = -ENOBUFS;
1130 	inet_hashinfo_init(&dccp_hashinfo);
1131 	dccp_hashinfo.bind_bucket_cachep =
1132 		kmem_cache_create("dccp_bind_bucket",
1133 				  sizeof(struct inet_bind_bucket), 0,
1134 				  SLAB_HWCACHE_ALIGN, NULL);
1135 	if (!dccp_hashinfo.bind_bucket_cachep)
1136 		goto out_free_percpu;
1137 
1138 	/*
1139 	 * Size and allocate the main established and bind bucket
1140 	 * hash tables.
1141 	 *
1142 	 * The methodology is similar to that of the buffer cache.
1143 	 */
1144 	if (totalram_pages >= (128 * 1024))
1145 		goal = totalram_pages >> (21 - PAGE_SHIFT);
1146 	else
1147 		goal = totalram_pages >> (23 - PAGE_SHIFT);
1148 
1149 	if (thash_entries)
1150 		goal = (thash_entries *
1151 			sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
1152 	for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
1153 		;
1154 	do {
1155 		unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE /
1156 					sizeof(struct inet_ehash_bucket);
1157 
1158 		while (hash_size & (hash_size - 1))
1159 			hash_size--;
1160 		dccp_hashinfo.ehash_mask = hash_size - 1;
1161 		dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
1162 			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order);
1163 	} while (!dccp_hashinfo.ehash && --ehash_order > 0);
1164 
1165 	if (!dccp_hashinfo.ehash) {
1166 		DCCP_CRIT("Failed to allocate DCCP established hash table");
1167 		goto out_free_bind_bucket_cachep;
1168 	}
1169 
1170 	for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
1171 		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
1172 
1173 	if (inet_ehash_locks_alloc(&dccp_hashinfo))
1174 			goto out_free_dccp_ehash;
1175 
1176 	bhash_order = ehash_order;
1177 
1178 	do {
1179 		dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
1180 					sizeof(struct inet_bind_hashbucket);
1181 		if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
1182 		    bhash_order > 0)
1183 			continue;
1184 		dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
1185 			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
1186 	} while (!dccp_hashinfo.bhash && --bhash_order >= 0);
1187 
1188 	if (!dccp_hashinfo.bhash) {
1189 		DCCP_CRIT("Failed to allocate DCCP bind hash table");
1190 		goto out_free_dccp_locks;
1191 	}
1192 
1193 	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
1194 		spin_lock_init(&dccp_hashinfo.bhash[i].lock);
1195 		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
1196 	}
1197 
1198 	rc = dccp_mib_init();
1199 	if (rc)
1200 		goto out_free_dccp_bhash;
1201 
1202 	rc = dccp_ackvec_init();
1203 	if (rc)
1204 		goto out_free_dccp_mib;
1205 
1206 	rc = dccp_sysctl_init();
1207 	if (rc)
1208 		goto out_ackvec_exit;
1209 
1210 	rc = ccid_initialize_builtins();
1211 	if (rc)
1212 		goto out_sysctl_exit;
1213 
1214 	dccp_timestamping_init();
1215 
1216 	return 0;
1217 
1218 out_sysctl_exit:
1219 	dccp_sysctl_exit();
1220 out_ackvec_exit:
1221 	dccp_ackvec_exit();
1222 out_free_dccp_mib:
1223 	dccp_mib_exit();
1224 out_free_dccp_bhash:
1225 	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
1226 out_free_dccp_locks:
1227 	inet_ehash_locks_free(&dccp_hashinfo);
1228 out_free_dccp_ehash:
1229 	free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
1230 out_free_bind_bucket_cachep:
1231 	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1232 out_free_percpu:
1233 	percpu_counter_destroy(&dccp_orphan_count);
1234 out_fail:
1235 	dccp_hashinfo.bhash = NULL;
1236 	dccp_hashinfo.ehash = NULL;
1237 	dccp_hashinfo.bind_bucket_cachep = NULL;
1238 	return rc;
1239 }
1240 
1241 static void __exit dccp_fini(void)
1242 {
1243 	ccid_cleanup_builtins();
1244 	dccp_mib_exit();
1245 	free_pages((unsigned long)dccp_hashinfo.bhash,
1246 		   get_order(dccp_hashinfo.bhash_size *
1247 			     sizeof(struct inet_bind_hashbucket)));
1248 	free_pages((unsigned long)dccp_hashinfo.ehash,
1249 		   get_order((dccp_hashinfo.ehash_mask + 1) *
1250 			     sizeof(struct inet_ehash_bucket)));
1251 	inet_ehash_locks_free(&dccp_hashinfo);
1252 	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1253 	dccp_ackvec_exit();
1254 	dccp_sysctl_exit();
1255 	percpu_counter_destroy(&dccp_orphan_count);
1256 }
1257 
1258 module_init(dccp_init);
1259 module_exit(dccp_fini);
1260 
1261 MODULE_LICENSE("GPL");
1262 MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
1263 MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
1264