xref: /openbmc/linux/net/dccp/proto.c (revision df2634f43f5106947f3735a0b61a6527a4b278cd)
1 /*
2  *  net/dccp/proto.c
3  *
4  *  An implementation of the DCCP protocol
5  *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6  *
7  *	This program is free software; you can redistribute it and/or modify it
8  *	under the terms of the GNU General Public License version 2 as
9  *	published by the Free Software Foundation.
10  */
11 
12 #include <linux/dccp.h>
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/sched.h>
16 #include <linux/kernel.h>
17 #include <linux/skbuff.h>
18 #include <linux/netdevice.h>
19 #include <linux/in.h>
20 #include <linux/if_arp.h>
21 #include <linux/init.h>
22 #include <linux/random.h>
23 #include <linux/slab.h>
24 #include <net/checksum.h>
25 
26 #include <net/inet_sock.h>
27 #include <net/sock.h>
28 #include <net/xfrm.h>
29 
30 #include <asm/ioctls.h>
31 #include <linux/spinlock.h>
32 #include <linux/timer.h>
33 #include <linux/delay.h>
34 #include <linux/poll.h>
35 
36 #include "ccid.h"
37 #include "dccp.h"
38 #include "feat.h"
39 
40 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
41 
42 EXPORT_SYMBOL_GPL(dccp_statistics);
43 
44 struct percpu_counter dccp_orphan_count;
45 EXPORT_SYMBOL_GPL(dccp_orphan_count);
46 
47 struct inet_hashinfo dccp_hashinfo;
48 EXPORT_SYMBOL_GPL(dccp_hashinfo);
49 
50 /* the maximum queue length for tx in packets. 0 is no limit */
51 int sysctl_dccp_tx_qlen __read_mostly = 5;
52 
53 #ifdef CONFIG_IP_DCCP_DEBUG
54 static const char *dccp_state_name(const int state)
55 {
56 	static const char *const dccp_state_names[] = {
57 	[DCCP_OPEN]		= "OPEN",
58 	[DCCP_REQUESTING]	= "REQUESTING",
59 	[DCCP_PARTOPEN]		= "PARTOPEN",
60 	[DCCP_LISTEN]		= "LISTEN",
61 	[DCCP_RESPOND]		= "RESPOND",
62 	[DCCP_CLOSING]		= "CLOSING",
63 	[DCCP_ACTIVE_CLOSEREQ]	= "CLOSEREQ",
64 	[DCCP_PASSIVE_CLOSE]	= "PASSIVE_CLOSE",
65 	[DCCP_PASSIVE_CLOSEREQ]	= "PASSIVE_CLOSEREQ",
66 	[DCCP_TIME_WAIT]	= "TIME_WAIT",
67 	[DCCP_CLOSED]		= "CLOSED",
68 	};
69 
70 	if (state >= DCCP_MAX_STATES)
71 		return "INVALID STATE!";
72 	else
73 		return dccp_state_names[state];
74 }
75 #endif
76 
77 void dccp_set_state(struct sock *sk, const int state)
78 {
79 	const int oldstate = sk->sk_state;
80 
81 	dccp_pr_debug("%s(%p)  %s  -->  %s\n", dccp_role(sk), sk,
82 		      dccp_state_name(oldstate), dccp_state_name(state));
83 	WARN_ON(state == oldstate);
84 
85 	switch (state) {
86 	case DCCP_OPEN:
87 		if (oldstate != DCCP_OPEN)
88 			DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
89 		/* Client retransmits all Confirm options until entering OPEN */
90 		if (oldstate == DCCP_PARTOPEN)
91 			dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
92 		break;
93 
94 	case DCCP_CLOSED:
95 		if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
96 		    oldstate == DCCP_CLOSING)
97 			DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
98 
99 		sk->sk_prot->unhash(sk);
100 		if (inet_csk(sk)->icsk_bind_hash != NULL &&
101 		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
102 			inet_put_port(sk);
103 		/* fall through */
104 	default:
105 		if (oldstate == DCCP_OPEN)
106 			DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
107 	}
108 
109 	/* Change state AFTER socket is unhashed to avoid closed
110 	 * socket sitting in hash tables.
111 	 */
112 	sk->sk_state = state;
113 }
114 
115 EXPORT_SYMBOL_GPL(dccp_set_state);
116 
117 static void dccp_finish_passive_close(struct sock *sk)
118 {
119 	switch (sk->sk_state) {
120 	case DCCP_PASSIVE_CLOSE:
121 		/* Node (client or server) has received Close packet. */
122 		dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
123 		dccp_set_state(sk, DCCP_CLOSED);
124 		break;
125 	case DCCP_PASSIVE_CLOSEREQ:
126 		/*
127 		 * Client received CloseReq. We set the `active' flag so that
128 		 * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
129 		 */
130 		dccp_send_close(sk, 1);
131 		dccp_set_state(sk, DCCP_CLOSING);
132 	}
133 }
134 
135 void dccp_done(struct sock *sk)
136 {
137 	dccp_set_state(sk, DCCP_CLOSED);
138 	dccp_clear_xmit_timers(sk);
139 
140 	sk->sk_shutdown = SHUTDOWN_MASK;
141 
142 	if (!sock_flag(sk, SOCK_DEAD))
143 		sk->sk_state_change(sk);
144 	else
145 		inet_csk_destroy_sock(sk);
146 }
147 
148 EXPORT_SYMBOL_GPL(dccp_done);
149 
150 const char *dccp_packet_name(const int type)
151 {
152 	static const char *const dccp_packet_names[] = {
153 		[DCCP_PKT_REQUEST]  = "REQUEST",
154 		[DCCP_PKT_RESPONSE] = "RESPONSE",
155 		[DCCP_PKT_DATA]	    = "DATA",
156 		[DCCP_PKT_ACK]	    = "ACK",
157 		[DCCP_PKT_DATAACK]  = "DATAACK",
158 		[DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
159 		[DCCP_PKT_CLOSE]    = "CLOSE",
160 		[DCCP_PKT_RESET]    = "RESET",
161 		[DCCP_PKT_SYNC]	    = "SYNC",
162 		[DCCP_PKT_SYNCACK]  = "SYNCACK",
163 	};
164 
165 	if (type >= DCCP_NR_PKT_TYPES)
166 		return "INVALID";
167 	else
168 		return dccp_packet_names[type];
169 }
170 
171 EXPORT_SYMBOL_GPL(dccp_packet_name);
172 
173 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
174 {
175 	struct dccp_sock *dp = dccp_sk(sk);
176 	struct inet_connection_sock *icsk = inet_csk(sk);
177 
178 	icsk->icsk_rto		= DCCP_TIMEOUT_INIT;
179 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
180 	sk->sk_state		= DCCP_CLOSED;
181 	sk->sk_write_space	= dccp_write_space;
182 	icsk->icsk_sync_mss	= dccp_sync_mss;
183 	dp->dccps_mss_cache	= 536;
184 	dp->dccps_rate_last	= jiffies;
185 	dp->dccps_role		= DCCP_ROLE_UNDEFINED;
186 	dp->dccps_service	= DCCP_SERVICE_CODE_IS_ABSENT;
187 	dp->dccps_l_ack_ratio	= dp->dccps_r_ack_ratio = 1;
188 	dp->dccps_tx_qlen	= sysctl_dccp_tx_qlen;
189 
190 	dccp_init_xmit_timers(sk);
191 
192 	INIT_LIST_HEAD(&dp->dccps_featneg);
193 	/* control socket doesn't need feat nego */
194 	if (likely(ctl_sock_initialized))
195 		return dccp_feat_init(sk);
196 	return 0;
197 }
198 
199 EXPORT_SYMBOL_GPL(dccp_init_sock);
200 
201 void dccp_destroy_sock(struct sock *sk)
202 {
203 	struct dccp_sock *dp = dccp_sk(sk);
204 
205 	/*
206 	 * DCCP doesn't use sk_write_queue, just sk_send_head
207 	 * for retransmissions
208 	 */
209 	if (sk->sk_send_head != NULL) {
210 		kfree_skb(sk->sk_send_head);
211 		sk->sk_send_head = NULL;
212 	}
213 
214 	/* Clean up a referenced DCCP bind bucket. */
215 	if (inet_csk(sk)->icsk_bind_hash != NULL)
216 		inet_put_port(sk);
217 
218 	kfree(dp->dccps_service_list);
219 	dp->dccps_service_list = NULL;
220 
221 	if (dp->dccps_hc_rx_ackvec != NULL) {
222 		dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
223 		dp->dccps_hc_rx_ackvec = NULL;
224 	}
225 	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
226 	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
227 	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
228 
229 	/* clean up feature negotiation state */
230 	dccp_feat_list_purge(&dp->dccps_featneg);
231 }
232 
233 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
234 
235 static inline int dccp_listen_start(struct sock *sk, int backlog)
236 {
237 	struct dccp_sock *dp = dccp_sk(sk);
238 
239 	dp->dccps_role = DCCP_ROLE_LISTEN;
240 	/* do not start to listen if feature negotiation setup fails */
241 	if (dccp_feat_finalise_settings(dp))
242 		return -EPROTO;
243 	return inet_csk_listen_start(sk, backlog);
244 }
245 
246 static inline int dccp_need_reset(int state)
247 {
248 	return state != DCCP_CLOSED && state != DCCP_LISTEN &&
249 	       state != DCCP_REQUESTING;
250 }
251 
252 int dccp_disconnect(struct sock *sk, int flags)
253 {
254 	struct inet_connection_sock *icsk = inet_csk(sk);
255 	struct inet_sock *inet = inet_sk(sk);
256 	int err = 0;
257 	const int old_state = sk->sk_state;
258 
259 	if (old_state != DCCP_CLOSED)
260 		dccp_set_state(sk, DCCP_CLOSED);
261 
262 	/*
263 	 * This corresponds to the ABORT function of RFC793, sec. 3.8
264 	 * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
265 	 */
266 	if (old_state == DCCP_LISTEN) {
267 		inet_csk_listen_stop(sk);
268 	} else if (dccp_need_reset(old_state)) {
269 		dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
270 		sk->sk_err = ECONNRESET;
271 	} else if (old_state == DCCP_REQUESTING)
272 		sk->sk_err = ECONNRESET;
273 
274 	dccp_clear_xmit_timers(sk);
275 
276 	__skb_queue_purge(&sk->sk_receive_queue);
277 	__skb_queue_purge(&sk->sk_write_queue);
278 	if (sk->sk_send_head != NULL) {
279 		__kfree_skb(sk->sk_send_head);
280 		sk->sk_send_head = NULL;
281 	}
282 
283 	inet->inet_dport = 0;
284 
285 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
286 		inet_reset_saddr(sk);
287 
288 	sk->sk_shutdown = 0;
289 	sock_reset_flag(sk, SOCK_DONE);
290 
291 	icsk->icsk_backoff = 0;
292 	inet_csk_delack_init(sk);
293 	__sk_dst_reset(sk);
294 
295 	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
296 
297 	sk->sk_error_report(sk);
298 	return err;
299 }
300 
301 EXPORT_SYMBOL_GPL(dccp_disconnect);
302 
303 /*
304  *	Wait for a DCCP event.
305  *
306  *	Note that we don't need to lock the socket, as the upper poll layers
307  *	take care of normal races (between the test and the event) and we don't
308  *	go look at any of the socket buffers directly.
309  */
310 unsigned int dccp_poll(struct file *file, struct socket *sock,
311 		       poll_table *wait)
312 {
313 	unsigned int mask;
314 	struct sock *sk = sock->sk;
315 
316 	sock_poll_wait(file, sk_sleep(sk), wait);
317 	if (sk->sk_state == DCCP_LISTEN)
318 		return inet_csk_listen_poll(sk);
319 
320 	/* Socket is not locked. We are protected from async events
321 	   by poll logic and correct handling of state changes
322 	   made by another threads is impossible in any case.
323 	 */
324 
325 	mask = 0;
326 	if (sk->sk_err)
327 		mask = POLLERR;
328 
329 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
330 		mask |= POLLHUP;
331 	if (sk->sk_shutdown & RCV_SHUTDOWN)
332 		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
333 
334 	/* Connected? */
335 	if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
336 		if (atomic_read(&sk->sk_rmem_alloc) > 0)
337 			mask |= POLLIN | POLLRDNORM;
338 
339 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
340 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
341 				mask |= POLLOUT | POLLWRNORM;
342 			} else {  /* send SIGIO later */
343 				set_bit(SOCK_ASYNC_NOSPACE,
344 					&sk->sk_socket->flags);
345 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
346 
347 				/* Race breaker. If space is freed after
348 				 * wspace test but before the flags are set,
349 				 * IO signal will be lost.
350 				 */
351 				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
352 					mask |= POLLOUT | POLLWRNORM;
353 			}
354 		}
355 	}
356 	return mask;
357 }
358 
359 EXPORT_SYMBOL_GPL(dccp_poll);
360 
361 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
362 {
363 	int rc = -ENOTCONN;
364 
365 	lock_sock(sk);
366 
367 	if (sk->sk_state == DCCP_LISTEN)
368 		goto out;
369 
370 	switch (cmd) {
371 	case SIOCINQ: {
372 		struct sk_buff *skb;
373 		unsigned long amount = 0;
374 
375 		skb = skb_peek(&sk->sk_receive_queue);
376 		if (skb != NULL) {
377 			/*
378 			 * We will only return the amount of this packet since
379 			 * that is all that will be read.
380 			 */
381 			amount = skb->len;
382 		}
383 		rc = put_user(amount, (int __user *)arg);
384 	}
385 		break;
386 	default:
387 		rc = -ENOIOCTLCMD;
388 		break;
389 	}
390 out:
391 	release_sock(sk);
392 	return rc;
393 }
394 
395 EXPORT_SYMBOL_GPL(dccp_ioctl);
396 
397 static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
398 				   char __user *optval, unsigned int optlen)
399 {
400 	struct dccp_sock *dp = dccp_sk(sk);
401 	struct dccp_service_list *sl = NULL;
402 
403 	if (service == DCCP_SERVICE_INVALID_VALUE ||
404 	    optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
405 		return -EINVAL;
406 
407 	if (optlen > sizeof(service)) {
408 		sl = kmalloc(optlen, GFP_KERNEL);
409 		if (sl == NULL)
410 			return -ENOMEM;
411 
412 		sl->dccpsl_nr = optlen / sizeof(u32) - 1;
413 		if (copy_from_user(sl->dccpsl_list,
414 				   optval + sizeof(service),
415 				   optlen - sizeof(service)) ||
416 		    dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
417 			kfree(sl);
418 			return -EFAULT;
419 		}
420 	}
421 
422 	lock_sock(sk);
423 	dp->dccps_service = service;
424 
425 	kfree(dp->dccps_service_list);
426 
427 	dp->dccps_service_list = sl;
428 	release_sock(sk);
429 	return 0;
430 }
431 
432 static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
433 {
434 	u8 *list, len;
435 	int i, rc;
436 
437 	if (cscov < 0 || cscov > 15)
438 		return -EINVAL;
439 	/*
440 	 * Populate a list of permissible values, in the range cscov...15. This
441 	 * is necessary since feature negotiation of single values only works if
442 	 * both sides incidentally choose the same value. Since the list starts
443 	 * lowest-value first, negotiation will pick the smallest shared value.
444 	 */
445 	if (cscov == 0)
446 		return 0;
447 	len = 16 - cscov;
448 
449 	list = kmalloc(len, GFP_KERNEL);
450 	if (list == NULL)
451 		return -ENOBUFS;
452 
453 	for (i = 0; i < len; i++)
454 		list[i] = cscov++;
455 
456 	rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
457 
458 	if (rc == 0) {
459 		if (rx)
460 			dccp_sk(sk)->dccps_pcrlen = cscov;
461 		else
462 			dccp_sk(sk)->dccps_pcslen = cscov;
463 	}
464 	kfree(list);
465 	return rc;
466 }
467 
468 static int dccp_setsockopt_ccid(struct sock *sk, int type,
469 				char __user *optval, unsigned int optlen)
470 {
471 	u8 *val;
472 	int rc = 0;
473 
474 	if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
475 		return -EINVAL;
476 
477 	val = memdup_user(optval, optlen);
478 	if (IS_ERR(val))
479 		return PTR_ERR(val);
480 
481 	lock_sock(sk);
482 	if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
483 		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
484 
485 	if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
486 		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
487 	release_sock(sk);
488 
489 	kfree(val);
490 	return rc;
491 }
492 
493 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
494 		char __user *optval, unsigned int optlen)
495 {
496 	struct dccp_sock *dp = dccp_sk(sk);
497 	int val, err = 0;
498 
499 	switch (optname) {
500 	case DCCP_SOCKOPT_PACKET_SIZE:
501 		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
502 		return 0;
503 	case DCCP_SOCKOPT_CHANGE_L:
504 	case DCCP_SOCKOPT_CHANGE_R:
505 		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
506 		return 0;
507 	case DCCP_SOCKOPT_CCID:
508 	case DCCP_SOCKOPT_RX_CCID:
509 	case DCCP_SOCKOPT_TX_CCID:
510 		return dccp_setsockopt_ccid(sk, optname, optval, optlen);
511 	}
512 
513 	if (optlen < (int)sizeof(int))
514 		return -EINVAL;
515 
516 	if (get_user(val, (int __user *)optval))
517 		return -EFAULT;
518 
519 	if (optname == DCCP_SOCKOPT_SERVICE)
520 		return dccp_setsockopt_service(sk, val, optval, optlen);
521 
522 	lock_sock(sk);
523 	switch (optname) {
524 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
525 		if (dp->dccps_role != DCCP_ROLE_SERVER)
526 			err = -EOPNOTSUPP;
527 		else
528 			dp->dccps_server_timewait = (val != 0);
529 		break;
530 	case DCCP_SOCKOPT_SEND_CSCOV:
531 		err = dccp_setsockopt_cscov(sk, val, false);
532 		break;
533 	case DCCP_SOCKOPT_RECV_CSCOV:
534 		err = dccp_setsockopt_cscov(sk, val, true);
535 		break;
536 	case DCCP_SOCKOPT_QPOLICY_ID:
537 		if (sk->sk_state != DCCP_CLOSED)
538 			err = -EISCONN;
539 		else if (val < 0 || val >= DCCPQ_POLICY_MAX)
540 			err = -EINVAL;
541 		else
542 			dp->dccps_qpolicy = val;
543 		break;
544 	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
545 		if (val < 0)
546 			err = -EINVAL;
547 		else
548 			dp->dccps_tx_qlen = val;
549 		break;
550 	default:
551 		err = -ENOPROTOOPT;
552 		break;
553 	}
554 	release_sock(sk);
555 
556 	return err;
557 }
558 
559 int dccp_setsockopt(struct sock *sk, int level, int optname,
560 		    char __user *optval, unsigned int optlen)
561 {
562 	if (level != SOL_DCCP)
563 		return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
564 							     optname, optval,
565 							     optlen);
566 	return do_dccp_setsockopt(sk, level, optname, optval, optlen);
567 }
568 
569 EXPORT_SYMBOL_GPL(dccp_setsockopt);
570 
571 #ifdef CONFIG_COMPAT
572 int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
573 			   char __user *optval, unsigned int optlen)
574 {
575 	if (level != SOL_DCCP)
576 		return inet_csk_compat_setsockopt(sk, level, optname,
577 						  optval, optlen);
578 	return do_dccp_setsockopt(sk, level, optname, optval, optlen);
579 }
580 
581 EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
582 #endif
583 
584 static int dccp_getsockopt_service(struct sock *sk, int len,
585 				   __be32 __user *optval,
586 				   int __user *optlen)
587 {
588 	const struct dccp_sock *dp = dccp_sk(sk);
589 	const struct dccp_service_list *sl;
590 	int err = -ENOENT, slen = 0, total_len = sizeof(u32);
591 
592 	lock_sock(sk);
593 	if ((sl = dp->dccps_service_list) != NULL) {
594 		slen = sl->dccpsl_nr * sizeof(u32);
595 		total_len += slen;
596 	}
597 
598 	err = -EINVAL;
599 	if (total_len > len)
600 		goto out;
601 
602 	err = 0;
603 	if (put_user(total_len, optlen) ||
604 	    put_user(dp->dccps_service, optval) ||
605 	    (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
606 		err = -EFAULT;
607 out:
608 	release_sock(sk);
609 	return err;
610 }
611 
612 static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
613 		    char __user *optval, int __user *optlen)
614 {
615 	struct dccp_sock *dp;
616 	int val, len;
617 
618 	if (get_user(len, optlen))
619 		return -EFAULT;
620 
621 	if (len < (int)sizeof(int))
622 		return -EINVAL;
623 
624 	dp = dccp_sk(sk);
625 
626 	switch (optname) {
627 	case DCCP_SOCKOPT_PACKET_SIZE:
628 		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
629 		return 0;
630 	case DCCP_SOCKOPT_SERVICE:
631 		return dccp_getsockopt_service(sk, len,
632 					       (__be32 __user *)optval, optlen);
633 	case DCCP_SOCKOPT_GET_CUR_MPS:
634 		val = dp->dccps_mss_cache;
635 		break;
636 	case DCCP_SOCKOPT_AVAILABLE_CCIDS:
637 		return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
638 	case DCCP_SOCKOPT_TX_CCID:
639 		val = ccid_get_current_tx_ccid(dp);
640 		if (val < 0)
641 			return -ENOPROTOOPT;
642 		break;
643 	case DCCP_SOCKOPT_RX_CCID:
644 		val = ccid_get_current_rx_ccid(dp);
645 		if (val < 0)
646 			return -ENOPROTOOPT;
647 		break;
648 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
649 		val = dp->dccps_server_timewait;
650 		break;
651 	case DCCP_SOCKOPT_SEND_CSCOV:
652 		val = dp->dccps_pcslen;
653 		break;
654 	case DCCP_SOCKOPT_RECV_CSCOV:
655 		val = dp->dccps_pcrlen;
656 		break;
657 	case DCCP_SOCKOPT_QPOLICY_ID:
658 		val = dp->dccps_qpolicy;
659 		break;
660 	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
661 		val = dp->dccps_tx_qlen;
662 		break;
663 	case 128 ... 191:
664 		return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
665 					     len, (u32 __user *)optval, optlen);
666 	case 192 ... 255:
667 		return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
668 					     len, (u32 __user *)optval, optlen);
669 	default:
670 		return -ENOPROTOOPT;
671 	}
672 
673 	len = sizeof(val);
674 	if (put_user(len, optlen) || copy_to_user(optval, &val, len))
675 		return -EFAULT;
676 
677 	return 0;
678 }
679 
680 int dccp_getsockopt(struct sock *sk, int level, int optname,
681 		    char __user *optval, int __user *optlen)
682 {
683 	if (level != SOL_DCCP)
684 		return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
685 							     optname, optval,
686 							     optlen);
687 	return do_dccp_getsockopt(sk, level, optname, optval, optlen);
688 }
689 
690 EXPORT_SYMBOL_GPL(dccp_getsockopt);
691 
692 #ifdef CONFIG_COMPAT
693 int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
694 			   char __user *optval, int __user *optlen)
695 {
696 	if (level != SOL_DCCP)
697 		return inet_csk_compat_getsockopt(sk, level, optname,
698 						  optval, optlen);
699 	return do_dccp_getsockopt(sk, level, optname, optval, optlen);
700 }
701 
702 EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
703 #endif
704 
705 static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
706 {
707 	struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
708 
709 	/*
710 	 * Assign an (opaque) qpolicy priority value to skb->priority.
711 	 *
712 	 * We are overloading this skb field for use with the qpolicy subystem.
713 	 * The skb->priority is normally used for the SO_PRIORITY option, which
714 	 * is initialised from sk_priority. Since the assignment of sk_priority
715 	 * to skb->priority happens later (on layer 3), we overload this field
716 	 * for use with queueing priorities as long as the skb is on layer 4.
717 	 * The default priority value (if nothing is set) is 0.
718 	 */
719 	skb->priority = 0;
720 
721 	for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
722 
723 		if (!CMSG_OK(msg, cmsg))
724 			return -EINVAL;
725 
726 		if (cmsg->cmsg_level != SOL_DCCP)
727 			continue;
728 
729 		if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
730 		    !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
731 			return -EINVAL;
732 
733 		switch (cmsg->cmsg_type) {
734 		case DCCP_SCM_PRIORITY:
735 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
736 				return -EINVAL;
737 			skb->priority = *(__u32 *)CMSG_DATA(cmsg);
738 			break;
739 		default:
740 			return -EINVAL;
741 		}
742 	}
743 	return 0;
744 }
745 
746 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
747 		 size_t len)
748 {
749 	const struct dccp_sock *dp = dccp_sk(sk);
750 	const int flags = msg->msg_flags;
751 	const int noblock = flags & MSG_DONTWAIT;
752 	struct sk_buff *skb;
753 	int rc, size;
754 	long timeo;
755 
756 	if (len > dp->dccps_mss_cache)
757 		return -EMSGSIZE;
758 
759 	lock_sock(sk);
760 
761 	if (dccp_qpolicy_full(sk)) {
762 		rc = -EAGAIN;
763 		goto out_release;
764 	}
765 
766 	timeo = sock_sndtimeo(sk, noblock);
767 
768 	/*
769 	 * We have to use sk_stream_wait_connect here to set sk_write_pending,
770 	 * so that the trick in dccp_rcv_request_sent_state_process.
771 	 */
772 	/* Wait for a connection to finish. */
773 	if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
774 		if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
775 			goto out_release;
776 
777 	size = sk->sk_prot->max_header + len;
778 	release_sock(sk);
779 	skb = sock_alloc_send_skb(sk, size, noblock, &rc);
780 	lock_sock(sk);
781 	if (skb == NULL)
782 		goto out_release;
783 
784 	skb_reserve(skb, sk->sk_prot->max_header);
785 	rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
786 	if (rc != 0)
787 		goto out_discard;
788 
789 	rc = dccp_msghdr_parse(msg, skb);
790 	if (rc != 0)
791 		goto out_discard;
792 
793 	dccp_qpolicy_push(sk, skb);
794 	/*
795 	 * The xmit_timer is set if the TX CCID is rate-based and will expire
796 	 * when congestion control permits to release further packets into the
797 	 * network. Window-based CCIDs do not use this timer.
798 	 */
799 	if (!timer_pending(&dp->dccps_xmit_timer))
800 		dccp_write_xmit(sk);
801 out_release:
802 	release_sock(sk);
803 	return rc ? : len;
804 out_discard:
805 	kfree_skb(skb);
806 	goto out_release;
807 }
808 
809 EXPORT_SYMBOL_GPL(dccp_sendmsg);
810 
811 int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
812 		 size_t len, int nonblock, int flags, int *addr_len)
813 {
814 	const struct dccp_hdr *dh;
815 	long timeo;
816 
817 	lock_sock(sk);
818 
819 	if (sk->sk_state == DCCP_LISTEN) {
820 		len = -ENOTCONN;
821 		goto out;
822 	}
823 
824 	timeo = sock_rcvtimeo(sk, nonblock);
825 
826 	do {
827 		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
828 
829 		if (skb == NULL)
830 			goto verify_sock_status;
831 
832 		dh = dccp_hdr(skb);
833 
834 		switch (dh->dccph_type) {
835 		case DCCP_PKT_DATA:
836 		case DCCP_PKT_DATAACK:
837 			goto found_ok_skb;
838 
839 		case DCCP_PKT_CLOSE:
840 		case DCCP_PKT_CLOSEREQ:
841 			if (!(flags & MSG_PEEK))
842 				dccp_finish_passive_close(sk);
843 			/* fall through */
844 		case DCCP_PKT_RESET:
845 			dccp_pr_debug("found fin (%s) ok!\n",
846 				      dccp_packet_name(dh->dccph_type));
847 			len = 0;
848 			goto found_fin_ok;
849 		default:
850 			dccp_pr_debug("packet_type=%s\n",
851 				      dccp_packet_name(dh->dccph_type));
852 			sk_eat_skb(sk, skb, 0);
853 		}
854 verify_sock_status:
855 		if (sock_flag(sk, SOCK_DONE)) {
856 			len = 0;
857 			break;
858 		}
859 
860 		if (sk->sk_err) {
861 			len = sock_error(sk);
862 			break;
863 		}
864 
865 		if (sk->sk_shutdown & RCV_SHUTDOWN) {
866 			len = 0;
867 			break;
868 		}
869 
870 		if (sk->sk_state == DCCP_CLOSED) {
871 			if (!sock_flag(sk, SOCK_DONE)) {
872 				/* This occurs when user tries to read
873 				 * from never connected socket.
874 				 */
875 				len = -ENOTCONN;
876 				break;
877 			}
878 			len = 0;
879 			break;
880 		}
881 
882 		if (!timeo) {
883 			len = -EAGAIN;
884 			break;
885 		}
886 
887 		if (signal_pending(current)) {
888 			len = sock_intr_errno(timeo);
889 			break;
890 		}
891 
892 		sk_wait_data(sk, &timeo);
893 		continue;
894 	found_ok_skb:
895 		if (len > skb->len)
896 			len = skb->len;
897 		else if (len < skb->len)
898 			msg->msg_flags |= MSG_TRUNC;
899 
900 		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
901 			/* Exception. Bailout! */
902 			len = -EFAULT;
903 			break;
904 		}
905 		if (flags & MSG_TRUNC)
906 			len = skb->len;
907 	found_fin_ok:
908 		if (!(flags & MSG_PEEK))
909 			sk_eat_skb(sk, skb, 0);
910 		break;
911 	} while (1);
912 out:
913 	release_sock(sk);
914 	return len;
915 }
916 
917 EXPORT_SYMBOL_GPL(dccp_recvmsg);
918 
919 int inet_dccp_listen(struct socket *sock, int backlog)
920 {
921 	struct sock *sk = sock->sk;
922 	unsigned char old_state;
923 	int err;
924 
925 	lock_sock(sk);
926 
927 	err = -EINVAL;
928 	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
929 		goto out;
930 
931 	old_state = sk->sk_state;
932 	if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
933 		goto out;
934 
935 	/* Really, if the socket is already in listen state
936 	 * we can only allow the backlog to be adjusted.
937 	 */
938 	if (old_state != DCCP_LISTEN) {
939 		/*
940 		 * FIXME: here it probably should be sk->sk_prot->listen_start
941 		 * see tcp_listen_start
942 		 */
943 		err = dccp_listen_start(sk, backlog);
944 		if (err)
945 			goto out;
946 	}
947 	sk->sk_max_ack_backlog = backlog;
948 	err = 0;
949 
950 out:
951 	release_sock(sk);
952 	return err;
953 }
954 
955 EXPORT_SYMBOL_GPL(inet_dccp_listen);
956 
957 static void dccp_terminate_connection(struct sock *sk)
958 {
959 	u8 next_state = DCCP_CLOSED;
960 
961 	switch (sk->sk_state) {
962 	case DCCP_PASSIVE_CLOSE:
963 	case DCCP_PASSIVE_CLOSEREQ:
964 		dccp_finish_passive_close(sk);
965 		break;
966 	case DCCP_PARTOPEN:
967 		dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
968 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
969 		/* fall through */
970 	case DCCP_OPEN:
971 		dccp_send_close(sk, 1);
972 
973 		if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
974 		    !dccp_sk(sk)->dccps_server_timewait)
975 			next_state = DCCP_ACTIVE_CLOSEREQ;
976 		else
977 			next_state = DCCP_CLOSING;
978 		/* fall through */
979 	default:
980 		dccp_set_state(sk, next_state);
981 	}
982 }
983 
984 void dccp_close(struct sock *sk, long timeout)
985 {
986 	struct dccp_sock *dp = dccp_sk(sk);
987 	struct sk_buff *skb;
988 	u32 data_was_unread = 0;
989 	int state;
990 
991 	lock_sock(sk);
992 
993 	sk->sk_shutdown = SHUTDOWN_MASK;
994 
995 	if (sk->sk_state == DCCP_LISTEN) {
996 		dccp_set_state(sk, DCCP_CLOSED);
997 
998 		/* Special case. */
999 		inet_csk_listen_stop(sk);
1000 
1001 		goto adjudge_to_death;
1002 	}
1003 
1004 	sk_stop_timer(sk, &dp->dccps_xmit_timer);
1005 
1006 	/*
1007 	 * We need to flush the recv. buffs.  We do this only on the
1008 	 * descriptor close, not protocol-sourced closes, because the
1009 	  *reader process may not have drained the data yet!
1010 	 */
1011 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1012 		data_was_unread += skb->len;
1013 		__kfree_skb(skb);
1014 	}
1015 
1016 	if (data_was_unread) {
1017 		/* Unread data was tossed, send an appropriate Reset Code */
1018 		DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
1019 		dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
1020 		dccp_set_state(sk, DCCP_CLOSED);
1021 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1022 		/* Check zero linger _after_ checking for unread data. */
1023 		sk->sk_prot->disconnect(sk, 0);
1024 	} else if (sk->sk_state != DCCP_CLOSED) {
1025 		/*
1026 		 * Normal connection termination. May need to wait if there are
1027 		 * still packets in the TX queue that are delayed by the CCID.
1028 		 */
1029 		dccp_flush_write_queue(sk, &timeout);
1030 		dccp_terminate_connection(sk);
1031 	}
1032 
1033 	/*
1034 	 * Flush write queue. This may be necessary in several cases:
1035 	 * - we have been closed by the peer but still have application data;
1036 	 * - abortive termination (unread data or zero linger time),
1037 	 * - normal termination but queue could not be flushed within time limit
1038 	 */
1039 	__skb_queue_purge(&sk->sk_write_queue);
1040 
1041 	sk_stream_wait_close(sk, timeout);
1042 
1043 adjudge_to_death:
1044 	state = sk->sk_state;
1045 	sock_hold(sk);
1046 	sock_orphan(sk);
1047 
1048 	/*
1049 	 * It is the last release_sock in its life. It will remove backlog.
1050 	 */
1051 	release_sock(sk);
1052 	/*
1053 	 * Now socket is owned by kernel and we acquire BH lock
1054 	 * to finish close. No need to check for user refs.
1055 	 */
1056 	local_bh_disable();
1057 	bh_lock_sock(sk);
1058 	WARN_ON(sock_owned_by_user(sk));
1059 
1060 	percpu_counter_inc(sk->sk_prot->orphan_count);
1061 
1062 	/* Have we already been destroyed by a softirq or backlog? */
1063 	if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
1064 		goto out;
1065 
1066 	if (sk->sk_state == DCCP_CLOSED)
1067 		inet_csk_destroy_sock(sk);
1068 
1069 	/* Otherwise, socket is reprieved until protocol close. */
1070 
1071 out:
1072 	bh_unlock_sock(sk);
1073 	local_bh_enable();
1074 	sock_put(sk);
1075 }
1076 
1077 EXPORT_SYMBOL_GPL(dccp_close);
1078 
1079 void dccp_shutdown(struct sock *sk, int how)
1080 {
1081 	dccp_pr_debug("called shutdown(%x)\n", how);
1082 }
1083 
1084 EXPORT_SYMBOL_GPL(dccp_shutdown);
1085 
1086 static inline int dccp_mib_init(void)
1087 {
1088 	return snmp_mib_init((void __percpu **)dccp_statistics,
1089 			     sizeof(struct dccp_mib),
1090 			     __alignof__(struct dccp_mib));
1091 }
1092 
1093 static inline void dccp_mib_exit(void)
1094 {
1095 	snmp_mib_free((void __percpu **)dccp_statistics);
1096 }
1097 
1098 static int thash_entries;
1099 module_param(thash_entries, int, 0444);
1100 MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
1101 
1102 #ifdef CONFIG_IP_DCCP_DEBUG
1103 int dccp_debug;
1104 module_param(dccp_debug, bool, 0644);
1105 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
1106 
1107 EXPORT_SYMBOL_GPL(dccp_debug);
1108 #endif
1109 
1110 static int __init dccp_init(void)
1111 {
1112 	unsigned long goal;
1113 	int ehash_order, bhash_order, i;
1114 	int rc;
1115 
1116 	BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
1117 		     FIELD_SIZEOF(struct sk_buff, cb));
1118 	rc = percpu_counter_init(&dccp_orphan_count, 0);
1119 	if (rc)
1120 		goto out_fail;
1121 	rc = -ENOBUFS;
1122 	inet_hashinfo_init(&dccp_hashinfo);
1123 	dccp_hashinfo.bind_bucket_cachep =
1124 		kmem_cache_create("dccp_bind_bucket",
1125 				  sizeof(struct inet_bind_bucket), 0,
1126 				  SLAB_HWCACHE_ALIGN, NULL);
1127 	if (!dccp_hashinfo.bind_bucket_cachep)
1128 		goto out_free_percpu;
1129 
1130 	/*
1131 	 * Size and allocate the main established and bind bucket
1132 	 * hash tables.
1133 	 *
1134 	 * The methodology is similar to that of the buffer cache.
1135 	 */
1136 	if (totalram_pages >= (128 * 1024))
1137 		goal = totalram_pages >> (21 - PAGE_SHIFT);
1138 	else
1139 		goal = totalram_pages >> (23 - PAGE_SHIFT);
1140 
1141 	if (thash_entries)
1142 		goal = (thash_entries *
1143 			sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
1144 	for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
1145 		;
1146 	do {
1147 		unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE /
1148 					sizeof(struct inet_ehash_bucket);
1149 
1150 		while (hash_size & (hash_size - 1))
1151 			hash_size--;
1152 		dccp_hashinfo.ehash_mask = hash_size - 1;
1153 		dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
1154 			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order);
1155 	} while (!dccp_hashinfo.ehash && --ehash_order > 0);
1156 
1157 	if (!dccp_hashinfo.ehash) {
1158 		DCCP_CRIT("Failed to allocate DCCP established hash table");
1159 		goto out_free_bind_bucket_cachep;
1160 	}
1161 
1162 	for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) {
1163 		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
1164 		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
1165 	}
1166 
1167 	if (inet_ehash_locks_alloc(&dccp_hashinfo))
1168 			goto out_free_dccp_ehash;
1169 
1170 	bhash_order = ehash_order;
1171 
1172 	do {
1173 		dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
1174 					sizeof(struct inet_bind_hashbucket);
1175 		if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
1176 		    bhash_order > 0)
1177 			continue;
1178 		dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
1179 			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
1180 	} while (!dccp_hashinfo.bhash && --bhash_order >= 0);
1181 
1182 	if (!dccp_hashinfo.bhash) {
1183 		DCCP_CRIT("Failed to allocate DCCP bind hash table");
1184 		goto out_free_dccp_locks;
1185 	}
1186 
1187 	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
1188 		spin_lock_init(&dccp_hashinfo.bhash[i].lock);
1189 		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
1190 	}
1191 
1192 	rc = dccp_mib_init();
1193 	if (rc)
1194 		goto out_free_dccp_bhash;
1195 
1196 	rc = dccp_ackvec_init();
1197 	if (rc)
1198 		goto out_free_dccp_mib;
1199 
1200 	rc = dccp_sysctl_init();
1201 	if (rc)
1202 		goto out_ackvec_exit;
1203 
1204 	rc = ccid_initialize_builtins();
1205 	if (rc)
1206 		goto out_sysctl_exit;
1207 
1208 	dccp_timestamping_init();
1209 
1210 	return 0;
1211 
1212 out_sysctl_exit:
1213 	dccp_sysctl_exit();
1214 out_ackvec_exit:
1215 	dccp_ackvec_exit();
1216 out_free_dccp_mib:
1217 	dccp_mib_exit();
1218 out_free_dccp_bhash:
1219 	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
1220 out_free_dccp_locks:
1221 	inet_ehash_locks_free(&dccp_hashinfo);
1222 out_free_dccp_ehash:
1223 	free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
1224 out_free_bind_bucket_cachep:
1225 	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1226 out_free_percpu:
1227 	percpu_counter_destroy(&dccp_orphan_count);
1228 out_fail:
1229 	dccp_hashinfo.bhash = NULL;
1230 	dccp_hashinfo.ehash = NULL;
1231 	dccp_hashinfo.bind_bucket_cachep = NULL;
1232 	return rc;
1233 }
1234 
1235 static void __exit dccp_fini(void)
1236 {
1237 	ccid_cleanup_builtins();
1238 	dccp_mib_exit();
1239 	free_pages((unsigned long)dccp_hashinfo.bhash,
1240 		   get_order(dccp_hashinfo.bhash_size *
1241 			     sizeof(struct inet_bind_hashbucket)));
1242 	free_pages((unsigned long)dccp_hashinfo.ehash,
1243 		   get_order((dccp_hashinfo.ehash_mask + 1) *
1244 			     sizeof(struct inet_ehash_bucket)));
1245 	inet_ehash_locks_free(&dccp_hashinfo);
1246 	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1247 	dccp_ackvec_exit();
1248 	dccp_sysctl_exit();
1249 	percpu_counter_destroy(&dccp_orphan_count);
1250 }
1251 
1252 module_init(dccp_init);
1253 module_exit(dccp_fini);
1254 
1255 MODULE_LICENSE("GPL");
1256 MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
1257 MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
1258