xref: /openbmc/linux/net/dccp/proto.c (revision 871a2c16c21b988688b4ab1a78eadd969765c0a3)
1 /*
2  *  net/dccp/proto.c
3  *
4  *  An implementation of the DCCP protocol
5  *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
6  *
7  *	This program is free software; you can redistribute it and/or modify it
8  *	under the terms of the GNU General Public License version 2 as
9  *	published by the Free Software Foundation.
10  */
11 
12 #include <linux/dccp.h>
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/sched.h>
16 #include <linux/kernel.h>
17 #include <linux/skbuff.h>
18 #include <linux/netdevice.h>
19 #include <linux/in.h>
20 #include <linux/if_arp.h>
21 #include <linux/init.h>
22 #include <linux/random.h>
23 #include <linux/slab.h>
24 #include <net/checksum.h>
25 
26 #include <net/inet_sock.h>
27 #include <net/sock.h>
28 #include <net/xfrm.h>
29 
30 #include <asm/ioctls.h>
31 #include <linux/spinlock.h>
32 #include <linux/timer.h>
33 #include <linux/delay.h>
34 #include <linux/poll.h>
35 
36 #include "ccid.h"
37 #include "dccp.h"
38 #include "feat.h"
39 
40 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
41 
42 EXPORT_SYMBOL_GPL(dccp_statistics);
43 
44 struct percpu_counter dccp_orphan_count;
45 EXPORT_SYMBOL_GPL(dccp_orphan_count);
46 
47 struct inet_hashinfo dccp_hashinfo;
48 EXPORT_SYMBOL_GPL(dccp_hashinfo);
49 
50 /* the maximum queue length for tx in packets. 0 is no limit */
51 int sysctl_dccp_tx_qlen __read_mostly = 5;
52 
53 #ifdef CONFIG_IP_DCCP_DEBUG
54 static const char *dccp_state_name(const int state)
55 {
56 	static const char *const dccp_state_names[] = {
57 	[DCCP_OPEN]		= "OPEN",
58 	[DCCP_REQUESTING]	= "REQUESTING",
59 	[DCCP_PARTOPEN]		= "PARTOPEN",
60 	[DCCP_LISTEN]		= "LISTEN",
61 	[DCCP_RESPOND]		= "RESPOND",
62 	[DCCP_CLOSING]		= "CLOSING",
63 	[DCCP_ACTIVE_CLOSEREQ]	= "CLOSEREQ",
64 	[DCCP_PASSIVE_CLOSE]	= "PASSIVE_CLOSE",
65 	[DCCP_PASSIVE_CLOSEREQ]	= "PASSIVE_CLOSEREQ",
66 	[DCCP_TIME_WAIT]	= "TIME_WAIT",
67 	[DCCP_CLOSED]		= "CLOSED",
68 	};
69 
70 	if (state >= DCCP_MAX_STATES)
71 		return "INVALID STATE!";
72 	else
73 		return dccp_state_names[state];
74 }
75 #endif
76 
77 void dccp_set_state(struct sock *sk, const int state)
78 {
79 	const int oldstate = sk->sk_state;
80 
81 	dccp_pr_debug("%s(%p)  %s  -->  %s\n", dccp_role(sk), sk,
82 		      dccp_state_name(oldstate), dccp_state_name(state));
83 	WARN_ON(state == oldstate);
84 
85 	switch (state) {
86 	case DCCP_OPEN:
87 		if (oldstate != DCCP_OPEN)
88 			DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
89 		/* Client retransmits all Confirm options until entering OPEN */
90 		if (oldstate == DCCP_PARTOPEN)
91 			dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
92 		break;
93 
94 	case DCCP_CLOSED:
95 		if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
96 		    oldstate == DCCP_CLOSING)
97 			DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
98 
99 		sk->sk_prot->unhash(sk);
100 		if (inet_csk(sk)->icsk_bind_hash != NULL &&
101 		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
102 			inet_put_port(sk);
103 		/* fall through */
104 	default:
105 		if (oldstate == DCCP_OPEN)
106 			DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
107 	}
108 
109 	/* Change state AFTER socket is unhashed to avoid closed
110 	 * socket sitting in hash tables.
111 	 */
112 	sk->sk_state = state;
113 }
114 
115 EXPORT_SYMBOL_GPL(dccp_set_state);
116 
117 static void dccp_finish_passive_close(struct sock *sk)
118 {
119 	switch (sk->sk_state) {
120 	case DCCP_PASSIVE_CLOSE:
121 		/* Node (client or server) has received Close packet. */
122 		dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
123 		dccp_set_state(sk, DCCP_CLOSED);
124 		break;
125 	case DCCP_PASSIVE_CLOSEREQ:
126 		/*
127 		 * Client received CloseReq. We set the `active' flag so that
128 		 * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
129 		 */
130 		dccp_send_close(sk, 1);
131 		dccp_set_state(sk, DCCP_CLOSING);
132 	}
133 }
134 
135 void dccp_done(struct sock *sk)
136 {
137 	dccp_set_state(sk, DCCP_CLOSED);
138 	dccp_clear_xmit_timers(sk);
139 
140 	sk->sk_shutdown = SHUTDOWN_MASK;
141 
142 	if (!sock_flag(sk, SOCK_DEAD))
143 		sk->sk_state_change(sk);
144 	else
145 		inet_csk_destroy_sock(sk);
146 }
147 
148 EXPORT_SYMBOL_GPL(dccp_done);
149 
150 const char *dccp_packet_name(const int type)
151 {
152 	static const char *const dccp_packet_names[] = {
153 		[DCCP_PKT_REQUEST]  = "REQUEST",
154 		[DCCP_PKT_RESPONSE] = "RESPONSE",
155 		[DCCP_PKT_DATA]	    = "DATA",
156 		[DCCP_PKT_ACK]	    = "ACK",
157 		[DCCP_PKT_DATAACK]  = "DATAACK",
158 		[DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
159 		[DCCP_PKT_CLOSE]    = "CLOSE",
160 		[DCCP_PKT_RESET]    = "RESET",
161 		[DCCP_PKT_SYNC]	    = "SYNC",
162 		[DCCP_PKT_SYNCACK]  = "SYNCACK",
163 	};
164 
165 	if (type >= DCCP_NR_PKT_TYPES)
166 		return "INVALID";
167 	else
168 		return dccp_packet_names[type];
169 }
170 
171 EXPORT_SYMBOL_GPL(dccp_packet_name);
172 
173 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
174 {
175 	struct dccp_sock *dp = dccp_sk(sk);
176 	struct inet_connection_sock *icsk = inet_csk(sk);
177 
178 	icsk->icsk_rto		= DCCP_TIMEOUT_INIT;
179 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
180 	sk->sk_state		= DCCP_CLOSED;
181 	sk->sk_write_space	= dccp_write_space;
182 	icsk->icsk_sync_mss	= dccp_sync_mss;
183 	dp->dccps_mss_cache	= 536;
184 	dp->dccps_rate_last	= jiffies;
185 	dp->dccps_role		= DCCP_ROLE_UNDEFINED;
186 	dp->dccps_service	= DCCP_SERVICE_CODE_IS_ABSENT;
187 	dp->dccps_l_ack_ratio	= dp->dccps_r_ack_ratio = 1;
188 	dp->dccps_tx_qlen	= sysctl_dccp_tx_qlen;
189 
190 	dccp_init_xmit_timers(sk);
191 
192 	INIT_LIST_HEAD(&dp->dccps_featneg);
193 	/* control socket doesn't need feat nego */
194 	if (likely(ctl_sock_initialized))
195 		return dccp_feat_init(sk);
196 	return 0;
197 }
198 
199 EXPORT_SYMBOL_GPL(dccp_init_sock);
200 
201 void dccp_destroy_sock(struct sock *sk)
202 {
203 	struct dccp_sock *dp = dccp_sk(sk);
204 
205 	/*
206 	 * DCCP doesn't use sk_write_queue, just sk_send_head
207 	 * for retransmissions
208 	 */
209 	if (sk->sk_send_head != NULL) {
210 		kfree_skb(sk->sk_send_head);
211 		sk->sk_send_head = NULL;
212 	}
213 
214 	/* Clean up a referenced DCCP bind bucket. */
215 	if (inet_csk(sk)->icsk_bind_hash != NULL)
216 		inet_put_port(sk);
217 
218 	kfree(dp->dccps_service_list);
219 	dp->dccps_service_list = NULL;
220 
221 	if (dp->dccps_hc_rx_ackvec != NULL) {
222 		dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
223 		dp->dccps_hc_rx_ackvec = NULL;
224 	}
225 	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
226 	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
227 	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
228 
229 	/* clean up feature negotiation state */
230 	dccp_feat_list_purge(&dp->dccps_featneg);
231 }
232 
233 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
234 
235 static inline int dccp_listen_start(struct sock *sk, int backlog)
236 {
237 	struct dccp_sock *dp = dccp_sk(sk);
238 
239 	dp->dccps_role = DCCP_ROLE_LISTEN;
240 	/* do not start to listen if feature negotiation setup fails */
241 	if (dccp_feat_finalise_settings(dp))
242 		return -EPROTO;
243 	return inet_csk_listen_start(sk, backlog);
244 }
245 
246 static inline int dccp_need_reset(int state)
247 {
248 	return state != DCCP_CLOSED && state != DCCP_LISTEN &&
249 	       state != DCCP_REQUESTING;
250 }
251 
252 int dccp_disconnect(struct sock *sk, int flags)
253 {
254 	struct inet_connection_sock *icsk = inet_csk(sk);
255 	struct inet_sock *inet = inet_sk(sk);
256 	int err = 0;
257 	const int old_state = sk->sk_state;
258 
259 	if (old_state != DCCP_CLOSED)
260 		dccp_set_state(sk, DCCP_CLOSED);
261 
262 	/*
263 	 * This corresponds to the ABORT function of RFC793, sec. 3.8
264 	 * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
265 	 */
266 	if (old_state == DCCP_LISTEN) {
267 		inet_csk_listen_stop(sk);
268 	} else if (dccp_need_reset(old_state)) {
269 		dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
270 		sk->sk_err = ECONNRESET;
271 	} else if (old_state == DCCP_REQUESTING)
272 		sk->sk_err = ECONNRESET;
273 
274 	dccp_clear_xmit_timers(sk);
275 
276 	__skb_queue_purge(&sk->sk_receive_queue);
277 	__skb_queue_purge(&sk->sk_write_queue);
278 	if (sk->sk_send_head != NULL) {
279 		__kfree_skb(sk->sk_send_head);
280 		sk->sk_send_head = NULL;
281 	}
282 
283 	inet->inet_dport = 0;
284 
285 	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
286 		inet_reset_saddr(sk);
287 
288 	sk->sk_shutdown = 0;
289 	sock_reset_flag(sk, SOCK_DONE);
290 
291 	icsk->icsk_backoff = 0;
292 	inet_csk_delack_init(sk);
293 	__sk_dst_reset(sk);
294 
295 	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
296 
297 	sk->sk_error_report(sk);
298 	return err;
299 }
300 
301 EXPORT_SYMBOL_GPL(dccp_disconnect);
302 
303 /*
304  *	Wait for a DCCP event.
305  *
306  *	Note that we don't need to lock the socket, as the upper poll layers
307  *	take care of normal races (between the test and the event) and we don't
308  *	go look at any of the socket buffers directly.
309  */
310 unsigned int dccp_poll(struct file *file, struct socket *sock,
311 		       poll_table *wait)
312 {
313 	unsigned int mask;
314 	struct sock *sk = sock->sk;
315 
316 	sock_poll_wait(file, sk_sleep(sk), wait);
317 	if (sk->sk_state == DCCP_LISTEN)
318 		return inet_csk_listen_poll(sk);
319 
320 	/* Socket is not locked. We are protected from async events
321 	   by poll logic and correct handling of state changes
322 	   made by another threads is impossible in any case.
323 	 */
324 
325 	mask = 0;
326 	if (sk->sk_err)
327 		mask = POLLERR;
328 
329 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
330 		mask |= POLLHUP;
331 	if (sk->sk_shutdown & RCV_SHUTDOWN)
332 		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
333 
334 	/* Connected? */
335 	if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
336 		if (atomic_read(&sk->sk_rmem_alloc) > 0)
337 			mask |= POLLIN | POLLRDNORM;
338 
339 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
340 			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
341 				mask |= POLLOUT | POLLWRNORM;
342 			} else {  /* send SIGIO later */
343 				set_bit(SOCK_ASYNC_NOSPACE,
344 					&sk->sk_socket->flags);
345 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
346 
347 				/* Race breaker. If space is freed after
348 				 * wspace test but before the flags are set,
349 				 * IO signal will be lost.
350 				 */
351 				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
352 					mask |= POLLOUT | POLLWRNORM;
353 			}
354 		}
355 	}
356 	return mask;
357 }
358 
359 EXPORT_SYMBOL_GPL(dccp_poll);
360 
361 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
362 {
363 	int rc = -ENOTCONN;
364 
365 	lock_sock(sk);
366 
367 	if (sk->sk_state == DCCP_LISTEN)
368 		goto out;
369 
370 	switch (cmd) {
371 	case SIOCINQ: {
372 		struct sk_buff *skb;
373 		unsigned long amount = 0;
374 
375 		skb = skb_peek(&sk->sk_receive_queue);
376 		if (skb != NULL) {
377 			/*
378 			 * We will only return the amount of this packet since
379 			 * that is all that will be read.
380 			 */
381 			amount = skb->len;
382 		}
383 		rc = put_user(amount, (int __user *)arg);
384 	}
385 		break;
386 	default:
387 		rc = -ENOIOCTLCMD;
388 		break;
389 	}
390 out:
391 	release_sock(sk);
392 	return rc;
393 }
394 
395 EXPORT_SYMBOL_GPL(dccp_ioctl);
396 
397 static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
398 				   char __user *optval, unsigned int optlen)
399 {
400 	struct dccp_sock *dp = dccp_sk(sk);
401 	struct dccp_service_list *sl = NULL;
402 
403 	if (service == DCCP_SERVICE_INVALID_VALUE ||
404 	    optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
405 		return -EINVAL;
406 
407 	if (optlen > sizeof(service)) {
408 		sl = kmalloc(optlen, GFP_KERNEL);
409 		if (sl == NULL)
410 			return -ENOMEM;
411 
412 		sl->dccpsl_nr = optlen / sizeof(u32) - 1;
413 		if (copy_from_user(sl->dccpsl_list,
414 				   optval + sizeof(service),
415 				   optlen - sizeof(service)) ||
416 		    dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
417 			kfree(sl);
418 			return -EFAULT;
419 		}
420 	}
421 
422 	lock_sock(sk);
423 	dp->dccps_service = service;
424 
425 	kfree(dp->dccps_service_list);
426 
427 	dp->dccps_service_list = sl;
428 	release_sock(sk);
429 	return 0;
430 }
431 
432 static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
433 {
434 	u8 *list, len;
435 	int i, rc;
436 
437 	if (cscov < 0 || cscov > 15)
438 		return -EINVAL;
439 	/*
440 	 * Populate a list of permissible values, in the range cscov...15. This
441 	 * is necessary since feature negotiation of single values only works if
442 	 * both sides incidentally choose the same value. Since the list starts
443 	 * lowest-value first, negotiation will pick the smallest shared value.
444 	 */
445 	if (cscov == 0)
446 		return 0;
447 	len = 16 - cscov;
448 
449 	list = kmalloc(len, GFP_KERNEL);
450 	if (list == NULL)
451 		return -ENOBUFS;
452 
453 	for (i = 0; i < len; i++)
454 		list[i] = cscov++;
455 
456 	rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
457 
458 	if (rc == 0) {
459 		if (rx)
460 			dccp_sk(sk)->dccps_pcrlen = cscov;
461 		else
462 			dccp_sk(sk)->dccps_pcslen = cscov;
463 	}
464 	kfree(list);
465 	return rc;
466 }
467 
468 static int dccp_setsockopt_ccid(struct sock *sk, int type,
469 				char __user *optval, unsigned int optlen)
470 {
471 	u8 *val;
472 	int rc = 0;
473 
474 	if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
475 		return -EINVAL;
476 
477 	val = memdup_user(optval, optlen);
478 	if (IS_ERR(val))
479 		return PTR_ERR(val);
480 
481 	lock_sock(sk);
482 	if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
483 		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
484 
485 	if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
486 		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
487 	release_sock(sk);
488 
489 	kfree(val);
490 	return rc;
491 }
492 
493 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
494 		char __user *optval, unsigned int optlen)
495 {
496 	struct dccp_sock *dp = dccp_sk(sk);
497 	int val, err = 0;
498 
499 	switch (optname) {
500 	case DCCP_SOCKOPT_PACKET_SIZE:
501 		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
502 		return 0;
503 	case DCCP_SOCKOPT_CHANGE_L:
504 	case DCCP_SOCKOPT_CHANGE_R:
505 		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
506 		return 0;
507 	case DCCP_SOCKOPT_CCID:
508 	case DCCP_SOCKOPT_RX_CCID:
509 	case DCCP_SOCKOPT_TX_CCID:
510 		return dccp_setsockopt_ccid(sk, optname, optval, optlen);
511 	}
512 
513 	if (optlen < (int)sizeof(int))
514 		return -EINVAL;
515 
516 	if (get_user(val, (int __user *)optval))
517 		return -EFAULT;
518 
519 	if (optname == DCCP_SOCKOPT_SERVICE)
520 		return dccp_setsockopt_service(sk, val, optval, optlen);
521 
522 	lock_sock(sk);
523 	switch (optname) {
524 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
525 		if (dp->dccps_role != DCCP_ROLE_SERVER)
526 			err = -EOPNOTSUPP;
527 		else
528 			dp->dccps_server_timewait = (val != 0);
529 		break;
530 	case DCCP_SOCKOPT_SEND_CSCOV:
531 		err = dccp_setsockopt_cscov(sk, val, false);
532 		break;
533 	case DCCP_SOCKOPT_RECV_CSCOV:
534 		err = dccp_setsockopt_cscov(sk, val, true);
535 		break;
536 	case DCCP_SOCKOPT_QPOLICY_ID:
537 		if (sk->sk_state != DCCP_CLOSED)
538 			err = -EISCONN;
539 		else if (val < 0 || val >= DCCPQ_POLICY_MAX)
540 			err = -EINVAL;
541 		else
542 			dp->dccps_qpolicy = val;
543 		break;
544 	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
545 		if (val < 0)
546 			err = -EINVAL;
547 		else
548 			dp->dccps_tx_qlen = val;
549 		break;
550 	default:
551 		err = -ENOPROTOOPT;
552 		break;
553 	}
554 	release_sock(sk);
555 
556 	return err;
557 }
558 
559 int dccp_setsockopt(struct sock *sk, int level, int optname,
560 		    char __user *optval, unsigned int optlen)
561 {
562 	if (level != SOL_DCCP)
563 		return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
564 							     optname, optval,
565 							     optlen);
566 	return do_dccp_setsockopt(sk, level, optname, optval, optlen);
567 }
568 
569 EXPORT_SYMBOL_GPL(dccp_setsockopt);
570 
571 #ifdef CONFIG_COMPAT
572 int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
573 			   char __user *optval, unsigned int optlen)
574 {
575 	if (level != SOL_DCCP)
576 		return inet_csk_compat_setsockopt(sk, level, optname,
577 						  optval, optlen);
578 	return do_dccp_setsockopt(sk, level, optname, optval, optlen);
579 }
580 
581 EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
582 #endif
583 
584 static int dccp_getsockopt_service(struct sock *sk, int len,
585 				   __be32 __user *optval,
586 				   int __user *optlen)
587 {
588 	const struct dccp_sock *dp = dccp_sk(sk);
589 	const struct dccp_service_list *sl;
590 	int err = -ENOENT, slen = 0, total_len = sizeof(u32);
591 
592 	lock_sock(sk);
593 	if ((sl = dp->dccps_service_list) != NULL) {
594 		slen = sl->dccpsl_nr * sizeof(u32);
595 		total_len += slen;
596 	}
597 
598 	err = -EINVAL;
599 	if (total_len > len)
600 		goto out;
601 
602 	err = 0;
603 	if (put_user(total_len, optlen) ||
604 	    put_user(dp->dccps_service, optval) ||
605 	    (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
606 		err = -EFAULT;
607 out:
608 	release_sock(sk);
609 	return err;
610 }
611 
612 static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
613 		    char __user *optval, int __user *optlen)
614 {
615 	struct dccp_sock *dp;
616 	int val, len;
617 
618 	if (get_user(len, optlen))
619 		return -EFAULT;
620 
621 	if (len < (int)sizeof(int))
622 		return -EINVAL;
623 
624 	dp = dccp_sk(sk);
625 
626 	switch (optname) {
627 	case DCCP_SOCKOPT_PACKET_SIZE:
628 		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
629 		return 0;
630 	case DCCP_SOCKOPT_SERVICE:
631 		return dccp_getsockopt_service(sk, len,
632 					       (__be32 __user *)optval, optlen);
633 	case DCCP_SOCKOPT_GET_CUR_MPS:
634 		val = dp->dccps_mss_cache;
635 		break;
636 	case DCCP_SOCKOPT_AVAILABLE_CCIDS:
637 		return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
638 	case DCCP_SOCKOPT_TX_CCID:
639 		val = ccid_get_current_tx_ccid(dp);
640 		if (val < 0)
641 			return -ENOPROTOOPT;
642 		break;
643 	case DCCP_SOCKOPT_RX_CCID:
644 		val = ccid_get_current_rx_ccid(dp);
645 		if (val < 0)
646 			return -ENOPROTOOPT;
647 		break;
648 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
649 		val = dp->dccps_server_timewait;
650 		break;
651 	case DCCP_SOCKOPT_SEND_CSCOV:
652 		val = dp->dccps_pcslen;
653 		break;
654 	case DCCP_SOCKOPT_RECV_CSCOV:
655 		val = dp->dccps_pcrlen;
656 		break;
657 	case DCCP_SOCKOPT_QPOLICY_ID:
658 		val = dp->dccps_qpolicy;
659 		break;
660 	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
661 		val = dp->dccps_tx_qlen;
662 		break;
663 	case 128 ... 191:
664 		return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
665 					     len, (u32 __user *)optval, optlen);
666 	case 192 ... 255:
667 		return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
668 					     len, (u32 __user *)optval, optlen);
669 	default:
670 		return -ENOPROTOOPT;
671 	}
672 
673 	len = sizeof(val);
674 	if (put_user(len, optlen) || copy_to_user(optval, &val, len))
675 		return -EFAULT;
676 
677 	return 0;
678 }
679 
680 int dccp_getsockopt(struct sock *sk, int level, int optname,
681 		    char __user *optval, int __user *optlen)
682 {
683 	if (level != SOL_DCCP)
684 		return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
685 							     optname, optval,
686 							     optlen);
687 	return do_dccp_getsockopt(sk, level, optname, optval, optlen);
688 }
689 
690 EXPORT_SYMBOL_GPL(dccp_getsockopt);
691 
692 #ifdef CONFIG_COMPAT
693 int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
694 			   char __user *optval, int __user *optlen)
695 {
696 	if (level != SOL_DCCP)
697 		return inet_csk_compat_getsockopt(sk, level, optname,
698 						  optval, optlen);
699 	return do_dccp_getsockopt(sk, level, optname, optval, optlen);
700 }
701 
702 EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
703 #endif
704 
705 static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
706 {
707 	struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
708 
709 	/*
710 	 * Assign an (opaque) qpolicy priority value to skb->priority.
711 	 *
712 	 * We are overloading this skb field for use with the qpolicy subystem.
713 	 * The skb->priority is normally used for the SO_PRIORITY option, which
714 	 * is initialised from sk_priority. Since the assignment of sk_priority
715 	 * to skb->priority happens later (on layer 3), we overload this field
716 	 * for use with queueing priorities as long as the skb is on layer 4.
717 	 * The default priority value (if nothing is set) is 0.
718 	 */
719 	skb->priority = 0;
720 
721 	for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
722 
723 		if (!CMSG_OK(msg, cmsg))
724 			return -EINVAL;
725 
726 		if (cmsg->cmsg_level != SOL_DCCP)
727 			continue;
728 
729 		switch (cmsg->cmsg_type) {
730 		case DCCP_SCM_PRIORITY:
731 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
732 				return -EINVAL;
733 			skb->priority = *(__u32 *)CMSG_DATA(cmsg);
734 			break;
735 		default:
736 			return -EINVAL;
737 		}
738 	}
739 	return 0;
740 }
741 
742 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
743 		 size_t len)
744 {
745 	const struct dccp_sock *dp = dccp_sk(sk);
746 	const int flags = msg->msg_flags;
747 	const int noblock = flags & MSG_DONTWAIT;
748 	struct sk_buff *skb;
749 	int rc, size;
750 	long timeo;
751 
752 	if (len > dp->dccps_mss_cache)
753 		return -EMSGSIZE;
754 
755 	lock_sock(sk);
756 
757 	if (dccp_qpolicy_full(sk)) {
758 		rc = -EAGAIN;
759 		goto out_release;
760 	}
761 
762 	timeo = sock_sndtimeo(sk, noblock);
763 
764 	/*
765 	 * We have to use sk_stream_wait_connect here to set sk_write_pending,
766 	 * so that the trick in dccp_rcv_request_sent_state_process.
767 	 */
768 	/* Wait for a connection to finish. */
769 	if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
770 		if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
771 			goto out_release;
772 
773 	size = sk->sk_prot->max_header + len;
774 	release_sock(sk);
775 	skb = sock_alloc_send_skb(sk, size, noblock, &rc);
776 	lock_sock(sk);
777 	if (skb == NULL)
778 		goto out_release;
779 
780 	skb_reserve(skb, sk->sk_prot->max_header);
781 	rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
782 	if (rc != 0)
783 		goto out_discard;
784 
785 	rc = dccp_msghdr_parse(msg, skb);
786 	if (rc != 0)
787 		goto out_discard;
788 
789 	dccp_qpolicy_push(sk, skb);
790 	/*
791 	 * The xmit_timer is set if the TX CCID is rate-based and will expire
792 	 * when congestion control permits to release further packets into the
793 	 * network. Window-based CCIDs do not use this timer.
794 	 */
795 	if (!timer_pending(&dp->dccps_xmit_timer))
796 		dccp_write_xmit(sk);
797 out_release:
798 	release_sock(sk);
799 	return rc ? : len;
800 out_discard:
801 	kfree_skb(skb);
802 	goto out_release;
803 }
804 
805 EXPORT_SYMBOL_GPL(dccp_sendmsg);
806 
807 int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
808 		 size_t len, int nonblock, int flags, int *addr_len)
809 {
810 	const struct dccp_hdr *dh;
811 	long timeo;
812 
813 	lock_sock(sk);
814 
815 	if (sk->sk_state == DCCP_LISTEN) {
816 		len = -ENOTCONN;
817 		goto out;
818 	}
819 
820 	timeo = sock_rcvtimeo(sk, nonblock);
821 
822 	do {
823 		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
824 
825 		if (skb == NULL)
826 			goto verify_sock_status;
827 
828 		dh = dccp_hdr(skb);
829 
830 		switch (dh->dccph_type) {
831 		case DCCP_PKT_DATA:
832 		case DCCP_PKT_DATAACK:
833 			goto found_ok_skb;
834 
835 		case DCCP_PKT_CLOSE:
836 		case DCCP_PKT_CLOSEREQ:
837 			if (!(flags & MSG_PEEK))
838 				dccp_finish_passive_close(sk);
839 			/* fall through */
840 		case DCCP_PKT_RESET:
841 			dccp_pr_debug("found fin (%s) ok!\n",
842 				      dccp_packet_name(dh->dccph_type));
843 			len = 0;
844 			goto found_fin_ok;
845 		default:
846 			dccp_pr_debug("packet_type=%s\n",
847 				      dccp_packet_name(dh->dccph_type));
848 			sk_eat_skb(sk, skb, 0);
849 		}
850 verify_sock_status:
851 		if (sock_flag(sk, SOCK_DONE)) {
852 			len = 0;
853 			break;
854 		}
855 
856 		if (sk->sk_err) {
857 			len = sock_error(sk);
858 			break;
859 		}
860 
861 		if (sk->sk_shutdown & RCV_SHUTDOWN) {
862 			len = 0;
863 			break;
864 		}
865 
866 		if (sk->sk_state == DCCP_CLOSED) {
867 			if (!sock_flag(sk, SOCK_DONE)) {
868 				/* This occurs when user tries to read
869 				 * from never connected socket.
870 				 */
871 				len = -ENOTCONN;
872 				break;
873 			}
874 			len = 0;
875 			break;
876 		}
877 
878 		if (!timeo) {
879 			len = -EAGAIN;
880 			break;
881 		}
882 
883 		if (signal_pending(current)) {
884 			len = sock_intr_errno(timeo);
885 			break;
886 		}
887 
888 		sk_wait_data(sk, &timeo);
889 		continue;
890 	found_ok_skb:
891 		if (len > skb->len)
892 			len = skb->len;
893 		else if (len < skb->len)
894 			msg->msg_flags |= MSG_TRUNC;
895 
896 		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
897 			/* Exception. Bailout! */
898 			len = -EFAULT;
899 			break;
900 		}
901 		if (flags & MSG_TRUNC)
902 			len = skb->len;
903 	found_fin_ok:
904 		if (!(flags & MSG_PEEK))
905 			sk_eat_skb(sk, skb, 0);
906 		break;
907 	} while (1);
908 out:
909 	release_sock(sk);
910 	return len;
911 }
912 
913 EXPORT_SYMBOL_GPL(dccp_recvmsg);
914 
915 int inet_dccp_listen(struct socket *sock, int backlog)
916 {
917 	struct sock *sk = sock->sk;
918 	unsigned char old_state;
919 	int err;
920 
921 	lock_sock(sk);
922 
923 	err = -EINVAL;
924 	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
925 		goto out;
926 
927 	old_state = sk->sk_state;
928 	if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
929 		goto out;
930 
931 	/* Really, if the socket is already in listen state
932 	 * we can only allow the backlog to be adjusted.
933 	 */
934 	if (old_state != DCCP_LISTEN) {
935 		/*
936 		 * FIXME: here it probably should be sk->sk_prot->listen_start
937 		 * see tcp_listen_start
938 		 */
939 		err = dccp_listen_start(sk, backlog);
940 		if (err)
941 			goto out;
942 	}
943 	sk->sk_max_ack_backlog = backlog;
944 	err = 0;
945 
946 out:
947 	release_sock(sk);
948 	return err;
949 }
950 
951 EXPORT_SYMBOL_GPL(inet_dccp_listen);
952 
953 static void dccp_terminate_connection(struct sock *sk)
954 {
955 	u8 next_state = DCCP_CLOSED;
956 
957 	switch (sk->sk_state) {
958 	case DCCP_PASSIVE_CLOSE:
959 	case DCCP_PASSIVE_CLOSEREQ:
960 		dccp_finish_passive_close(sk);
961 		break;
962 	case DCCP_PARTOPEN:
963 		dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
964 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
965 		/* fall through */
966 	case DCCP_OPEN:
967 		dccp_send_close(sk, 1);
968 
969 		if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
970 		    !dccp_sk(sk)->dccps_server_timewait)
971 			next_state = DCCP_ACTIVE_CLOSEREQ;
972 		else
973 			next_state = DCCP_CLOSING;
974 		/* fall through */
975 	default:
976 		dccp_set_state(sk, next_state);
977 	}
978 }
979 
980 void dccp_close(struct sock *sk, long timeout)
981 {
982 	struct dccp_sock *dp = dccp_sk(sk);
983 	struct sk_buff *skb;
984 	u32 data_was_unread = 0;
985 	int state;
986 
987 	lock_sock(sk);
988 
989 	sk->sk_shutdown = SHUTDOWN_MASK;
990 
991 	if (sk->sk_state == DCCP_LISTEN) {
992 		dccp_set_state(sk, DCCP_CLOSED);
993 
994 		/* Special case. */
995 		inet_csk_listen_stop(sk);
996 
997 		goto adjudge_to_death;
998 	}
999 
1000 	sk_stop_timer(sk, &dp->dccps_xmit_timer);
1001 
1002 	/*
1003 	 * We need to flush the recv. buffs.  We do this only on the
1004 	 * descriptor close, not protocol-sourced closes, because the
1005 	  *reader process may not have drained the data yet!
1006 	 */
1007 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1008 		data_was_unread += skb->len;
1009 		__kfree_skb(skb);
1010 	}
1011 
1012 	if (data_was_unread) {
1013 		/* Unread data was tossed, send an appropriate Reset Code */
1014 		DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
1015 		dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
1016 		dccp_set_state(sk, DCCP_CLOSED);
1017 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1018 		/* Check zero linger _after_ checking for unread data. */
1019 		sk->sk_prot->disconnect(sk, 0);
1020 	} else if (sk->sk_state != DCCP_CLOSED) {
1021 		/*
1022 		 * Normal connection termination. May need to wait if there are
1023 		 * still packets in the TX queue that are delayed by the CCID.
1024 		 */
1025 		dccp_flush_write_queue(sk, &timeout);
1026 		dccp_terminate_connection(sk);
1027 	}
1028 
1029 	/*
1030 	 * Flush write queue. This may be necessary in several cases:
1031 	 * - we have been closed by the peer but still have application data;
1032 	 * - abortive termination (unread data or zero linger time),
1033 	 * - normal termination but queue could not be flushed within time limit
1034 	 */
1035 	__skb_queue_purge(&sk->sk_write_queue);
1036 
1037 	sk_stream_wait_close(sk, timeout);
1038 
1039 adjudge_to_death:
1040 	state = sk->sk_state;
1041 	sock_hold(sk);
1042 	sock_orphan(sk);
1043 
1044 	/*
1045 	 * It is the last release_sock in its life. It will remove backlog.
1046 	 */
1047 	release_sock(sk);
1048 	/*
1049 	 * Now socket is owned by kernel and we acquire BH lock
1050 	 * to finish close. No need to check for user refs.
1051 	 */
1052 	local_bh_disable();
1053 	bh_lock_sock(sk);
1054 	WARN_ON(sock_owned_by_user(sk));
1055 
1056 	percpu_counter_inc(sk->sk_prot->orphan_count);
1057 
1058 	/* Have we already been destroyed by a softirq or backlog? */
1059 	if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
1060 		goto out;
1061 
1062 	if (sk->sk_state == DCCP_CLOSED)
1063 		inet_csk_destroy_sock(sk);
1064 
1065 	/* Otherwise, socket is reprieved until protocol close. */
1066 
1067 out:
1068 	bh_unlock_sock(sk);
1069 	local_bh_enable();
1070 	sock_put(sk);
1071 }
1072 
1073 EXPORT_SYMBOL_GPL(dccp_close);
1074 
1075 void dccp_shutdown(struct sock *sk, int how)
1076 {
1077 	dccp_pr_debug("called shutdown(%x)\n", how);
1078 }
1079 
1080 EXPORT_SYMBOL_GPL(dccp_shutdown);
1081 
1082 static inline int dccp_mib_init(void)
1083 {
1084 	return snmp_mib_init((void __percpu **)dccp_statistics,
1085 			     sizeof(struct dccp_mib),
1086 			     __alignof__(struct dccp_mib));
1087 }
1088 
1089 static inline void dccp_mib_exit(void)
1090 {
1091 	snmp_mib_free((void __percpu **)dccp_statistics);
1092 }
1093 
1094 static int thash_entries;
1095 module_param(thash_entries, int, 0444);
1096 MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
1097 
1098 #ifdef CONFIG_IP_DCCP_DEBUG
1099 int dccp_debug;
1100 module_param(dccp_debug, bool, 0644);
1101 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
1102 
1103 EXPORT_SYMBOL_GPL(dccp_debug);
1104 #endif
1105 
1106 static int __init dccp_init(void)
1107 {
1108 	unsigned long goal;
1109 	int ehash_order, bhash_order, i;
1110 	int rc;
1111 
1112 	BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
1113 		     FIELD_SIZEOF(struct sk_buff, cb));
1114 	rc = percpu_counter_init(&dccp_orphan_count, 0);
1115 	if (rc)
1116 		goto out_fail;
1117 	rc = -ENOBUFS;
1118 	inet_hashinfo_init(&dccp_hashinfo);
1119 	dccp_hashinfo.bind_bucket_cachep =
1120 		kmem_cache_create("dccp_bind_bucket",
1121 				  sizeof(struct inet_bind_bucket), 0,
1122 				  SLAB_HWCACHE_ALIGN, NULL);
1123 	if (!dccp_hashinfo.bind_bucket_cachep)
1124 		goto out_free_percpu;
1125 
1126 	/*
1127 	 * Size and allocate the main established and bind bucket
1128 	 * hash tables.
1129 	 *
1130 	 * The methodology is similar to that of the buffer cache.
1131 	 */
1132 	if (totalram_pages >= (128 * 1024))
1133 		goal = totalram_pages >> (21 - PAGE_SHIFT);
1134 	else
1135 		goal = totalram_pages >> (23 - PAGE_SHIFT);
1136 
1137 	if (thash_entries)
1138 		goal = (thash_entries *
1139 			sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
1140 	for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
1141 		;
1142 	do {
1143 		unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE /
1144 					sizeof(struct inet_ehash_bucket);
1145 
1146 		while (hash_size & (hash_size - 1))
1147 			hash_size--;
1148 		dccp_hashinfo.ehash_mask = hash_size - 1;
1149 		dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
1150 			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order);
1151 	} while (!dccp_hashinfo.ehash && --ehash_order > 0);
1152 
1153 	if (!dccp_hashinfo.ehash) {
1154 		DCCP_CRIT("Failed to allocate DCCP established hash table");
1155 		goto out_free_bind_bucket_cachep;
1156 	}
1157 
1158 	for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) {
1159 		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
1160 		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
1161 	}
1162 
1163 	if (inet_ehash_locks_alloc(&dccp_hashinfo))
1164 			goto out_free_dccp_ehash;
1165 
1166 	bhash_order = ehash_order;
1167 
1168 	do {
1169 		dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
1170 					sizeof(struct inet_bind_hashbucket);
1171 		if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
1172 		    bhash_order > 0)
1173 			continue;
1174 		dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
1175 			__get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
1176 	} while (!dccp_hashinfo.bhash && --bhash_order >= 0);
1177 
1178 	if (!dccp_hashinfo.bhash) {
1179 		DCCP_CRIT("Failed to allocate DCCP bind hash table");
1180 		goto out_free_dccp_locks;
1181 	}
1182 
1183 	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
1184 		spin_lock_init(&dccp_hashinfo.bhash[i].lock);
1185 		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
1186 	}
1187 
1188 	rc = dccp_mib_init();
1189 	if (rc)
1190 		goto out_free_dccp_bhash;
1191 
1192 	rc = dccp_ackvec_init();
1193 	if (rc)
1194 		goto out_free_dccp_mib;
1195 
1196 	rc = dccp_sysctl_init();
1197 	if (rc)
1198 		goto out_ackvec_exit;
1199 
1200 	rc = ccid_initialize_builtins();
1201 	if (rc)
1202 		goto out_sysctl_exit;
1203 
1204 	dccp_timestamping_init();
1205 
1206 	return 0;
1207 
1208 out_sysctl_exit:
1209 	dccp_sysctl_exit();
1210 out_ackvec_exit:
1211 	dccp_ackvec_exit();
1212 out_free_dccp_mib:
1213 	dccp_mib_exit();
1214 out_free_dccp_bhash:
1215 	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
1216 out_free_dccp_locks:
1217 	inet_ehash_locks_free(&dccp_hashinfo);
1218 out_free_dccp_ehash:
1219 	free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
1220 out_free_bind_bucket_cachep:
1221 	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1222 out_free_percpu:
1223 	percpu_counter_destroy(&dccp_orphan_count);
1224 out_fail:
1225 	dccp_hashinfo.bhash = NULL;
1226 	dccp_hashinfo.ehash = NULL;
1227 	dccp_hashinfo.bind_bucket_cachep = NULL;
1228 	return rc;
1229 }
1230 
1231 static void __exit dccp_fini(void)
1232 {
1233 	ccid_cleanup_builtins();
1234 	dccp_mib_exit();
1235 	free_pages((unsigned long)dccp_hashinfo.bhash,
1236 		   get_order(dccp_hashinfo.bhash_size *
1237 			     sizeof(struct inet_bind_hashbucket)));
1238 	free_pages((unsigned long)dccp_hashinfo.ehash,
1239 		   get_order((dccp_hashinfo.ehash_mask + 1) *
1240 			     sizeof(struct inet_ehash_bucket)));
1241 	inet_ehash_locks_free(&dccp_hashinfo);
1242 	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1243 	dccp_ackvec_exit();
1244 	dccp_sysctl_exit();
1245 	percpu_counter_destroy(&dccp_orphan_count);
1246 }
1247 
1248 module_init(dccp_init);
1249 module_exit(dccp_fini);
1250 
1251 MODULE_LICENSE("GPL");
1252 MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
1253 MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
1254