1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Support for INET connection oriented protocols. 7 * 8 * Authors: See the TCP sources 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or(at your option) any later version. 14 */ 15 16 #include <linux/config.h> 17 #include <linux/module.h> 18 #include <linux/jhash.h> 19 20 #include <net/inet_connection_sock.h> 21 #include <net/inet_hashtables.h> 22 #include <net/inet_timewait_sock.h> 23 #include <net/ip.h> 24 #include <net/route.h> 25 #include <net/tcp_states.h> 26 #include <net/xfrm.h> 27 28 #ifdef INET_CSK_DEBUG 29 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; 30 EXPORT_SYMBOL(inet_csk_timer_bug_msg); 31 #endif 32 33 /* 34 * This array holds the first and last local port number. 35 * For high-usage systems, use sysctl to change this to 36 * 32768-61000 37 */ 38 int sysctl_local_port_range[2] = { 1024, 4999 }; 39 40 int inet_csk_bind_conflict(const struct sock *sk, 41 const struct inet_bind_bucket *tb) 42 { 43 const u32 sk_rcv_saddr = inet_rcv_saddr(sk); 44 struct sock *sk2; 45 struct hlist_node *node; 46 int reuse = sk->sk_reuse; 47 48 sk_for_each_bound(sk2, node, &tb->owners) { 49 if (sk != sk2 && 50 !inet_v6_ipv6only(sk2) && 51 (!sk->sk_bound_dev_if || 52 !sk2->sk_bound_dev_if || 53 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 54 if (!reuse || !sk2->sk_reuse || 55 sk2->sk_state == TCP_LISTEN) { 56 const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); 57 if (!sk2_rcv_saddr || !sk_rcv_saddr || 58 sk2_rcv_saddr == sk_rcv_saddr) 59 break; 60 } 61 } 62 } 63 return node != NULL; 64 } 65 66 EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); 67 68 /* Obtain a reference to a local port for the given sock, 69 * if snum is zero it means select any available local port. 70 */ 71 int inet_csk_get_port(struct inet_hashinfo *hashinfo, 72 struct sock *sk, unsigned short snum, 73 int (*bind_conflict)(const struct sock *sk, 74 const struct inet_bind_bucket *tb)) 75 { 76 struct inet_bind_hashbucket *head; 77 struct hlist_node *node; 78 struct inet_bind_bucket *tb; 79 int ret; 80 81 local_bh_disable(); 82 if (!snum) { 83 int low = sysctl_local_port_range[0]; 84 int high = sysctl_local_port_range[1]; 85 int remaining = (high - low) + 1; 86 int rover = net_random() % (high - low) + low; 87 88 do { 89 head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; 90 spin_lock(&head->lock); 91 inet_bind_bucket_for_each(tb, node, &head->chain) 92 if (tb->port == rover) 93 goto next; 94 break; 95 next: 96 spin_unlock(&head->lock); 97 if (++rover > high) 98 rover = low; 99 } while (--remaining > 0); 100 101 /* Exhausted local port range during search? It is not 102 * possible for us to be holding one of the bind hash 103 * locks if this test triggers, because if 'remaining' 104 * drops to zero, we broke out of the do/while loop at 105 * the top level, not from the 'break;' statement. 106 */ 107 ret = 1; 108 if (remaining <= 0) 109 goto fail; 110 111 /* OK, here is the one we will use. HEAD is 112 * non-NULL and we hold it's mutex. 113 */ 114 snum = rover; 115 } else { 116 head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; 117 spin_lock(&head->lock); 118 inet_bind_bucket_for_each(tb, node, &head->chain) 119 if (tb->port == snum) 120 goto tb_found; 121 } 122 tb = NULL; 123 goto tb_not_found; 124 tb_found: 125 if (!hlist_empty(&tb->owners)) { 126 if (sk->sk_reuse > 1) 127 goto success; 128 if (tb->fastreuse > 0 && 129 sk->sk_reuse && sk->sk_state != TCP_LISTEN) { 130 goto success; 131 } else { 132 ret = 1; 133 if (bind_conflict(sk, tb)) 134 goto fail_unlock; 135 } 136 } 137 tb_not_found: 138 ret = 1; 139 if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) 140 goto fail_unlock; 141 if (hlist_empty(&tb->owners)) { 142 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) 143 tb->fastreuse = 1; 144 else 145 tb->fastreuse = 0; 146 } else if (tb->fastreuse && 147 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) 148 tb->fastreuse = 0; 149 success: 150 if (!inet_csk(sk)->icsk_bind_hash) 151 inet_bind_hash(sk, tb, snum); 152 BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); 153 ret = 0; 154 155 fail_unlock: 156 spin_unlock(&head->lock); 157 fail: 158 local_bh_enable(); 159 return ret; 160 } 161 162 EXPORT_SYMBOL_GPL(inet_csk_get_port); 163 164 /* 165 * Wait for an incoming connection, avoid race conditions. This must be called 166 * with the socket locked. 167 */ 168 static int inet_csk_wait_for_connect(struct sock *sk, long timeo) 169 { 170 struct inet_connection_sock *icsk = inet_csk(sk); 171 DEFINE_WAIT(wait); 172 int err; 173 174 /* 175 * True wake-one mechanism for incoming connections: only 176 * one process gets woken up, not the 'whole herd'. 177 * Since we do not 'race & poll' for established sockets 178 * anymore, the common case will execute the loop only once. 179 * 180 * Subtle issue: "add_wait_queue_exclusive()" will be added 181 * after any current non-exclusive waiters, and we know that 182 * it will always _stay_ after any new non-exclusive waiters 183 * because all non-exclusive waiters are added at the 184 * beginning of the wait-queue. As such, it's ok to "drop" 185 * our exclusiveness temporarily when we get woken up without 186 * having to remove and re-insert us on the wait queue. 187 */ 188 for (;;) { 189 prepare_to_wait_exclusive(sk->sk_sleep, &wait, 190 TASK_INTERRUPTIBLE); 191 release_sock(sk); 192 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) 193 timeo = schedule_timeout(timeo); 194 lock_sock(sk); 195 err = 0; 196 if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) 197 break; 198 err = -EINVAL; 199 if (sk->sk_state != TCP_LISTEN) 200 break; 201 err = sock_intr_errno(timeo); 202 if (signal_pending(current)) 203 break; 204 err = -EAGAIN; 205 if (!timeo) 206 break; 207 } 208 finish_wait(sk->sk_sleep, &wait); 209 return err; 210 } 211 212 /* 213 * This will accept the next outstanding connection. 214 */ 215 struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 216 { 217 struct inet_connection_sock *icsk = inet_csk(sk); 218 struct sock *newsk; 219 int error; 220 221 lock_sock(sk); 222 223 /* We need to make sure that this socket is listening, 224 * and that it has something pending. 225 */ 226 error = -EINVAL; 227 if (sk->sk_state != TCP_LISTEN) 228 goto out_err; 229 230 /* Find already established connection */ 231 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { 232 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 233 234 /* If this is a non blocking socket don't sleep */ 235 error = -EAGAIN; 236 if (!timeo) 237 goto out_err; 238 239 error = inet_csk_wait_for_connect(sk, timeo); 240 if (error) 241 goto out_err; 242 } 243 244 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); 245 BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); 246 out: 247 release_sock(sk); 248 return newsk; 249 out_err: 250 newsk = NULL; 251 *err = error; 252 goto out; 253 } 254 255 EXPORT_SYMBOL(inet_csk_accept); 256 257 /* 258 * Using different timers for retransmit, delayed acks and probes 259 * We may wish use just one timer maintaining a list of expire jiffies 260 * to optimize. 261 */ 262 void inet_csk_init_xmit_timers(struct sock *sk, 263 void (*retransmit_handler)(unsigned long), 264 void (*delack_handler)(unsigned long), 265 void (*keepalive_handler)(unsigned long)) 266 { 267 struct inet_connection_sock *icsk = inet_csk(sk); 268 269 init_timer(&icsk->icsk_retransmit_timer); 270 init_timer(&icsk->icsk_delack_timer); 271 init_timer(&sk->sk_timer); 272 273 icsk->icsk_retransmit_timer.function = retransmit_handler; 274 icsk->icsk_delack_timer.function = delack_handler; 275 sk->sk_timer.function = keepalive_handler; 276 277 icsk->icsk_retransmit_timer.data = 278 icsk->icsk_delack_timer.data = 279 sk->sk_timer.data = (unsigned long)sk; 280 281 icsk->icsk_pending = icsk->icsk_ack.pending = 0; 282 } 283 284 EXPORT_SYMBOL(inet_csk_init_xmit_timers); 285 286 void inet_csk_clear_xmit_timers(struct sock *sk) 287 { 288 struct inet_connection_sock *icsk = inet_csk(sk); 289 290 icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; 291 292 sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 293 sk_stop_timer(sk, &icsk->icsk_delack_timer); 294 sk_stop_timer(sk, &sk->sk_timer); 295 } 296 297 EXPORT_SYMBOL(inet_csk_clear_xmit_timers); 298 299 void inet_csk_delete_keepalive_timer(struct sock *sk) 300 { 301 sk_stop_timer(sk, &sk->sk_timer); 302 } 303 304 EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); 305 306 void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) 307 { 308 sk_reset_timer(sk, &sk->sk_timer, jiffies + len); 309 } 310 311 EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); 312 313 struct dst_entry* inet_csk_route_req(struct sock *sk, 314 const struct request_sock *req) 315 { 316 struct rtable *rt; 317 const struct inet_request_sock *ireq = inet_rsk(req); 318 struct ip_options *opt = inet_rsk(req)->opt; 319 struct flowi fl = { .oif = sk->sk_bound_dev_if, 320 .nl_u = { .ip4_u = 321 { .daddr = ((opt && opt->srr) ? 322 opt->faddr : 323 ireq->rmt_addr), 324 .saddr = ireq->loc_addr, 325 .tos = RT_CONN_FLAGS(sk) } }, 326 .proto = sk->sk_protocol, 327 .uli_u = { .ports = 328 { .sport = inet_sk(sk)->sport, 329 .dport = ireq->rmt_port } } }; 330 331 if (ip_route_output_flow(&rt, &fl, sk, 0)) { 332 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); 333 return NULL; 334 } 335 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { 336 ip_rt_put(rt); 337 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); 338 return NULL; 339 } 340 return &rt->u.dst; 341 } 342 343 EXPORT_SYMBOL_GPL(inet_csk_route_req); 344 345 static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, 346 const u32 rnd, const u16 synq_hsize) 347 { 348 return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); 349 } 350 351 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 352 #define AF_INET_FAMILY(fam) ((fam) == AF_INET) 353 #else 354 #define AF_INET_FAMILY(fam) 1 355 #endif 356 357 struct request_sock *inet_csk_search_req(const struct sock *sk, 358 struct request_sock ***prevp, 359 const __u16 rport, const __u32 raddr, 360 const __u32 laddr) 361 { 362 const struct inet_connection_sock *icsk = inet_csk(sk); 363 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 364 struct request_sock *req, **prev; 365 366 for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, 367 lopt->nr_table_entries)]; 368 (req = *prev) != NULL; 369 prev = &req->dl_next) { 370 const struct inet_request_sock *ireq = inet_rsk(req); 371 372 if (ireq->rmt_port == rport && 373 ireq->rmt_addr == raddr && 374 ireq->loc_addr == laddr && 375 AF_INET_FAMILY(req->rsk_ops->family)) { 376 BUG_TRAP(!req->sk); 377 *prevp = prev; 378 break; 379 } 380 } 381 382 return req; 383 } 384 385 EXPORT_SYMBOL_GPL(inet_csk_search_req); 386 387 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, 388 unsigned long timeout) 389 { 390 struct inet_connection_sock *icsk = inet_csk(sk); 391 struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; 392 const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, 393 lopt->hash_rnd, lopt->nr_table_entries); 394 395 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); 396 inet_csk_reqsk_queue_added(sk, timeout); 397 } 398 399 /* Only thing we need from tcp.h */ 400 extern int sysctl_tcp_synack_retries; 401 402 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); 403 404 void inet_csk_reqsk_queue_prune(struct sock *parent, 405 const unsigned long interval, 406 const unsigned long timeout, 407 const unsigned long max_rto) 408 { 409 struct inet_connection_sock *icsk = inet_csk(parent); 410 struct request_sock_queue *queue = &icsk->icsk_accept_queue; 411 struct listen_sock *lopt = queue->listen_opt; 412 int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; 413 int thresh = max_retries; 414 unsigned long now = jiffies; 415 struct request_sock **reqp, *req; 416 int i, budget; 417 418 if (lopt == NULL || lopt->qlen == 0) 419 return; 420 421 /* Normally all the openreqs are young and become mature 422 * (i.e. converted to established socket) for first timeout. 423 * If synack was not acknowledged for 3 seconds, it means 424 * one of the following things: synack was lost, ack was lost, 425 * rtt is high or nobody planned to ack (i.e. synflood). 426 * When server is a bit loaded, queue is populated with old 427 * open requests, reducing effective size of queue. 428 * When server is well loaded, queue size reduces to zero 429 * after several minutes of work. It is not synflood, 430 * it is normal operation. The solution is pruning 431 * too old entries overriding normal timeout, when 432 * situation becomes dangerous. 433 * 434 * Essentially, we reserve half of room for young 435 * embrions; and abort old ones without pity, if old 436 * ones are about to clog our table. 437 */ 438 if (lopt->qlen>>(lopt->max_qlen_log-1)) { 439 int young = (lopt->qlen_young<<1); 440 441 while (thresh > 2) { 442 if (lopt->qlen < young) 443 break; 444 thresh--; 445 young <<= 1; 446 } 447 } 448 449 if (queue->rskq_defer_accept) 450 max_retries = queue->rskq_defer_accept; 451 452 budget = 2 * (lopt->nr_table_entries / (timeout / interval)); 453 i = lopt->clock_hand; 454 455 do { 456 reqp=&lopt->syn_table[i]; 457 while ((req = *reqp) != NULL) { 458 if (time_after_eq(now, req->expires)) { 459 if ((req->retrans < thresh || 460 (inet_rsk(req)->acked && req->retrans < max_retries)) 461 && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { 462 unsigned long timeo; 463 464 if (req->retrans++ == 0) 465 lopt->qlen_young--; 466 timeo = min((timeout << req->retrans), max_rto); 467 req->expires = now + timeo; 468 reqp = &req->dl_next; 469 continue; 470 } 471 472 /* Drop this request */ 473 inet_csk_reqsk_queue_unlink(parent, req, reqp); 474 reqsk_queue_removed(queue, req); 475 reqsk_free(req); 476 continue; 477 } 478 reqp = &req->dl_next; 479 } 480 481 i = (i + 1) & (lopt->nr_table_entries - 1); 482 483 } while (--budget > 0); 484 485 lopt->clock_hand = i; 486 487 if (lopt->qlen) 488 inet_csk_reset_keepalive_timer(parent, interval); 489 } 490 491 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); 492 493 struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, 494 const gfp_t priority) 495 { 496 struct sock *newsk = sk_clone(sk, priority); 497 498 if (newsk != NULL) { 499 struct inet_connection_sock *newicsk = inet_csk(newsk); 500 501 newsk->sk_state = TCP_SYN_RECV; 502 newicsk->icsk_bind_hash = NULL; 503 504 inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; 505 newsk->sk_write_space = sk_stream_write_space; 506 507 newicsk->icsk_retransmits = 0; 508 newicsk->icsk_backoff = 0; 509 newicsk->icsk_probes_out = 0; 510 511 /* Deinitialize accept_queue to trap illegal accesses. */ 512 memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); 513 } 514 return newsk; 515 } 516 517 EXPORT_SYMBOL_GPL(inet_csk_clone); 518 519 /* 520 * At this point, there should be no process reference to this 521 * socket, and thus no user references at all. Therefore we 522 * can assume the socket waitqueue is inactive and nobody will 523 * try to jump onto it. 524 */ 525 void inet_csk_destroy_sock(struct sock *sk) 526 { 527 BUG_TRAP(sk->sk_state == TCP_CLOSE); 528 BUG_TRAP(sock_flag(sk, SOCK_DEAD)); 529 530 /* It cannot be in hash table! */ 531 BUG_TRAP(sk_unhashed(sk)); 532 533 /* If it has not 0 inet_sk(sk)->num, it must be bound */ 534 BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); 535 536 sk->sk_prot->destroy(sk); 537 538 sk_stream_kill_queues(sk); 539 540 xfrm_sk_free_policy(sk); 541 542 sk_refcnt_debug_release(sk); 543 544 atomic_dec(sk->sk_prot->orphan_count); 545 sock_put(sk); 546 } 547 548 EXPORT_SYMBOL(inet_csk_destroy_sock); 549 550 int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) 551 { 552 struct inet_sock *inet = inet_sk(sk); 553 struct inet_connection_sock *icsk = inet_csk(sk); 554 int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); 555 556 if (rc != 0) 557 return rc; 558 559 sk->sk_max_ack_backlog = 0; 560 sk->sk_ack_backlog = 0; 561 inet_csk_delack_init(sk); 562 563 /* There is race window here: we announce ourselves listening, 564 * but this transition is still not validated by get_port(). 565 * It is OK, because this socket enters to hash table only 566 * after validation is complete. 567 */ 568 sk->sk_state = TCP_LISTEN; 569 if (!sk->sk_prot->get_port(sk, inet->num)) { 570 inet->sport = htons(inet->num); 571 572 sk_dst_reset(sk); 573 sk->sk_prot->hash(sk); 574 575 return 0; 576 } 577 578 sk->sk_state = TCP_CLOSE; 579 __reqsk_queue_destroy(&icsk->icsk_accept_queue); 580 return -EADDRINUSE; 581 } 582 583 EXPORT_SYMBOL_GPL(inet_csk_listen_start); 584 585 /* 586 * This routine closes sockets which have been at least partially 587 * opened, but not yet accepted. 588 */ 589 void inet_csk_listen_stop(struct sock *sk) 590 { 591 struct inet_connection_sock *icsk = inet_csk(sk); 592 struct request_sock *acc_req; 593 struct request_sock *req; 594 595 inet_csk_delete_keepalive_timer(sk); 596 597 /* make all the listen_opt local to us */ 598 acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); 599 600 /* Following specs, it would be better either to send FIN 601 * (and enter FIN-WAIT-1, it is normal close) 602 * or to send active reset (abort). 603 * Certainly, it is pretty dangerous while synflood, but it is 604 * bad justification for our negligence 8) 605 * To be honest, we are not able to make either 606 * of the variants now. --ANK 607 */ 608 reqsk_queue_destroy(&icsk->icsk_accept_queue); 609 610 while ((req = acc_req) != NULL) { 611 struct sock *child = req->sk; 612 613 acc_req = req->dl_next; 614 615 local_bh_disable(); 616 bh_lock_sock(child); 617 BUG_TRAP(!sock_owned_by_user(child)); 618 sock_hold(child); 619 620 sk->sk_prot->disconnect(child, O_NONBLOCK); 621 622 sock_orphan(child); 623 624 atomic_inc(sk->sk_prot->orphan_count); 625 626 inet_csk_destroy_sock(child); 627 628 bh_unlock_sock(child); 629 local_bh_enable(); 630 sock_put(child); 631 632 sk_acceptq_removed(sk); 633 __reqsk_free(req); 634 } 635 BUG_TRAP(!sk->sk_ack_backlog); 636 } 637 638 EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 639 640 void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) 641 { 642 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 643 const struct inet_sock *inet = inet_sk(sk); 644 645 sin->sin_family = AF_INET; 646 sin->sin_addr.s_addr = inet->daddr; 647 sin->sin_port = inet->dport; 648 } 649 650 EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); 651