tcp_output.c (9e9fd65d1fa51d919d54d731be0e66492b5b6c5a) | tcp_output.c (46d3ceabd8d98ed0ad10f20c595ca784e34786c5) |
---|---|
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Authors: Ross Biro --- 36 unchanged lines hidden (view full) --- 45/* People can turn this off for buggy TCP's found in printers etc. */ 46int sysctl_tcp_retrans_collapse __read_mostly = 1; 47 48/* People can turn this on to work with those rare, broken TCPs that 49 * interpret the window field as a signed quantity. 50 */ 51int sysctl_tcp_workaround_signed_windows __read_mostly = 0; 52 | 1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Authors: Ross Biro --- 36 unchanged lines hidden (view full) --- 45/* People can turn this off for buggy TCP's found in printers etc. */ 46int sysctl_tcp_retrans_collapse __read_mostly = 1; 47 48/* People can turn this on to work with those rare, broken TCPs that 49 * interpret the window field as a signed quantity. 50 */ 51int sysctl_tcp_workaround_signed_windows __read_mostly = 0; 52 |
53/* Default TSQ limit of two TSO segments */ 54int sysctl_tcp_limit_output_bytes __read_mostly = 131072; 55 |
|
53/* This limits the percentage of the congestion window which we 54 * will allow a single TSO frame to consume. Building TSO frames 55 * which are too large can cause TCP streams to be bursty. 56 */ 57int sysctl_tcp_tso_win_divisor __read_mostly = 3; 58 59int sysctl_tcp_mtu_probing __read_mostly = 0; 60int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; 61 62/* By default, RFC2861 behavior. */ 63int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 64 65int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ 66EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); 67 | 56/* This limits the percentage of the congestion window which we 57 * will allow a single TSO frame to consume. Building TSO frames 58 * which are too large can cause TCP streams to be bursty. 59 */ 60int sysctl_tcp_tso_win_divisor __read_mostly = 3; 61 62int sysctl_tcp_mtu_probing __read_mostly = 0; 63int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; 64 65/* By default, RFC2861 behavior. */ 66int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 67 68int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ 69EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); 70 |
71static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 72 int push_one, gfp_t gfp); |
|
68 69/* Account for new data that has been sent to the network. */ 70static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 71{ 72 struct tcp_sock *tp = tcp_sk(sk); 73 unsigned int prior_packets = tp->packets_out; 74 75 tcp_advance_send_head(sk, skb); --- 702 unchanged lines hidden (view full) --- 778 TCPOLEN_SACK_PERBLOCK); 779 size += TCPOLEN_SACK_BASE_ALIGNED + 780 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; 781 } 782 783 return size; 784} 785 | 73 74/* Account for new data that has been sent to the network. */ 75static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 76{ 77 struct tcp_sock *tp = tcp_sk(sk); 78 unsigned int prior_packets = tp->packets_out; 79 80 tcp_advance_send_head(sk, skb); --- 702 unchanged lines hidden (view full) --- 783 TCPOLEN_SACK_PERBLOCK); 784 size += TCPOLEN_SACK_BASE_ALIGNED + 785 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; 786 } 787 788 return size; 789} 790 |
791 792/* TCP SMALL QUEUES (TSQ) 793 * 794 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) 795 * to reduce RTT and bufferbloat. 796 * We do this using a special skb destructor (tcp_wfree). 797 * 798 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb 799 * needs to be reallocated in a driver. 800 * The invariant being skb->truesize substracted from sk->sk_wmem_alloc 801 * 802 * Since transmit from skb destructor is forbidden, we use a tasklet 803 * to process all sockets that eventually need to send more skbs. 804 * We use one tasklet per cpu, with its own queue of sockets. 805 */ 806struct tsq_tasklet { 807 struct tasklet_struct tasklet; 808 struct list_head head; /* queue of tcp sockets */ 809}; 810static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); 811 812/* 813 * One tasklest per cpu tries to send more skbs. 814 * We run in tasklet context but need to disable irqs when 815 * transfering tsq->head because tcp_wfree() might 816 * interrupt us (non NAPI drivers) 817 */ 818static void tcp_tasklet_func(unsigned long data) 819{ 820 struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; 821 LIST_HEAD(list); 822 unsigned long flags; 823 struct list_head *q, *n; 824 struct tcp_sock *tp; 825 struct sock *sk; 826 827 local_irq_save(flags); 828 list_splice_init(&tsq->head, &list); 829 local_irq_restore(flags); 830 831 list_for_each_safe(q, n, &list) { 832 tp = list_entry(q, struct tcp_sock, tsq_node); 833 list_del(&tp->tsq_node); 834 835 sk = (struct sock *)tp; 836 bh_lock_sock(sk); 837 838 if (!sock_owned_by_user(sk)) { 839 if ((1 << sk->sk_state) & 840 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | 841 TCPF_CLOSING | TCPF_CLOSE_WAIT)) 842 tcp_write_xmit(sk, 843 tcp_current_mss(sk), 844 0, 0, 845 GFP_ATOMIC); 846 } else { 847 /* defer the work to tcp_release_cb() */ 848 set_bit(TSQ_OWNED, &tp->tsq_flags); 849 } 850 bh_unlock_sock(sk); 851 852 clear_bit(TSQ_QUEUED, &tp->tsq_flags); 853 sk_free(sk); 854 } 855} 856 857/** 858 * tcp_release_cb - tcp release_sock() callback 859 * @sk: socket 860 * 861 * called from release_sock() to perform protocol dependent 862 * actions before socket release. 863 */ 864void tcp_release_cb(struct sock *sk) 865{ 866 struct tcp_sock *tp = tcp_sk(sk); 867 868 if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) { 869 if ((1 << sk->sk_state) & 870 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | 871 TCPF_CLOSING | TCPF_CLOSE_WAIT)) 872 tcp_write_xmit(sk, 873 tcp_current_mss(sk), 874 0, 0, 875 GFP_ATOMIC); 876 } 877} 878EXPORT_SYMBOL(tcp_release_cb); 879 880void __init tcp_tasklet_init(void) 881{ 882 int i; 883 884 for_each_possible_cpu(i) { 885 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); 886 887 INIT_LIST_HEAD(&tsq->head); 888 tasklet_init(&tsq->tasklet, 889 tcp_tasklet_func, 890 (unsigned long)tsq); 891 } 892} 893 894/* 895 * Write buffer destructor automatically called from kfree_skb. 896 * We cant xmit new skbs from this context, as we might already 897 * hold qdisc lock. 898 */ 899void tcp_wfree(struct sk_buff *skb) 900{ 901 struct sock *sk = skb->sk; 902 struct tcp_sock *tp = tcp_sk(sk); 903 904 if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && 905 !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { 906 unsigned long flags; 907 struct tsq_tasklet *tsq; 908 909 /* Keep a ref on socket. 910 * This last ref will be released in tcp_tasklet_func() 911 */ 912 atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); 913 914 /* queue this socket to tasklet queue */ 915 local_irq_save(flags); 916 tsq = &__get_cpu_var(tsq_tasklet); 917 list_add(&tp->tsq_node, &tsq->head); 918 tasklet_schedule(&tsq->tasklet); 919 local_irq_restore(flags); 920 } else { 921 sock_wfree(skb); 922 } 923} 924 |
|
786/* This routine actually transmits TCP packets queued in by 787 * tcp_do_sendmsg(). This is used by both the initial 788 * transmission and possible later retransmissions. 789 * All SKB's seen here are completely headerless. It is our 790 * job to build the TCP header, and pass the packet down to 791 * IP so it can do the same plus pass the packet off to the 792 * device. 793 * --- 45 unchanged lines hidden (view full) --- 839 if (tcp_packets_in_flight(tp) == 0) { 840 tcp_ca_event(sk, CA_EVENT_TX_START); 841 skb->ooo_okay = 1; 842 } else 843 skb->ooo_okay = 0; 844 845 skb_push(skb, tcp_header_size); 846 skb_reset_transport_header(skb); | 925/* This routine actually transmits TCP packets queued in by 926 * tcp_do_sendmsg(). This is used by both the initial 927 * transmission and possible later retransmissions. 928 * All SKB's seen here are completely headerless. It is our 929 * job to build the TCP header, and pass the packet down to 930 * IP so it can do the same plus pass the packet off to the 931 * device. 932 * --- 45 unchanged lines hidden (view full) --- 978 if (tcp_packets_in_flight(tp) == 0) { 979 tcp_ca_event(sk, CA_EVENT_TX_START); 980 skb->ooo_okay = 1; 981 } else 982 skb->ooo_okay = 0; 983 984 skb_push(skb, tcp_header_size); 985 skb_reset_transport_header(skb); |
847 skb_set_owner_w(skb, sk); | |
848 | 986 |
987 skb_orphan(skb); 988 skb->sk = sk; 989 skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? 990 tcp_wfree : sock_wfree; 991 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 992 |
|
849 /* Build TCP header and checksum it. */ 850 th = tcp_hdr(skb); 851 th->source = inet->inet_sport; 852 th->dest = inet->inet_dport; 853 th->seq = htonl(tcb->seq); 854 th->ack_seq = htonl(tp->rcv_nxt); 855 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | 856 tcb->tcp_flags); --- 918 unchanged lines hidden (view full) --- 1775 } else if (result > 0) { 1776 sent_pkts = 1; 1777 } 1778 } 1779 1780 while ((skb = tcp_send_head(sk))) { 1781 unsigned int limit; 1782 | 993 /* Build TCP header and checksum it. */ 994 th = tcp_hdr(skb); 995 th->source = inet->inet_sport; 996 th->dest = inet->inet_dport; 997 th->seq = htonl(tcb->seq); 998 th->ack_seq = htonl(tp->rcv_nxt); 999 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | 1000 tcb->tcp_flags); --- 918 unchanged lines hidden (view full) --- 1919 } else if (result > 0) { 1920 sent_pkts = 1; 1921 } 1922 } 1923 1924 while ((skb = tcp_send_head(sk))) { 1925 unsigned int limit; 1926 |
1927 |
|
1783 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1784 BUG_ON(!tso_segs); 1785 1786 cwnd_quota = tcp_cwnd_test(tp, skb); 1787 if (!cwnd_quota) 1788 break; 1789 1790 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) --- 4 unchanged lines hidden (view full) --- 1795 (tcp_skb_is_last(sk, skb) ? 1796 nonagle : TCP_NAGLE_PUSH)))) 1797 break; 1798 } else { 1799 if (!push_one && tcp_tso_should_defer(sk, skb)) 1800 break; 1801 } 1802 | 1928 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1929 BUG_ON(!tso_segs); 1930 1931 cwnd_quota = tcp_cwnd_test(tp, skb); 1932 if (!cwnd_quota) 1933 break; 1934 1935 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) --- 4 unchanged lines hidden (view full) --- 1940 (tcp_skb_is_last(sk, skb) ? 1941 nonagle : TCP_NAGLE_PUSH)))) 1942 break; 1943 } else { 1944 if (!push_one && tcp_tso_should_defer(sk, skb)) 1945 break; 1946 } 1947 |
1948 /* TSQ : sk_wmem_alloc accounts skb truesize, 1949 * including skb overhead. But thats OK. 1950 */ 1951 if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { 1952 set_bit(TSQ_THROTTLED, &tp->tsq_flags); 1953 break; 1954 } |
|
1803 limit = mss_now; 1804 if (tso_segs > 1 && !tcp_urg_mode(tp)) 1805 limit = tcp_mss_split_point(sk, skb, mss_now, 1806 cwnd_quota); 1807 1808 if (skb->len > limit && 1809 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 1810 break; --- 626 unchanged lines hidden (view full) --- 2437 2438 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; 2439 TCP_ECN_send_synack(tcp_sk(sk), skb); 2440 } 2441 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2442 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2443} 2444 | 1955 limit = mss_now; 1956 if (tso_segs > 1 && !tcp_urg_mode(tp)) 1957 limit = tcp_mss_split_point(sk, skb, mss_now, 1958 cwnd_quota); 1959 1960 if (skb->len > limit && 1961 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 1962 break; --- 626 unchanged lines hidden (view full) --- 2589 2590 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; 2591 TCP_ECN_send_synack(tcp_sk(sk), skb); 2592 } 2593 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2594 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2595} 2596 |
2445/* Prepare a SYN-ACK. */ | 2597/** 2598 * tcp_make_synack - Prepare a SYN-ACK. 2599 * sk: listener socket 2600 * dst: dst entry attached to the SYNACK 2601 * req: request_sock pointer 2602 * rvp: request_values pointer 2603 * 2604 * Allocate one skb and build a SYNACK packet. 2605 * @dst is consumed : Caller should not use it again. 2606 */ |
2446struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2447 struct request_sock *req, 2448 struct request_values *rvp) 2449{ 2450 struct tcp_out_options opts; 2451 struct tcp_extend_values *xvp = tcp_xv(rvp); 2452 struct inet_request_sock *ireq = inet_rsk(req); 2453 struct tcp_sock *tp = tcp_sk(sk); 2454 const struct tcp_cookie_values *cvp = tp->cookie_values; 2455 struct tcphdr *th; 2456 struct sk_buff *skb; 2457 struct tcp_md5sig_key *md5; 2458 int tcp_header_size; 2459 int mss; 2460 int s_data_desired = 0; 2461 2462 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) 2463 s_data_desired = cvp->s_data_desired; | 2607struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2608 struct request_sock *req, 2609 struct request_values *rvp) 2610{ 2611 struct tcp_out_options opts; 2612 struct tcp_extend_values *xvp = tcp_xv(rvp); 2613 struct inet_request_sock *ireq = inet_rsk(req); 2614 struct tcp_sock *tp = tcp_sk(sk); 2615 const struct tcp_cookie_values *cvp = tp->cookie_values; 2616 struct tcphdr *th; 2617 struct sk_buff *skb; 2618 struct tcp_md5sig_key *md5; 2619 int tcp_header_size; 2620 int mss; 2621 int s_data_desired = 0; 2622 2623 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) 2624 s_data_desired = cvp->s_data_desired; |
2464 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); 2465 if (skb == NULL) | 2625 skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC); 2626 if (unlikely(!skb)) { 2627 dst_release(dst); |
2466 return NULL; | 2628 return NULL; |
2467 | 2629 } |
2468 /* Reserve space for headers. */ 2469 skb_reserve(skb, MAX_TCP_HEADER); 2470 | 2630 /* Reserve space for headers. */ 2631 skb_reserve(skb, MAX_TCP_HEADER); 2632 |
2471 skb_dst_set(skb, dst_clone(dst)); | 2633 skb_dst_set(skb, dst); |
2472 2473 mss = dst_metric_advmss(dst); 2474 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2475 mss = tp->rx_opt.user_mss; 2476 2477 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ 2478 __u8 rcv_wscale; 2479 /* Set this up on the first call only */ --- 425 unchanged lines hidden --- | 2634 2635 mss = dst_metric_advmss(dst); 2636 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2637 mss = tp->rx_opt.user_mss; 2638 2639 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ 2640 __u8 rcv_wscale; 2641 /* Set this up on the first call only */ --- 425 unchanged lines hidden --- |