tcp_output.c (9e9fd65d1fa51d919d54d731be0e66492b5b6c5a) tcp_output.c (46d3ceabd8d98ed0ad10f20c595ca784e34786c5)
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Authors: Ross Biro

--- 36 unchanged lines hidden (view full) ---

45/* People can turn this off for buggy TCP's found in printers etc. */
46int sysctl_tcp_retrans_collapse __read_mostly = 1;
47
48/* People can turn this on to work with those rare, broken TCPs that
49 * interpret the window field as a signed quantity.
50 */
51int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Authors: Ross Biro

--- 36 unchanged lines hidden (view full) ---

45/* People can turn this off for buggy TCP's found in printers etc. */
46int sysctl_tcp_retrans_collapse __read_mostly = 1;
47
48/* People can turn this on to work with those rare, broken TCPs that
49 * interpret the window field as a signed quantity.
50 */
51int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52
53/* Default TSQ limit of two TSO segments */
54int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
55
53/* This limits the percentage of the congestion window which we
54 * will allow a single TSO frame to consume. Building TSO frames
55 * which are too large can cause TCP streams to be bursty.
56 */
57int sysctl_tcp_tso_win_divisor __read_mostly = 3;
58
59int sysctl_tcp_mtu_probing __read_mostly = 0;
60int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
61
62/* By default, RFC2861 behavior. */
63int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
64
65int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
66EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
67
56/* This limits the percentage of the congestion window which we
57 * will allow a single TSO frame to consume. Building TSO frames
58 * which are too large can cause TCP streams to be bursty.
59 */
60int sysctl_tcp_tso_win_divisor __read_mostly = 3;
61
62int sysctl_tcp_mtu_probing __read_mostly = 0;
63int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
64
65/* By default, RFC2861 behavior. */
66int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
67
68int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
69EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
70
71static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
72 int push_one, gfp_t gfp);
68
69/* Account for new data that has been sent to the network. */
70static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
71{
72 struct tcp_sock *tp = tcp_sk(sk);
73 unsigned int prior_packets = tp->packets_out;
74
75 tcp_advance_send_head(sk, skb);

--- 702 unchanged lines hidden (view full) ---

778 TCPOLEN_SACK_PERBLOCK);
779 size += TCPOLEN_SACK_BASE_ALIGNED +
780 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
781 }
782
783 return size;
784}
785
73
74/* Account for new data that has been sent to the network. */
75static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
76{
77 struct tcp_sock *tp = tcp_sk(sk);
78 unsigned int prior_packets = tp->packets_out;
79
80 tcp_advance_send_head(sk, skb);

--- 702 unchanged lines hidden (view full) ---

783 TCPOLEN_SACK_PERBLOCK);
784 size += TCPOLEN_SACK_BASE_ALIGNED +
785 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
786 }
787
788 return size;
789}
790
791
792/* TCP SMALL QUEUES (TSQ)
793 *
794 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
795 * to reduce RTT and bufferbloat.
796 * We do this using a special skb destructor (tcp_wfree).
797 *
798 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
799 * needs to be reallocated in a driver.
800 * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
801 *
802 * Since transmit from skb destructor is forbidden, we use a tasklet
803 * to process all sockets that eventually need to send more skbs.
804 * We use one tasklet per cpu, with its own queue of sockets.
805 */
806struct tsq_tasklet {
807 struct tasklet_struct tasklet;
808 struct list_head head; /* queue of tcp sockets */
809};
810static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
811
812/*
813 * One tasklest per cpu tries to send more skbs.
814 * We run in tasklet context but need to disable irqs when
815 * transfering tsq->head because tcp_wfree() might
816 * interrupt us (non NAPI drivers)
817 */
818static void tcp_tasklet_func(unsigned long data)
819{
820 struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
821 LIST_HEAD(list);
822 unsigned long flags;
823 struct list_head *q, *n;
824 struct tcp_sock *tp;
825 struct sock *sk;
826
827 local_irq_save(flags);
828 list_splice_init(&tsq->head, &list);
829 local_irq_restore(flags);
830
831 list_for_each_safe(q, n, &list) {
832 tp = list_entry(q, struct tcp_sock, tsq_node);
833 list_del(&tp->tsq_node);
834
835 sk = (struct sock *)tp;
836 bh_lock_sock(sk);
837
838 if (!sock_owned_by_user(sk)) {
839 if ((1 << sk->sk_state) &
840 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
841 TCPF_CLOSING | TCPF_CLOSE_WAIT))
842 tcp_write_xmit(sk,
843 tcp_current_mss(sk),
844 0, 0,
845 GFP_ATOMIC);
846 } else {
847 /* defer the work to tcp_release_cb() */
848 set_bit(TSQ_OWNED, &tp->tsq_flags);
849 }
850 bh_unlock_sock(sk);
851
852 clear_bit(TSQ_QUEUED, &tp->tsq_flags);
853 sk_free(sk);
854 }
855}
856
857/**
858 * tcp_release_cb - tcp release_sock() callback
859 * @sk: socket
860 *
861 * called from release_sock() to perform protocol dependent
862 * actions before socket release.
863 */
864void tcp_release_cb(struct sock *sk)
865{
866 struct tcp_sock *tp = tcp_sk(sk);
867
868 if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
869 if ((1 << sk->sk_state) &
870 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
871 TCPF_CLOSING | TCPF_CLOSE_WAIT))
872 tcp_write_xmit(sk,
873 tcp_current_mss(sk),
874 0, 0,
875 GFP_ATOMIC);
876 }
877}
878EXPORT_SYMBOL(tcp_release_cb);
879
880void __init tcp_tasklet_init(void)
881{
882 int i;
883
884 for_each_possible_cpu(i) {
885 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
886
887 INIT_LIST_HEAD(&tsq->head);
888 tasklet_init(&tsq->tasklet,
889 tcp_tasklet_func,
890 (unsigned long)tsq);
891 }
892}
893
894/*
895 * Write buffer destructor automatically called from kfree_skb.
896 * We cant xmit new skbs from this context, as we might already
897 * hold qdisc lock.
898 */
899void tcp_wfree(struct sk_buff *skb)
900{
901 struct sock *sk = skb->sk;
902 struct tcp_sock *tp = tcp_sk(sk);
903
904 if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
905 !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
906 unsigned long flags;
907 struct tsq_tasklet *tsq;
908
909 /* Keep a ref on socket.
910 * This last ref will be released in tcp_tasklet_func()
911 */
912 atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
913
914 /* queue this socket to tasklet queue */
915 local_irq_save(flags);
916 tsq = &__get_cpu_var(tsq_tasklet);
917 list_add(&tp->tsq_node, &tsq->head);
918 tasklet_schedule(&tsq->tasklet);
919 local_irq_restore(flags);
920 } else {
921 sock_wfree(skb);
922 }
923}
924
786/* This routine actually transmits TCP packets queued in by
787 * tcp_do_sendmsg(). This is used by both the initial
788 * transmission and possible later retransmissions.
789 * All SKB's seen here are completely headerless. It is our
790 * job to build the TCP header, and pass the packet down to
791 * IP so it can do the same plus pass the packet off to the
792 * device.
793 *

--- 45 unchanged lines hidden (view full) ---

839 if (tcp_packets_in_flight(tp) == 0) {
840 tcp_ca_event(sk, CA_EVENT_TX_START);
841 skb->ooo_okay = 1;
842 } else
843 skb->ooo_okay = 0;
844
845 skb_push(skb, tcp_header_size);
846 skb_reset_transport_header(skb);
925/* This routine actually transmits TCP packets queued in by
926 * tcp_do_sendmsg(). This is used by both the initial
927 * transmission and possible later retransmissions.
928 * All SKB's seen here are completely headerless. It is our
929 * job to build the TCP header, and pass the packet down to
930 * IP so it can do the same plus pass the packet off to the
931 * device.
932 *

--- 45 unchanged lines hidden (view full) ---

978 if (tcp_packets_in_flight(tp) == 0) {
979 tcp_ca_event(sk, CA_EVENT_TX_START);
980 skb->ooo_okay = 1;
981 } else
982 skb->ooo_okay = 0;
983
984 skb_push(skb, tcp_header_size);
985 skb_reset_transport_header(skb);
847 skb_set_owner_w(skb, sk);
848
986
987 skb_orphan(skb);
988 skb->sk = sk;
989 skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
990 tcp_wfree : sock_wfree;
991 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
992
849 /* Build TCP header and checksum it. */
850 th = tcp_hdr(skb);
851 th->source = inet->inet_sport;
852 th->dest = inet->inet_dport;
853 th->seq = htonl(tcb->seq);
854 th->ack_seq = htonl(tp->rcv_nxt);
855 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
856 tcb->tcp_flags);

--- 918 unchanged lines hidden (view full) ---

1775 } else if (result > 0) {
1776 sent_pkts = 1;
1777 }
1778 }
1779
1780 while ((skb = tcp_send_head(sk))) {
1781 unsigned int limit;
1782
993 /* Build TCP header and checksum it. */
994 th = tcp_hdr(skb);
995 th->source = inet->inet_sport;
996 th->dest = inet->inet_dport;
997 th->seq = htonl(tcb->seq);
998 th->ack_seq = htonl(tp->rcv_nxt);
999 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
1000 tcb->tcp_flags);

--- 918 unchanged lines hidden (view full) ---

1919 } else if (result > 0) {
1920 sent_pkts = 1;
1921 }
1922 }
1923
1924 while ((skb = tcp_send_head(sk))) {
1925 unsigned int limit;
1926
1927
1783 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1784 BUG_ON(!tso_segs);
1785
1786 cwnd_quota = tcp_cwnd_test(tp, skb);
1787 if (!cwnd_quota)
1788 break;
1789
1790 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))

--- 4 unchanged lines hidden (view full) ---

1795 (tcp_skb_is_last(sk, skb) ?
1796 nonagle : TCP_NAGLE_PUSH))))
1797 break;
1798 } else {
1799 if (!push_one && tcp_tso_should_defer(sk, skb))
1800 break;
1801 }
1802
1928 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1929 BUG_ON(!tso_segs);
1930
1931 cwnd_quota = tcp_cwnd_test(tp, skb);
1932 if (!cwnd_quota)
1933 break;
1934
1935 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))

--- 4 unchanged lines hidden (view full) ---

1940 (tcp_skb_is_last(sk, skb) ?
1941 nonagle : TCP_NAGLE_PUSH))))
1942 break;
1943 } else {
1944 if (!push_one && tcp_tso_should_defer(sk, skb))
1945 break;
1946 }
1947
1948 /* TSQ : sk_wmem_alloc accounts skb truesize,
1949 * including skb overhead. But thats OK.
1950 */
1951 if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
1952 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
1953 break;
1954 }
1803 limit = mss_now;
1804 if (tso_segs > 1 && !tcp_urg_mode(tp))
1805 limit = tcp_mss_split_point(sk, skb, mss_now,
1806 cwnd_quota);
1807
1808 if (skb->len > limit &&
1809 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
1810 break;

--- 626 unchanged lines hidden (view full) ---

2437
2438 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
2439 TCP_ECN_send_synack(tcp_sk(sk), skb);
2440 }
2441 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2442 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2443}
2444
1955 limit = mss_now;
1956 if (tso_segs > 1 && !tcp_urg_mode(tp))
1957 limit = tcp_mss_split_point(sk, skb, mss_now,
1958 cwnd_quota);
1959
1960 if (skb->len > limit &&
1961 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
1962 break;

--- 626 unchanged lines hidden (view full) ---

2589
2590 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
2591 TCP_ECN_send_synack(tcp_sk(sk), skb);
2592 }
2593 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2594 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2595}
2596
2445/* Prepare a SYN-ACK. */
2597/**
2598 * tcp_make_synack - Prepare a SYN-ACK.
2599 * sk: listener socket
2600 * dst: dst entry attached to the SYNACK
2601 * req: request_sock pointer
2602 * rvp: request_values pointer
2603 *
2604 * Allocate one skb and build a SYNACK packet.
2605 * @dst is consumed : Caller should not use it again.
2606 */
2446struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2447 struct request_sock *req,
2448 struct request_values *rvp)
2449{
2450 struct tcp_out_options opts;
2451 struct tcp_extend_values *xvp = tcp_xv(rvp);
2452 struct inet_request_sock *ireq = inet_rsk(req);
2453 struct tcp_sock *tp = tcp_sk(sk);
2454 const struct tcp_cookie_values *cvp = tp->cookie_values;
2455 struct tcphdr *th;
2456 struct sk_buff *skb;
2457 struct tcp_md5sig_key *md5;
2458 int tcp_header_size;
2459 int mss;
2460 int s_data_desired = 0;
2461
2462 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
2463 s_data_desired = cvp->s_data_desired;
2607struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2608 struct request_sock *req,
2609 struct request_values *rvp)
2610{
2611 struct tcp_out_options opts;
2612 struct tcp_extend_values *xvp = tcp_xv(rvp);
2613 struct inet_request_sock *ireq = inet_rsk(req);
2614 struct tcp_sock *tp = tcp_sk(sk);
2615 const struct tcp_cookie_values *cvp = tp->cookie_values;
2616 struct tcphdr *th;
2617 struct sk_buff *skb;
2618 struct tcp_md5sig_key *md5;
2619 int tcp_header_size;
2620 int mss;
2621 int s_data_desired = 0;
2622
2623 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
2624 s_data_desired = cvp->s_data_desired;
2464 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
2465 if (skb == NULL)
2625 skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC);
2626 if (unlikely(!skb)) {
2627 dst_release(dst);
2466 return NULL;
2628 return NULL;
2467
2629 }
2468 /* Reserve space for headers. */
2469 skb_reserve(skb, MAX_TCP_HEADER);
2470
2630 /* Reserve space for headers. */
2631 skb_reserve(skb, MAX_TCP_HEADER);
2632
2471 skb_dst_set(skb, dst_clone(dst));
2633 skb_dst_set(skb, dst);
2472
2473 mss = dst_metric_advmss(dst);
2474 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2475 mss = tp->rx_opt.user_mss;
2476
2477 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2478 __u8 rcv_wscale;
2479 /* Set this up on the first call only */

--- 425 unchanged lines hidden ---
2634
2635 mss = dst_metric_advmss(dst);
2636 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2637 mss = tp->rx_opt.user_mss;
2638
2639 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2640 __u8 rcv_wscale;
2641 /* Set this up on the first call only */

--- 425 unchanged lines hidden ---