tcp.c (edfbeecd92b0c4a648ed96a7e255bfc9a1bc4642) tcp.c (76a9ebe811fb3d0605cb084f1ae6be5610541865)
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Authors: Ross Biro

--- 1281 unchanged lines hidden (view full) ---

1290
1291 process_backlog = true;
1292 skb->ip_summed = CHECKSUM_PARTIAL;
1293
1294 skb_entail(sk, skb);
1295 copy = size_goal;
1296
1297 /* All packets are restored as if they have
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Authors: Ross Biro

--- 1281 unchanged lines hidden (view full) ---

1290
1291 process_backlog = true;
1292 skb->ip_summed = CHECKSUM_PARTIAL;
1293
1294 skb_entail(sk, skb);
1295 copy = size_goal;
1296
1297 /* All packets are restored as if they have
1298 * already been sent. skb_mstamp isn't set to
1298 * already been sent. skb_mstamp_ns isn't set to
1299 * avoid wrong rtt estimation.
1300 */
1301 if (tp->repair)
1302 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1303 }
1304
1305 /* Try to append data to the end of skb. */
1306 if (copy > msg_data_left(msg))

--- 441 unchanged lines hidden (view full) ---

1748 struct tcp_zerocopy_receive *zc)
1749{
1750 unsigned long address = (unsigned long)zc->address;
1751 const skb_frag_t *frags = NULL;
1752 u32 length = 0, seq, offset;
1753 struct vm_area_struct *vma;
1754 struct sk_buff *skb = NULL;
1755 struct tcp_sock *tp;
1299 * avoid wrong rtt estimation.
1300 */
1301 if (tp->repair)
1302 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1303 }
1304
1305 /* Try to append data to the end of skb. */
1306 if (copy > msg_data_left(msg))

--- 441 unchanged lines hidden (view full) ---

1748 struct tcp_zerocopy_receive *zc)
1749{
1750 unsigned long address = (unsigned long)zc->address;
1751 const skb_frag_t *frags = NULL;
1752 u32 length = 0, seq, offset;
1753 struct vm_area_struct *vma;
1754 struct sk_buff *skb = NULL;
1755 struct tcp_sock *tp;
1756 int inq;
1756 int ret;
1757
1758 if (address & (PAGE_SIZE - 1) || address != zc->address)
1759 return -EINVAL;
1760
1761 if (sk->sk_state == TCP_LISTEN)
1762 return -ENOTCONN;
1763

--- 4 unchanged lines hidden (view full) ---

1768 ret = -EINVAL;
1769 vma = find_vma(current->mm, address);
1770 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
1771 goto out;
1772 zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
1773
1774 tp = tcp_sk(sk);
1775 seq = tp->copied_seq;
1757 int ret;
1758
1759 if (address & (PAGE_SIZE - 1) || address != zc->address)
1760 return -EINVAL;
1761
1762 if (sk->sk_state == TCP_LISTEN)
1763 return -ENOTCONN;
1764

--- 4 unchanged lines hidden (view full) ---

1769 ret = -EINVAL;
1770 vma = find_vma(current->mm, address);
1771 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
1772 goto out;
1773 zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
1774
1775 tp = tcp_sk(sk);
1776 seq = tp->copied_seq;
1776 zc->length = min_t(u32, zc->length, tcp_inq(sk));
1777 inq = tcp_inq(sk);
1778 zc->length = min_t(u32, zc->length, inq);
1777 zc->length &= ~(PAGE_SIZE - 1);
1779 zc->length &= ~(PAGE_SIZE - 1);
1778
1779 zap_page_range(vma, address, zc->length);
1780
1781 zc->recv_skip_hint = 0;
1780 if (zc->length) {
1781 zap_page_range(vma, address, zc->length);
1782 zc->recv_skip_hint = 0;
1783 } else {
1784 zc->recv_skip_hint = inq;
1785 }
1782 ret = 0;
1783 while (length + PAGE_SIZE <= zc->length) {
1784 if (zc->recv_skip_hint < PAGE_SIZE) {
1785 if (skb) {
1786 skb = skb->next;
1787 offset = seq - TCP_SKB_CB(skb)->seq;
1788 } else {
1789 skb = tcp_recv_skb(sk, seq, &offset);

--- 6 unchanged lines hidden (view full) ---

1796 frags = skb_shinfo(skb)->frags;
1797 while (offset) {
1798 if (frags->size > offset)
1799 goto out;
1800 offset -= frags->size;
1801 frags++;
1802 }
1803 }
1786 ret = 0;
1787 while (length + PAGE_SIZE <= zc->length) {
1788 if (zc->recv_skip_hint < PAGE_SIZE) {
1789 if (skb) {
1790 skb = skb->next;
1791 offset = seq - TCP_SKB_CB(skb)->seq;
1792 } else {
1793 skb = tcp_recv_skb(sk, seq, &offset);

--- 6 unchanged lines hidden (view full) ---

1800 frags = skb_shinfo(skb)->frags;
1801 while (offset) {
1802 if (frags->size > offset)
1803 goto out;
1804 offset -= frags->size;
1805 frags++;
1806 }
1807 }
1804 if (frags->size != PAGE_SIZE || frags->page_offset)
1808 if (frags->size != PAGE_SIZE || frags->page_offset) {
1809 int remaining = zc->recv_skip_hint;
1810
1811 while (remaining && (frags->size != PAGE_SIZE ||
1812 frags->page_offset)) {
1813 remaining -= frags->size;
1814 frags++;
1815 }
1816 zc->recv_skip_hint -= remaining;
1805 break;
1817 break;
1818 }
1806 ret = vm_insert_page(vma, address + length,
1807 skb_frag_page(frags));
1808 if (ret)
1809 break;
1810 length += PAGE_SIZE;
1811 seq += PAGE_SIZE;
1812 zc->recv_skip_hint -= PAGE_SIZE;
1813 frags++;

--- 584 unchanged lines hidden (view full) ---

2398
2399 sk_stream_wait_close(sk, timeout);
2400
2401adjudge_to_death:
2402 state = sk->sk_state;
2403 sock_hold(sk);
2404 sock_orphan(sk);
2405
1819 ret = vm_insert_page(vma, address + length,
1820 skb_frag_page(frags));
1821 if (ret)
1822 break;
1823 length += PAGE_SIZE;
1824 seq += PAGE_SIZE;
1825 zc->recv_skip_hint -= PAGE_SIZE;
1826 frags++;

--- 584 unchanged lines hidden (view full) ---

2411
2412 sk_stream_wait_close(sk, timeout);
2413
2414adjudge_to_death:
2415 state = sk->sk_state;
2416 sock_hold(sk);
2417 sock_orphan(sk);
2418
2406 /* It is the last release_sock in its life. It will remove backlog. */
2407 release_sock(sk);
2408
2409
2410 /* Now socket is owned by kernel and we acquire BH lock
2411 * to finish close. No need to check for user refs.
2412 */
2413 local_bh_disable();
2414 bh_lock_sock(sk);
2419 local_bh_disable();
2420 bh_lock_sock(sk);
2415 WARN_ON(sock_owned_by_user(sk));
2421 /* remove backlog if any, without releasing ownership. */
2422 __release_sock(sk);
2416
2417 percpu_counter_inc(sk->sk_prot->orphan_count);
2418
2419 /* Have we already been destroyed by a softirq or backlog? */
2420 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2421 goto out;
2422
2423 /* This is a (useful) BSD violating of the RFC. There is a

--- 52 unchanged lines hidden (view full) ---

2476 reqsk_fastopen_remove(sk, req, false);
2477 inet_csk_destroy_sock(sk);
2478 }
2479 /* Otherwise, socket is reprieved until protocol close. */
2480
2481out:
2482 bh_unlock_sock(sk);
2483 local_bh_enable();
2423
2424 percpu_counter_inc(sk->sk_prot->orphan_count);
2425
2426 /* Have we already been destroyed by a softirq or backlog? */
2427 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2428 goto out;
2429
2430 /* This is a (useful) BSD violating of the RFC. There is a

--- 52 unchanged lines hidden (view full) ---

2483 reqsk_fastopen_remove(sk, req, false);
2484 inet_csk_destroy_sock(sk);
2485 }
2486 /* Otherwise, socket is reprieved until protocol close. */
2487
2488out:
2489 bh_unlock_sock(sk);
2490 local_bh_enable();
2491 release_sock(sk);
2484 sock_put(sk);
2485}
2486EXPORT_SYMBOL(tcp_close);
2487
2488/* These states need RST on ABORT according to RFC793 */
2489
2490static inline bool tcp_need_reset(int state)
2491{

--- 98 unchanged lines hidden (view full) ---

2590 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2591 __sk_dst_reset(sk);
2592 dst_release(sk->sk_rx_dst);
2593 sk->sk_rx_dst = NULL;
2594 tcp_saved_syn_free(tp);
2595 tp->compressed_ack = 0;
2596 tp->bytes_sent = 0;
2597 tp->bytes_retrans = 0;
2492 sock_put(sk);
2493}
2494EXPORT_SYMBOL(tcp_close);
2495
2496/* These states need RST on ABORT according to RFC793 */
2497
2498static inline bool tcp_need_reset(int state)
2499{

--- 98 unchanged lines hidden (view full) ---

2598 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2599 __sk_dst_reset(sk);
2600 dst_release(sk->sk_rx_dst);
2601 sk->sk_rx_dst = NULL;
2602 tcp_saved_syn_free(tp);
2603 tp->compressed_ack = 0;
2604 tp->bytes_sent = 0;
2605 tp->bytes_retrans = 0;
2606 tp->duplicate_sack[0].start_seq = 0;
2607 tp->duplicate_sack[0].end_seq = 0;
2598 tp->dsack_dups = 0;
2599 tp->reord_seen = 0;
2600
2601 /* Clean up fastopen related fields */
2602 tcp_free_fastopen_req(tp);
2603 inet->defer_connect = 0;
2604
2605 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);

--- 490 unchanged lines hidden (view full) ---

3096 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3097}
3098
3099/* Return information about state of tcp endpoint in API format. */
3100void tcp_get_info(struct sock *sk, struct tcp_info *info)
3101{
3102 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
3103 const struct inet_connection_sock *icsk = inet_csk(sk);
2608 tp->dsack_dups = 0;
2609 tp->reord_seen = 0;
2610
2611 /* Clean up fastopen related fields */
2612 tcp_free_fastopen_req(tp);
2613 inet->defer_connect = 0;
2614
2615 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);

--- 490 unchanged lines hidden (view full) ---

3106 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3107}
3108
3109/* Return information about state of tcp endpoint in API format. */
3110void tcp_get_info(struct sock *sk, struct tcp_info *info)
3111{
3112 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
3113 const struct inet_connection_sock *icsk = inet_csk(sk);
3114 unsigned long rate;
3104 u32 now;
3105 u64 rate64;
3106 bool slow;
3115 u32 now;
3116 u64 rate64;
3117 bool slow;
3107 u32 rate;
3108
3109 memset(info, 0, sizeof(*info));
3110 if (sk->sk_type != SOCK_STREAM)
3111 return;
3112
3113 info->tcpi_state = inet_sk_state_load(sk);
3114
3115 /* Report meaningful fields for all TCP states, including listeners */
3116 rate = READ_ONCE(sk->sk_pacing_rate);
3118
3119 memset(info, 0, sizeof(*info));
3120 if (sk->sk_type != SOCK_STREAM)
3121 return;
3122
3123 info->tcpi_state = inet_sk_state_load(sk);
3124
3125 /* Report meaningful fields for all TCP states, including listeners */
3126 rate = READ_ONCE(sk->sk_pacing_rate);
3117 rate64 = rate != ~0U ? rate : ~0ULL;
3127 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3118 info->tcpi_pacing_rate = rate64;
3119
3120 rate = READ_ONCE(sk->sk_max_pacing_rate);
3128 info->tcpi_pacing_rate = rate64;
3129
3130 rate = READ_ONCE(sk->sk_max_pacing_rate);
3121 rate64 = rate != ~0U ? rate : ~0ULL;
3131 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3122 info->tcpi_max_pacing_rate = rate64;
3123
3124 info->tcpi_reordering = tp->reordering;
3125 info->tcpi_snd_cwnd = tp->snd_cwnd;
3126
3127 if (info->tcpi_state == TCP_LISTEN) {
3128 /* listeners aliased fields :
3129 * tcpi_unacked -> Number of children ready for accept()

--- 109 unchanged lines hidden (view full) ---

3239 0;
3240}
3241
3242struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3243{
3244 const struct tcp_sock *tp = tcp_sk(sk);
3245 struct sk_buff *stats;
3246 struct tcp_info info;
3132 info->tcpi_max_pacing_rate = rate64;
3133
3134 info->tcpi_reordering = tp->reordering;
3135 info->tcpi_snd_cwnd = tp->snd_cwnd;
3136
3137 if (info->tcpi_state == TCP_LISTEN) {
3138 /* listeners aliased fields :
3139 * tcpi_unacked -> Number of children ready for accept()

--- 109 unchanged lines hidden (view full) ---

3249 0;
3250}
3251
3252struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3253{
3254 const struct tcp_sock *tp = tcp_sk(sk);
3255 struct sk_buff *stats;
3256 struct tcp_info info;
3257 unsigned long rate;
3247 u64 rate64;
3258 u64 rate64;
3248 u32 rate;
3249
3250 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3251 if (!stats)
3252 return NULL;
3253
3254 tcp_get_info_chrono_stats(tp, &info);
3255 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3256 info.tcpi_busy_time, TCP_NLA_PAD);
3257 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3258 info.tcpi_rwnd_limited, TCP_NLA_PAD);
3259 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3260 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3261 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3262 tp->data_segs_out, TCP_NLA_PAD);
3263 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3264 tp->total_retrans, TCP_NLA_PAD);
3265
3266 rate = READ_ONCE(sk->sk_pacing_rate);
3259
3260 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3261 if (!stats)
3262 return NULL;
3263
3264 tcp_get_info_chrono_stats(tp, &info);
3265 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3266 info.tcpi_busy_time, TCP_NLA_PAD);
3267 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3268 info.tcpi_rwnd_limited, TCP_NLA_PAD);
3269 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3270 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3271 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3272 tp->data_segs_out, TCP_NLA_PAD);
3273 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3274 tp->total_retrans, TCP_NLA_PAD);
3275
3276 rate = READ_ONCE(sk->sk_pacing_rate);
3267 rate64 = rate != ~0U ? rate : ~0ULL;
3277 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3268 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3269
3270 rate64 = tcp_compute_delivery_rate(tp);
3271 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3272
3273 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
3274 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3275 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));

--- 613 unchanged lines hidden (view full) ---

3889 max_wshare = min(4UL*1024*1024, limit);
3890 max_rshare = min(6UL*1024*1024, limit);
3891
3892 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3893 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
3894 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3895
3896 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3278 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3279
3280 rate64 = tcp_compute_delivery_rate(tp);
3281 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3282
3283 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
3284 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3285 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));

--- 613 unchanged lines hidden (view full) ---

3899 max_wshare = min(4UL*1024*1024, limit);
3900 max_rshare = min(6UL*1024*1024, limit);
3901
3902 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3903 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
3904 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3905
3906 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3897 init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
3898 init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
3907 init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
3908 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
3899
3900 pr_info("Hash tables configured (established %u bind %u)\n",
3901 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3902
3903 tcp_v4_init();
3904 tcp_metrics_init();
3905 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3906 tcp_tasklet_init();
3907}
3909
3910 pr_info("Hash tables configured (established %u bind %u)\n",
3911 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3912
3913 tcp_v4_init();
3914 tcp_metrics_init();
3915 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3916 tcp_tasklet_init();
3917}