tcp.c (edfbeecd92b0c4a648ed96a7e255bfc9a1bc4642) | tcp.c (76a9ebe811fb3d0605cb084f1ae6be5610541865) |
---|---|
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Authors: Ross Biro --- 1281 unchanged lines hidden (view full) --- 1290 1291 process_backlog = true; 1292 skb->ip_summed = CHECKSUM_PARTIAL; 1293 1294 skb_entail(sk, skb); 1295 copy = size_goal; 1296 1297 /* All packets are restored as if they have | 1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Authors: Ross Biro --- 1281 unchanged lines hidden (view full) --- 1290 1291 process_backlog = true; 1292 skb->ip_summed = CHECKSUM_PARTIAL; 1293 1294 skb_entail(sk, skb); 1295 copy = size_goal; 1296 1297 /* All packets are restored as if they have |
1298 * already been sent. skb_mstamp isn't set to | 1298 * already been sent. skb_mstamp_ns isn't set to |
1299 * avoid wrong rtt estimation. 1300 */ 1301 if (tp->repair) 1302 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; 1303 } 1304 1305 /* Try to append data to the end of skb. */ 1306 if (copy > msg_data_left(msg)) --- 441 unchanged lines hidden (view full) --- 1748 struct tcp_zerocopy_receive *zc) 1749{ 1750 unsigned long address = (unsigned long)zc->address; 1751 const skb_frag_t *frags = NULL; 1752 u32 length = 0, seq, offset; 1753 struct vm_area_struct *vma; 1754 struct sk_buff *skb = NULL; 1755 struct tcp_sock *tp; | 1299 * avoid wrong rtt estimation. 1300 */ 1301 if (tp->repair) 1302 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; 1303 } 1304 1305 /* Try to append data to the end of skb. */ 1306 if (copy > msg_data_left(msg)) --- 441 unchanged lines hidden (view full) --- 1748 struct tcp_zerocopy_receive *zc) 1749{ 1750 unsigned long address = (unsigned long)zc->address; 1751 const skb_frag_t *frags = NULL; 1752 u32 length = 0, seq, offset; 1753 struct vm_area_struct *vma; 1754 struct sk_buff *skb = NULL; 1755 struct tcp_sock *tp; |
1756 int inq; |
|
1756 int ret; 1757 1758 if (address & (PAGE_SIZE - 1) || address != zc->address) 1759 return -EINVAL; 1760 1761 if (sk->sk_state == TCP_LISTEN) 1762 return -ENOTCONN; 1763 --- 4 unchanged lines hidden (view full) --- 1768 ret = -EINVAL; 1769 vma = find_vma(current->mm, address); 1770 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) 1771 goto out; 1772 zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); 1773 1774 tp = tcp_sk(sk); 1775 seq = tp->copied_seq; | 1757 int ret; 1758 1759 if (address & (PAGE_SIZE - 1) || address != zc->address) 1760 return -EINVAL; 1761 1762 if (sk->sk_state == TCP_LISTEN) 1763 return -ENOTCONN; 1764 --- 4 unchanged lines hidden (view full) --- 1769 ret = -EINVAL; 1770 vma = find_vma(current->mm, address); 1771 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) 1772 goto out; 1773 zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); 1774 1775 tp = tcp_sk(sk); 1776 seq = tp->copied_seq; |
1776 zc->length = min_t(u32, zc->length, tcp_inq(sk)); | 1777 inq = tcp_inq(sk); 1778 zc->length = min_t(u32, zc->length, inq); |
1777 zc->length &= ~(PAGE_SIZE - 1); | 1779 zc->length &= ~(PAGE_SIZE - 1); |
1778 1779 zap_page_range(vma, address, zc->length); 1780 1781 zc->recv_skip_hint = 0; | 1780 if (zc->length) { 1781 zap_page_range(vma, address, zc->length); 1782 zc->recv_skip_hint = 0; 1783 } else { 1784 zc->recv_skip_hint = inq; 1785 } |
1782 ret = 0; 1783 while (length + PAGE_SIZE <= zc->length) { 1784 if (zc->recv_skip_hint < PAGE_SIZE) { 1785 if (skb) { 1786 skb = skb->next; 1787 offset = seq - TCP_SKB_CB(skb)->seq; 1788 } else { 1789 skb = tcp_recv_skb(sk, seq, &offset); --- 6 unchanged lines hidden (view full) --- 1796 frags = skb_shinfo(skb)->frags; 1797 while (offset) { 1798 if (frags->size > offset) 1799 goto out; 1800 offset -= frags->size; 1801 frags++; 1802 } 1803 } | 1786 ret = 0; 1787 while (length + PAGE_SIZE <= zc->length) { 1788 if (zc->recv_skip_hint < PAGE_SIZE) { 1789 if (skb) { 1790 skb = skb->next; 1791 offset = seq - TCP_SKB_CB(skb)->seq; 1792 } else { 1793 skb = tcp_recv_skb(sk, seq, &offset); --- 6 unchanged lines hidden (view full) --- 1800 frags = skb_shinfo(skb)->frags; 1801 while (offset) { 1802 if (frags->size > offset) 1803 goto out; 1804 offset -= frags->size; 1805 frags++; 1806 } 1807 } |
1804 if (frags->size != PAGE_SIZE || frags->page_offset) | 1808 if (frags->size != PAGE_SIZE || frags->page_offset) { 1809 int remaining = zc->recv_skip_hint; 1810 1811 while (remaining && (frags->size != PAGE_SIZE || 1812 frags->page_offset)) { 1813 remaining -= frags->size; 1814 frags++; 1815 } 1816 zc->recv_skip_hint -= remaining; |
1805 break; | 1817 break; |
1818 } |
|
1806 ret = vm_insert_page(vma, address + length, 1807 skb_frag_page(frags)); 1808 if (ret) 1809 break; 1810 length += PAGE_SIZE; 1811 seq += PAGE_SIZE; 1812 zc->recv_skip_hint -= PAGE_SIZE; 1813 frags++; --- 584 unchanged lines hidden (view full) --- 2398 2399 sk_stream_wait_close(sk, timeout); 2400 2401adjudge_to_death: 2402 state = sk->sk_state; 2403 sock_hold(sk); 2404 sock_orphan(sk); 2405 | 1819 ret = vm_insert_page(vma, address + length, 1820 skb_frag_page(frags)); 1821 if (ret) 1822 break; 1823 length += PAGE_SIZE; 1824 seq += PAGE_SIZE; 1825 zc->recv_skip_hint -= PAGE_SIZE; 1826 frags++; --- 584 unchanged lines hidden (view full) --- 2411 2412 sk_stream_wait_close(sk, timeout); 2413 2414adjudge_to_death: 2415 state = sk->sk_state; 2416 sock_hold(sk); 2417 sock_orphan(sk); 2418 |
2406 /* It is the last release_sock in its life. It will remove backlog. */ 2407 release_sock(sk); 2408 2409 2410 /* Now socket is owned by kernel and we acquire BH lock 2411 * to finish close. No need to check for user refs. 2412 */ | |
2413 local_bh_disable(); 2414 bh_lock_sock(sk); | 2419 local_bh_disable(); 2420 bh_lock_sock(sk); |
2415 WARN_ON(sock_owned_by_user(sk)); | 2421 /* remove backlog if any, without releasing ownership. */ 2422 __release_sock(sk); |
2416 2417 percpu_counter_inc(sk->sk_prot->orphan_count); 2418 2419 /* Have we already been destroyed by a softirq or backlog? */ 2420 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) 2421 goto out; 2422 2423 /* This is a (useful) BSD violating of the RFC. There is a --- 52 unchanged lines hidden (view full) --- 2476 reqsk_fastopen_remove(sk, req, false); 2477 inet_csk_destroy_sock(sk); 2478 } 2479 /* Otherwise, socket is reprieved until protocol close. */ 2480 2481out: 2482 bh_unlock_sock(sk); 2483 local_bh_enable(); | 2423 2424 percpu_counter_inc(sk->sk_prot->orphan_count); 2425 2426 /* Have we already been destroyed by a softirq or backlog? */ 2427 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) 2428 goto out; 2429 2430 /* This is a (useful) BSD violating of the RFC. There is a --- 52 unchanged lines hidden (view full) --- 2483 reqsk_fastopen_remove(sk, req, false); 2484 inet_csk_destroy_sock(sk); 2485 } 2486 /* Otherwise, socket is reprieved until protocol close. */ 2487 2488out: 2489 bh_unlock_sock(sk); 2490 local_bh_enable(); |
2491 release_sock(sk); |
|
2484 sock_put(sk); 2485} 2486EXPORT_SYMBOL(tcp_close); 2487 2488/* These states need RST on ABORT according to RFC793 */ 2489 2490static inline bool tcp_need_reset(int state) 2491{ --- 98 unchanged lines hidden (view full) --- 2590 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 2591 __sk_dst_reset(sk); 2592 dst_release(sk->sk_rx_dst); 2593 sk->sk_rx_dst = NULL; 2594 tcp_saved_syn_free(tp); 2595 tp->compressed_ack = 0; 2596 tp->bytes_sent = 0; 2597 tp->bytes_retrans = 0; | 2492 sock_put(sk); 2493} 2494EXPORT_SYMBOL(tcp_close); 2495 2496/* These states need RST on ABORT according to RFC793 */ 2497 2498static inline bool tcp_need_reset(int state) 2499{ --- 98 unchanged lines hidden (view full) --- 2598 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 2599 __sk_dst_reset(sk); 2600 dst_release(sk->sk_rx_dst); 2601 sk->sk_rx_dst = NULL; 2602 tcp_saved_syn_free(tp); 2603 tp->compressed_ack = 0; 2604 tp->bytes_sent = 0; 2605 tp->bytes_retrans = 0; |
2606 tp->duplicate_sack[0].start_seq = 0; 2607 tp->duplicate_sack[0].end_seq = 0; |
|
2598 tp->dsack_dups = 0; 2599 tp->reord_seen = 0; 2600 2601 /* Clean up fastopen related fields */ 2602 tcp_free_fastopen_req(tp); 2603 inet->defer_connect = 0; 2604 2605 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); --- 490 unchanged lines hidden (view full) --- 3096 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; 3097} 3098 3099/* Return information about state of tcp endpoint in API format. */ 3100void tcp_get_info(struct sock *sk, struct tcp_info *info) 3101{ 3102 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ 3103 const struct inet_connection_sock *icsk = inet_csk(sk); | 2608 tp->dsack_dups = 0; 2609 tp->reord_seen = 0; 2610 2611 /* Clean up fastopen related fields */ 2612 tcp_free_fastopen_req(tp); 2613 inet->defer_connect = 0; 2614 2615 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); --- 490 unchanged lines hidden (view full) --- 3106 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; 3107} 3108 3109/* Return information about state of tcp endpoint in API format. */ 3110void tcp_get_info(struct sock *sk, struct tcp_info *info) 3111{ 3112 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ 3113 const struct inet_connection_sock *icsk = inet_csk(sk); |
3114 unsigned long rate; |
|
3104 u32 now; 3105 u64 rate64; 3106 bool slow; | 3115 u32 now; 3116 u64 rate64; 3117 bool slow; |
3107 u32 rate; | |
3108 3109 memset(info, 0, sizeof(*info)); 3110 if (sk->sk_type != SOCK_STREAM) 3111 return; 3112 3113 info->tcpi_state = inet_sk_state_load(sk); 3114 3115 /* Report meaningful fields for all TCP states, including listeners */ 3116 rate = READ_ONCE(sk->sk_pacing_rate); | 3118 3119 memset(info, 0, sizeof(*info)); 3120 if (sk->sk_type != SOCK_STREAM) 3121 return; 3122 3123 info->tcpi_state = inet_sk_state_load(sk); 3124 3125 /* Report meaningful fields for all TCP states, including listeners */ 3126 rate = READ_ONCE(sk->sk_pacing_rate); |
3117 rate64 = rate != ~0U ? rate : ~0ULL; | 3127 rate64 = (rate != ~0UL) ? rate : ~0ULL; |
3118 info->tcpi_pacing_rate = rate64; 3119 3120 rate = READ_ONCE(sk->sk_max_pacing_rate); | 3128 info->tcpi_pacing_rate = rate64; 3129 3130 rate = READ_ONCE(sk->sk_max_pacing_rate); |
3121 rate64 = rate != ~0U ? rate : ~0ULL; | 3131 rate64 = (rate != ~0UL) ? rate : ~0ULL; |
3122 info->tcpi_max_pacing_rate = rate64; 3123 3124 info->tcpi_reordering = tp->reordering; 3125 info->tcpi_snd_cwnd = tp->snd_cwnd; 3126 3127 if (info->tcpi_state == TCP_LISTEN) { 3128 /* listeners aliased fields : 3129 * tcpi_unacked -> Number of children ready for accept() --- 109 unchanged lines hidden (view full) --- 3239 0; 3240} 3241 3242struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) 3243{ 3244 const struct tcp_sock *tp = tcp_sk(sk); 3245 struct sk_buff *stats; 3246 struct tcp_info info; | 3132 info->tcpi_max_pacing_rate = rate64; 3133 3134 info->tcpi_reordering = tp->reordering; 3135 info->tcpi_snd_cwnd = tp->snd_cwnd; 3136 3137 if (info->tcpi_state == TCP_LISTEN) { 3138 /* listeners aliased fields : 3139 * tcpi_unacked -> Number of children ready for accept() --- 109 unchanged lines hidden (view full) --- 3249 0; 3250} 3251 3252struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) 3253{ 3254 const struct tcp_sock *tp = tcp_sk(sk); 3255 struct sk_buff *stats; 3256 struct tcp_info info; |
3257 unsigned long rate; |
|
3247 u64 rate64; | 3258 u64 rate64; |
3248 u32 rate; | |
3249 3250 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); 3251 if (!stats) 3252 return NULL; 3253 3254 tcp_get_info_chrono_stats(tp, &info); 3255 nla_put_u64_64bit(stats, TCP_NLA_BUSY, 3256 info.tcpi_busy_time, TCP_NLA_PAD); 3257 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, 3258 info.tcpi_rwnd_limited, TCP_NLA_PAD); 3259 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, 3260 info.tcpi_sndbuf_limited, TCP_NLA_PAD); 3261 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, 3262 tp->data_segs_out, TCP_NLA_PAD); 3263 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, 3264 tp->total_retrans, TCP_NLA_PAD); 3265 3266 rate = READ_ONCE(sk->sk_pacing_rate); | 3259 3260 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); 3261 if (!stats) 3262 return NULL; 3263 3264 tcp_get_info_chrono_stats(tp, &info); 3265 nla_put_u64_64bit(stats, TCP_NLA_BUSY, 3266 info.tcpi_busy_time, TCP_NLA_PAD); 3267 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, 3268 info.tcpi_rwnd_limited, TCP_NLA_PAD); 3269 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, 3270 info.tcpi_sndbuf_limited, TCP_NLA_PAD); 3271 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, 3272 tp->data_segs_out, TCP_NLA_PAD); 3273 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, 3274 tp->total_retrans, TCP_NLA_PAD); 3275 3276 rate = READ_ONCE(sk->sk_pacing_rate); |
3267 rate64 = rate != ~0U ? rate : ~0ULL; | 3277 rate64 = (rate != ~0UL) ? rate : ~0ULL; |
3268 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); 3269 3270 rate64 = tcp_compute_delivery_rate(tp); 3271 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); 3272 3273 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd); 3274 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); 3275 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); --- 613 unchanged lines hidden (view full) --- 3889 max_wshare = min(4UL*1024*1024, limit); 3890 max_rshare = min(6UL*1024*1024, limit); 3891 3892 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; 3893 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; 3894 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); 3895 3896 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; | 3278 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); 3279 3280 rate64 = tcp_compute_delivery_rate(tp); 3281 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); 3282 3283 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd); 3284 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); 3285 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); --- 613 unchanged lines hidden (view full) --- 3899 max_wshare = min(4UL*1024*1024, limit); 3900 max_rshare = min(6UL*1024*1024, limit); 3901 3902 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; 3903 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; 3904 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); 3905 3906 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; |
3897 init_net.ipv4.sysctl_tcp_rmem[1] = 87380; 3898 init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); | 3907 init_net.ipv4.sysctl_tcp_rmem[1] = 131072; 3908 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); |
3899 3900 pr_info("Hash tables configured (established %u bind %u)\n", 3901 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3902 3903 tcp_v4_init(); 3904 tcp_metrics_init(); 3905 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); 3906 tcp_tasklet_init(); 3907} | 3909 3910 pr_info("Hash tables configured (established %u bind %u)\n", 3911 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3912 3913 tcp_v4_init(); 3914 tcp_metrics_init(); 3915 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); 3916 tcp_tasklet_init(); 3917} |