From 385e20706facd376f27863bd55b7cc7720d3f27b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:00 -0700 Subject: tcp: use tp->tcp_mstamp in output path Idea is to later convert tp->tcp_mstamp to a full u64 counter using usec resolution, so that we can later have fine grained TCP TS clock (RFC 7323), regardless of HZ value. We try to refresh tp->tcp_mstamp only when necessary. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 86934bcf685a..ec7c5473c788 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -339,7 +339,7 @@ static void tcp_probe_timer(struct sock *sk) */ start_ts = tcp_skb_timestamp(tcp_send_head(sk)); if (!start_ts) - skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp); + tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) goto abort; @@ -561,6 +561,7 @@ void tcp_write_timer_handler(struct sock *sk) goto out; } + skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); event = icsk->icsk_pending; switch (event) { -- cgit v1.2.3 From d635fbe27ebee0f4b845abe5e9620c9400785a5c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:03 -0700 Subject: tcp: use tcp_jiffies32 to feed tp->lsndtime Use tcp_jiffies32 instead of tcp_time_stamp to feed tp->lsndtime. tcp_time_stamp will soon be a litle bit more expensive than simply reading 'jiffies'. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ec7c5473c788..5f6f219a431e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -63,7 +63,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) shift++; /* If some dubious ICMP arrived, penalize even more. */ @@ -73,7 +73,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ - if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) do_reset = true; -- cgit v1.2.3 From 70eabf0e1b8fe11519f793416655266605f700b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:07 -0700 Subject: tcp: use tcp_jiffies32 for rcv_tstamp and lrcvtime Use tcp_jiffies32 instead of tcp_time_stamp, since tcp_time_stamp will soon be only used for TCP TS option. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5f6f219a431e..9e0616cb8c17 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -451,7 +451,7 @@ void tcp_retransmit_timer(struct sock *sk) tp->snd_una, tp->snd_nxt); } #endif - if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { + if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) { tcp_write_err(sk); goto out; } -- cgit v1.2.3 From c74df29a8d119a09ccc5e50265e3383c76278f3d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:08 -0700 Subject: tcp: use tcp_jiffies32 to feed probe_timestamp Use tcp_jiffies32 instead of tcp_time_stamp, since tcp_time_stamp will soon be only used for TCP TS option. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 9e0616cb8c17..6629f47aa7f0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -115,7 +115,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) if (net->ipv4.sysctl_tcp_mtu_probing) { if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; - icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct net *net = sock_net(sk); -- cgit v1.2.3 From 9a568de4818dea9a05af141046bd3e589245ab83 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2017 14:00:14 -0700 Subject: tcp: switch TCP TS option (RFC 7323) to 1ms clock TCP Timestamps option is defined in RFC 7323 Traditionally on linux, it has been tied to the internal 'jiffies' variable, because it had been a cheap and good enough generator. For TCP flows on the Internet, 1 ms resolution would be much better than 4ms or 10ms (HZ=250 or HZ=100 respectively) For TCP flows in the DC, Google has used usec resolution for more than two years with great success [1] Receive size autotuning (DRS) is indeed more precise and converges faster to optimal window size. This patch converts tp->tcp_mstamp to a plain u64 value storing a 1 usec TCP clock. This choice will allow us to upstream the 1 usec TS option as discussed in IETF 97. [1] https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6629f47aa7f0..27a667bce806 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -153,8 +153,8 @@ static bool retransmits_timed_out(struct sock *sk, unsigned int timeout, bool syn_set) { - unsigned int linear_backoff_thresh, start_ts; unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; + unsigned int linear_backoff_thresh, start_ts; if (!inet_csk(sk)->icsk_retransmits) return false; @@ -172,7 +172,7 @@ static bool retransmits_timed_out(struct sock *sk, timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; } - return (tcp_time_stamp - start_ts) >= timeout; + return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); } /* A write timeout has occurred. Process the after effects. */ @@ -341,7 +341,7 @@ static void tcp_probe_timer(struct sock *sk) if (!start_ts) tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) + (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; @@ -561,7 +561,7 @@ void tcp_write_timer_handler(struct sock *sk) goto out; } - skb_mstamp_get(&tcp_sk(sk)->tcp_mstamp); + tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; switch (event) { -- cgit v1.2.3 From 4ab688793e086ef6d1744a0f803fe9770a1ae5d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 21 May 2017 10:39:00 -0700 Subject: tcp: fix tcp_probe_timer() for TCP_USER_TIMEOUT TCP_USER_TIMEOUT is still converted to jiffies value in icsk_user_timeout So we need to make a conversion for the cases HZ != 1000 Fixes: 9a568de4818d ("tcp: switch TCP TS option (RFC 7323) to 1ms clock") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 27a667bce806..c4a35ba7f8ed 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -341,7 +341,8 @@ static void tcp_probe_timer(struct sock *sk) if (!start_ts) tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) + (s32)(tcp_time_stamp(tp) - start_ts) > + jiffies_to_msecs(icsk->icsk_user_timeout)) goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; -- cgit v1.2.3 From ce682ef6e3e019f98cafbdc7058668e0ea8f4a13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 May 2017 12:38:35 -0700 Subject: tcp: fix TCP_SYNCNT flakes After the mentioned commit, some of our packetdrill tests became flaky. TCP_SYNCNT socket option can limit the number of SYN retransmits. retransmits_timed_out() has to compare times computations based on local_clock() while timers are based on jiffies. With NTP adjustments and roundings we can observe 999 ms delay for 1000 ms timers. We end up sending one extra SYN packet. Gimmick added in commit 6fa12c850314 ("Revert Backoff [v3]: Calculate TCP's connection close threshold as a time value") makes no real sense for TCP_SYN_SENT sockets where no RTO backoff can happen at all. Lets use a simpler logic for TCP_SYN_SENT sockets and remove @syn_set parameter from retransmits_timed_out() Fixes: 9a568de4818d ("tcp: switch TCP TS option (RFC 7323) to 1ms clock") Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'net/ipv4/tcp_timer.c') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c4a35ba7f8ed..c0feeeef962a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -139,21 +139,17 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) * @timeout: A custom timeout value. * If set to 0 the default timeout is calculated and used. * Using TCP_RTO_MIN and the number of unsuccessful retransmits. - * @syn_set: true if the SYN Bit was set. * * The default "timeout" value this function can calculate and use * is equivalent to the timeout of a TCP Connection * after "boundary" unsuccessful, exponentially backed-off - * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if - * syn_set flag is set. - * + * retransmissions with an initial RTO of TCP_RTO_MIN. */ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, - unsigned int timeout, - bool syn_set) + unsigned int timeout) { - unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; + const unsigned int rto_base = TCP_RTO_MIN; unsigned int linear_backoff_thresh, start_ts; if (!inet_csk(sk)->icsk_retransmits) @@ -181,8 +177,8 @@ static int tcp_write_timeout(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); + bool expired, do_reset; int retry_until; - bool do_reset, syn_set = false; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { @@ -196,9 +192,9 @@ static int tcp_write_timeout(struct sock *sk) sk_rethink_txhash(sk); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; - syn_set = true; + expired = icsk->icsk_retransmits >= retry_until; } else { - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { /* Some middle-boxes may black-hole Fast Open _after_ * the handshake. Therefore we conservatively disable * Fast Open on this path on recurring timeouts after @@ -224,15 +220,15 @@ static int tcp_write_timeout(struct sock *sk) retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || - !retransmits_timed_out(sk, retry_until, 0, 0); + !retransmits_timed_out(sk, retry_until, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } + expired = retransmits_timed_out(sk, retry_until, + icsk->icsk_user_timeout); } - - if (retransmits_timed_out(sk, retry_until, - syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { + if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -540,7 +536,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0)) + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); out:; -- cgit v1.2.3