diff options
author | Gerrit Renker <gerrit@erg.abdn.ac.uk> | 2008-09-09 13:27:22 +0200 |
---|---|---|
committer | Gerrit Renker <gerrit@erg.abdn.ac.uk> | 2008-09-09 13:27:22 +0200 |
commit | 410e27a49bb98bc7fa3ff5fc05cc313817b9f253 (patch) | |
tree | 88bb1fcf84f9ebfa4299c9a8dcd9e6330b358446 /net/dccp/ccids | |
parent | 0a68a20cc3eafa73bb54097c28b921147d7d3685 (diff) | |
download | linux-3.10-410e27a49bb98bc7fa3ff5fc05cc313817b9f253.tar.gz linux-3.10-410e27a49bb98bc7fa3ff5fc05cc313817b9f253.tar.bz2 linux-3.10-410e27a49bb98bc7fa3ff5fc05cc313817b9f253.zip |
This reverts "Merge branch 'dccp' of git://eden-feed.erg.abdn.ac.uk/dccp_exp"
as it accentally contained the wrong set of patches. These will be
submitted separately.
Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Diffstat (limited to 'net/dccp/ccids')
-rw-r--r-- | net/dccp/ccids/Kconfig | 30 | ||||
-rw-r--r-- | net/dccp/ccids/ccid2.c | 622 | ||||
-rw-r--r-- | net/dccp/ccids/ccid2.h | 63 | ||||
-rw-r--r-- | net/dccp/ccids/ccid3.c | 762 | ||||
-rw-r--r-- | net/dccp/ccids/ccid3.h | 153 | ||||
-rw-r--r-- | net/dccp/ccids/lib/loss_interval.c | 30 | ||||
-rw-r--r-- | net/dccp/ccids/lib/loss_interval.h | 4 | ||||
-rw-r--r-- | net/dccp/ccids/lib/packet_history.c | 282 | ||||
-rw-r--r-- | net/dccp/ccids/lib/packet_history.h | 78 | ||||
-rw-r--r-- | net/dccp/ccids/lib/tfrc.h | 16 | ||||
-rw-r--r-- | net/dccp/ccids/lib/tfrc_equation.c | 29 |
11 files changed, 1116 insertions, 953 deletions
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index fb168be2cb4..12275943eab 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -1,8 +1,10 @@ menu "DCCP CCIDs Configuration (EXPERIMENTAL)" + depends on EXPERIMENTAL config IP_DCCP_CCID2 - tristate "CCID2 (TCP-Like)" + tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" def_tristate IP_DCCP + select IP_DCCP_ACKVEC ---help--- CCID 2, TCP-like Congestion Control, denotes Additive Increase, Multiplicative Decrease (AIMD) congestion control with behavior @@ -34,7 +36,7 @@ config IP_DCCP_CCID2_DEBUG If in doubt, say N. config IP_DCCP_CCID3 - tristate "CCID3 (TCP-Friendly)" + tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" def_tristate IP_DCCP select IP_DCCP_TFRC_LIB ---help--- @@ -62,9 +64,9 @@ config IP_DCCP_CCID3 If in doubt, say M. -if IP_DCCP_CCID3 config IP_DCCP_CCID3_DEBUG bool "CCID3 debugging messages" + depends on IP_DCCP_CCID3 ---help--- Enable CCID3-specific debugging messages. @@ -74,29 +76,10 @@ config IP_DCCP_CCID3_DEBUG If in doubt, say N. -choice - prompt "Select method for measuring the packet size s" - default IP_DCCP_CCID3_MEASURE_S_AS_MPS - -config IP_DCCP_CCID3_MEASURE_S_AS_MPS - bool "Always use MPS in place of s" - ---help--- - This use is recommended as it is consistent with the initialisation - of X and suggested when s varies (rfc3448bis, (1) in section 4.1). -config IP_DCCP_CCID3_MEASURE_S_AS_AVG - bool "Use moving average" - ---help--- - An alternative way of tracking s, also supported by rfc3448bis. - This used to be the default for CCID-3 in previous kernels. -config IP_DCCP_CCID3_MEASURE_S_AS_MAX - bool "Track the maximum payload length" - ---help--- - An experimental method based on tracking the maximum packet size. -endchoice - config IP_DCCP_CCID3_RTO int "Use higher bound for nofeedback timer" default 100 + depends on IP_DCCP_CCID3 && EXPERIMENTAL ---help--- Use higher lower bound for nofeedback timer expiration. @@ -123,7 +106,6 @@ config IP_DCCP_CCID3_RTO The purpose of the nofeedback timer is to slow DCCP down when there is serious network congestion: experimenting with larger values should therefore not be performed on WANs. -endif # IP_DCCP_CCID3 config IP_DCCP_TFRC_LIB tristate diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index fa713227c66..9a430734530 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -25,7 +25,7 @@ /* * This implementation should follow RFC 4341 */ -#include "../feat.h" + #include "../ccid.h" #include "../dccp.h" #include "ccid2.h" @@ -34,8 +34,51 @@ #ifdef CONFIG_IP_DCCP_CCID2_DEBUG static int ccid2_debug; #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) + +static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx) +{ + int len = 0; + int pipe = 0; + struct ccid2_seq *seqp = hctx->ccid2hctx_seqh; + + /* there is data in the chain */ + if (seqp != hctx->ccid2hctx_seqt) { + seqp = seqp->ccid2s_prev; + len++; + if (!seqp->ccid2s_acked) + pipe++; + + while (seqp != hctx->ccid2hctx_seqt) { + struct ccid2_seq *prev = seqp->ccid2s_prev; + + len++; + if (!prev->ccid2s_acked) + pipe++; + + /* packets are sent sequentially */ + BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, + prev->ccid2s_seq ) >= 0); + BUG_ON(time_before(seqp->ccid2s_sent, + prev->ccid2s_sent)); + + seqp = prev; + } + } + + BUG_ON(pipe != hctx->ccid2hctx_pipe); + ccid2_pr_debug("len of chain=%d\n", len); + + do { + seqp = seqp->ccid2s_prev; + len++; + } while (seqp != hctx->ccid2hctx_seqh); + + ccid2_pr_debug("total len=%d\n", len); + BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN); +} #else #define ccid2_pr_debug(format, a...) +#define ccid2_hc_tx_check_sanity(hctx) #endif static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) @@ -44,7 +87,8 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) int i; /* check if we have space to preserve the pointer to the buffer */ - if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *)) + if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) / + sizeof(struct ccid2_seq*))) return -ENOMEM; /* allocate buffer and initialize linked list */ @@ -60,35 +104,38 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; /* This is the first allocation. Initiate the head and tail. */ - if (hctx->seqbufc == 0) - hctx->seqh = hctx->seqt = seqp; + if (hctx->ccid2hctx_seqbufc == 0) + hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp; else { /* link the existing list with the one we just created */ - hctx->seqh->ccid2s_next = seqp; - seqp->ccid2s_prev = hctx->seqh; + hctx->ccid2hctx_seqh->ccid2s_next = seqp; + seqp->ccid2s_prev = hctx->ccid2hctx_seqh; - hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt; + hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; + seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt; } /* store the original pointer to the buffer so we can free it */ - hctx->seqbuf[hctx->seqbufc] = seqp; - hctx->seqbufc++; + hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp; + hctx->ccid2hctx_seqbufc++; return 0; } static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { - if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) - return CCID_PACKET_WILL_DEQUEUE_LATER; - return CCID_PACKET_SEND_AT_ONCE; + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + + if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) + return 0; + + return 1; /* XXX CCID should dequeue when ready instead of polling */ } static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) { struct dccp_sock *dp = dccp_sk(sk); - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2); + u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); /* * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from @@ -100,8 +147,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); val = max_ratio; } - if (val > DCCPF_ACK_RATIO_MAX) - val = DCCPF_ACK_RATIO_MAX; + if (val > 0xFFFF) /* RFC 4340, 11.3 */ + val = 0xFFFF; if (val == dp->dccps_l_ack_ratio) return; @@ -110,77 +157,99 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) dp->dccps_l_ack_ratio = val; } +static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) +{ + ccid2_pr_debug("change SRTT to %ld\n", val); + hctx->ccid2hctx_srtt = val; +} + +static void ccid2_start_rto_timer(struct sock *sk); + static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); + long s; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { - sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5); + sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, + jiffies + HZ / 5); goto out; } ccid2_pr_debug("RTO_EXPIRE\n"); + ccid2_hc_tx_check_sanity(hctx); + /* back-off timer */ - hctx->rto <<= 1; - if (hctx->rto > DCCP_RTO_MAX) - hctx->rto = DCCP_RTO_MAX; + hctx->ccid2hctx_rto <<= 1; + + s = hctx->ccid2hctx_rto / HZ; + if (s > 60) + hctx->ccid2hctx_rto = 60 * HZ; + + ccid2_start_rto_timer(sk); /* adjust pipe, cwnd etc */ - hctx->ssthresh = hctx->cwnd / 2; - if (hctx->ssthresh < 2) - hctx->ssthresh = 2; - hctx->cwnd = 1; - hctx->pipe = 0; + hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; + if (hctx->ccid2hctx_ssthresh < 2) + hctx->ccid2hctx_ssthresh = 2; + hctx->ccid2hctx_cwnd = 1; + hctx->ccid2hctx_pipe = 0; /* clear state about stuff we sent */ - hctx->seqt = hctx->seqh; - hctx->packets_acked = 0; + hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; + hctx->ccid2hctx_packets_acked = 0; /* clear ack ratio state. */ - hctx->rpseq = 0; - hctx->rpdupack = -1; + hctx->ccid2hctx_rpseq = 0; + hctx->ccid2hctx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); - - /* if we were blocked before, we may now send cwnd=1 packet */ - if (sender_was_blocked) - tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); - /* restart backed-off timer */ - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); + ccid2_hc_tx_check_sanity(hctx); out: bh_unlock_sock(sk); sock_put(sk); } -static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) +static void ccid2_start_rto_timer(struct sock *sk) +{ + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + + ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto); + + BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer)); + sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, + jiffies + hctx->ccid2hctx_rto); +} + +static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); struct ccid2_seq *next; - hctx->pipe++; + hctx->ccid2hctx_pipe++; - hctx->seqh->ccid2s_seq = dp->dccps_gss; - hctx->seqh->ccid2s_acked = 0; - hctx->seqh->ccid2s_sent = jiffies; + hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; + hctx->ccid2hctx_seqh->ccid2s_acked = 0; + hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; - next = hctx->seqh->ccid2s_next; + next = hctx->ccid2hctx_seqh->ccid2s_next; /* check if we need to alloc more space */ - if (next == hctx->seqt) { + if (next == hctx->ccid2hctx_seqt) { if (ccid2_hc_tx_alloc_seq(hctx)) { DCCP_CRIT("packet history - out of memory!"); /* FIXME: find a more graceful way to bail out */ return; } - next = hctx->seqh->ccid2s_next; - BUG_ON(next == hctx->seqt); + next = hctx->ccid2hctx_seqh->ccid2s_next; + BUG_ON(next == hctx->ccid2hctx_seqt); } - hctx->seqh = next; + hctx->ccid2hctx_seqh = next; - ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe); + ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, + hctx->ccid2hctx_pipe); /* * FIXME: The code below is broken and the variables have been removed @@ -203,12 +272,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) */ #if 0 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ - hctx->arsent++; + hctx->ccid2hctx_arsent++; /* We had an ack loss in this window... */ - if (hctx->ackloss) { - if (hctx->arsent >= hctx->cwnd) { - hctx->arsent = 0; - hctx->ackloss = 0; + if (hctx->ccid2hctx_ackloss) { + if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) { + hctx->ccid2hctx_arsent = 0; + hctx->ccid2hctx_ackloss = 0; } } else { /* No acks lost up to now... */ @@ -218,28 +287,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - dp->dccps_l_ack_ratio; - denom = hctx->cwnd * hctx->cwnd / denom; + denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom; - if (hctx->arsent >= denom) { + if (hctx->ccid2hctx_arsent >= denom) { ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); - hctx->arsent = 0; + hctx->ccid2hctx_arsent = 0; } } else { /* we can't increase ack ratio further [1] */ - hctx->arsent = 0; /* or maybe set it to cwnd*/ + hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ } } #endif /* setup RTO timer */ - if (!timer_pending(&hctx->rtotimer)) - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); + if (!timer_pending(&hctx->ccid2hctx_rtotimer)) + ccid2_start_rto_timer(sk); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG do { - struct ccid2_seq *seqp = hctx->seqt; + struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; - while (seqp != hctx->seqh) { + while (seqp != hctx->ccid2hctx_seqh) { ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", (unsigned long long)seqp->ccid2s_seq, seqp->ccid2s_acked, seqp->ccid2s_sent); @@ -247,158 +316,205 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) } } while (0); ccid2_pr_debug("=========\n"); + ccid2_hc_tx_check_sanity(hctx); #endif } -/** - * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm - * This code is almost identical with TCP's tcp_rtt_estimator(), since - * - it has a higher sampling frequency (recommended by RFC 1323), - * - the RTO does not collapse into RTT due to RTTVAR going towards zero, - * - it is simple (cf. more complex proposals such as Eifel timer or research - * which suggests that the gain should be set according to window size), - * - in tests it was found to work well with CCID2 [gerrit]. +/* XXX Lame code duplication! + * returns -1 if none was found. + * else returns the next offset to use in the function call. */ -static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) +static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, + unsigned char **vec, unsigned char *veclen) { - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - long m = mrtt ? : 1; - - if (hctx->srtt == 0) { - /* First measurement m */ - hctx->srtt = m << 3; - hctx->mdev = m << 1; - - hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev); - hctx->rttvar = hctx->mdev_max; - hctx->rtt_seq = dccp_sk(sk)->dccps_gss; - } else { - /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ - m -= (hctx->srtt >> 3); - hctx->srtt += m; - - /* Similarly, update scaled mdev with regard to |m| */ - if (m < 0) { - m = -m; - m -= (hctx->mdev >> 2); + const struct dccp_hdr *dh = dccp_hdr(skb); + unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); + unsigned char *opt_ptr; + const unsigned char *opt_end = (unsigned char *)dh + + (dh->dccph_doff * 4); + unsigned char opt, len; + unsigned char *value; + + BUG_ON(offset < 0); + options += offset; + opt_ptr = options; + if (opt_ptr >= opt_end) + return -1; + + while (opt_ptr != opt_end) { + opt = *opt_ptr++; + len = 0; + value = NULL; + + /* Check if this isn't a single byte option */ + if (opt > DCCPO_MAX_RESERVED) { + if (opt_ptr == opt_end) + goto out_invalid_option; + + len = *opt_ptr++; + if (len < 3) + goto out_invalid_option; /* - * This neutralises RTO increase when RTT < SRTT - mdev - * (see P. Sarolahti, A. Kuznetsov,"Congestion Control - * in Linux TCP", USENIX 2002, pp. 49-62). + * Remove the type and len fields, leaving + * just the value size */ - if (m > 0) - m >>= 3; - } else { - m -= (hctx->mdev >> 2); - } - hctx->mdev += m; + len -= 2; + value = opt_ptr; + opt_ptr += len; - if (hctx->mdev > hctx->mdev_max) { - hctx->mdev_max = hctx->mdev; - if (hctx->mdev_max > hctx->rttvar) - hctx->rttvar = hctx->mdev_max; + if (opt_ptr > opt_end) + goto out_invalid_option; } - /* - * Decay RTTVAR at most once per flight, exploiting that - * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) - * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) - * GAR is a useful bound for FlightSize = pipe, AWL is probably - * too low as it over-estimates pipe. - */ - if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) { - if (hctx->mdev_max < hctx->rttvar) - hctx->rttvar -= (hctx->rttvar - - hctx->mdev_max) >> 2; - hctx->rtt_seq = dccp_sk(sk)->dccps_gss; - hctx->mdev_max = TCP_RTO_MIN; + switch (opt) { + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + *vec = value; + *veclen = len; + return offset + (opt_ptr - options); } } - /* - * Set RTO from SRTT and RTTVAR - * Clock granularity is ignored since the minimum error for RTTVAR is - * clamped to 50msec (corresponding to HZ=20). This leads to a minimum - * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP - * does not retransmit data, DCCP does not require TCP's recommended - * minimum timeout of one second". - */ - hctx->rto = (hctx->srtt >> 3) + hctx->rttvar; + return -1; - if (hctx->rto > DCCP_RTO_MAX) - hctx->rto = DCCP_RTO_MAX; +out_invalid_option: + DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); + return -1; } -static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, - unsigned int *maxincr) +static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - if (hctx->cwnd < hctx->ssthresh) { - if (*maxincr > 0 && ++hctx->packets_acked == 2) { - hctx->cwnd += 1; - *maxincr -= 1; - hctx->packets_acked = 0; - } - } else if (++hctx->packets_acked >= hctx->cwnd) { - hctx->cwnd += 1; - hctx->packets_acked = 0; - } - /* - * FIXME: RTT is sampled several times per acknowledgment (for each - * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). - * This causes the RTT to be over-estimated, since the older entries - * in the Ack Vector have earlier sending times. - * The cleanest solution is to not use the ccid2s_sent field at all - * and instead use DCCP timestamps - need to be resolved at some time. - */ - ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); + sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer); + ccid2_pr_debug("deleted RTO timer\n"); } -static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) +static inline void ccid2_new_ack(struct sock *sk, + struct ccid2_seq *seqp, + unsigned int *maxincr) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - if (time_before(seqp->ccid2s_sent, hctx->last_cong)) { - ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); - return; + if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { + if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { + hctx->ccid2hctx_cwnd += 1; + *maxincr -= 1; + hctx->ccid2hctx_packets_acked = 0; + } + } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { + hctx->ccid2hctx_cwnd += 1; + hctx->ccid2hctx_packets_acked = 0; } - hctx->last_cong = jiffies; + /* update RTO */ + if (hctx->ccid2hctx_srtt == -1 || + time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) { + unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; + int s; + + /* first measurement */ + if (hctx->ccid2hctx_srtt == -1) { + ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", + r, jiffies, + (unsigned long long)seqp->ccid2s_seq); + ccid2_change_srtt(hctx, r); + hctx->ccid2hctx_rttvar = r >> 1; + } else { + /* RTTVAR */ + long tmp = hctx->ccid2hctx_srtt - r; + long srtt; + + if (tmp < 0) + tmp *= -1; + + tmp >>= 2; + hctx->ccid2hctx_rttvar *= 3; + hctx->ccid2hctx_rttvar >>= 2; + hctx->ccid2hctx_rttvar += tmp; + + /* SRTT */ + srtt = hctx->ccid2hctx_srtt; + srtt *= 7; + srtt >>= 3; + tmp = r >> 3; + srtt += tmp; + ccid2_change_srtt(hctx, srtt); + } + s = hctx->ccid2hctx_rttvar << 2; + /* clock granularity is 1 when based on jiffies */ + if (!s) + s = 1; + hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s; + + /* must be at least a second */ + s = hctx->ccid2hctx_rto / HZ; + /* DCCP doesn't require this [but I like it cuz my code sux] */ +#if 1 + if (s < 1) + hctx->ccid2hctx_rto = HZ; +#endif + /* max 60 seconds */ + if (s > 60) + hctx->ccid2hctx_rto = HZ * 60; - hctx->cwnd = hctx->cwnd / 2 ? : 1U; - hctx->ssthresh = max(hctx->cwnd, 2U); + hctx->ccid2hctx_lastrtt = jiffies; - /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ - if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd) - ccid2_change_l_ack_ratio(sk, hctx->cwnd); + ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", + hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar, + hctx->ccid2hctx_rto, HZ, r); + } + + /* we got a new ack, so re-start RTO timer */ + ccid2_hc_tx_kill_rto_timer(sk); + ccid2_start_rto_timer(sk); } -static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) +static void ccid2_hc_tx_dec_pipe(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - switch (option) { - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen, - option - DCCPO_ACK_VECTOR_0); + if (hctx->ccid2hctx_pipe == 0) + DCCP_BUG("pipe == 0"); + else + hctx->ccid2hctx_pipe--; + + if (hctx->ccid2hctx_pipe == 0) + ccid2_hc_tx_kill_rto_timer(sk); +} + +static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) +{ + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + + if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { + ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); + return; } - return 0; + + hctx->ccid2hctx_last_cong = jiffies; + + hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; + hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); + + /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ + if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) + ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); } static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); - struct dccp_ackvec_parsed *avp; u64 ackno, seqno; struct ccid2_seq *seqp; + unsigned char *vector; + unsigned char veclen; + int offset = 0; int done = 0; unsigned int maxincr = 0; + ccid2_hc_tx_check_sanity(hctx); /* check reverse path congestion */ seqno = DCCP_SKB_CB(skb)->dccpd_seq; @@ -407,21 +523,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * -sorbo. */ /* need to bootstrap */ - if (hctx->rpdupack == -1) { - hctx->rpdupack = 0; - hctx->rpseq = seqno; + if (hctx->ccid2hctx_rpdupack == -1) { + hctx->ccid2hctx_rpdupack = 0; + hctx->ccid2hctx_rpseq = seqno; } else { /* check if packet is consecutive */ - if (dccp_delta_seqno(hctx->rpseq, seqno) == 1) - hctx->rpseq = seqno; + if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1) + hctx->ccid2hctx_rpseq = seqno; /* it's a later packet */ - else if (after48(seqno, hctx->rpseq)) { - hctx->rpdupack++; + else if (after48(seqno, hctx->ccid2hctx_rpseq)) { + hctx->ccid2hctx_rpdupack++; /* check if we got enough dupacks */ - if (hctx->rpdupack >= NUMDUPACK) { - hctx->rpdupack = -1; /* XXX lame */ - hctx->rpseq = 0; + if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { + hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ + hctx->ccid2hctx_rpseq = 0; ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); } @@ -429,22 +545,27 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* check forward path congestion */ - if (dccp_packet_without_ack(skb)) + /* still didn't send out new data packets */ + if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt) return; - /* still didn't send out new data packets */ - if (hctx->seqh == hctx->seqt) - goto done; + switch (DCCP_SKB_CB(skb)->dccpd_type) { + case DCCP_PKT_ACK: + case DCCP_PKT_DATAACK: + break; + default: + return; + } ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - if (after48(ackno, hctx->high_ack)) - hctx->high_ack = ackno; + if (after48(ackno, hctx->ccid2hctx_high_ack)) + hctx->ccid2hctx_high_ack = ackno; - seqp = hctx->seqt; + seqp = hctx->ccid2hctx_seqt; while (before48(seqp->ccid2s_seq, ackno)) { seqp = seqp->ccid2s_next; - if (seqp == hctx->seqh) { - seqp = hctx->seqh->ccid2s_prev; + if (seqp == hctx->ccid2hctx_seqh) { + seqp = hctx->ccid2hctx_seqh->ccid2s_prev; break; } } @@ -454,26 +575,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * packets per acknowledgement. Rounding up avoids that cwnd is not * advanced when Ack Ratio is 1 and gives a slight edge otherwise. */ - if (hctx->cwnd < hctx->ssthresh) + if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); /* go through all ack vectors */ - list_for_each_entry(avp, &hctx->av_chunks, node) { + while ((offset = ccid2_ackvector(sk, skb, offset, + &vector, &veclen)) != -1) { /* go through this ack vector */ - for (; avp->len--; avp->vec++) { - u64 ackno_end_rl = SUB48(ackno, - dccp_ackvec_runlen(avp->vec)); + while (veclen--) { + const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; + u64 ackno_end_rl = SUB48(ackno, rl); - ccid2_pr_debug("ackvec %llu |%u,%u|\n", + ccid2_pr_debug("ackvec start:%llu end:%llu\n", (unsigned long long)ackno, - dccp_ackvec_state(avp->vec) >> 6, - dccp_ackvec_runlen(avp->vec)); + (unsigned long long)ackno_end_rl); /* if the seqno we are analyzing is larger than the * current ackno, then move towards the tail of our * seqnos. */ while (after48(seqp->ccid2s_seq, ackno)) { - if (seqp == hctx->seqt) { + if (seqp == hctx->ccid2hctx_seqt) { done = 1; break; } @@ -486,24 +607,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * run length */ while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = dccp_ackvec_state(avp->vec); + const u8 state = *vector & + DCCP_ACKVEC_STATE_MASK; /* new packet received or marked */ - if (state != DCCPAV_NOT_RECEIVED && + if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && !seqp->ccid2s_acked) { - if (state == DCCPAV_ECN_MARKED) + if (state == + DCCP_ACKVEC_STATE_ECN_MARKED) { ccid2_congestion_event(sk, seqp); - else + } else ccid2_new_ack(sk, seqp, &maxincr); seqp->ccid2s_acked = 1; ccid2_pr_debug("Got ack for %llu\n", (unsigned long long)seqp->ccid2s_seq); - hctx->pipe--; + ccid2_hc_tx_dec_pipe(sk); } - if (seqp == hctx->seqt) { + if (seqp == hctx->ccid2hctx_seqt) { done = 1; break; } @@ -513,6 +636,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) break; ackno = SUB48(ackno_end_rl, 1); + vector++; } if (done) break; @@ -521,11 +645,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* The state about what is acked should be correct now * Check for NUMDUPACK */ - seqp = hctx->seqt; - while (before48(seqp->ccid2s_seq, hctx->high_ack)) { + seqp = hctx->ccid2hctx_seqt; + while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) { seqp = seqp->ccid2s_next; - if (seqp == hctx->seqh) { - seqp = hctx->seqh->ccid2s_prev; + if (seqp == hctx->ccid2hctx_seqh) { + seqp = hctx->ccid2hctx_seqh->ccid2s_prev; break; } } @@ -536,7 +660,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (done == NUMDUPACK) break; } - if (seqp == hctx->seqt) + if (seqp == hctx->ccid2hctx_seqt) break; seqp = seqp->ccid2s_prev; } @@ -557,34 +681,25 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * one ack vector. */ ccid2_congestion_event(sk, seqp); - hctx->pipe--; + ccid2_hc_tx_dec_pipe(sk); } - if (seqp == hctx->seqt) + if (seqp == hctx->ccid2hctx_seqt) break; seqp = seqp->ccid2s_prev; } - hctx->seqt = last_acked; + hctx->ccid2hctx_seqt = last_acked; } /* trim acked packets in tail */ - while (hctx->seqt != hctx->seqh) { - if (!hctx->seqt->ccid2s_acked) + while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) { + if (!hctx->ccid2hctx_seqt->ccid2s_acked) break; - hctx->seqt = hctx->seqt->ccid2s_next; + hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next; } - /* restart RTO timer if not all outstanding data has been acked */ - if (hctx->pipe == 0) - sk_stop_timer(sk, &hctx->rtotimer); - else - sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); -done: - /* check if incoming Acks allow pending packets to be sent */ - if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) - tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); - dccp_ackvec_parsed_cleanup(&hctx->av_chunks); + ccid2_hc_tx_check_sanity(hctx); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) @@ -594,13 +709,17 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) u32 max_ratio; /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ - hctx->ssthresh = ~0U; + hctx->ccid2hctx_ssthresh = ~0U; - /* Use larger initial windows (RFC 3390, rfc2581bis) */ - hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); + /* + * RFC 4341, 5: "The cwnd parameter is initialized to at most four + * packets for new connections, following the rules from [RFC3390]". + * We need to convert the bytes of RFC3390 into the packets of RFC 4341. + */ + hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); /* Make sure that Ack Ratio is enabled and within bounds. */ - max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); + max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) dp->dccps_l_ack_ratio = max_ratio; @@ -608,11 +727,15 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) if (ccid2_hc_tx_alloc_seq(hctx)) return -ENOMEM; - hctx->rto = DCCP_TIMEOUT_INIT; - hctx->rpdupack = -1; - hctx->last_cong = jiffies; - setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); - INIT_LIST_HEAD(&hctx->av_chunks); + hctx->ccid2hctx_rto = 3 * HZ; + ccid2_change_srtt(hctx, -1); + hctx->ccid2hctx_rttvar = -1; + hctx->ccid2hctx_rpdupack = -1; + hctx->ccid2hctx_last_cong = jiffies; + setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire, + (unsigned long)sk); + + ccid2_hc_tx_check_sanity(hctx); return 0; } @@ -621,11 +744,11 @@ static void ccid2_hc_tx_exit(struct sock *sk) struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); int i; - sk_stop_timer(sk, &hctx->rtotimer); + ccid2_hc_tx_kill_rto_timer(sk); - for (i = 0; i < hctx->seqbufc; i++) - kfree(hctx->seqbuf[i]); - hctx->seqbufc = 0; + for (i = 0; i < hctx->ccid2hctx_seqbufc; i++) + kfree(hctx->ccid2hctx_seqbuf[i]); + hctx->ccid2hctx_seqbufc = 0; } static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) @@ -636,28 +759,27 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) switch (DCCP_SKB_CB(skb)->dccpd_type) { case DCCP_PKT_DATA: case DCCP_PKT_DATAACK: - hcrx->data++; - if (hcrx->data >= dp->dccps_r_ack_ratio) { + hcrx->ccid2hcrx_data++; + if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) { dccp_send_ack(sk); - hcrx->data = 0; + hcrx->ccid2hcrx_data = 0; } break; } } static struct ccid_operations ccid2 = { - .ccid_id = DCCPC_CCID2, - .ccid_name = "TCP-like", - .ccid_owner = THIS_MODULE, - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), - .ccid_hc_tx_init = ccid2_hc_tx_init, - .ccid_hc_tx_exit = ccid2_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, - .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, + .ccid_id = DCCPC_CCID2, + .ccid_name = "TCP-like", + .ccid_owner = THIS_MODULE, + .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), + .ccid_hc_tx_init = ccid2_hc_tx_init, + .ccid_hc_tx_exit = ccid2_hc_tx_exit, + .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, + .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, + .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, + .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), + .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, }; #ifdef CONFIG_IP_DCCP_CCID2_DEBUG diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 8b7a2dee2f6..2c94ca02901 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -42,49 +42,34 @@ struct ccid2_seq { /** struct ccid2_hc_tx_sock - CCID2 TX half connection * - * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 - * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @srtt: smoothed RTT estimate, scaled by 2^3 - * @mdev: smoothed RTT variation, scaled by 2^2 - * @mdev_max: maximum of @mdev during one flight - * @rttvar: moving average/maximum of @mdev_max - * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) - * @rtt_seq: to decay RTTVAR at most once per flight - * @rpseq: last consecutive seqno - * @rpdupack: dupacks since rpseq - * @av_chunks: list of Ack Vectors received on current skb - */ + * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 + * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465) + * @ccid2hctx_lastrtt -time RTT was last measured + * @ccid2hctx_rpseq - last consecutive seqno + * @ccid2hctx_rpdupack - dupacks since rpseq +*/ struct ccid2_hc_tx_sock { - u32 cwnd; - u32 ssthresh; - u32 pipe; - u32 packets_acked; - struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX]; - int seqbufc; - struct ccid2_seq *seqh; - struct ccid2_seq *seqt; - /* RTT measurement: variables/principles are the same as in TCP */ - u32 srtt, - mdev, - mdev_max, - rttvar, - rto; - u64 rtt_seq:48; - struct timer_list rtotimer; - u64 rpseq; - int rpdupack; - unsigned long last_cong; - u64 high_ack; - struct list_head av_chunks; + u32 ccid2hctx_cwnd; + u32 ccid2hctx_ssthresh; + u32 ccid2hctx_pipe; + u32 ccid2hctx_packets_acked; + struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; + int ccid2hctx_seqbufc; + struct ccid2_seq *ccid2hctx_seqh; + struct ccid2_seq *ccid2hctx_seqt; + long ccid2hctx_rto; + long ccid2hctx_srtt; + long ccid2hctx_rttvar; + unsigned long ccid2hctx_lastrtt; + struct timer_list ccid2hctx_rtotimer; + u64 ccid2hctx_rpseq; + int ccid2hctx_rpdupack; + unsigned long ccid2hctx_last_cong; + u64 ccid2hctx_high_ack; }; -static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx) -{ - return (hctx->pipe >= hctx->cwnd); -} - struct ccid2_hc_rx_sock { - int data; + int ccid2hcrx_data; }; static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 06cfdad84a6..3b8bd7ca676 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -49,41 +49,75 @@ static int ccid3_debug; /* * Transmitter Half-Connection Routines */ -/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */ -static int do_osc_prev = true; +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG +static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) +{ + static char *ccid3_state_names[] = { + [TFRC_SSTATE_NO_SENT] = "NO_SENT", + [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", + [TFRC_SSTATE_FBACK] = "FBACK", + [TFRC_SSTATE_TERM] = "TERM", + }; + + return ccid3_state_names[state]; +} +#endif + +static void ccid3_hc_tx_set_state(struct sock *sk, + enum ccid3_hc_tx_states state) +{ + struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_tx_state_name(oldstate), + ccid3_tx_state_name(state)); + WARN_ON(state == oldstate); + hctx->ccid3hctx_state = state; +} /* * Compute the initial sending rate X_init in the manner of RFC 3390: * - * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT + * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT * + * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis + * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. * For consistency with other parts of the code, X_init is scaled by 2^6. */ static inline u64 rfc3390_initial_rate(struct sock *sk) { - const u32 mps = dccp_sk(sk)->dccps_mss_cache, - w_init = clamp(4380U, 2 * mps, 4 * mps); + const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + const __u32 w_init = clamp_t(__u32, 4380U, + 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s); - return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt); + return scaled_div(w_init << 6, hctx->ccid3hctx_rtt); } -/** - * ccid3_update_send_interval - Calculate new t_ipi = s / X - * This respects the granularity of X (64 * bytes/second) and enforces the - * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342. +/* + * Recalculate t_ipi and delta (should be called whenever X changes) */ static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) { - if (unlikely(hctx->x <= hctx->s)) - hctx->x = hctx->s; - hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); + /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ + hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, + hctx->ccid3hctx_x); + + /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ + hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, + TFRC_OPSYS_HALF_TIME_GRAN); + + ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", + hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta, + hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6)); + } static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) { - u32 delta = ktime_us_delta(now, hctx->t_last_win_count); + u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); - return delta / hctx->rtt; + return delta / hctx->ccid3hctx_rtt; } /** @@ -99,8 +133,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - u64 min_rate = 2 * hctx->x_recv; - const u64 old_x = hctx->x; + __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; + const __u64 old_x = hctx->ccid3hctx_x; ktime_t now = stamp ? *stamp : ktime_get_real(); /* @@ -111,44 +145,50 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) */ if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { min_rate = rfc3390_initial_rate(sk); - min_rate = max(min_rate, 2 * hctx->x_recv); + min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); } - if (hctx->p > 0) { + if (hctx->ccid3hctx_p > 0) { - hctx->x = min(((u64)hctx->x_calc) << 6, min_rate); + hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, + min_rate); + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, + (((__u64)hctx->ccid3hctx_s) << 6) / + TFRC_T_MBI); - } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) { + } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld) + - (s64)hctx->ccid3hctx_rtt >= 0) { - hctx->x = min(2 * hctx->x, min_rate); - hctx->x = max(hctx->x, - scaled_div(((u64)hctx->s) << 6, hctx->rtt)); - hctx->t_ld = now; + hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate); + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, + scaled_div(((__u64)hctx->ccid3hctx_s) << 6, + hctx->ccid3hctx_rtt)); + hctx->ccid3hctx_t_ld = now; } - if (hctx->x != old_x) { + if (hctx->ccid3hctx_x != old_x) { ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " "X_recv=%u\n", (unsigned)(old_x >> 6), - (unsigned)(hctx->x >> 6), hctx->x_calc, - (unsigned)(hctx->x_recv >> 6)); + (unsigned)(hctx->ccid3hctx_x >> 6), + hctx->ccid3hctx_x_calc, + (unsigned)(hctx->ccid3hctx_x_recv >> 6)); ccid3_update_send_interval(hctx); } } /* - * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1) - * @new_len: DCCP payload size in bytes (not used by all methods) + * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) + * @len: DCCP packet payload size in bytes */ -static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) +static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) { -#if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG) - return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9); -#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX) - return max(ccid3_hc_tx_sk(sk)->s, new_len); -#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */ - return dccp_sk(sk)->dccps_mss_cache; -#endif + const u16 old_s = hctx->ccid3hctx_s; + + hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9); + + if (hctx->ccid3hctx_s != old_s) + ccid3_update_send_interval(hctx); } /* @@ -158,13 +198,13 @@ static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, ktime_t now) { - u32 delta = ktime_us_delta(now, hctx->t_last_win_count), - quarter_rtts = (4 * delta) / hctx->rtt; + u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count), + quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt; if (quarter_rtts > 0) { - hctx->t_last_win_count = now; - hctx->last_win_count += min(quarter_rtts, 5U); - hctx->last_win_count &= 0xF; /* mod 16 */ + hctx->ccid3hctx_t_last_win_count = now; + hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U); + hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ } } @@ -181,26 +221,25 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) goto restart_timer; } - ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk, - hctx->feedback ? "" : "out"); + ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, + ccid3_tx_state_name(hctx->ccid3hctx_state)); - /* Ignore and do not restart after leaving the established state */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) + if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK) + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) goto out; - /* Reset feedback state to "no feedback received" */ - hctx->feedback = false; - /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 - * RTO is 0 if and only if no feedback has been received yet. */ - if (hctx->t_rto == 0 || hctx->p == 0) { + if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */ + hctx->ccid3hctx_p == 0) { /* halve send rate directly */ - hctx->x /= 2; + hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, + (((__u64)hctx->ccid3hctx_s) << 6) / + TFRC_T_MBI); ccid3_update_send_interval(hctx); - } else { /* * Modify the cached value of X_recv @@ -212,41 +251,44 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) * * Note that X_recv is scaled by 2^6 while X_calc is not */ - BUG_ON(hctx->p && !hctx->x_calc); + BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); - if (hctx->x_calc > (hctx->x_recv >> 5)) - hctx->x_recv /= 2; + if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5)) + hctx->ccid3hctx_x_recv = + max(hctx->ccid3hctx_x_recv / 2, + (((__u64)hctx->ccid3hctx_s) << 6) / + (2 * TFRC_T_MBI)); else { - hctx->x_recv = hctx->x_calc; - hctx->x_recv <<= 4; + hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; + hctx->ccid3hctx_x_recv <<= 4; } ccid3_hc_tx_update_x(sk, NULL); } ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", - (unsigned long long)hctx->x); + (unsigned long long)hctx->ccid3hctx_x); /* * Set new timeout for the nofeedback timer. * See comments in packet_recv() regarding the value of t_RTO. */ - if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */ + if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */ t_nfb = TFRC_INITIAL_TIMEOUT; else - t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); restart_timer: - sk_reset_timer(sk, &hctx->no_feedback_timer, + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + usecs_to_jiffies(t_nfb)); out: bh_unlock_sock(sk); sock_put(sk); } -/** - * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets - * @skb: next packet candidate to send on @sk - * This function uses the convention of ccid_packet_dequeue_eval() and - * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. +/* + * returns + * > 0: delay (in msecs) that should pass before actually sending + * = 0: can send immediately + * < 0: error condition; do not send packet */ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { @@ -263,14 +305,18 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) if (unlikely(skb->len == 0)) return -EBADMSG; - if (hctx->s == 0) { - sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + (jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); - hctx->last_win_count = 0; - hctx->t_last_win_count = now; + hctx->ccid3hctx_last_win_count = 0; + hctx->ccid3hctx_t_last_win_count = now; /* Set t_0 for initial packet */ - hctx->t_nom = now; + hctx->ccid3hctx_t_nom = now; + + hctx->ccid3hctx_s = skb->len; /* * Use initial RTT sample when available: recommended by erratum @@ -279,9 +325,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) */ if (dp->dccps_syn_rtt) { ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); - hctx->rtt = dp->dccps_syn_rtt; - hctx->x = rfc3390_initial_rate(sk); - hctx->t_ld = now; + hctx->ccid3hctx_rtt = dp->dccps_syn_rtt; + hctx->ccid3hctx_x = rfc3390_initial_rate(sk); + hctx->ccid3hctx_t_ld = now; } else { /* * Sender does not have RTT sample: @@ -289,20 +335,17 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) * is needed in several parts (e.g. window counter); * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. */ - hctx->rtt = DCCP_FALLBACK_RTT; - hctx->x = dp->dccps_mss_cache; - hctx->x <<= 6; + hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT; + hctx->ccid3hctx_x = hctx->ccid3hctx_s; + hctx->ccid3hctx_x <<= 6; } - - /* Compute t_ipi = s / X */ - hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len); ccid3_update_send_interval(hctx); - /* Seed value for Oscillation Prevention (sec. 4.5) */ - hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt); - - } else { - delay = ktime_us_delta(hctx->t_nom, now); + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + break; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); /* * Scheduling of packet transmissions [RFC 3448, 4.6] @@ -312,80 +355,99 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) * else * // send the packet in (t_nom - t_now) milliseconds. */ - if (delay >= TFRC_T_DELTA) - return (u32)delay / USEC_PER_MSEC; + if (delay - (s64)hctx->ccid3hctx_delta >= 1000) + return (u32)delay / 1000L; ccid3_hc_tx_update_win_count(hctx, now); + break; + case TFRC_SSTATE_TERM: + DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); + return -EINVAL; } /* prepare to send now (add options etc.) */ dp->dccps_hc_tx_insert_options = 1; - DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count; + DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; /* set the nominal send time for the next following packet */ - hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi); - return CCID_PACKET_SEND_AT_ONCE; + hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); + return 0; } -static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) +static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, + unsigned int len) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - /* Changes to s will become effective the next time X is computed */ - hctx->s = ccid3_hc_tx_measure_packet_size(sk, len); + ccid3_hc_tx_update_s(hctx, len); - if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss)) + if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss)) DCCP_CRIT("packet history - out of memory!"); } static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct tfrc_tx_hist_entry *acked; + struct ccid3_options_received *opt_recv; ktime_t now; unsigned long t_nfb; - u32 r_sample; + u32 pinv, r_sample; /* we are only interested in ACKs */ if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) return; - /* - * Locate the acknowledged packet in the TX history. - * - * Returning "entry not found" here can for instance happen when - * - the host has not sent out anything (e.g. a passive server), - * - the Ack is outdated (packet with higher Ack number was received), - * - it is a bogus Ack (for a packet not sent on this connection). - */ - acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb)); - if (acked == NULL) + /* ... and only in the established state */ + if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK && + hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) + return; + + opt_recv = &hctx->ccid3hctx_options_received; + now = ktime_get_real(); + + /* Estimate RTT from history if ACK number is valid */ + r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist, + DCCP_SKB_CB(skb)->dccpd_ack_seq, now); + if (r_sample == 0) { + DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, + dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type), + (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq); return; - /* For the sake of RTT sampling, ignore/remove all older entries */ - tfrc_tx_hist_purge(&acked->next); + } - /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ - now = ktime_get_real(); - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); - hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9); + /* Update receive rate in units of 64 * bytes/second */ + hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; + hctx->ccid3hctx_x_recv <<= 6; + /* Update loss event rate (which is scaled by 1e6) */ + pinv = opt_recv->ccid3or_loss_event_rate; + if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ + hctx->ccid3hctx_p = 0; + else /* can not exceed 100% */ + hctx->ccid3hctx_p = scaled_div(1, pinv); + /* + * Validate new RTT sample and update moving average + */ + r_sample = dccp_sample_rtt(sk, r_sample); + hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9); /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ - if (!hctx->feedback) { - hctx->feedback = true; + if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - if (hctx->t_rto == 0) { + if (hctx->ccid3hctx_t_rto == 0) { /* * Initial feedback packet: Larger Initial Windows (4.2) */ - hctx->x = rfc3390_initial_rate(sk); - hctx->t_ld = now; + hctx->ccid3hctx_x = rfc3390_initial_rate(sk); + hctx->ccid3hctx_t_ld = now; ccid3_update_send_interval(hctx); goto done_computing_x; - } else if (hctx->p == 0) { + } else if (hctx->ccid3hctx_p == 0) { /* * First feedback after nofeedback timer expiry (4.3) */ @@ -394,52 +456,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ - if (hctx->p > 0) - hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p); + if (hctx->ccid3hctx_p > 0) + hctx->ccid3hctx_x_calc = + tfrc_calc_x(hctx->ccid3hctx_s, + hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p); ccid3_hc_tx_update_x(sk, &now); done_computing_x: ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " "p=%u, X_calc=%u, X_recv=%u, X=%u\n", - dccp_role(sk), sk, hctx->rtt, r_sample, - hctx->s, hctx->p, hctx->x_calc, - (unsigned)(hctx->x_recv >> 6), - (unsigned)(hctx->x >> 6)); - /* - * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to - * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This - * can be useful if few connections share a link, avoiding that buffer - * fill levels (RTT) oscillate as a result of frequent adjustments to X. - * A useful presentation with background information is in - * Joerg Widmer, "Equation-Based Congestion Control", - * MSc Thesis, University of Mannheim, Germany, 2000 - * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation"). - */ - if (do_osc_prev) { - r_sample = tfrc_scaled_sqrt(r_sample); - /* - * The modulation can work in both ways: increase/decrease t_ipi - * according to long-term increases/decreases of the RTT. The - * former is a useful measure, since it works against queue - * build-up. The latter temporarily increases the sending rate, - * so that buffers fill up more quickly. This in turn causes - * the RTT to increase, so that either later reduction becomes - * necessary or the RTT stays at a very high level. Decreasing - * t_ipi is therefore not supported. - * Furthermore, during the initial slow-start phase the RTT - * naturally increases, where using the algorithm would cause - * delays. Hence it is disabled during the initial slow-start. - */ - if (r_sample > hctx->r_sqmean && hctx->p > 0) - hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample, - hctx->r_sqmean); - hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI); - /* update R_sqmean _after_ computing the modulation factor */ - hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9); - } + dccp_role(sk), + sk, hctx->ccid3hctx_rtt, r_sample, + hctx->ccid3hctx_s, hctx->ccid3hctx_p, + hctx->ccid3hctx_x_calc, + (unsigned)(hctx->ccid3hctx_x_recv >> 6), + (unsigned)(hctx->ccid3hctx_x >> 6)); /* unschedule no feedback timer */ - sk_stop_timer(sk, &hctx->no_feedback_timer); + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); /* * As we have calculated new ipi, delta, t_nom it is possible @@ -453,66 +488,95 @@ done_computing_x: * This can help avoid triggering the nofeedback timer too * often ('spinning') on LANs with small RTTs. */ - hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO * - (USEC_PER_SEC / 1000))); + hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, + (CONFIG_IP_DCCP_CCID3_RTO * + (USEC_PER_SEC / 1000))); /* * Schedule no feedback timer to expire in * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) */ - t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " "expire in %lu jiffies (%luus)\n", - dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); + dccp_role(sk), + sk, usecs_to_jiffies(t_nfb), t_nfb); - sk_reset_timer(sk, &hctx->no_feedback_timer, + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, jiffies + usecs_to_jiffies(t_nfb)); } -static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) +static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, + unsigned char len, u16 idx, + unsigned char *value) { + int rc = 0; + const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); + struct ccid3_options_received *opt_recv; __be32 opt_val; - switch (option) { - case TFRC_OPT_RECEIVE_RATE: - case TFRC_OPT_LOSS_EVENT_RATE: - /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ - if (packet_type == DCCP_PKT_DATA) - break; - if (unlikely(optlen != 4)) { - DCCP_WARN("%s(%p), invalid len %d for %u\n", - dccp_role(sk), sk, optlen, option); - return -EINVAL; - } - opt_val = ntohl(get_unaligned((__be32 *)optval)); + opt_recv = &hctx->ccid3hctx_options_received; - if (option == TFRC_OPT_RECEIVE_RATE) { - /* Receive Rate is kept in units of 64 bytes/second */ - hctx->x_recv = opt_val; - hctx->x_recv <<= 6; + if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { + opt_recv->ccid3or_seqno = dp->dccps_gsr; + opt_recv->ccid3or_loss_event_rate = ~0; + opt_recv->ccid3or_loss_intervals_idx = 0; + opt_recv->ccid3or_loss_intervals_len = 0; + opt_recv->ccid3or_receive_rate = 0; + } - ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", - dccp_role(sk), sk, opt_val); + switch (option) { + case TFRC_OPT_LOSS_EVENT_RATE: + if (unlikely(len != 4)) { + DCCP_WARN("%s(%p), invalid len %d " + "for TFRC_OPT_LOSS_EVENT_RATE\n", + dccp_role(sk), sk, len); + rc = -EINVAL; } else { - /* Update the fixpoint Loss Event Rate fraction */ - hctx->p = tfrc_invert_loss_event_rate(opt_val); - + opt_val = get_unaligned((__be32 *)value); + opt_recv->ccid3or_loss_event_rate = ntohl(opt_val); ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", - dccp_role(sk), sk, opt_val); + dccp_role(sk), sk, + opt_recv->ccid3or_loss_event_rate); } + break; + case TFRC_OPT_LOSS_INTERVALS: + opt_recv->ccid3or_loss_intervals_idx = idx; + opt_recv->ccid3or_loss_intervals_len = len; + ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", + dccp_role(sk), sk, + opt_recv->ccid3or_loss_intervals_idx, + opt_recv->ccid3or_loss_intervals_len); + break; + case TFRC_OPT_RECEIVE_RATE: + if (unlikely(len != 4)) { + DCCP_WARN("%s(%p), invalid len %d " + "for TFRC_OPT_RECEIVE_RATE\n", + dccp_role(sk), sk, len); + rc = -EINVAL; + } else { + opt_val = get_unaligned((__be32 *)value); + opt_recv->ccid3or_receive_rate = ntohl(opt_val); + ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", + dccp_role(sk), sk, + opt_recv->ccid3or_receive_rate); + } + break; } - return 0; + + return rc; } static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); - hctx->hist = NULL; - setup_timer(&hctx->no_feedback_timer, - ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); + hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; + hctx->ccid3hctx_hist = NULL; + setup_timer(&hctx->ccid3hctx_no_feedback_timer, + ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); + return 0; } @@ -520,36 +584,42 @@ static void ccid3_hc_tx_exit(struct sock *sk) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - sk_stop_timer(sk, &hctx->no_feedback_timer); - tfrc_tx_hist_purge(&hctx->hist); + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + + tfrc_tx_hist_purge(&hctx->ccid3hctx_hist); } static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) { - info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto; - info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt; + struct ccid3_hc_tx_sock *hctx; + + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return; + + hctx = ccid3_hc_tx_sk(sk); + info->tcpi_rto = hctx->ccid3hctx_t_rto; + info->tcpi_rtt = hctx->ccid3hctx_rtt; } static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct tfrc_tx_info tfrc; + const struct ccid3_hc_tx_sock *hctx; const void *val; + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return -EINVAL; + + hctx = ccid3_hc_tx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(tfrc)) + if (len < sizeof(hctx->ccid3hctx_tfrc)) return -EINVAL; - tfrc.tfrctx_x = hctx->x; - tfrc.tfrctx_x_recv = hctx->x_recv; - tfrc.tfrctx_x_calc = hctx->x_calc; - tfrc.tfrctx_rtt = hctx->rtt; - tfrc.tfrctx_p = hctx->p; - tfrc.tfrctx_rto = hctx->t_rto; - tfrc.tfrctx_ipi = hctx->t_ipi; - len = sizeof(tfrc); - val = &tfrc; + len = sizeof(hctx->ccid3hctx_tfrc); + val = &hctx->ccid3hctx_tfrc; break; default: return -ENOPROTOOPT; @@ -564,82 +634,112 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, /* * Receiver Half-Connection Routines */ + +/* CCID3 feedback types */ +enum ccid3_fback_type { + CCID3_FBACK_NONE = 0, + CCID3_FBACK_INITIAL, + CCID3_FBACK_PERIODIC, + CCID3_FBACK_PARAM_CHANGE +}; + +#ifdef CONFIG_IP_DCCP_CCID3_DEBUG +static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) +{ + static char *ccid3_rx_state_names[] = { + [TFRC_RSTATE_NO_DATA] = "NO_DATA", + [TFRC_RSTATE_DATA] = "DATA", + [TFRC_RSTATE_TERM] = "TERM", + }; + + return ccid3_rx_state_names[state]; +} +#endif + +static void ccid3_hc_rx_set_state(struct sock *sk, + enum ccid3_hc_rx_states state) +{ + struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_rx_state_name(oldstate), + ccid3_rx_state_name(state)); + WARN_ON(state == oldstate); + hcrx->ccid3hcrx_state = state; +} + static void ccid3_hc_rx_send_feedback(struct sock *sk, const struct sk_buff *skb, enum ccid3_fback_type fbtype) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + ktime_t now; + s64 delta = 0; + + if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM)) + return; + + now = ktime_get_real(); switch (fbtype) { case CCID3_FBACK_INITIAL: - hcrx->x_recv = 0; - hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ + hcrx->ccid3hcrx_x_recv = 0; + hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ break; case CCID3_FBACK_PARAM_CHANGE: - if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) { - /* - * rfc3448bis-06, 6.3.1: First packet(s) lost or marked - * FIXME: in rfc3448bis the receiver returns X_recv=0 - * here as it normally would in the first feedback packet. - * However this is not possible yet, since the code still - * uses RFC 3448, i.e. - * If (p > 0) - * Calculate X_calc using the TCP throughput equation. - * X = max(min(X_calc, 2*X_recv), s/t_mbi); - * would bring X down to s/t_mbi. That is why we return - * X_recv according to rfc3448bis-06 for the moment. - */ - u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), - rtt = tfrc_rx_hist_rtt(&hcrx->hist); - - hcrx->x_recv = scaled_div32(s, 2 * rtt); - break; - } /* * When parameters change (new loss or p > p_prev), we do not * have a reliable estimate for R_m of [RFC 3448, 6.2] and so - * always check whether at least RTT time units were covered. + * need to reuse the previous value of X_recv. However, when + * X_recv was 0 (due to early loss), this would kill X down to + * s/t_mbi (i.e. one packet in 64 seconds). + * To avoid such drastic reduction, we approximate X_recv as + * the number of bytes since last feedback. + * This is a safe fallback, since X is bounded above by X_calc. */ - hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); - break; + if (hcrx->ccid3hcrx_x_recv > 0) + break; + /* fall through */ case CCID3_FBACK_PERIODIC: - /* - * Step (2) of rfc3448bis-06, 6.2: - * - if no data packets have been received, just restart timer - * - if data packets have been received, re-compute X_recv - */ - if (hcrx->hist.bytes_recvd == 0) - goto prepare_for_next_time; - hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); + delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback); + if (delta <= 0) + DCCP_BUG("delta (%ld) <= 0", (long)delta); + else + hcrx->ccid3hcrx_x_recv = + scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); break; default: return; } - ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse); + ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, + hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv); - dccp_sk(sk)->dccps_hc_rx_insert_options = 1; - dccp_send_ack(sk); + hcrx->ccid3hcrx_tstamp_last_feedback = now; + hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval; + hcrx->ccid3hcrx_bytes_recv = 0; -prepare_for_next_time: - tfrc_rx_hist_restart_byte_counter(&hcrx->hist); - hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; - hcrx->feedback = fbtype; + dp->dccps_hc_rx_insert_options = 1; + dccp_send_ack(sk); } static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) { - const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + const struct ccid3_hc_rx_sock *hcrx; __be32 x_recv, pinv; if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) return 0; + hcrx = ccid3_hc_rx_sk(sk); + if (dccp_packet_without_ack(skb)) return 0; - x_recv = htonl(hcrx->x_recv); - pinv = htonl(hcrx->p_inverse); + x_recv = htonl(hcrx->ccid3hcrx_x_recv); + pinv = htonl(hcrx->ccid3hcrx_pinv); if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, &pinv, sizeof(pinv)) || @@ -662,95 +762,171 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) static u32 ccid3_first_li(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), - rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p; + u32 x_recv, p, delta; u64 fval; - /* - * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p - * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p - * is about 20.64%. This yields an interval length of 4.84 (rounded up). - */ - if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) - return 5; + if (hcrx->ccid3hcrx_rtt == 0) { + DCCP_WARN("No RTT estimate available, using fallback RTT\n"); + hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT; + } - x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); - if (x_recv == 0) - goto failed; + delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback)); + x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); + if (x_recv == 0) { /* would also trigger divide-by-zero */ + DCCP_WARN("X_recv==0\n"); + if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) { + DCCP_BUG("stored value of X_recv is zero"); + return ~0U; + } + } - fval = scaled_div32(scaled_div(s, rtt), x_recv); + fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt); + fval = scaled_div32(fval, x_recv); p = tfrc_calc_x_reverse_lookup(fval); ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); - if (p > 0) - return scaled_div(1, p); -failed: - return UINT_MAX; + return p == 0 ? ~0U : scaled_div(1, p); } static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; const bool is_data_packet = dccp_data_packet(skb); + if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { + if (is_data_packet) { + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; + do_feedback = CCID3_FBACK_INITIAL; + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); + hcrx->ccid3hcrx_s = payload; + /* + * Not necessary to update ccid3hcrx_bytes_recv here, + * since X_recv = 0 for the first feedback packet (cf. + * RFC 3448, 6.3) -- gerrit + */ + } + goto update_records; + } + + if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb)) + return; /* done receiving */ + + if (is_data_packet) { + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; + /* + * Update moving-average of s and the sum of received payload bytes + */ + hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9); + hcrx->ccid3hcrx_bytes_recv += payload; + } + /* * Perform loss detection and handle pending losses */ - if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist, - skb, ndp, ccid3_first_li, sk)) - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE); + if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist, + skb, ndp, ccid3_first_li, sk)) { + do_feedback = CCID3_FBACK_PARAM_CHANGE; + goto done_receiving; + } + + if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist)) + return; /* done receiving */ + /* - * Feedback for first non-empty data packet (RFC 3448, 6.3) + * Handle data packets: RTT sampling and monitoring p */ - else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet)) - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL); + if (unlikely(!is_data_packet)) + goto update_records; + + if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) { + const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb); + /* + * Empty loss history: no loss so far, hence p stays 0. + * Sample RTT values, since an RTT estimate is required for the + * computation of p when the first loss occurs; RFC 3448, 6.3.1. + */ + if (sample != 0) + hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9); + + } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) { + /* + * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean + * has decreased (resp. p has increased), send feedback now. + */ + do_feedback = CCID3_FBACK_PARAM_CHANGE; + } + /* * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 */ - else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet && - SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) - ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC); + if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3) + do_feedback = CCID3_FBACK_PERIODIC; + +update_records: + tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp); + +done_receiving: + if (do_feedback) + ccid3_hc_rx_send_feedback(sk, skb, do_feedback); } static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); - tfrc_lh_init(&hcrx->li_hist); - return tfrc_rx_hist_init(&hcrx->hist, sk); + hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; + tfrc_lh_init(&hcrx->ccid3hcrx_li_hist); + return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist); } static void ccid3_hc_rx_exit(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - tfrc_rx_hist_purge(&hcrx->hist); - tfrc_lh_cleanup(&hcrx->li_hist); + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); + + tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist); + tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist); } static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) { + const struct ccid3_hc_rx_sock *hcrx; + + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return; + + hcrx = ccid3_hc_rx_sk(sk); + info->tcpi_ca_state = hcrx->ccid3hcrx_state; info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist); + info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; } static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + const struct ccid3_hc_rx_sock *hcrx; struct tfrc_rx_info rx_info; const void *val; + /* Listen socks doesn't have a private CCID block */ + if (sk->sk_state == DCCP_LISTEN) + return -EINVAL; + + hcrx = ccid3_hc_rx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_RX_INFO: if (len < sizeof(rx_info)) return -EINVAL; - rx_info.tfrcrx_x_recv = hcrx->x_recv; - rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist); - rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse); + rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv; + rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt; + rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U : + scaled_div(1, hcrx->ccid3hcrx_pinv); len = sizeof(rx_info); val = &rx_info; break; @@ -786,9 +962,6 @@ static struct ccid_operations ccid3 = { .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, }; -module_param(do_osc_prev, bool, 0644); -MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)"); - #ifdef CONFIG_IP_DCCP_CCID3_DEBUG module_param(ccid3_debug, bool, 0644); MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); @@ -796,19 +969,6 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); static __init int ccid3_module_init(void) { - struct timespec tp; - - /* - * Without a fine-grained clock resolution, RTTs/X_recv are not sampled - * correctly and feedback is sent either too early or too late. - */ - hrtimer_get_res(CLOCK_MONOTONIC, &tp); - if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) { - printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec" - " resolution - check your clocksource.\n", __func__, - tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION); - return -ESOCKTNOSUPPORT; - } return ccid_register(&ccid3); } module_init(ccid3_module_init); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index af6e1bf937d..49ca32bd7e7 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -47,22 +47,11 @@ /* Two seconds as per RFC 3448 4.2 */ #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) -/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */ -#define TFRC_T_MBI (64 * USEC_PER_SEC) +/* In usecs - half the scheduling granularity as per RFC3448 4.6 */ +#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) -/* - * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are - * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. - * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse - * resolution of HZ < 500 means that the error is below one timer tick (t_gran) - * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). - */ -#if (HZ >= 500) -# define TFRC_T_DELTA USEC_PER_MSEC -#else -# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) -#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC. -#endif +/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ +#define TFRC_T_MBI 64 enum ccid3_options { TFRC_OPT_LOSS_EVENT_RATE = 192, @@ -70,43 +59,62 @@ enum ccid3_options { TFRC_OPT_RECEIVE_RATE = 194, }; +struct ccid3_options_received { + u64 ccid3or_seqno:48, + ccid3or_loss_intervals_idx:16; + u16 ccid3or_loss_intervals_len; + u32 ccid3or_loss_event_rate; + u32 ccid3or_receive_rate; +}; + +/* TFRC sender states */ +enum ccid3_hc_tx_states { + TFRC_SSTATE_NO_SENT = 1, + TFRC_SSTATE_NO_FBACK, + TFRC_SSTATE_FBACK, + TFRC_SSTATE_TERM, +}; + /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket * - * @x - Current sending rate in 64 * bytes per second - * @x_recv - Receive rate in 64 * bytes per second - * @x_calc - Calculated rate in bytes per second - * @rtt - Estimate of current round trip time in usecs - * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5) - * @p - Current loss event rate (0-1) scaled by 1000000 - * @s - Packet size in bytes - * @t_rto - Nofeedback Timer setting in usecs - * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs - * @feedback - Whether feedback has been received or not - * @last_win_count - Last window counter sent - * @t_last_win_count - Timestamp of earliest packet with - * last_win_count value sent - * @no_feedback_timer - Handle to no feedback timer - * @t_ld - Time last doubled during slow start - * @t_nom - Nominal send time of next packet - * @hist - Packet history + * @ccid3hctx_x - Current sending rate in 64 * bytes per second + * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second + * @ccid3hctx_x_calc - Calculated rate in bytes per second + * @ccid3hctx_rtt - Estimate of current round trip time in usecs + * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 + * @ccid3hctx_s - Packet size in bytes + * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs + * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs + * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states + * @ccid3hctx_last_win_count - Last window counter sent + * @ccid3hctx_t_last_win_count - Timestamp of earliest packet + * with last_win_count value sent + * @ccid3hctx_no_feedback_timer - Handle to no feedback timer + * @ccid3hctx_t_ld - Time last doubled during slow start + * @ccid3hctx_t_nom - Nominal send time of next packet + * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs + * @ccid3hctx_hist - Packet history + * @ccid3hctx_options_received - Parsed set of retrieved options */ struct ccid3_hc_tx_sock { - u64 x; - u64 x_recv; - u32 x_calc; - u32 rtt; - u16 r_sqmean; - u32 p; - u32 t_rto; - u32 t_ipi; - u16 s; - bool feedback:1; - u8 last_win_count; - ktime_t t_last_win_count; - struct timer_list no_feedback_timer; - ktime_t t_ld; - ktime_t t_nom; - struct tfrc_tx_hist_entry *hist; + struct tfrc_tx_info ccid3hctx_tfrc; +#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x +#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv +#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc +#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt +#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p +#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto +#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi + u16 ccid3hctx_s; + enum ccid3_hc_tx_states ccid3hctx_state:8; + u8 ccid3hctx_last_win_count; + ktime_t ccid3hctx_t_last_win_count; + struct timer_list ccid3hctx_no_feedback_timer; + ktime_t ccid3hctx_t_ld; + ktime_t ccid3hctx_t_nom; + u32 ccid3hctx_delta; + struct tfrc_tx_hist_entry *ccid3hctx_hist; + struct ccid3_options_received ccid3hctx_options_received; }; static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) @@ -116,32 +124,41 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) return hctx; } - -enum ccid3_fback_type { - CCID3_FBACK_NONE = 0, - CCID3_FBACK_INITIAL, - CCID3_FBACK_PERIODIC, - CCID3_FBACK_PARAM_CHANGE +/* TFRC receiver states */ +enum ccid3_hc_rx_states { + TFRC_RSTATE_NO_DATA = 1, + TFRC_RSTATE_DATA, + TFRC_RSTATE_TERM = 127, }; /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket * - * @last_counter - Tracks window counter (RFC 4342, 8.1) - * @feedback - The type of the feedback last sent - * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) - * @tstamp_last_feedback - Time at which last feedback was sent - * @hist - Packet history (loss detection + RTT sampling) - * @li_hist - Loss Interval database - * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) + * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) + * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) + * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) + * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) + * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states + * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes + * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) + * @ccid3hcrx_rtt - Receiver estimate of RTT + * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent + * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent + * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling) + * @ccid3hcrx_li_hist - Loss Interval database + * @ccid3hcrx_s - Received packet size in bytes + * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) */ struct ccid3_hc_rx_sock { - u8 last_counter:4; - enum ccid3_fback_type feedback:4; - u32 x_recv; - ktime_t tstamp_last_feedback; - struct tfrc_rx_hist hist; - struct tfrc_loss_hist li_hist; -#define p_inverse li_hist.i_mean + u8 ccid3hcrx_last_counter:4; + enum ccid3_hc_rx_states ccid3hcrx_state:8; + u32 ccid3hcrx_bytes_recv; + u32 ccid3hcrx_x_recv; + u32 ccid3hcrx_rtt; + ktime_t ccid3hcrx_tstamp_last_feedback; + struct tfrc_rx_hist ccid3hcrx_hist; + struct tfrc_loss_hist ccid3hcrx_li_hist; + u16 ccid3hcrx_s; +#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean }; static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index b1ae8f8259e..5b3ce0688c5 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -86,26 +86,21 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) /** * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 - * This updates I_mean as the sequence numbers increase. As a consequence, the - * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1) - * decreases, and thus there is no need to send renewed feedback. + * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev */ -void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) +u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) { struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); + u32 old_i_mean = lh->i_mean; s64 len; if (cur == NULL) /* not initialised */ - return; - - /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */ - if (!dccp_data_packet(skb)) - return; + return 0; len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ - return; + return 0; if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) /* @@ -119,11 +114,14 @@ void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) cur->li_is_closed = 1; if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ - return; + return 0; cur->li_length = len; tfrc_lh_calc_i_mean(lh); + + return (lh->i_mean < old_i_mean); } +EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean); /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, @@ -140,18 +138,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, * @sk: Used by @calc_first_li in caller-specific way (subtyping) * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. */ -bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, - u32 (*calc_first_li)(struct sock *), struct sock *sk) +int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, + u32 (*calc_first_li)(struct sock *), struct sock *sk) { struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) - return false; + return 0; new = tfrc_lh_demand_next(lh); if (unlikely(new == NULL)) { DCCP_CRIT("Cannot allocate/add loss record."); - return false; + return 0; } new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; @@ -169,7 +167,7 @@ bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, tfrc_lh_calc_i_mean(lh); } - return true; + return 1; } EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h index d08a226db43..246018a3b26 100644 --- a/net/dccp/ccids/lib/loss_interval.h +++ b/net/dccp/ccids/lib/loss_interval.h @@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) struct tfrc_rx_hist; -extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, +extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, u32 (*first_li)(struct sock *), struct sock *); -extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); +extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); #endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index cce9f03bda3..6cc108afdc3 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -40,6 +40,18 @@ #include "packet_history.h" #include "../../dccp.h" +/** + * tfrc_tx_hist_entry - Simple singly-linked TX history list + * @next: next oldest entry (LIFO order) + * @seqno: sequence number of this entry + * @stamp: send time of packet with sequence number @seqno + */ +struct tfrc_tx_hist_entry { + struct tfrc_tx_hist_entry *next; + u64 seqno; + ktime_t stamp; +}; + /* * Transmitter History Routines */ @@ -61,6 +73,15 @@ void tfrc_tx_packet_history_exit(void) } } +static struct tfrc_tx_hist_entry * + tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) +{ + while (head != NULL && head->seqno != seqno) + head = head->next; + + return head; +} + int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) { struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); @@ -90,6 +111,25 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) } EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); +u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, + const ktime_t now) +{ + u32 rtt = 0; + struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); + + if (packet != NULL) { + rtt = ktime_us_delta(now, packet->stamp); + /* + * Garbage-collect older (irrelevant) entries: + */ + tfrc_tx_hist_purge(&packet->next); + } + + return rtt; +} +EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); + + /* * Receiver History Routines */ @@ -151,31 +191,14 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); - -static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) -{ - struct tfrc_rx_hist_entry *tmp = h->ring[a]; - - h->ring[a] = h->ring[b]; - h->ring[b] = tmp; -} - static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) { - __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a), - tfrc_rx_hist_index(h, b)); -} + const u8 idx_a = tfrc_rx_hist_index(h, a), + idx_b = tfrc_rx_hist_index(h, b); + struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; -/** - * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling - * This is called after loss detection has finished, when the history entry - * with the index of `loss_count' holds the highest-received sequence number. - * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt). - */ -static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h) -{ - __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count)); - h->loss_count = h->loss_start = 0; + h->ring[idx_a] = h->ring[idx_b]; + h->ring[idx_b] = tmp; } /* @@ -192,8 +215,10 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, s1 = DCCP_SKB_CB(skb)->dccpd_seq; - if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */ + if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ h->loss_count = 1; + tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); + } } static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) @@ -215,7 +240,8 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2 if (dccp_loss_free(s2, s1, n1)) { /* hole is filled: S0, S2, and S1 are consecutive */ - tfrc_rx_hist_resume_rtt_sampling(h); + h->loss_count = 0; + h->loss_start = tfrc_rx_hist_index(h, 1); } else /* gap between S2 and S1: just update loss_prev */ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); @@ -268,7 +294,8 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) if (dccp_loss_free(s1, s2, n2)) { /* entire hole filled by S0, S3, S1, S2 */ - tfrc_rx_hist_resume_rtt_sampling(h); + h->loss_start = tfrc_rx_hist_index(h, 2); + h->loss_count = 0; } else { /* gap remains between S1 and S2 */ h->loss_start = tfrc_rx_hist_index(h, 1); @@ -312,7 +339,8 @@ static void __three_after_loss(struct tfrc_rx_hist *h) if (dccp_loss_free(s2, s3, n3)) { /* no gap between S2 and S3: entire hole is filled */ - tfrc_rx_hist_resume_rtt_sampling(h); + h->loss_start = tfrc_rx_hist_index(h, 3); + h->loss_count = 0; } else { /* gap between S2 and S3 */ h->loss_start = tfrc_rx_hist_index(h, 2); @@ -326,13 +354,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h) } /** - * tfrc_rx_congestion_event - Loss detection and further processing - * @h: The non-empty RX history object - * @lh: Loss Intervals database to update - * @skb: Currently received packet - * @ndp: The NDP count belonging to @skb - * @first_li: Caller-dependent computation of first loss interval in @lh - * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) + * tfrc_rx_handle_loss - Loss detection and further processing + * @h: The non-empty RX history object + * @lh: Loss Intervals database to update + * @skb: Currently received packet + * @ndp: The NDP count belonging to @skb + * @calc_first_li: Caller-dependent computation of first loss interval in @lh + * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) * Chooses action according to pending loss, updates LI database when a new * loss was detected, and does required post-processing. Returns 1 when caller * should send feedback, 0 otherwise. @@ -340,20 +368,15 @@ static void __three_after_loss(struct tfrc_rx_hist *h) * records accordingly, the caller should not perform any more RX history * operations when loss_count is greater than 0 after calling this function. */ -bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *), struct sock *sk) +int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, const u64 ndp, + u32 (*calc_first_li)(struct sock *), struct sock *sk) { - bool new_event = false; - - if (tfrc_rx_hist_duplicate(h, skb)) - return 0; + int is_new_loss = 0; if (h->loss_count == 0) { __do_track_loss(h, skb, ndp); - tfrc_rx_hist_sample_rtt(h, skb); - tfrc_rx_hist_add_packet(h, skb, ndp); } else if (h->loss_count == 1) { __one_after_loss(h, skb, ndp); } else if (h->loss_count != 2) { @@ -362,57 +385,34 @@ bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, /* * Update Loss Interval database and recycle RX records */ - new_event = tfrc_lh_interval_add(lh, h, first_li, sk); + is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); __three_after_loss(h); } - - /* - * Update moving-average of `s' and the sum of received payload bytes. - */ - if (dccp_data_packet(skb)) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - - h->packet_size = tfrc_ewma(h->packet_size, payload, 9); - h->bytes_recvd += payload; - } - - /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */ - if (!new_event) - tfrc_lh_update_i_mean(lh, skb); - - return new_event; + return is_new_loss; } -EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event); +EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); -/* Compute the sending rate X_recv measured between feedback intervals */ -u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv) +int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) { - u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate; - s64 delta = ktime_to_us(net_timedelta(h->bytes_start)); - - WARN_ON(delta <= 0); - /* - * Ensure that the sampling interval for X_recv is at least one RTT, - * by extending the sampling interval backwards in time, over the last - * R_(m-1) seconds, as per rfc3448bis-06, 6.2. - * To reduce noise (e.g. when the RTT changes often), this is only - * done when delta is smaller than RTT/2. - */ - if (last_x_recv > 0 && delta < last_rtt/2) { - tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n", - (long)delta, (unsigned)last_rtt); + int i; - delta = (bytes ? delta : 0) + last_rtt; - bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC); + for (i = 0; i <= TFRC_NDUPACK; i++) { + h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); + if (h->ring[i] == NULL) + goto out_free; } - if (unlikely(bytes == 0)) { - DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv); - return last_x_recv; + h->loss_count = h->loss_start = 0; + return 0; + +out_free: + while (i-- != 0) { + kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); + h->ring[i] = NULL; } - return scaled_div32(bytes, delta); + return -ENOBUFS; } -EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv); +EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) { @@ -426,81 +426,73 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) } EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); -static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) +/** + * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) { - int i; - - memset(h, 0, sizeof(*h)); - - for (i = 0; i <= TFRC_NDUPACK; i++) { - h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); - if (h->ring[i] == NULL) { - tfrc_rx_hist_purge(h); - return -ENOBUFS; - } - } - return 0; + return h->ring[0]; } -int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk) +/** + * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry + */ +static inline struct tfrc_rx_hist_entry * + tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) { - if (tfrc_rx_hist_alloc(h)) - return -ENOBUFS; - /* - * Initialise first entry with GSR to start loss detection as early as - * possible. Code using this must not use any other fields. The entry - * will be overwritten once the CCID updates its received packets. - */ - tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr; - return 0; + return h->ring[h->rtt_sample_prev]; } -EXPORT_SYMBOL_GPL(tfrc_rx_hist_init); /** * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal - * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss - * is pending and uses the following history entries (via rtt_sample_prev): - * - h->ring[0] contains the most recent history entry prior to @skb; - * - h->ring[1] is an unused `dummy' entry when the current difference is 0; + * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able + * to compute a sample with given data - calling function should check this. */ -void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) +u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) { - struct tfrc_rx_hist_entry *last = h->ring[0]; - u32 sample, delta_v; - - /* - * When not to sample: - * - on non-data packets - * (RFC 4342, 8.1: CCVal only fully defined for data packets); - * - when no data packets have been received yet - * (FIXME: using sampled packet size as indicator here); - * - as long as there are gaps in the sequence space (pending loss). - */ - if (!dccp_data_packet(skb) || h->packet_size == 0 || - tfrc_rx_hist_loss_pending(h)) - return; + u32 sample = 0, + delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + + if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ + if (h->rtt_sample_prev == 2) { /* previous candidate stored */ + sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + if (sample) + sample = 4 / sample * + ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); + else /* + * FIXME: This condition is in principle not + * possible but occurs when CCID is used for + * two-way data traffic. I have tried to trace + * it, but the cause does not seem to be here. + */ + DCCP_BUG("please report to dccp@vger.kernel.org" + " => prev = %u, last = %u", + tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + } else if (delta_v < 1) { + h->rtt_sample_prev = 1; + goto keep_ref_for_next_time; + } - h->rtt_sample_prev = 0; /* reset previous candidate */ + } else if (delta_v == 4) /* optimal match */ + sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); + else { /* suboptimal match */ + h->rtt_sample_prev = 2; + goto keep_ref_for_next_time; + } - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval); - if (delta_v == 0) { /* less than RTT/4 difference */ - h->rtt_sample_prev = 1; - return; + if (unlikely(sample > DCCP_SANE_RTT_MAX)) { + DCCP_WARN("RTT sample %u too large, using max\n", sample); + sample = DCCP_SANE_RTT_MAX; } - sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp))); - if (delta_v <= 4) /* between RTT/4 and RTT */ - sample *= 4 / delta_v; - else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2)) - /* - * Optimisation: CCVal difference is greater than 1 RTT, yet the - * sample is less than the local RTT estimate; which means that - * the RTT estimate is too high. - * To avoid noise, it is not done if the sample is below RTT/2. - */ - return; + h->rtt_sample_prev = 0; /* use current entry as next reference */ +keep_ref_for_next_time: - /* Use a lower weight than usual to increase responsiveness */ - h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5); + return sample; } EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 555e65cd73a..461cc91cce8 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -40,28 +40,12 @@ #include <linux/slab.h> #include "tfrc.h" -/** - * tfrc_tx_hist_entry - Simple singly-linked TX history list - * @next: next oldest entry (LIFO order) - * @seqno: sequence number of this entry - * @stamp: send time of packet with sequence number @seqno - */ -struct tfrc_tx_hist_entry { - struct tfrc_tx_hist_entry *next; - u64 seqno; - ktime_t stamp; -}; - -static inline struct tfrc_tx_hist_entry * - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) -{ - while (head != NULL && head->seqno != seqno) - head = head->next; - return head; -} +struct tfrc_tx_hist_entry; extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); +extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, + const u64 seqno, const ktime_t now); /* Subtraction a-b modulo-16, respects circular wrap-around */ #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) @@ -91,22 +75,12 @@ struct tfrc_rx_hist_entry { * @loss_count: Number of entries in circular history * @loss_start: Movable index (for loss detection) * @rtt_sample_prev: Used during RTT sampling, points to candidate entry - * @rtt_estimate: Receiver RTT estimate - * @packet_size: Packet size in bytes (as per RFC 3448, 3.1) - * @bytes_recvd: Number of bytes received since @bytes_start - * @bytes_start: Start time for counting @bytes_recvd */ struct tfrc_rx_hist { struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; u8 loss_count:2, loss_start:2; - /* Receiver RTT sampling */ #define rtt_sample_prev loss_start - u32 rtt_estimate; - /* Receiver sampling of application payload lengths */ - u32 packet_size, - bytes_recvd; - ktime_t bytes_start; }; /** @@ -150,50 +124,20 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) return h->loss_count > 0; } -/* - * Accessor functions to retrieve parameters sampled by the RX history - */ -static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h) -{ - if (h->packet_size == 0) { - DCCP_WARN("No sample for s, using fallback\n"); - return TCP_MIN_RCVMSS; - } - return h->packet_size; - -} -static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h) -{ - if (h->rtt_estimate == 0) { - DCCP_WARN("No RTT estimate available, using fallback RTT\n"); - return DCCP_FALLBACK_RTT; - } - return h->rtt_estimate; -} - -static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h) -{ - h->bytes_recvd = 0; - h->bytes_start = ktime_get_real(); -} - -extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv); - - extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, const u64 ndp); extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); struct tfrc_loss_hist; -extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *sk), - struct sock *sk); -extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, - const struct sk_buff *skb); -extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); +extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, const u64 ndp, + u32 (*first_li)(struct sock *sk), + struct sock *sk); +extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, + const struct sk_buff *skb); +extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); #endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index ede12f53de5..ed9857527ac 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h @@ -48,21 +48,6 @@ static inline u32 scaled_div32(u64 a, u64 b) } /** - * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1 - * Uses scaling to improve accuracy of the integer approximation of sqrt(). The - * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for - * clamped RTT samples (dccp_sample_rtt). - * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the - * scaling factor is neutralised. For this purpose, it avoids returning zero. - */ -static inline u16 tfrc_scaled_sqrt(const u32 sample) -{ - const unsigned long non_zero_sample = sample ? : 1; - - return int_sqrt(non_zero_sample << 10); -} - -/** * tfrc_ewma - Exponentially weighted moving average * @weight: Weight to be used as damping factor, in units of 1/10 */ @@ -73,7 +58,6 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); -extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); extern int tfrc_tx_packet_history_init(void); extern void tfrc_tx_packet_history_exit(void); diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 38239c4d5e1..2f20a29cffe 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -632,16 +632,8 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ - /* - * In the congestion-avoidance phase p decays towards 0 - * when there are no further losses, so this case is - * natural. Truncating to p_min = 0.01% means that the - * maximum achievable throughput is limited to about - * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g. - * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps. - */ - tfrc_pr_debug("Value of p (%d) below resolution. " - "Substituting %d\n", p, TFRC_SMALLEST_P); + DCCP_WARN("Value of p (%d) below resolution. " + "Substituting %d\n", p, TFRC_SMALLEST_P); index = 0; } else /* 0.0001 <= p <= 0.05 */ index = p/TFRC_SMALLEST_P - 1; @@ -666,6 +658,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) result = scaled_div(s, R); return scaled_div32(result, f); } + EXPORT_SYMBOL_GPL(tfrc_calc_x); /** @@ -700,19 +693,5 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) index = tfrc_binsearch(fvalue, 0); return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } -EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); -/** - * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% - * When @loss_event_rate is large, there is a chance that p is truncated to 0. - * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. - */ -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) -{ - if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ - return 0; - if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ - return 1000000; - return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); -} -EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate); +EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); |