diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/ip_input.c | 14 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 14 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 30 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 8 | ||||
-rw-r--r-- | net/ipv4/udp.c | 171 |
6 files changed, 162 insertions, 81 deletions
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 324e7e0fdb2a..97069399d864 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -329,6 +329,7 @@ drop: static inline int ip_rcv_finish(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); + struct rtable *rt; /* * Initialise the virtual path cache for the packet. It describes @@ -340,6 +341,8 @@ static inline int ip_rcv_finish(struct sk_buff *skb) if (unlikely(err)) { if (err == -EHOSTUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + else if (err == -ENETUNREACH) + IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); goto drop; } } @@ -358,6 +361,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb) if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop; + rt = (struct rtable*)skb->dst; + if (rt->rt_type == RTN_MULTICAST) + IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS); + else if (rt->rt_type == RTN_BROADCAST) + IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS); + return dst_input(skb); drop: @@ -414,7 +423,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto inhdr_error; len = ntohs(iph->tot_len); - if (skb->len < len || len < (iph->ihl*4)) + if (skb->len < len) { + IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 534650cad3a8..d6427d918512 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -160,9 +160,15 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; + struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; int hh_len = LL_RESERVED_SPACE(dev); + if (rt->rt_type == RTN_MULTICAST) + IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + else if (rt->rt_type == RTN_BROADCAST) + IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); + /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { struct sk_buff *skb2; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2cf9a898ce50..d6e488668171 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1573,14 +1573,12 @@ void tcp_close(struct sock *sk, long timeout) sk_stream_mem_reclaim(sk); - /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section - * 3.10, we send a RST here because data was lost. To - * witness the awful effects of the old behavior of always - * doing a FIN, run an older 2.1.x kernel or 2.0.x, start - * a bulk GET in an FTP client, suspend the process, wait - * for the client to advertise a zero window, then kill -9 - * the FTP client, wheee... Note: timeout is always zero - * in such a case. + /* As outlined in RFC 2525, section 2.17, we send a RST here because + * data was lost. To witness the awful effects of the old behavior of + * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk + * GET in an FTP client, suspend the process, wait for the client to + * advertise a zero window, then kill -9 the FTP client, wheee... + * Note: timeout is always zero in such a case. */ if (data_was_unread) { /* Unread data was tossed, zap the connection. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 051f0f815f17..7641b2761a14 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1265,20 +1265,15 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } -/* F-RTO can only be used if these conditions are satisfied: - * - there must be some unsent new data - * - the advertised window should allow sending it - * - TCP has never retransmitted anything other than head (SACK enhanced - * variant from Appendix B of RFC4138 is more robust here) +/* F-RTO can only be used if TCP has never retransmitted anything other than + * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ int tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!sysctl_tcp_frto || !tcp_send_head(sk) || - after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, - tp->snd_una + tp->snd_wnd)) + if (!sysctl_tcp_frto) return 0; if (IsSackFrto()) @@ -2642,7 +2637,9 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) * algorithm is not part of the F-RTO detection algorithm * given in RFC4138 but can be selected separately). * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss - * and TCP falls back to conventional RTO recovery. + * and TCP falls back to conventional RTO recovery. F-RTO allows overriding + * of Nagle, this is done using frto_counter states 2 and 3, when a new data + * segment of any size sent during F-RTO, state 2 is upgraded to 3. * * Rationale: if the RTO was spurious, new ACKs should arrive from the * original window even after we transmit two new data segments. @@ -2671,7 +2668,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) inet_csk(sk)->icsk_retransmits = 0; if (!before(tp->snd_una, tp->frto_highmark)) { - tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); return 1; } @@ -2697,7 +2694,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) return 1; } - if ((tp->frto_counter == 2) && + if ((tp->frto_counter >= 2) && (!(flag&FLAG_FORWARD_PROGRESS) || ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { /* RFC4138 shortcoming (see comment above) */ @@ -2710,10 +2707,19 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) } if (tp->frto_counter == 1) { + /* Sending of the next skb must be allowed or no FRTO */ + if (!tcp_send_head(sk) || + after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, + tp->snd_una + tp->snd_wnd)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), + flag); + return 1; + } + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; tp->frto_counter = 2; return 1; - } else /* frto_counter == 2 */ { + } else { switch (sysctl_tcp_frto_response) { case 2: tcp_undo_spur_to_response(sk, flag); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e70a6840cb64..0faacf9c419d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1035,8 +1035,10 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, if (nonagle & TCP_NAGLE_PUSH) return 1; - /* Don't use the nagle rule for urgent data (or for the final FIN). */ - if (tp->urg_mode || + /* Don't use the nagle rule for urgent data (or for the final FIN). + * Nagle can be ignored during F-RTO too (see RFC4138). + */ + if (tp->urg_mode || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; @@ -2035,7 +2037,7 @@ void tcp_send_fin(struct sock *sk) /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there * was unread data in the receive queue. This behavior is recommended - * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM + * by RFC 2525, section 2.17. -DaveM */ void tcp_send_active_reset(struct sock *sk, gfp_t priority) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cec0f2cc49b7..144970704c2c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock); static int udp_port_rover; -static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) +/* + * Note about this hash function : + * Typical use is probably daddr = 0, only dport is going to vary hash + */ +static inline unsigned int hash_port_and_addr(__u16 port, __be32 addr) +{ + addr ^= addr >> 16; + addr ^= addr >> 8; + return port ^ addr; +} + +static inline int __udp_lib_port_inuse(unsigned int hash, int port, + __be32 daddr, struct hlist_head udptable[]) { struct sock *sk; struct hlist_node *node; + struct inet_sock *inet; - sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) - if (sk->sk_hash == num) + sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { + if (sk->sk_hash != hash) + continue; + inet = inet_sk(sk); + if (inet->num != port) + continue; + if (inet->rcv_saddr == daddr) return 1; + } return 0; } @@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, struct hlist_node *node; struct hlist_head *head; struct sock *sk2; + unsigned int hash; int error = 1; write_lock_bh(&udp_hash_lock); @@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { int size; - head = &udptable[result & (UDP_HTABLE_SIZE - 1)]; + hash = hash_port_and_addr(result, + inet_sk(sk)->rcv_saddr); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; if (hlist_empty(head)) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + @@ -181,7 +203,10 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, result = sysctl_local_port_range[0] + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); - if (! __udp_lib_lport_inuse(result, udptable)) + hash = hash_port_and_addr(result, + inet_sk(sk)->rcv_saddr); + if (! __udp_lib_port_inuse(hash, result, + inet_sk(sk)->rcv_saddr, udptable)) break; } if (i >= (1 << 16) / UDP_HTABLE_SIZE) @@ -189,11 +214,13 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, gotit: *port_rover = snum = result; } else { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + hash = hash_port_and_addr(snum, inet_sk(sk)->rcv_saddr); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; sk_for_each(sk2, node, head) - if (sk2->sk_hash == snum && + if (sk2->sk_hash == hash && sk2 != sk && + inet_sk(sk2)->num == snum && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && @@ -201,9 +228,9 @@ gotit: goto fail; } inet_sk(sk)->num = snum; - sk->sk_hash = snum; + sk->sk_hash = hash; if (sk_unhashed(sk)) { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; sk_add_node(sk, head); sock_prot_inc_use(sk->sk_prot); } @@ -242,63 +269,78 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport, { struct sock *sk, *result = NULL; struct hlist_node *node; - unsigned short hnum = ntohs(dport); - int badness = -1; + unsigned int hash, hashwild; + int score, best = -1; + + hash = hash_port_and_addr(ntohs(dport), daddr); + hashwild = hash_port_and_addr(ntohs(dport), 0); read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { + +lookup: + + sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { - int score = (sk->sk_family == PF_INET ? 1 : 0); - if (inet->rcv_saddr) { - if (inet->rcv_saddr != daddr) - continue; - score+=2; - } - if (inet->daddr) { - if (inet->daddr != saddr) - continue; - score+=2; - } - if (inet->dport) { - if (inet->dport != sport) - continue; - score+=2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score+=2; - } - if (score == 9) { - result = sk; - break; - } else if (score > badness) { - result = sk; - badness = score; - } + if (sk->sk_hash != hash || ipv6_only_sock(sk) || + inet->num != dport) + continue; + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->rcv_saddr) { + if (inet->rcv_saddr != daddr) + continue; + score+=2; + } + if (inet->daddr) { + if (inet->daddr != saddr) + continue; + score+=2; + } + if (inet->dport) { + if (inet->dport != sport) + continue; + score+=2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score+=2; + } + if (score == 9) { + result = sk; + goto found; + } else if (score > best) { + result = sk; + best = score; } } + + if (hash != hashwild) { + hash = hashwild; + goto lookup; + } +found: if (result) sock_hold(result); read_unlock(&udp_hash_lock); return result; } -static inline struct sock *udp_v4_mcast_next(struct sock *sk, - __be16 loc_port, __be32 loc_addr, - __be16 rmt_port, __be32 rmt_addr, - int dif) +static inline struct sock *udp_v4_mcast_next( + struct sock *sk, + unsigned int hnum, __be16 loc_port, __be32 loc_addr, + __be16 rmt_port, __be32 rmt_addr, + int dif) { struct hlist_node *node; struct sock *s = sk; - unsigned short hnum = ntohs(loc_port); sk_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); if (s->sk_hash != hnum || + inet->num != loc_port || (inet->daddr && inet->daddr != rmt_addr) || (inet->dport != rmt_port && inet->dport) || (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || @@ -1129,29 +1171,44 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, __be32 saddr, __be32 daddr, struct hlist_head udptable[]) { - struct sock *sk; + struct sock *sk, *skw, *sknext; int dif; + unsigned int hash = hash_port_and_addr(ntohs(uh->dest), daddr); + unsigned int hashwild = hash_port_and_addr(ntohs(uh->dest), 0); - read_lock(&udp_hash_lock); - sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); - if (sk) { - struct sock *sknext = NULL; + read_lock(&udp_hash_lock); + + sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]); + skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]); + + sk = udp_v4_mcast_next(sk, hash, uh->dest, daddr, uh->source, saddr, dif); + if (!sk) { + hash = hashwild; + sk = udp_v4_mcast_next(skw, hash, uh->dest, daddr, uh->source, + saddr, dif); + } + if (sk) { do { struct sk_buff *skb1 = skb; - - sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, - uh->source, saddr, dif); + sknext = udp_v4_mcast_next(sk_next(sk), hash, uh->dest, + daddr, uh->source, saddr, dif); + if (!sknext && hash != hashwild) { + hash = hashwild; + sknext = udp_v4_mcast_next(skw, hash, uh->dest, + daddr, uh->source, saddr, dif); + } if (sknext) skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) { int ret = udp_queue_rcv_skb(sk, skb1); if (ret > 0) - /* we should probably re-process instead - * of dropping packets here. */ + /* + * we should probably re-process + * instead of dropping packets here. + */ kfree_skb(skb1); } sk = sknext; |