From 4eaa0e3c869acd5dbc7c2e3818a9ae9cbf221d27 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Apr 2010 16:13:29 -0700 Subject: fib: suppress lockdep-RCU false positive in FIB trie. Followup of commit 634a4b20 Allow tnode_get_child_rcu() to be called either under rcu_read_lock() protection or with RTNL held. Signed-off-by: Eric Dumazet Signed-off-by: Paul E. McKenney Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 59a838795e3..c98f115fb0f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -209,7 +209,9 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) { struct node *ret = tnode_get_child(tn, i); - return rcu_dereference(ret); + return rcu_dereference_check(ret, + rcu_read_lock_held() || + lockdep_rtnl_is_held()); } static inline int tnode_child_length(const struct tnode *tn) -- cgit v1.2.3 From e30b38c298b55e09456d3ccbc1df2f3e2e8dc6e9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Apr 2010 09:13:03 +0000 Subject: ip: Fix ip_dev_loopback_xmit() Eric Paris got following trace with a linux-next kernel [ 14.203970] BUG: using smp_processor_id() in preemptible [00000000] code: avahi-daemon/2093 [ 14.204025] caller is netif_rx+0xfa/0x110 [ 14.204035] Call Trace: [ 14.204064] [] debug_smp_processor_id+0x105/0x110 [ 14.204070] [] netif_rx+0xfa/0x110 [ 14.204090] [] ip_dev_loopback_xmit+0x71/0xa0 [ 14.204095] [] ip_mc_output+0x192/0x2c0 [ 14.204099] [] ip_local_out+0x20/0x30 [ 14.204105] [] ip_push_pending_frames+0x28d/0x3d0 [ 14.204119] [] udp_push_pending_frames+0x14c/0x400 [ 14.204125] [] udp_sendmsg+0x39c/0x790 [ 14.204137] [] inet_sendmsg+0x45/0x80 [ 14.204149] [] sock_sendmsg+0xf1/0x110 [ 14.204189] [] sys_sendmsg+0x20c/0x380 [ 14.204233] [] system_call_fastpath+0x16/0x1b While current linux-2.6 kernel doesnt emit this warning, bug is latent and might cause unexpected failures. ip_dev_loopback_xmit() runs in process context, preemption enabled, so must call netif_rx_ni() instead of netif_rx(), to make sure that we process pending software interrupt. Same change for ip6_dev_loopback_xmit() Reported-by: Eric Paris Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c65f18e0936..d1bcc9f21d4 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -120,7 +120,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb) newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(newskb)); - netif_rx(newskb); + netif_rx_ni(newskb); return 0; } -- cgit v1.2.3 From aa395145165cb06a0d0885221bbe0ce4a564391d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 13:03:51 +0000 Subject: net: sk_sleep() helper Define a new function to return the waitqueue of a "struct sock". static inline wait_queue_head_t *sk_sleep(struct sock *sk) { return sk->sk_sleep; } Change all read occurrences of sk_sleep by a call to this function. Needed for a future RCU conversion. sk_sleep wont be a field directly available. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 6 +++--- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/tcp.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c5376c72550..5ca7290c2e6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -548,7 +548,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo) { DEFINE_WAIT(wait); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. @@ -561,9 +561,9 @@ static long inet_wait_for_connect(struct sock *sk, long timeo) lock_sock(sk); if (signal_pending(current) || !timeo) break; - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return timeo; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8da6429269d..e0a3e3537b1 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -234,7 +234,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) * having to remove and re-insert us on the wait queue. */ for (;;) { - prepare_to_wait_exclusive(sk->sk_sleep, &wait, + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) @@ -253,7 +253,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) if (!timeo) break; } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); return err; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0f8caf64caa..77208334a61 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -378,7 +378,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); - sock_poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == TCP_LISTEN) return inet_csk_listen_poll(sk); -- cgit v1.2.3 From 0eae88f31ca2b88911ce843452054139e028771f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 19:06:52 -0700 Subject: net: Fix various endianness glitches Sparse can help us find endianness bugs, but we need to make some cleanups to be able to more easily spot real bugs. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 8 ++++---- net/ipv4/ipmr.c | 10 +++++----- net/ipv4/route.c | 29 ++++++++++++++--------------- net/ipv4/tcp.c | 15 ++++++++------- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_output.c | 4 ++-- net/ipv4/udp.c | 8 ++++---- 7 files changed, 39 insertions(+), 39 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5ca7290c2e6..9f52880fae1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1323,8 +1323,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto out_unlock; - id = ntohl(*(u32 *)&iph->id); - flush = (u16)((ntohl(*(u32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); + id = ntohl(*(__be32 *)&iph->id); + flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); id >>= 16; for (p = *head; p; p = p->next) { @@ -1337,8 +1337,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if ((iph->protocol ^ iph2->protocol) | (iph->tos ^ iph2->tos) | - (iph->saddr ^ iph2->saddr) | - (iph->daddr ^ iph2->daddr)) { + ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | + ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 7d8a2bcecb7..a2df5012a1d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1772,10 +1772,10 @@ int ip_mr_input(struct sk_buff *skb) vif = ipmr_find_vif(mrt, skb->dev); if (vif >= 0) { - int err = ipmr_cache_unresolved(mrt, vif, skb); + int err2 = ipmr_cache_unresolved(mrt, vif, skb); read_unlock(&mrt_lock); - return err; + return err2; } read_unlock(&mrt_lock); kfree_skb(skb); @@ -2227,9 +2227,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) const struct ipmr_mfc_iter *it = seq->private; const struct mr_table *mrt = it->mrt; - seq_printf(seq, "%08lX %08lX %-3hd", - (unsigned long) mfc->mfc_mcastgrp, - (unsigned long) mfc->mfc_origin, + seq_printf(seq, "%08X %08X %-3hd", + (__force u32) mfc->mfc_mcastgrp, + (__force u32) mfc->mfc_origin, mfc->mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cb562fdd9b9..a947428ef0a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -258,10 +258,9 @@ static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); (__raw_get_cpu_var(rt_cache_stat).field++) static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, - int genid) + int genid) { - return jhash_3words((__force u32)(__be32)(daddr), - (__force u32)(__be32)(saddr), + return jhash_3words((__force u32)daddr, (__force u32)saddr, idx, genid) & rt_hash_mask; } @@ -378,12 +377,13 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) struct rtable *r = v; int len; - seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" - "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", + seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" + "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", r->u.dst.dev ? r->u.dst.dev->name : "*", - (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, + (__force u32)r->rt_dst, + (__force u32)r->rt_gateway, r->rt_flags, atomic_read(&r->u.dst.__refcnt), - r->u.dst.__use, 0, (unsigned long)r->rt_src, + r->u.dst.__use, 0, (__force u32)r->rt_src, (dst_metric(&r->u.dst, RTAX_ADVMSS) ? (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), dst_metric(&r->u.dst, RTAX_WINDOW), @@ -685,18 +685,17 @@ static inline bool rt_caching(const struct net *net) static inline bool compare_hash_inputs(const struct flowi *fl1, const struct flowi *fl2) { - return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | + return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | + ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | (fl1->iif ^ fl2->iif)) == 0); } static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) { - return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | - (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | + return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | + ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | (fl1->mark ^ fl2->mark) | - (*(u16 *)&fl1->nl_u.ip4_u.tos ^ - *(u16 *)&fl2->nl_u.ip4_u.tos) | + (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) | (fl1->oif ^ fl2->oif) | (fl1->iif ^ fl2->iif)) == 0; } @@ -2319,8 +2318,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.dst.rt_next)) { - if (((rth->fl.fl4_dst ^ daddr) | - (rth->fl.fl4_src ^ saddr) | + if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | + ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | (rth->fl.iif ^ iif) | rth->fl.oif | (rth->fl.fl4_tos ^ tos)) == 0 && diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 77208334a61..6689c61cab4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2721,7 +2721,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct tcphdr *th2; unsigned int len; unsigned int thlen; - unsigned int flags; + __be32 flags; unsigned int mss = 1; unsigned int hlen; unsigned int off; @@ -2771,10 +2771,10 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) found: flush = NAPI_GRO_CB(p)->flush; - flush |= flags & TCP_FLAG_CWR; - flush |= (flags ^ tcp_flag_word(th2)) & - ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH); - flush |= th->ack_seq ^ th2->ack_seq; + flush |= (__force int)(flags & TCP_FLAG_CWR); + flush |= (__force int)((flags ^ tcp_flag_word(th2)) & + ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); + flush |= (__force int)(th->ack_seq ^ th2->ack_seq); for (i = sizeof(*th); i < thlen; i += 4) flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); @@ -2795,8 +2795,9 @@ found: out_check_final: flush = len < mss; - flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | - TCP_FLAG_SYN | TCP_FLAG_FIN); + flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | + TCP_FLAG_RST | TCP_FLAG_SYN | + TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) pp = head; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ad08392a738..4d6717d1e61 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1286,8 +1286,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_release; /* Secret recipe starts with IP addresses */ - *mess++ ^= daddr; - *mess++ ^= saddr; + *mess++ ^= (__force u32)daddr; + *mess++ ^= (__force u32)saddr; /* plus variable length Initiator Cookie */ c = (u8 *)mess; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2b7d71fb843..429ad9286ef 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -861,7 +861,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { - th->urg_ptr = 0xFFFF; + th->urg_ptr = htons(0xFFFF); th->urg = 1; } } @@ -2485,7 +2485,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, *tail-- ^= TCP_SKB_CB(skb)->seq + 1; /* recommended */ - *tail-- ^= ((th->dest << 16) | th->source); + *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source); *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ sha_transform((__u32 *)&xvp->cookie_bakery[0], diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 666b963496f..1e18f9cc924 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -307,13 +307,13 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, unsigned int port) { - return jhash_1word(saddr, net_hash_mix(net)) ^ port; + return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; } int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = - udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum); + udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); unsigned int hash2_partial = udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); @@ -466,14 +466,14 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, daddr, hnum, dif, hslot2, slot2); if (!result) { - hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum); + hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) goto begin; result = udp4_lib_lookup2(net, saddr, sport, - INADDR_ANY, hnum, dif, + htonl(INADDR_ANY), hnum, dif, hslot2, slot2); } rcu_read_unlock(); -- cgit v1.2.3 From aa2ea0586d9dbe56a334d835a43b45e8c2104e77 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 22 Apr 2010 07:00:24 +0000 Subject: tcp: fix outsegs stat for TSO segments Account for TSO segments of an skb in TCP_MIB_OUTSEGS counter. Without doing this, the counter can be off by orders of magnitude from the actual number of segments sent. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 429ad9286ef..5db3a2c6cb3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -888,7 +888,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_event_data_sent(tp, skb, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) - TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, + tcp_skb_pcount(skb)); err = icsk->icsk_af_ops->queue_xmit(skb); if (likely(err <= 0)) @@ -2503,7 +2504,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->window = htons(min(req->rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), tp, &opts); th->doff = (tcp_header_size >> 2); - TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ -- cgit v1.2.3 From fda48a0d7a8412cedacda46a9c0bf8ef9cd13559 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Apr 2010 09:26:15 +0000 Subject: tcp: bind() fix when many ports are bound Port autoselection done by kernel only works when number of bound sockets is under a threshold (typically 30000). When this threshold is over, we must check if there is a conflict before exiting first loop in inet_csk_get_port() Change inet_csk_bind_conflict() to forbid two reuse-enabled sockets to bind on same (address,port) tuple (with a non ANY address) Same change for inet6_csk_bind_conflict() Reported-by: Gaspar Chilingarov Signed-off-by: Eric Dumazet Acked-by: Evgeniy Polyakov Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8da6429269d..14825eb0977 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -70,13 +70,17 @@ int inet_csk_bind_conflict(const struct sock *sk, (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); + if (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) { - const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); if (!sk2_rcv_saddr || !sk_rcv_saddr || sk2_rcv_saddr == sk_rcv_saddr) break; - } + } else if (reuse && sk2->sk_reuse && + sk2_rcv_saddr && + sk2_rcv_saddr == sk_rcv_saddr) + break; } } return node != NULL; @@ -120,9 +124,11 @@ again: smallest_size = tb->num_owners; smallest_rover = rover; if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { - spin_unlock(&head->lock); - snum = smallest_rover; - goto have_snum; + if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { + spin_unlock(&head->lock); + snum = smallest_rover; + goto have_snum; + } } } goto next; -- cgit v1.2.3 From 3d0c9c4eb2dbdcc461be4084abd87a9a9e70f713 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 26 Apr 2010 16:02:04 +0200 Subject: net: fib_rules: mark arguments to fib_rules_register const and __net_initdata fib_rules_register() duplicates the template passed to it without modification, mark the argument as const. Additionally the templates are only needed when instantiating a new namespace, so mark them as __net_initdata, which means they can be discarded when CONFIG_NET_NS=n. Signed-off-by: Patrick McHardy --- net/ipv4/fib_rules.c | 2 +- net/ipv4/ipmr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 3ec84fea5b7..8ab62a56701 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -245,7 +245,7 @@ static void fib4_rule_flush_cache(struct fib_rules_ops *ops) rt_cache_flush(ops->fro_net, -1); } -static struct fib_rules_ops fib4_rules_ops_template = { +static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { .family = FIB_RULES_IPV4, .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a2df5012a1d..7d3e382aed6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -216,7 +216,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, return 0; } -static struct fib_rules_ops ipmr_rules_ops_template = { +static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = { .family = FIB_RULES_IPMR, .rule_size = sizeof(struct ipmr_rule), .addr_size = sizeof(u32), -- cgit v1.2.3 From 25239cee7e8732dbdc9f5d324f1c22a3bdec1d1f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 26 Apr 2010 16:02:05 +0200 Subject: net: rtnetlink: decouple rtnetlink address families from real address families Decouple rtnetlink address families from real address families in socket.h to be able to add rtnetlink interfaces to code that is not a real address family without increasing AF_MAX/NPROTO. This will be used to add support for multicast route dumping from all tables as the proc interface can't be extended to support anything but the main table without breaking compatibility. This partialy undoes the patch to introduce independant families for routing rules and converts ipmr routing rules to a new rtnetlink family. Similar to that patch, values up to 127 are reserved for real address families, values above that may be used arbitrarily. Signed-off-by: Patrick McHardy --- net/ipv4/fib_rules.c | 2 +- net/ipv4/ipmr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 8ab62a56701..76daeb5ff56 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -246,7 +246,7 @@ static void fib4_rule_flush_cache(struct fib_rules_ops *ops) } static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { - .family = FIB_RULES_IPV4, + .family = AF_INET, .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), .action = fib4_rule_action, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 7d3e382aed6..41e8fc0ce8b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -217,7 +217,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, } static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = { - .family = FIB_RULES_IPMR, + .family = RTNL_FAMILY_IPMR, .rule_size = sizeof(struct ipmr_rule), .addr_size = sizeof(u32), .action = ipmr_rule_action, -- cgit v1.2.3 From cb6a4e461fb427689920472bd7335f926d521747 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 26 Apr 2010 16:02:08 +0200 Subject: net: ipmr: add support for dumping routing tables over netlink The ipmr /proc interface (ip_mr_cache) can't be extended to dump routes from any tables but the main table in a backwards compatible fashion since the output format ends in a variable amount of output interfaces. Introduce a new netlink interface to dump multicast routes from all tables, similar to the netlink interface for regular routes. Signed-off-by: Patrick McHardy --- net/ipv4/ipmr.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 89 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 41e8fc0ce8b..eddfd12f55b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -128,8 +128,8 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, int local); static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); -static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - struct mfc_cache *c, struct rtmsg *rtm); +static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mfc_cache *c, struct rtmsg *rtm); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -831,7 +831,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); - if (ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { + if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { nlh->nlmsg_len = (skb_tail_pointer(skb) - (u8 *)nlh); } else { @@ -1904,9 +1904,8 @@ drop: } #endif -static int -ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, - struct rtmsg *rtm) +static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mfc_cache *c, struct rtmsg *rtm) { int ct; struct rtnexthop *nhp; @@ -1994,11 +1993,93 @@ int ipmr_get_route(struct net *net, if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) cache->mfc_flags |= MFC_NOTIFY; - err = ipmr_fill_mroute(mrt, skb, cache, rtm); + err = __ipmr_fill_mroute(mrt, skb, cache, rtm); read_unlock(&mrt_lock); return err; } +static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + u32 pid, u32 seq, struct mfc_cache *c) +{ + struct nlmsghdr *nlh; + struct rtmsg *rtm; + + nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = RTNL_FAMILY_IPMR; + rtm->rtm_dst_len = 32; + rtm->rtm_src_len = 32; + rtm->rtm_tos = 0; + rtm->rtm_table = mrt->id; + NLA_PUT_U32(skb, RTA_TABLE, mrt->id); + rtm->rtm_type = RTN_MULTICAST; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = RTPROT_UNSPEC; + rtm->rtm_flags = 0; + + NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin); + NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp); + + if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct mr_table *mrt; + struct mfc_cache *mfc; + unsigned int t = 0, s_t; + unsigned int h = 0, s_h; + unsigned int e = 0, s_e; + + s_t = cb->args[0]; + s_h = cb->args[1]; + s_e = cb->args[2]; + + read_lock(&mrt_lock); + ipmr_for_each_table(mrt, net) { + if (t < s_t) + goto next_table; + if (t > s_t) + s_h = 0; + for (h = s_h; h < MFC_LINES; h++) { + list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) { + if (e < s_e) + goto next_entry; + if (ipmr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + mfc) < 0) + goto done; +next_entry: + e++; + } + e = s_e = 0; + } + s_h = 0; +next_table: + t++; + } +done: + read_unlock(&mrt_lock); + + cb->args[2] = e; + cb->args[1] = h; + cb->args[0] = t; + + return skb->len; +} + #ifdef CONFIG_PROC_FS /* * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif @@ -2355,6 +2436,7 @@ int __init ip_mr_init(void) goto add_proto_fail; } #endif + rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute); return 0; #ifdef CONFIG_IP_PIMSM_V2 -- cgit v1.2.3 From 6c37e5de456987f5bc80879afde05aa120784095 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Mon, 26 Apr 2010 18:33:27 +0000 Subject: TCP: avoid to send keepalive probes if receiving data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RFC 1122 says the following: ... Keep-alive packets MUST only be sent when no data or acknowledgement packets have been received for the connection within an interval. ... The acknowledgement packet is reseting the keepalive timer but the data packet isn't. This patch fixes it by checking the timestamp of the last received data packet too when the keepalive timer expires. Signed-off-by: Flavio Leitner Signed-off-by: Eric Dumazet Acked-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_timer.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6689c61cab4..8ce29747ad9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2298,7 +2298,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (sock_flag(sk, SOCK_KEEPOPEN) && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; + u32 elapsed = keepalive_time_elapsed(tp); if (tp->keepalive_time > elapsed) elapsed = tp->keepalive_time - elapsed; else diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c732be00606..440a5c6004f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -517,7 +517,7 @@ static void tcp_keepalive_timer (unsigned long data) struct sock *sk = (struct sock *) data; struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - __u32 elapsed; + u32 elapsed; /* Only process if socket is not in use. */ bh_lock_sock(sk); @@ -554,7 +554,7 @@ static void tcp_keepalive_timer (unsigned long data) if (tp->packets_out || tcp_send_head(sk)) goto resched; - elapsed = tcp_time_stamp - tp->rcv_tstamp; + elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { if (icsk->icsk_probes_out >= keepalive_probes(tp)) { -- cgit v1.2.3 From c58dc01babfd58ec9e71a6ce080150dc27755d88 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Apr 2010 15:05:31 -0700 Subject: net: Make RFS socket operations not be inet specific. Idea from Eric Dumazet. As for placement inside of struct sock, I tried to choose a place that otherwise has a 32-bit hole on 64-bit systems. Signed-off-by: David S. Miller Acked-by: Eric Dumazet --- net/ipv4/af_inet.c | 8 ++++---- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9f52880fae1..c6c43bcd1c6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -419,7 +419,7 @@ int inet_release(struct socket *sock) if (sk) { long timeout; - inet_rps_reset_flow(sk); + sock_rps_reset_flow(sk); /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); @@ -722,7 +722,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; - inet_rps_record_flow(sk); + sock_rps_record_flow(sk); /* We may need to bind the socket. */ if (!inet_sk(sk)->inet_num && inet_autobind(sk)) @@ -737,7 +737,7 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, { struct sock *sk = sock->sk; - inet_rps_record_flow(sk); + sock_rps_record_flow(sk); /* We may need to bind the socket. */ if (!inet_sk(sk)->inet_num && inet_autobind(sk)) @@ -755,7 +755,7 @@ int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, int addr_len = 0; int err; - inet_rps_record_flow(sk); + sock_rps_record_flow(sk); err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4d6717d1e61..771f8146a2e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1672,7 +1672,7 @@ process: skb->dev = NULL; - inet_rps_save_rxhash(sk, skb->rxhash); + sock_rps_save_rxhash(sk, skb->rxhash); bh_lock_sock_nested(sk); ret = 0; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1e18f9cc924..fa3d2874db4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1217,7 +1217,7 @@ int udp_disconnect(struct sock *sk, int flags) sk->sk_state = TCP_CLOSE; inet->inet_daddr = 0; inet->inet_dport = 0; - inet_rps_save_rxhash(sk, 0); + sock_rps_save_rxhash(sk, 0); sk->sk_bound_dev_if = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); @@ -1262,7 +1262,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) int rc; if (inet_sk(sk)->inet_daddr) - inet_rps_save_rxhash(sk, skb->rxhash); + sock_rps_save_rxhash(sk, skb->rxhash); rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) { -- cgit v1.2.3 From c377411f2494a931ff7facdbb3a6839b1266bcf6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Apr 2010 15:13:20 -0700 Subject: net: sk_add_backlog() take rmem_alloc into account Current socket backlog limit is not enough to really stop DDOS attacks, because user thread spend many time to process a full backlog each round, and user might crazy spin on socket lock. We should add backlog size and receive_queue size (aka rmem_alloc) to pace writers, and let user run without being slow down too much. Introduce a sk_rcvqueues_full() helper, to avoid taking socket lock in stress situations. Under huge stress from a multiqueue/RPS enabled NIC, a single flow udp receiver can now process ~200.000 pps (instead of ~100 pps before the patch) on a 8 core machine. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fa3d2874db4..63eb56b2d87 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1372,6 +1372,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto drop; } + + if (sk_rcvqueues_full(sk, skb)) + goto drop; + rc = 0; bh_lock_sock(sk); -- cgit v1.2.3 From 8d238b25b1ec22a73b1c2206f111df2faaff8285 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 28 Apr 2010 11:25:59 -0700 Subject: Revert "tcp: bind() fix when many ports are bound" This reverts two commits: fda48a0d7a8412cedacda46a9c0bf8ef9cd13559 tcp: bind() fix when many ports are bound and a follow-on fix for it: 6443bb1fc2050ca2b6585a3fa77f7833b55329ed ipv6: Fix inet6_csk_bind_conflict() It causes problems with binding listening sockets when time-wait sockets from a previous instance still are alive. It's too late to keep fiddling with this so late in the -rc series, and we'll deal with it in net-next-2.6 instead. Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 14825eb0977..8da6429269d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -70,17 +70,13 @@ int inet_csk_bind_conflict(const struct sock *sk, (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { - const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); - if (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) { + const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); if (!sk2_rcv_saddr || !sk_rcv_saddr || sk2_rcv_saddr == sk_rcv_saddr) break; - } else if (reuse && sk2->sk_reuse && - sk2_rcv_saddr && - sk2_rcv_saddr == sk_rcv_saddr) - break; + } } } return node != NULL; @@ -124,11 +120,9 @@ again: smallest_size = tb->num_owners; smallest_rover = rover; if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { - if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { - spin_unlock(&head->lock); - snum = smallest_rover; - goto have_snum; - } + spin_unlock(&head->lock); + snum = smallest_rover; + goto have_snum; } } goto next; -- cgit v1.2.3 From 4b0b72f7dd617b13abd1b04c947e15873e011a24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Apr 2010 14:35:48 -0700 Subject: net: speedup udp receive path Since commit 95766fff ([UDP]: Add memory accounting.), each received packet needs one extra sock_lock()/sock_release() pair. This added latency because of possible backlog handling. Then later, ticket spinlocks added yet another latency source in case of DDOS. This patch introduces lock_sock_bh() and unlock_sock_bh() synchronization primitives, avoiding one atomic operation and backlog processing. skb_free_datagram_locked() uses them instead of full blown lock_sock()/release_sock(). skb is orphaned inside locked section for proper socket memory reclaim, and finally freed outside of it. UDP receive path now take the socket spinlock only once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 63eb56b2d87..1f86965ba7d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1062,10 +1062,10 @@ static unsigned int first_packet_length(struct sock *sk) spin_unlock_bh(&rcvq->lock); if (!skb_queue_empty(&list_kill)) { - lock_sock(sk); + lock_sock_bh(sk); __skb_queue_purge(&list_kill); sk_mem_reclaim_partial(sk); - release_sock(sk); + unlock_sock_bh(sk); } return res; } @@ -1196,10 +1196,10 @@ out: return err; csum_copy_err: - lock_sock(sk); + lock_sock_bh(sk); if (!skb_kill_datagram(sk, skb, flags)) UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - release_sock(sk); + unlock_sock_bh(sk); if (noblock) return -EAGAIN; @@ -1624,9 +1624,9 @@ int udp_rcv(struct sk_buff *skb) void udp_destroy_sock(struct sock *sk) { - lock_sock(sk); + lock_sock_bh(sk); udp_flush_pending_frames(sk); - release_sock(sk); + unlock_sock_bh(sk); } /* -- cgit v1.2.3 From f84af32cbca70a3c6d30463dc08c7984af11c277 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Apr 2010 15:31:51 -0700 Subject: net: ip_queue_rcv_skb() helper When queueing a skb to socket, we can immediately release its dst if target socket do not use IP_CMSG_PKTINFO. tcp_data_queue() can drop dst too. This to benefit from a hot cache line and avoid the receiver, possibly on another cpu, to dirty this cache line himself. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 16 ++++++++++++++++ net/ipv4/raw.c | 2 +- net/ipv4/tcp_input.c | 1 + net/ipv4/udp.c | 2 +- 4 files changed, 19 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b0aa0546a3b..ce231780a2b 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -954,6 +954,22 @@ e_inval: return -EINVAL; } +/** + * ip_queue_rcv_skb - Queue an skb into sock receive queue + * @sk: socket + * @skb: buffer + * + * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option + * is not set, we drop skb dst entry now, while dst cache line is hot. + */ +int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO)) + skb_dst_drop(skb); + return sock_queue_rcv_skb(sk, skb); +} +EXPORT_SYMBOL(ip_queue_rcv_skb); + int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index cc6f097fbd5..52ef5af78a4 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -290,7 +290,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) { /* Charge it to the socket. */ - if (sock_queue_rcv_skb(sk, skb) < 0) { + if (ip_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ae3ec15fb63..e82162c211b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4367,6 +4367,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) goto drop; + skb_dst_drop(skb); __skb_pull(skb, th->doff * 4); TCP_ECN_accept_cwr(tp, skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1f86965ba7d..4560b291180 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1264,7 +1264,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (inet_sk(sk)->inet_daddr) sock_rps_save_rxhash(sk, skb->rxhash); - rc = sock_queue_rcv_skb(sk, skb); + rc = ip_queue_rcv_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); -- cgit v1.2.3 From 3ee943728fff536edaf8f59faa58aaa1aa7366e3 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Sat, 8 May 2010 01:57:52 -0700 Subject: ipv4: remove ip_rt_secret timer (v4) A while back there was a discussion regarding the rt_secret_interval timer. Given that we've had the ability to do emergency route cache rebuilds for awhile now, based on a statistical analysis of the various hash chain lengths in the cache, the use of the flush timer is somewhat redundant. This patch removes the rt_secret_interval sysctl, allowing us to rely solely on the statistical analysis mechanism to determine the need for route cache flushes. Signed-off-by: Neil Horman Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 108 +++++-------------------------------------------------- 1 file changed, 8 insertions(+), 100 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a947428ef0a..dea3f926425 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -129,7 +129,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; -static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; static int rt_chain_length_max __read_mostly = 20; static struct delayed_work expires_work; @@ -918,32 +917,11 @@ void rt_cache_flush_batch(void) rt_do_flush(!in_softirq()); } -/* - * We change rt_genid and let gc do the cleanup - */ -static void rt_secret_rebuild(unsigned long __net) -{ - struct net *net = (struct net *)__net; - rt_cache_invalidate(net); - mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); -} - -static void rt_secret_rebuild_oneshot(struct net *net) -{ - del_timer_sync(&net->ipv4.rt_secret_timer); - rt_cache_invalidate(net); - if (ip_rt_secret_interval) - mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); -} - static void rt_emergency_hash_rebuild(struct net *net) { - if (net_ratelimit()) { + if (net_ratelimit()) printk(KERN_WARNING "Route hash chain too long!\n"); - printk(KERN_WARNING "Adjust your secret_interval!\n"); - } - - rt_secret_rebuild_oneshot(net); + rt_cache_invalidate(net); } /* @@ -3101,48 +3079,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, return -EINVAL; } -static void rt_secret_reschedule(int old) -{ - struct net *net; - int new = ip_rt_secret_interval; - int diff = new - old; - - if (!diff) - return; - - rtnl_lock(); - for_each_net(net) { - int deleted = del_timer_sync(&net->ipv4.rt_secret_timer); - long time; - - if (!new) - continue; - - if (deleted) { - time = net->ipv4.rt_secret_timer.expires - jiffies; - - if (time <= 0 || (time += diff) <= 0) - time = 0; - } else - time = new; - - mod_timer(&net->ipv4.rt_secret_timer, jiffies + time); - } - rtnl_unlock(); -} - -static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int old = ip_rt_secret_interval; - int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); - - rt_secret_reschedule(old); - - return ret; -} - static ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", @@ -3251,13 +3187,6 @@ static ctl_table ipv4_route_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "secret_interval", - .data = &ip_rt_secret_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ipv4_sysctl_rt_secret_interval, - }, { } }; @@ -3336,34 +3265,15 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { }; #endif - -static __net_init int rt_secret_timer_init(struct net *net) +static __net_init int rt_genid_init(struct net *net) { - atomic_set(&net->ipv4.rt_genid, - (int) ((num_physpages ^ (num_physpages>>8)) ^ - (jiffies ^ (jiffies >> 7)))); - - net->ipv4.rt_secret_timer.function = rt_secret_rebuild; - net->ipv4.rt_secret_timer.data = (unsigned long)net; - init_timer_deferrable(&net->ipv4.rt_secret_timer); - - if (ip_rt_secret_interval) { - net->ipv4.rt_secret_timer.expires = - jiffies + net_random() % ip_rt_secret_interval + - ip_rt_secret_interval; - add_timer(&net->ipv4.rt_secret_timer); - } + get_random_bytes(&net->ipv4.rt_genid, + sizeof(net->ipv4.rt_genid)); return 0; } -static __net_exit void rt_secret_timer_exit(struct net *net) -{ - del_timer_sync(&net->ipv4.rt_secret_timer); -} - -static __net_initdata struct pernet_operations rt_secret_timer_ops = { - .init = rt_secret_timer_init, - .exit = rt_secret_timer_exit, +static __net_initdata struct pernet_operations rt_genid_ops = { + .init = rt_genid_init, }; @@ -3424,9 +3334,6 @@ int __init ip_rt_init(void) schedule_delayed_work(&expires_work, net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - if (register_pernet_subsys(&rt_secret_timer_ops)) - printk(KERN_ERR "Unable to setup rt_secret_timer\n"); - if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM @@ -3438,6 +3345,7 @@ int __init ip_rt_init(void) #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); #endif + register_pernet_subsys(&rt_genid_ops); return rc; } -- cgit v1.2.3