diff options
Diffstat (limited to 'net/ipv4')
36 files changed, 821 insertions, 648 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 551ce564b03..b4c0969137c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -355,6 +355,8 @@ lookup_protocol: inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; + inet->nodefrag = 0; + if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) @@ -1100,7 +1102,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) if (err) return err; - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); new_saddr = rt->rt_src; @@ -1166,7 +1168,7 @@ int inet_sk_rebuild_header(struct sock *sk) err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); } if (!err) - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); else { /* Routing failed... */ sk->sk_route_caps = 0; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f094b75810d..cf78f41830c 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -427,7 +427,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) if (ip_route_output_key(net, &rt, &fl) < 0) return 1; - if (rt->u.dst.dev != dev) { + if (rt->dst.dev != dev) { NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); flag = 1; } @@ -532,7 +532,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct in_device *out_dev; int imi, omi = -1; - if (rt->u.dst.dev == dev) + if (rt->dst.dev == dev) return 0; if (!IN_DEV_PROXY_ARP(in_dev)) @@ -545,10 +545,10 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, /* place to check for proxy_arp for routes */ - if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) { + out_dev = __in_dev_get_rcu(rt->dst.dev); + if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); - in_dev_put(out_dev); - } + return (omi != imi && omi != -1); } @@ -576,7 +576,7 @@ static inline int arp_fwd_pvlan(struct in_device *in_dev, __be32 sip, __be32 tip) { /* Private VLAN is only concerned about the same ethernet segment */ - if (rt->u.dst.dev != dev) + if (rt->dst.dev != dev) return 0; /* Don't reply on self probes (often done by windowz boxes)*/ @@ -741,7 +741,7 @@ void arp_send(int type, int ptype, __be32 dest_ip, static int arp_process(struct sk_buff *skb) { struct net_device *dev = skb->dev; - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); struct arphdr *arp; unsigned char *arp_ptr; struct rtable *rt; @@ -890,7 +890,6 @@ static int arp_process(struct sk_buff *skb) arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); - in_dev_put(in_dev); return 0; } goto out; @@ -936,8 +935,6 @@ static int arp_process(struct sk_buff *skb) } out: - if (in_dev) - in_dev_put(in_dev); consume_skb(skb); return 0; } @@ -1045,7 +1042,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, struct rtable * rt; if ((err = ip_route_output_key(net, &rt, &fl)) != 0) return err; - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return -EINVAL; @@ -1152,7 +1149,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r, struct rtable * rt; if ((err = ip_route_output_key(net, &rt, &fl)) != 0) return err; - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return -EINVAL; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index fb2465811b4..fe3daa7f07a 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -69,7 +69,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->sk_state = TCP_ESTABLISHED; inet->inet_id = jiffies; - sk_dst_set(sk, &rt->u.dst); + sk_dst_set(sk, &rt->dst); return(0); } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 382bc768ed5..da14c49284f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1081,6 +1081,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, } ip_mc_up(in_dev); /* fall through */ + case NETDEV_NOTIFY_PEERS: case NETDEV_CHANGEADDR: /* Send gratuitous ARP to notify of link change */ if (IN_DEV_ARP_NOTIFY(in_dev)) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4f0ed458c88..e830f7a123b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -284,7 +284,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, if (no_addr) goto last_resort; if (rpf == 1) - goto e_inval; + goto e_rpf; fl.oif = dev->ifindex; ret = 0; @@ -299,7 +299,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, last_resort: if (rpf) - goto e_inval; + goto e_rpf; *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); *itag = 0; return 0; @@ -308,6 +308,8 @@ e_inval_res: fib_res_put(&res); e_inval: return -EINVAL; +e_rpf: + return -EXDEV; } static inline __be32 sk_extract_addr(struct sockaddr *addr) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index d65e9215bcd..7569b21a3a2 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -271,7 +271,7 @@ int xrlim_allow(struct dst_entry *dst, int timeout) static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, int type, int code) { - struct dst_entry *dst = &rt->u.dst; + struct dst_entry *dst = &rt->dst; int rc = 1; if (type > NR_ICMP_TYPES) @@ -327,7 +327,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, struct sock *sk; struct sk_buff *skb; - sk = icmp_sk(dev_net((*rt)->u.dst.dev)); + sk = icmp_sk(dev_net((*rt)->dst.dev)); if (ip_append_data(sk, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, @@ -359,7 +359,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->u.dst.dev); + struct net *net = dev_net(rt->dst.dev); struct sock *sk; struct inet_sock *inet; __be32 daddr; @@ -427,7 +427,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) if (!rt) goto out; - net = dev_net(rt->u.dst.dev); + net = dev_net(rt->dst.dev); /* * Find the original header. It is expected to be valid, of course. @@ -596,9 +596,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) /* Ugh! */ orefdst = skb_in->_skb_refdst; /* save old refdst */ err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, - RT_TOS(tos), rt2->u.dst.dev); + RT_TOS(tos), rt2->dst.dev); - dst_release(&rt2->u.dst); + dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); skb_in->_skb_refdst = orefdst; /* restore old refdst */ } @@ -610,7 +610,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) XFRM_LOOKUP_ICMP); switch (err) { case 0: - dst_release(&rt->u.dst); + dst_release(&rt->dst); rt = rt2; break; case -EPERM: @@ -629,7 +629,7 @@ route_done: /* RFC says return as much as we can without exceeding 576 bytes. */ - room = dst_mtu(&rt->u.dst); + room = dst_mtu(&rt->dst); if (room > 576) room = 576; room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; @@ -925,6 +925,7 @@ static void icmp_address(struct sk_buff *skb) /* * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain * loudly if an inconsistency is found. + * called with rcu_read_lock() */ static void icmp_address_reply(struct sk_buff *skb) @@ -935,12 +936,12 @@ static void icmp_address_reply(struct sk_buff *skb) struct in_ifaddr *ifa; if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) - goto out; + return; - in_dev = in_dev_get(dev); + in_dev = __in_dev_get_rcu(dev); if (!in_dev) - goto out; - rcu_read_lock(); + return; + if (in_dev->ifa_list && IN_DEV_LOG_MARTIANS(in_dev) && IN_DEV_FORWARD(in_dev)) { @@ -958,9 +959,6 @@ static void icmp_address_reply(struct sk_buff *skb) mp, dev->name, &rt->rt_src); } } - rcu_read_unlock(); - in_dev_put(in_dev); -out:; } static void icmp_discard(struct sk_buff *skb) @@ -974,7 +972,7 @@ int icmp_rcv(struct sk_buff *skb) { struct icmphdr *icmph; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->u.dst.dev); + struct net *net = dev_net(rt->dst.dev); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 5fff865a4fa..b5580d42299 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -312,7 +312,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) return NULL; } - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); skb->dev = dev; skb_reserve(skb, LL_RESERVED_SPACE(dev)); @@ -330,7 +330,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->saddr = rt->rt_src; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ - ip_select_ident(pip, &rt->u.dst, NULL); + ip_select_ident(pip, &rt->dst, NULL); ((u8*)&pip[1])[0] = IPOPT_RA; ((u8*)&pip[1])[1] = 4; ((u8*)&pip[1])[2] = 0; @@ -660,7 +660,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, return -1; } - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); skb_reserve(skb, LL_RESERVED_SPACE(dev)); @@ -676,7 +676,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = rt->rt_src; iph->protocol = IPPROTO_IGMP; - ip_select_ident(iph, &rt->u.dst, NULL); + ip_select_ident(iph, &rt->dst, NULL); ((u8*)&iph[1])[0] = IPOPT_RA; ((u8*)&iph[1])[1] = 4; ((u8*)&iph[1])[2] = 0; @@ -916,18 +916,19 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, read_unlock(&in_dev->mc_list_lock); } +/* called in rcu_read_lock() section */ int igmp_rcv(struct sk_buff *skb) { /* This basically follows the spec line by line -- see RFC1112 */ struct igmphdr *ih; - struct in_device *in_dev = in_dev_get(skb->dev); + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); int len = skb->len; if (in_dev == NULL) goto drop; if (!pskb_may_pull(skb, sizeof(struct igmphdr))) - goto drop_ref; + goto drop; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: @@ -937,7 +938,7 @@ int igmp_rcv(struct sk_buff *skb) case CHECKSUM_NONE: skb->csum = 0; if (__skb_checksum_complete(skb)) - goto drop_ref; + goto drop; } ih = igmp_hdr(skb); @@ -957,7 +958,6 @@ int igmp_rcv(struct sk_buff *skb) break; case IGMP_PIM: #ifdef CONFIG_IP_PIMSM_V1 - in_dev_put(in_dev); return pim_rcv_v1(skb); #endif case IGMPV3_HOST_MEMBERSHIP_REPORT: @@ -971,8 +971,6 @@ int igmp_rcv(struct sk_buff *skb) break; } -drop_ref: - in_dev_put(in_dev); drop: kfree_skb(skb); return 0; @@ -1427,7 +1425,7 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) } if (!dev && !ip_route_output_key(net, &rt, &fl)) { - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); } if (dev) { @@ -1646,8 +1644,7 @@ static int sf_setstate(struct ip_mc_list *pmc) if (dpsf->sf_inaddr == psf->sf_inaddr) break; if (!dpsf) { - dpsf = (struct ip_sf_list *) - kmalloc(sizeof(*dpsf), GFP_ATOMIC); + dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); if (!dpsf) continue; *dpsf = *psf; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 70eb3507c40..57c9e4d7b80 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -383,7 +383,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, goto no_route; if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto route_err; - return &rt->u.dst; + return &rt->dst; route_err: ip_rt_put(rt); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 6bcfe52a9c8..9ffa24b9a80 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -51,8 +51,8 @@ * lookups performed with disabled BHs. * * Serialisation issues. - * 1. Nodes may appear in the tree only with the pool write lock held. - * 2. Nodes may disappear from the tree only with the pool write lock held + * 1. Nodes may appear in the tree only with the pool lock held. + * 2. Nodes may disappear from the tree only with the pool lock held * AND reference count being 0. * 3. Nodes appears and disappears from unused node list only under * "inet_peer_unused_lock". @@ -64,23 +64,31 @@ * usually under some other lock to prevent node disappearing * dtime: unused node list lock * v4daddr: unchangeable - * ip_id_count: idlock + * ip_id_count: atomic value (no lock needed) */ static struct kmem_cache *peer_cachep __read_mostly; #define node_height(x) x->avl_height -static struct inet_peer peer_fake_node = { - .avl_left = &peer_fake_node, - .avl_right = &peer_fake_node, + +#define peer_avl_empty ((struct inet_peer *)&peer_fake_node) +static const struct inet_peer peer_fake_node = { + .avl_left = peer_avl_empty, + .avl_right = peer_avl_empty, .avl_height = 0 }; -#define peer_avl_empty (&peer_fake_node) -static struct inet_peer *peer_root = peer_avl_empty; -static DEFINE_RWLOCK(peer_pool_lock); + +static struct { + struct inet_peer *root; + spinlock_t lock; + int total; +} peers = { + .root = peer_avl_empty, + .lock = __SPIN_LOCK_UNLOCKED(peers.lock), + .total = 0, +}; #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ -static int peer_total; /* Exported for sysctl_net_ipv4. */ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more * aggressively at this stage */ @@ -89,8 +97,13 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min int inet_peer_gc_mintime __read_mostly = 10 * HZ; int inet_peer_gc_maxtime __read_mostly = 120 * HZ; -static LIST_HEAD(unused_peers); -static DEFINE_SPINLOCK(inet_peer_unused_lock); +static struct { + struct list_head list; + spinlock_t lock; +} unused_peers = { + .list = LIST_HEAD_INIT(unused_peers.list), + .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock), +}; static void peer_check_expire(unsigned long dummy); static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); @@ -116,7 +129,7 @@ void __init inet_initpeers(void) peer_cachep = kmem_cache_create("inet_peer_cache", sizeof(struct inet_peer), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); /* All the timers, started at system startup tend @@ -131,38 +144,69 @@ void __init inet_initpeers(void) /* Called with or without local BH being disabled. */ static void unlink_from_unused(struct inet_peer *p) { - spin_lock_bh(&inet_peer_unused_lock); - list_del_init(&p->unused); - spin_unlock_bh(&inet_peer_unused_lock); + if (!list_empty(&p->unused)) { + spin_lock_bh(&unused_peers.lock); + list_del_init(&p->unused); + spin_unlock_bh(&unused_peers.lock); + } } /* * Called with local BH disabled and the pool lock held. - * _stack is known to be NULL or not at compile time, - * so compiler will optimize the if (_stack) tests. */ #define lookup(_daddr, _stack) \ ({ \ struct inet_peer *u, **v; \ - if (_stack != NULL) { \ - stackptr = _stack; \ - *stackptr++ = &peer_root; \ - } \ - for (u = peer_root; u != peer_avl_empty; ) { \ + \ + stackptr = _stack; \ + *stackptr++ = &peers.root; \ + for (u = peers.root; u != peer_avl_empty; ) { \ if (_daddr == u->v4daddr) \ break; \ if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ v = &u->avl_left; \ else \ v = &u->avl_right; \ - if (_stack != NULL) \ - *stackptr++ = v; \ + *stackptr++ = v; \ u = *v; \ } \ u; \ }) -/* Called with local BH disabled and the pool write lock held. */ +/* + * Called with rcu_read_lock_bh() + * Because we hold no lock against a writer, its quite possible we fall + * in an endless loop. + * But every pointer we follow is guaranteed to be valid thanks to RCU. + * We exit from this function if number of links exceeds PEER_MAXDEPTH + */ +static struct inet_peer *lookup_rcu_bh(__be32 daddr) +{ + struct inet_peer *u = rcu_dereference_bh(peers.root); + int count = 0; + + while (u != peer_avl_empty) { + if (daddr == u->v4daddr) { + /* Before taking a reference, check if this entry was + * deleted, unlink_from_pool() sets refcnt=-1 to make + * distinction between an unused entry (refcnt=0) and + * a freed one. + */ + if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1))) + u = NULL; + return u; + } + if ((__force __u32)daddr < (__force __u32)u->v4daddr) + u = rcu_dereference_bh(u->avl_left); + else + u = rcu_dereference_bh(u->avl_right); + if (unlikely(++count == PEER_MAXDEPTH)) + break; + } + return NULL; +} + +/* Called with local BH disabled and the pool lock held. */ #define lookup_rightempty(start) \ ({ \ struct inet_peer *u, **v; \ @@ -176,9 +220,10 @@ static void unlink_from_unused(struct inet_peer *p) u; \ }) -/* Called with local BH disabled and the pool write lock held. +/* Called with local BH disabled and the pool lock held. * Variable names are the proof of operation correctness. - * Look into mm/map_avl.c for more detail description of the ideas. */ + * Look into mm/map_avl.c for more detail description of the ideas. + */ static void peer_avl_rebalance(struct inet_peer **stack[], struct inet_peer ***stackend) { @@ -254,15 +299,21 @@ static void peer_avl_rebalance(struct inet_peer **stack[], } } -/* Called with local BH disabled and the pool write lock held. */ +/* Called with local BH disabled and the pool lock held. */ #define link_to_pool(n) \ do { \ n->avl_height = 1; \ n->avl_left = peer_avl_empty; \ n->avl_right = peer_avl_empty; \ + smp_wmb(); /* lockless readers can catch us now */ \ **--stackptr = n; \ peer_avl_rebalance(stack, stackptr); \ -} while(0) +} while (0) + +static void inetpeer_free_rcu(struct rcu_head *head) +{ + kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); +} /* May be called with local BH enabled. */ static void unlink_from_pool(struct inet_peer *p) @@ -271,13 +322,14 @@ static void unlink_from_pool(struct inet_peer *p) do_free = 0; - write_lock_bh(&peer_pool_lock); + spin_lock_bh(&peers.lock); /* Check the reference counter. It was artificially incremented by 1 - * in cleanup() function to prevent sudden disappearing. If the - * reference count is still 1 then the node is referenced only as `p' - * here and from the pool. So under the exclusive pool lock it's safe - * to remove the node and free it later. */ - if (atomic_read(&p->refcnt) == 1) { + * in cleanup() function to prevent sudden disappearing. If we can + * atomically (because of lockless readers) take this last reference, + * it's safe to remove the node and free it later. + * We use refcnt=-1 to alert lockless readers this entry is deleted. + */ + if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { struct inet_peer **stack[PEER_MAXDEPTH]; struct inet_peer ***stackptr, ***delp; if (lookup(p->v4daddr, stack) != p) @@ -303,20 +355,21 @@ static void unlink_from_pool(struct inet_peer *p) delp[1] = &t->avl_left; /* was &p->avl_left */ } peer_avl_rebalance(stack, stackptr); - peer_total--; + peers.total--; do_free = 1; } - write_unlock_bh(&peer_pool_lock); + spin_unlock_bh(&peers.lock); if (do_free) - kmem_cache_free(peer_cachep, p); + call_rcu_bh(&p->rcu, inetpeer_free_rcu); else /* The node is used again. Decrease the reference counter * back. The loop "cleanup -> unlink_from_unused * -> unlink_from_pool -> putpeer -> link_to_unused * -> cleanup (for the same node)" * doesn't really exist because the entry will have a - * recent deletion time and will not be cleaned again soon. */ + * recent deletion time and will not be cleaned again soon. + */ inet_putpeer(p); } @@ -326,16 +379,16 @@ static int cleanup_once(unsigned long ttl) struct inet_peer *p = NULL; /* Remove the first entry from the list of unused nodes. */ - spin_lock_bh(&inet_peer_unused_lock); - if (!list_empty(&unused_peers)) { + spin_lock_bh(&unused_peers.lock); + if (!list_empty(&unused_peers.list)) { __u32 delta; - p = list_first_entry(&unused_peers, struct inet_peer, unused); + p = list_first_entry(&unused_peers.list, struct inet_peer, unused); delta = (__u32)jiffies - p->dtime; if (delta < ttl) { /* Do not prune fresh entries. */ - spin_unlock_bh(&inet_peer_unused_lock); + spin_unlock_bh(&unused_peers.lock); return -1; } @@ -345,7 +398,7 @@ static int cleanup_once(unsigned long ttl) * before unlink_from_pool() call. */ atomic_inc(&p->refcnt); } - spin_unlock_bh(&inet_peer_unused_lock); + spin_unlock_bh(&unused_peers.lock); if (p == NULL) /* It means that the total number of USED entries has @@ -360,62 +413,56 @@ static int cleanup_once(unsigned long ttl) /* Called with or without local BH being disabled. */ struct inet_peer *inet_getpeer(__be32 daddr, int create) { - struct inet_peer *p, *n; + struct inet_peer *p; struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; - /* Look up for the address quickly. */ - read_lock_bh(&peer_pool_lock); - p = lookup(daddr, NULL); - if (p != peer_avl_empty) - atomic_inc(&p->refcnt); - read_unlock_bh(&peer_pool_lock); + /* Look up for the address quickly, lockless. + * Because of a concurrent writer, we might not find an existing entry. + */ + rcu_read_lock_bh(); + p = lookup_rcu_bh(daddr); + rcu_read_unlock_bh(); + + if (p) { + /* The existing node has been found. + * Remove the entry from unused list if it was there. + */ + unlink_from_unused(p); + return p; + } + /* retry an exact lookup, taking the lock before. + * At least, nodes should be hot in our cache. + */ + spin_lock_bh(&peers.lock); + p = lookup(daddr, stack); if (p != peer_avl_empty) { - /* The existing node has been found. */ + atomic_inc(&p->refcnt); + spin_unlock_bh(&peers.lock); /* Remove the entry from unused list if it was there. */ unlink_from_unused(p); return p; } + p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; + if (p) { + p->v4daddr = daddr; + atomic_set(&p->refcnt, 1); + atomic_set(&p->rid, 0); + atomic_set(&p->ip_id_count, secure_ip_id(daddr)); + p->tcp_ts_stamp = 0; + INIT_LIST_HEAD(&p->unused); + + + /* Link the node. */ + link_to_pool(p); + peers.total++; + } + spin_unlock_bh(&peers.lock); - if (!create) - return NULL; - - /* Allocate the space outside the locked region. */ - n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); - if (n == NULL) - return NULL; - n->v4daddr = daddr; - atomic_set(&n->refcnt, 1); - atomic_set(&n->rid, 0); - atomic_set(&n->ip_id_count, secure_ip_id(daddr)); - n->tcp_ts_stamp = 0; - - write_lock_bh(&peer_pool_lock); - /* Check if an entry has suddenly appeared. */ - p = lookup(daddr, stack); - if (p != peer_avl_empty) - goto out_free; - - /* Link the node. */ - link_to_pool(n); - INIT_LIST_HEAD(&n->unused); - peer_total++; - write_unlock_bh(&peer_pool_lock); - - if (peer_total >= inet_peer_threshold) + if (peers.total >= inet_peer_threshold) /* Remove one less-recently-used entry. */ cleanup_once(0); - return n; - -out_free: - /* The appropriate node is already in the pool. */ - atomic_inc(&p->refcnt); - write_unlock_bh(&peer_pool_lock); - /* Remove the entry from unused list if it was there. */ - unlink_from_unused(p); - /* Free preallocated the preallocated node. */ - kmem_cache_free(peer_cachep, n); return p; } @@ -425,12 +472,12 @@ static void peer_check_expire(unsigned long dummy) unsigned long now = jiffies; int ttl; - if (peer_total >= inet_peer_threshold) + if (peers.total >= inet_peer_threshold) ttl = inet_peer_minttl; else ttl = inet_peer_maxttl - (inet_peer_maxttl - inet_peer_minttl) / HZ * - peer_total / inet_peer_threshold * HZ; + peers.total / inet_peer_threshold * HZ; while (!cleanup_once(ttl)) { if (jiffies != now) break; @@ -439,22 +486,25 @@ static void peer_check_expire(unsigned long dummy) /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime * interval depending on the total number of entries (more entries, * less interval). */ - if (peer_total >= inet_peer_threshold) + if (peers.total >= inet_peer_threshold) peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; else peer_periodic_timer.expires = jiffies + inet_peer_gc_maxtime - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * - peer_total / inet_peer_threshold * HZ; + peers.total / inet_peer_threshold * HZ; add_timer(&peer_periodic_timer); } void inet_putpeer(struct inet_peer *p) { - spin_lock_bh(&inet_peer_unused_lock); - if (atomic_dec_and_test(&p->refcnt)) { - list_add_tail(&p->unused, &unused_peers); + local_bh_disable(); + + if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) { + list_add_tail(&p->unused, &unused_peers.list); p->dtime = (__u32)jiffies; + spin_unlock(&unused_peers.lock); } - spin_unlock_bh(&inet_peer_unused_lock); + + local_bh_enable(); } diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 56cdf68a074..99461f09320 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -87,16 +87,16 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; - if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && + if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { - IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_mtu(&rt->u.dst))); + htonl(dst_mtu(&rt->dst))); goto drop; } /* We are about to mangle packet. Copy it! */ - if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) + if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len)) goto drop; iph = ip_hdr(skb); @@ -113,7 +113,7 @@ int ip_forward(struct sk_buff *skb) skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, - rt->u.dst.dev, ip_forward_finish); + rt->dst.dev, ip_forward_finish); sr_failed: /* diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 75347ea70ea..858d34648ee 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -124,11 +124,8 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a) } /* Memory Tracking Functions. */ -static __inline__ void frag_kfree_skb(struct netns_frags *nf, - struct sk_buff *skb, int *work) +static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) { - if (work) - *work -= skb->truesize; atomic_sub(skb->truesize, &nf->mem); kfree_skb(skb); } @@ -309,7 +306,7 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(qp->q.net, fp, NULL); + frag_kfree_skb(qp->q.net, fp); fp = xp; } while (fp); @@ -446,7 +443,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(qp->q.net, free_it, NULL); + frag_kfree_skb(qp->q.net, free_it); } } @@ -556,7 +553,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &qp->q.net->mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -566,8 +562,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &qp->q.net->mem); } + atomic_sub(head->truesize, &qp->q.net->mem); head->next = NULL; head->dev = dev; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 32618e11076..749e54889e8 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -745,7 +745,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev goto tx_error; } } - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); @@ -755,7 +755,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev df = tiph->frag_off; if (df) - mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen; + mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; else mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; @@ -803,7 +803,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev tunnel->err_count = 0; } - max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len; + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { @@ -830,7 +830,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. @@ -853,7 +853,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; #endif else - iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); + iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT); } ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; @@ -915,7 +915,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) .proto = IPPROTO_GRE }; struct rtable *rt; if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; ip_rt_put(rt); } @@ -1174,7 +1174,7 @@ static int ipgre_open(struct net_device *dev) struct rtable *rt; if (ip_route_output_key(dev_net(dev), &rt, &fl)) return -EADDRNOTAVAIL; - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); if (__in_dev_get_rtnl(dev) == NULL) return -EADDRNOTAVAIL; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d930dc5e4d8..db47a5a00ed 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -146,7 +146,7 @@ #include <linux/netlink.h> /* - * Process Router Attention IP option + * Process Router Attention IP option (RFC 2113) */ int ip_call_ra_chain(struct sk_buff *skb) { @@ -155,8 +155,7 @@ int ip_call_ra_chain(struct sk_buff *skb) struct sock *last = NULL; struct net_device *dev = skb->dev; - read_lock(&ip_ra_lock); - for (ra = ip_ra_chain; ra; ra = ra->next) { + for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report @@ -167,10 +166,8 @@ int ip_call_ra_chain(struct sk_buff *skb) sk->sk_bound_dev_if == dev->ifindex) && net_eq(sock_net(sk), dev_net(dev))) { if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { - read_unlock(&ip_ra_lock); + if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) return 1; - } } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -183,10 +180,8 @@ int ip_call_ra_chain(struct sk_buff *skb) if (last) { raw_rcv(last, skb); - read_unlock(&ip_ra_lock); return 1; } - read_unlock(&ip_ra_lock); return 0; } @@ -298,18 +293,16 @@ static inline int ip_rcv_options(struct sk_buff *skb) } if (unlikely(opt->srr)) { - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); + if (in_dev) { if (!IN_DEV_SOURCE_ROUTE(in_dev)) { if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_INFO "source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); - in_dev_put(in_dev); goto drop; } - - in_dev_put(in_dev); } if (ip_options_rcv_srr(skb)) @@ -340,6 +333,9 @@ static int ip_rcv_finish(struct sk_buff *skb) else if (err == -ENETUNREACH) IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_INNOROUTES); + else if (err == -EXDEV) + NET_INC_STATS_BH(dev_net(skb->dev), + LINUX_MIB_IPRPFILTER); goto drop; } } @@ -360,10 +356,10 @@ static int ip_rcv_finish(struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST, + IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST, + IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST, skb->len); return dst_input(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 041d41df122..7d1f4b4481a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -151,15 +151,15 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; - if (ip_dont_fragment(sk, &rt->u.dst)) + if (ip_dont_fragment(sk, &rt->dst)) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; - iph->ttl = ip_select_ttl(inet, &rt->u.dst); + iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = sk->sk_protocol; - ip_select_ident(iph, &rt->u.dst, sk); + ip_select_ident(iph, &rt->dst, sk); if (opt && opt->optlen) { iph->ihl += opt->optlen>>2; @@ -240,7 +240,7 @@ int ip_mc_output(struct sk_buff *skb) { struct sock *sk = skb->sk; struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->u.dst.dev; + struct net_device *dev = rt->dst.dev; /* * If the indicated interface is up and running, send the packet. @@ -359,9 +359,9 @@ int ip_queue_xmit(struct sk_buff *skb) if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) goto no_route; } - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); } - skb_dst_set_noref(skb, &rt->u.dst); + skb_dst_set_noref(skb, &rt->dst); packet_routed: if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) @@ -372,11 +372,11 @@ packet_routed: skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - if (ip_dont_fragment(sk, &rt->u.dst) && !skb->local_df) + if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; - iph->ttl = ip_select_ttl(inet, &rt->u.dst); + iph->ttl = ip_select_ttl(inet, &rt->dst); iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; @@ -387,7 +387,7 @@ packet_routed: ip_options_build(skb, opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(iph, &rt->u.dst, sk, + ip_select_ident_more(iph, &rt->dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); skb->priority = sk->sk_priority; @@ -452,7 +452,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) struct rtable *rt = skb_rtable(skb); int err = 0; - dev = rt->u.dst.dev; + dev = rt->dst.dev; /* * Point into the IP datagram header. @@ -473,7 +473,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) */ hlen = iph->ihl * 4; - mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge) mtu -= nf_bridge_mtu_reduction(skb); @@ -586,7 +586,7 @@ slow_path: * we need to make room for the encapsulating header */ pad = nf_bridge_pad(skb); - ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad); + ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, pad); mtu -= pad; /* @@ -833,13 +833,13 @@ int ip_append_data(struct sock *sk, */ *rtp = NULL; inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? - rt->u.dst.dev->mtu : - dst_mtu(rt->u.dst.path); - inet->cork.dst = &rt->u.dst; + rt->dst.dev->mtu : + dst_mtu(rt->dst.path); + inet->cork.dst = &rt->dst; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - if ((exthdrlen = rt->u.dst.header_len) != 0) { + if ((exthdrlen = rt->dst.header_len) != 0) { length += exthdrlen; transhdrlen += exthdrlen; } @@ -852,7 +852,7 @@ int ip_append_data(struct sock *sk, exthdrlen = 0; mtu = inet->cork.fragsize; } - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; @@ -869,7 +869,7 @@ int ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->u.dst.dev->features & NETIF_F_V4_CSUM && + rt->dst.dev->features & NETIF_F_V4_CSUM && !exthdrlen) csummode = CHECKSUM_PARTIAL; @@ -878,7 +878,7 @@ int ip_append_data(struct sock *sk, inet->cork.length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO)) { err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags); @@ -926,7 +926,7 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; if ((flags & MSG_MORE) && - !(rt->u.dst.dev->features&NETIF_F_SG)) + !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else alloclen = datalen + fragheaderlen; @@ -937,7 +937,7 @@ alloc_new_skb: * the last. */ if (datalen == length + fraggap) - alloclen += rt->u.dst.trailer_len; + alloclen += rt->dst.trailer_len; if (transhdrlen) { skb = sock_alloc_send_skb(sk, @@ -1010,7 +1010,7 @@ alloc_new_skb: if (copy > length) copy = length; - if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + if (!(rt->dst.dev->features&NETIF_F_SG)) { unsigned int off; off = skb->len; @@ -1105,10 +1105,10 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; - if (!(rt->u.dst.dev->features&NETIF_F_SG)) + if (!(rt->dst.dev->features&NETIF_F_SG)) return -EOPNOTSUPP; - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + hh_len = LL_RESERVED_SPACE(rt->dst.dev); mtu = inet->cork.fragsize; fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); @@ -1125,7 +1125,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, inet->cork.length += size; if ((size + skb->len > mtu) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO)) { skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; } @@ -1277,8 +1277,8 @@ int ip_push_pending_frames(struct sock *sk) * If local_df is set too, we still allow to fragment this frame * locally. */ if (inet->pmtudisc >= IP_PMTUDISC_DO || - (skb->len <= dst_mtu(&rt->u.dst) && - ip_dont_fragment(sk, &rt->u.dst))) + (skb->len <= dst_mtu(&rt->dst) && + ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); if (inet->cork.flags & IPCORK_OPT) @@ -1287,7 +1287,7 @@ int ip_push_pending_frames(struct sock *sk) if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; else - ttl = ip_select_ttl(inet, &rt->u.dst); + ttl = ip_select_ttl(inet, &rt->dst); iph = (struct iphdr *)skb->data; iph->version = 4; @@ -1298,7 +1298,7 @@ int ip_push_pending_frames(struct sock *sk) } iph->tos = inet->tos; iph->frag_off = df; - ip_select_ident(iph, &rt->u.dst, sk); + ip_select_ident(iph, &rt->dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; @@ -1311,7 +1311,7 @@ int ip_push_pending_frames(struct sock *sk) * on dst refcount */ inet->cork.dst = NULL; - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); if (iph->protocol == IPPROTO_ICMP) icmp_out_count(net, ((struct icmphdr *) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ce231780a2b..6c40a8c46e7 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -239,7 +239,16 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) sent to multicast group to reach destination designated router. */ struct ip_ra_chain *ip_ra_chain; -DEFINE_RWLOCK(ip_ra_lock); +static DEFINE_SPINLOCK(ip_ra_lock); + + +static void ip_ra_destroy_rcu(struct rcu_head *head) +{ + struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); + + sock_put(ra->saved_sk); + kfree(ra); +} int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) @@ -251,35 +260,42 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; - write_lock_bh(&ip_ra_lock); + spin_lock_bh(&ip_ra_lock); for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { - write_unlock_bh(&ip_ra_lock); + spin_unlock_bh(&ip_ra_lock); kfree(new_ra); return -EADDRINUSE; } - *rap = ra->next; - write_unlock_bh(&ip_ra_lock); + /* dont let ip_call_ra_chain() use sk again */ + ra->sk = NULL; + rcu_assign_pointer(*rap, ra->next); + spin_unlock_bh(&ip_ra_lock); if (ra->destructor) ra->destructor(sk); - sock_put(sk); - kfree(ra); + /* + * Delay sock_put(sk) and kfree(ra) after one rcu grace + * period. This guarantee ip_call_ra_chain() dont need + * to mess with socket refcounts. + */ + ra->saved_sk = sk; + call_rcu(&ra->rcu, ip_ra_destroy_rcu); return 0; } } if (new_ra == NULL) { - write_unlock_bh(&ip_ra_lock); + spin_unlock_bh(&ip_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->destructor = destructor; new_ra->next = ra; - *rap = new_ra; + rcu_assign_pointer(*rap, new_ra); sock_hold(sk); - write_unlock_bh(&ip_ra_lock); + spin_unlock_bh(&ip_ra_lock); return 0; } @@ -449,7 +465,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) | - (1<<IP_MINTTL))) || + (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) || optname == IP_MULTICAST_TTL || optname == IP_MULTICAST_ALL || optname == IP_MULTICAST_LOOP || @@ -572,6 +588,13 @@ static int do_ip_setsockopt(struct sock *sk, int level, } inet->hdrincl = val ? 1 : 0; break; + case IP_NODEFRAG: + if (sk->sk_type != SOCK_RAW) { + err = -ENOPROTOOPT; + break; + } + inet->nodefrag = val ? 1 : 0; + break; case IP_MTU_DISCOVER: if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) goto e_inval; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index b9d84e800cf..3a6e1ec5e9a 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -665,6 +665,13 @@ ic_dhcp_init_options(u8 *options) memcpy(e, ic_req_params, sizeof(ic_req_params)); e += sizeof(ic_req_params); + if (ic_host_name_set) { + *e++ = 12; /* host-name */ + len = strlen(utsname()->nodename); + *e++ = len; + memcpy(e, utsname()->nodename, len); + e += len; + } if (*vendor_class_identifier) { printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n", vendor_class_identifier); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 7fd63671103..ec036731a70 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -435,7 +435,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error_icmp; } } - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); @@ -446,7 +446,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) df |= old_iph->frag_off & htons(IP_DF); if (df) { - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { stats->collisions++; @@ -503,7 +503,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. @@ -552,7 +552,7 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) .proto = IPPROTO_IPIP }; struct rtable *rt; if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; ip_rt_put(rt); } dev->flags |= IFF_POINTOPOINT; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 757f25eb9b4..539592294f4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1553,9 +1553,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } - dev = rt->u.dst.dev; + dev = rt->dst.dev; - if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { + if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not allow to send ICMP, so that packets will disappear to blackhole. @@ -1566,7 +1566,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } - encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len; + encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); @@ -1577,7 +1577,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->bytes_out += skb->len; skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); ip_decrease_ttl(ip_hdr(skb)); /* FIXME: forward and output firewalls used to be called here. diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 07de855e217..d88a46c54fd 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -43,7 +43,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) /* Drop old route. */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); } else { /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ @@ -53,11 +53,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) orefdst = skb->_skb_refdst; if (ip_route_input(skb, iph->daddr, iph->saddr, - RT_TOS(iph->tos), rt->u.dst.dev) != 0) { - dst_release(&rt->u.dst); + RT_TOS(iph->tos), rt->dst.dev) != 0) { + dst_release(&rt->dst); return -1; } - dst_release(&rt->u.dst); + dst_release(&rt->dst); refdst_drop(orefdst); } @@ -212,9 +212,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, skb->len - dataoff, 0); skb->ip_summed = CHECKSUM_NONE; - csum = __skb_checksum_complete_head(skb, dataoff + len); - if (!csum) - skb->ip_summed = CHECKSUM_UNNECESSARY; + return __skb_checksum_complete_head(skb, dataoff + len); } return csum; } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 1ac01b12862..16c0ba0a272 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -758,7 +758,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) * about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = vmalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1005,8 +1005,7 @@ static int __do_replace(struct net *net, const char *name, struct arpt_entry *iter; ret = 0; - counters = vmalloc_node(num_counters * sizeof(struct xt_counters), - numa_node_id()); + counters = vmalloc(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; @@ -1159,7 +1158,7 @@ static int do_add_counters(struct net *net, const void __user *user, if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc_node(len - size, numa_node_id()); + paddc = vmalloc(len - size); if (!paddc) return -ENOMEM; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index a4e5fc5df4b..d2c1311cb28 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -42,7 +42,7 @@ typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; -static DEFINE_RWLOCK(queue_lock); +static DEFINE_SPINLOCK(queue_lock); static int peer_pid __read_mostly; static unsigned int copy_range __read_mostly; static unsigned int queue_total; @@ -72,10 +72,10 @@ __ipq_set_mode(unsigned char mode, unsigned int range) break; case IPQ_COPY_PACKET: - copy_mode = mode; + if (range > 0xFFFF) + range = 0xFFFF; copy_range = range; - if (copy_range > 0xFFFF) - copy_range = 0xFFFF; + copy_mode = mode; break; default: @@ -101,7 +101,7 @@ ipq_find_dequeue_entry(unsigned long id) { struct nf_queue_entry *entry = NULL, *i; - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); list_for_each_entry(i, &queue_list, list) { if ((unsigned long)i == id) { @@ -115,7 +115,7 @@ ipq_find_dequeue_entry(unsigned long id) queue_total--; } - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); return entry; } @@ -136,9 +136,9 @@ __ipq_flush(ipq_cmpfn cmpfn, unsigned long data) static void ipq_flush(ipq_cmpfn cmpfn, unsigned long data) { - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); __ipq_flush(cmpfn, data); - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); } static struct sk_buff * @@ -152,9 +152,7 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) struct nlmsghdr *nlh; struct timeval tv; - read_lock_bh(&queue_lock); - - switch (copy_mode) { + switch (ACCESS_ONCE(copy_mode)) { case IPQ_COPY_META: case IPQ_COPY_NONE: size = NLMSG_SPACE(sizeof(*pmsg)); @@ -162,26 +160,21 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) case IPQ_COPY_PACKET: if (entry->skb->ip_summed == CHECKSUM_PARTIAL && - (*errp = skb_checksum_help(entry->skb))) { - read_unlock_bh(&queue_lock); + (*errp = skb_checksum_help(entry->skb))) return NULL; - } - if (copy_range == 0 || copy_range > entry->skb->len) + + data_len = ACCESS_ONCE(copy_range); + if (data_len == 0 || data_len > entry->skb->len) data_len = entry->skb->len; - else - data_len = copy_range; size = NLMSG_SPACE(sizeof(*pmsg) + data_len); break; default: *errp = -EINVAL; - read_unlock_bh(&queue_lock); return NULL; } - read_unlock_bh(&queue_lock); - skb = alloc_skb(size, GFP_ATOMIC); if (!skb) goto nlmsg_failure; @@ -242,7 +235,7 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) if (nskb == NULL) return status; - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); if (!peer_pid) goto err_out_free_nskb; @@ -266,14 +259,14 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) __ipq_enqueue_entry(entry); - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); return status; err_out_free_nskb: kfree_skb(nskb); err_out_unlock: - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); return status; } @@ -342,9 +335,9 @@ ipq_set_mode(unsigned char mode, unsigned int range) { int status; - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); status = __ipq_set_mode(mode, range); - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); return status; } @@ -440,11 +433,11 @@ __ipq_rcv_skb(struct sk_buff *skb) if (security_netlink_recv(skb, CAP_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); if (peer_pid) { if (peer_pid != pid) { - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); RCV_SKB_FAIL(-EBUSY); } } else { @@ -452,7 +445,7 @@ __ipq_rcv_skb(struct sk_buff *skb) peer_pid = pid; } - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); status = ipq_receive_peer(NLMSG_DATA(nlh), type, nlmsglen - NLMSG_LENGTH(0)); @@ -497,10 +490,10 @@ ipq_rcv_nl_event(struct notifier_block *this, struct netlink_notify *n = ptr; if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) __ipq_reset(); - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); } return NOTIFY_DONE; } @@ -527,7 +520,7 @@ static ctl_table ipq_table[] = { #ifdef CONFIG_PROC_FS static int ip_queue_show(struct seq_file *m, void *v) { - read_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); seq_printf(m, "Peer PID : %d\n" @@ -545,7 +538,7 @@ static int ip_queue_show(struct seq_file *m, void *v) queue_dropped, queue_user_dropped); - read_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); return 0; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4b6c5ca610f..b38c11810c6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -928,7 +928,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = vmalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -1352,7 +1352,7 @@ do_add_counters(struct net *net, const void __user *user, if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc_node(len - size, numa_node_id()); + paddc = vmalloc(len - size); if (!paddc) return -ENOMEM; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index f91c94b9a79..64d0875f519 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -53,12 +53,13 @@ struct clusterip_config { #endif enum clusterip_hashmode hash_mode; /* which hashing mode */ u_int32_t hash_initval; /* hash initialization */ + struct rcu_head rcu; }; static LIST_HEAD(clusterip_configs); /* clusterip_lock protects the clusterip_configs list */ -static DEFINE_RWLOCK(clusterip_lock); +static DEFINE_SPINLOCK(clusterip_lock); #ifdef CONFIG_PROC_FS static const struct file_operations clusterip_proc_fops; @@ -71,11 +72,17 @@ clusterip_config_get(struct clusterip_config *c) atomic_inc(&c->refcount); } + +static void clusterip_config_rcu_free(struct rcu_head *head) +{ + kfree(container_of(head, struct clusterip_config, rcu)); +} + static inline void clusterip_config_put(struct clusterip_config *c) { if (atomic_dec_and_test(&c->refcount)) - kfree(c); + call_rcu_bh(&c->rcu, clusterip_config_rcu_free); } /* decrease the count of entries using/referencing this config. If last @@ -84,10 +91,11 @@ clusterip_config_put(struct clusterip_config *c) static inline void clusterip_config_entry_put(struct clusterip_config *c) { - write_lock_bh(&clusterip_lock); - if (atomic_dec_and_test(&c->entries)) { - list_del(&c->list); - write_unlock_bh(&clusterip_lock); + local_bh_disable(); + if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) { + list_del_rcu(&c->list); + spin_unlock(&clusterip_lock); + local_bh_enable(); dev_mc_del(c->dev, c->clustermac); dev_put(c->dev); @@ -100,7 +108,7 @@ clusterip_config_entry_put(struct clusterip_config *c) #endif return; } - write_unlock_bh(&clusterip_lock); + local_bh_enable(); } static struct clusterip_config * @@ -108,7 +116,7 @@ __clusterip_config_find(__be32 clusterip) { struct clusterip_config *c; - list_for_each_entry(c, &clusterip_configs, list) { + list_for_each_entry_rcu(c, &clusterip_configs, list) { if (c->clusterip == clusterip) return c; } @@ -121,16 +129,15 @@ clusterip_config_find_get(__be32 clusterip, int entry) { struct clusterip_config *c; - read_lock_bh(&clusterip_lock); + rcu_read_lock_bh(); c = __clusterip_config_find(clusterip); - if (!c) { - read_unlock_bh(&clusterip_lock); - return NULL; + if (c) { + if (unlikely(!atomic_inc_not_zero(&c->refcount))) + c = NULL; + else if (entry) + atomic_inc(&c->entries); } - atomic_inc(&c->refcount); - if (entry) - atomic_inc(&c->entries); - read_unlock_bh(&clusterip_lock); + rcu_read_unlock_bh(); return c; } @@ -181,9 +188,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, } #endif - write_lock_bh(&clusterip_lock); - list_add(&c->list, &clusterip_configs); - write_unlock_bh(&clusterip_lock); + spin_lock_bh(&clusterip_lock); + list_add_rcu(&c->list, &clusterip_configs); + spin_unlock_bh(&clusterip_lock); return c; } @@ -733,6 +740,9 @@ static void __exit clusterip_tg_exit(void) #endif nf_unregister_hook(&cip_arp_ops); xt_unregister_target(&clusterip_tg_reg); + + /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */ + rcu_barrier_bh(); } module_init(clusterip_tg_init); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index cb763ae9ed9..eab8de32f20 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -66,6 +66,11 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && inet->nodefrag) + return NF_ACCEPT; + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) /* Previously seen (loopback)? Ignore. Do this before diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 4f8bddb760c..c7719b283ad 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -742,7 +742,7 @@ static int __init nf_nat_init(void) spin_unlock_bh(&nf_nat_lock); /* Initialize fake conntrack so that NAT will skip it */ - nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index beb25819c9c..6723c682250 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -98,7 +98,7 @@ nf_nat_fn(unsigned int hooknum, return NF_ACCEPT; /* Don't try to NAT if this packet is not conntracked */ - if (ct == &nf_conntrack_untracked) + if (nf_ct_is_untracked(ct)) return NF_ACCEPT; nat = nfct_nat(ct); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3dc9914c1dc..e320ca6b3ef 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -252,6 +252,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), + SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 2c7a1639388..009a7b2aa1e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -314,7 +314,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) } static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, - struct rtable *rt, + struct rtable **rtp, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); @@ -323,25 +323,27 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, struct sk_buff *skb; unsigned int iphlen; int err; + struct rtable *rt = *rtp; - if (length > rt->u.dst.dev->mtu) { + if (length > rt->dst.dev->mtu) { ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, - rt->u.dst.dev->mtu); + rt->dst.dev->mtu); return -EMSGSIZE; } if (flags&MSG_PROBE) goto out; skb = sock_alloc_send_skb(sk, - length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, + length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15, flags & MSG_DONTWAIT, &err); if (skb == NULL) goto error; - skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); + skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev)); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_dst_set(skb, dst_clone(&rt->u.dst)); + skb_dst_set(skb, &rt->dst); + *rtp = NULL; skb_reset_network_header(skb); iph = ip_hdr(skb); @@ -373,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(iph, &rt->u.dst, NULL); + ip_select_ident(iph, &rt->dst, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } @@ -382,7 +384,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, skb_transport_header(skb))->type); err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, - rt->u.dst.dev, dst_output); + rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) @@ -576,7 +578,7 @@ back_from_confirm: if (inet->hdrincl) err = raw_send_hdrinc(sk, msg->msg_iov, len, - rt, msg->msg_flags); + &rt, msg->msg_flags); else { if (!ipc.addr) @@ -604,7 +606,7 @@ out: return len; do_confirm: - dst_confirm(&rt->u.dst); + dst_confirm(&rt->dst); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 560acc677ce..03430de4616 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -253,8 +253,7 @@ static unsigned rt_hash_mask __read_mostly; static unsigned int rt_hash_log __read_mostly; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) \ - (__raw_get_cpu_var(rt_cache_stat).field++) +#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, int genid) @@ -287,10 +286,10 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) rcu_read_lock_bh(); r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); while (r) { - if (dev_net(r->u.dst.dev) == seq_file_net(seq) && + if (dev_net(r->dst.dev) == seq_file_net(seq) && r->rt_genid == st->genid) return r; - r = rcu_dereference_bh(r->u.dst.rt_next); + r = rcu_dereference_bh(r->dst.rt_next); } rcu_read_unlock_bh(); } @@ -302,7 +301,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, { struct rt_cache_iter_state *st = seq->private; - r = r->u.dst.rt_next; + r = r->dst.rt_next; while (!r) { rcu_read_unlock_bh(); do { @@ -320,7 +319,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq, { struct rt_cache_iter_state *st = seq->private; while ((r = __rt_cache_get_next(seq, r)) != NULL) { - if (dev_net(r->u.dst.dev) != seq_file_net(seq)) + if (dev_net(r->dst.dev) != seq_file_net(seq)) continue; if (r->rt_genid == st->genid) break; @@ -378,19 +377,19 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", - r->u.dst.dev ? r->u.dst.dev->name : "*", + r->dst.dev ? r->dst.dev->name : "*", (__force u32)r->rt_dst, (__force u32)r->rt_gateway, - r->rt_flags, atomic_read(&r->u.dst.__refcnt), - r->u.dst.__use, 0, (__force u32)r->rt_src, - (dst_metric(&r->u.dst, RTAX_ADVMSS) ? - (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), - dst_metric(&r->u.dst, RTAX_WINDOW), - (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + - dst_metric(&r->u.dst, RTAX_RTTVAR)), + r->rt_flags, atomic_read(&r->dst.__refcnt), + r->dst.__use, 0, (__force u32)r->rt_src, + (dst_metric(&r->dst, RTAX_ADVMSS) ? + (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0), + dst_metric(&r->dst, RTAX_WINDOW), + (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + + dst_metric(&r->dst, RTAX_RTTVAR)), r->fl.fl4_tos, - r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, - r->u.dst.hh ? (r->u.dst.hh->hh_output == + r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, + r->dst.hh ? (r->dst.hh->hh_output == dev_queue_xmit) : 0, r->rt_spec_dst, &len); @@ -609,13 +608,13 @@ static inline int ip_rt_proc_init(void) static inline void rt_free(struct rtable *rt) { - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); } static inline void rt_drop(struct rtable *rt) { ip_rt_put(rt); - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); } static inline int rt_fast_clean(struct rtable *rth) @@ -623,13 +622,13 @@ static inline int rt_fast_clean(struct rtable *rth) /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rth->fl.iif && rth->u.dst.rt_next; + rth->fl.iif && rth->dst.rt_next; } static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->u.dst.expires; + rth->dst.expires; } static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) @@ -637,15 +636,15 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t unsigned long age; int ret = 0; - if (atomic_read(&rth->u.dst.__refcnt)) + if (atomic_read(&rth->dst.__refcnt)) goto out; ret = 1; - if (rth->u.dst.expires && - time_after_eq(jiffies, rth->u.dst.expires)) + if (rth->dst.expires && + time_after_eq(jiffies, rth->dst.expires)) goto out; - age = jiffies - rth->u.dst.lastuse; + age = jiffies - rth->dst.lastuse; ret = 0; if ((age <= tmo1 && !rt_fast_clean(rth)) || (age <= tmo2 && rt_valuable(rth))) @@ -661,7 +660,7 @@ out: return ret; */ static inline u32 rt_score(struct rtable *rt) { - u32 score = jiffies - rt->u.dst.lastuse; + u32 score = jiffies - rt->dst.lastuse; score = ~score & ~(3<<30); @@ -701,12 +700,12 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) { - return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); + return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); } static inline int rt_is_expired(struct rtable *rth) { - return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); + return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); } /* @@ -735,7 +734,7 @@ static void rt_do_flush(int process_context) rth = rt_hash_table[i].chain; /* defer releasing the head of the list after spin_unlock */ - for (tail = rth; tail; tail = tail->u.dst.rt_next) + for (tail = rth; tail; tail = tail->dst.rt_next) if (!rt_is_expired(tail)) break; if (rth != tail) @@ -744,9 +743,9 @@ static void rt_do_flush(int process_context) /* call rt_free on entries after the tail requiring flush */ prev = &rt_hash_table[i].chain; for (p = *prev; p; p = next) { - next = p->u.dst.rt_next; + next = p->dst.rt_next; if (!rt_is_expired(p)) { - prev = &p->u.dst.rt_next; + prev = &p->dst.rt_next; } else { *prev = next; rt_free(p); @@ -761,7 +760,7 @@ static void rt_do_flush(int process_context) spin_unlock_bh(rt_hash_lock_addr(i)); for (; rth != tail; rth = next) { - next = rth->u.dst.rt_next; + next = rth->dst.rt_next; rt_free(rth); } } @@ -792,7 +791,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) while (aux != rth) { if (compare_hash_inputs(&aux->fl, &rth->fl)) return 0; - aux = aux->u.dst.rt_next; + aux = aux->dst.rt_next; } return ONE; } @@ -832,18 +831,18 @@ static void rt_check_expire(void) length = 0; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { - prefetch(rth->u.dst.rt_next); + prefetch(rth->dst.rt_next); if (rt_is_expired(rth)) { - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); continue; } - if (rth->u.dst.expires) { + if (rth->dst.expires) { /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->u.dst.expires)) { + if (time_before_eq(jiffies, rth->dst.expires)) { nofree: tmo >>= 1; - rthp = &rth->u.dst.rt_next; + rthp = &rth->dst.rt_next; /* * We only count entries on * a chain with equal hash inputs once @@ -859,7 +858,7 @@ nofree: goto nofree; /* Cleanup aged off entries. */ - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); } spin_unlock_bh(rt_hash_lock_addr(i)); @@ -1000,10 +999,10 @@ static int rt_garbage_collect(struct dst_ops *ops) if (!rt_is_expired(rth) && !rt_may_expire(rth, tmo, expire)) { tmo >>= 1; - rthp = &rth->u.dst.rt_next; + rthp = &rth->dst.rt_next; continue; } - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); goal--; } @@ -1069,7 +1068,7 @@ static int slow_chain_length(const struct rtable *head) while (rth) { length += has_noalias(head, rth); - rth = rth->u.dst.rt_next; + rth = rth->dst.rt_next; } return length >> FRACT_BITS; } @@ -1091,7 +1090,7 @@ restart: candp = NULL; now = jiffies; - if (!rt_caching(dev_net(rt->u.dst.dev))) { + if (!rt_caching(dev_net(rt->dst.dev))) { /* * If we're not caching, just tell the caller we * were successful and don't touch the route. The @@ -1109,7 +1108,7 @@ restart: */ if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { - int err = arp_bind_neighbour(&rt->u.dst); + int err = arp_bind_neighbour(&rt->dst); if (err) { if (net_ratelimit()) printk(KERN_WARNING @@ -1128,19 +1127,19 @@ restart: spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { if (rt_is_expired(rth)) { - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); continue; } if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { /* Put it first */ - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; /* * Since lookup is lockfree, the deletion * must be visible to another weakly ordered CPU before * the insertion at the start of the hash chain. */ - rcu_assign_pointer(rth->u.dst.rt_next, + rcu_assign_pointer(rth->dst.rt_next, rt_hash_table[hash].chain); /* * Since lookup is lockfree, the update writes @@ -1148,18 +1147,18 @@ restart: */ rcu_assign_pointer(rt_hash_table[hash].chain, rth); - dst_use(&rth->u.dst, now); + dst_use(&rth->dst, now); spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); if (rp) *rp = rth; else - skb_dst_set(skb, &rth->u.dst); + skb_dst_set(skb, &rth->dst); return 0; } - if (!atomic_read(&rth->u.dst.__refcnt)) { + if (!atomic_read(&rth->dst.__refcnt)) { u32 score = rt_score(rth); if (score <= min_score) { @@ -1171,7 +1170,7 @@ restart: chain_length++; - rthp = &rth->u.dst.rt_next; + rthp = &rth->dst.rt_next; } if (cand) { @@ -1182,17 +1181,17 @@ restart: * only 2 entries per bucket. We will see. */ if (chain_length > ip_rt_gc_elasticity) { - *candp = cand->u.dst.rt_next; + *candp = cand->dst.rt_next; rt_free(cand); } } else { if (chain_length > rt_chain_length_max && slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { - struct net *net = dev_net(rt->u.dst.dev); + struct net *net = dev_net(rt->dst.dev); int num = ++net->ipv4.current_rt_cache_rebuild_count; if (!rt_caching(net)) { printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", - rt->u.dst.dev->name, num); + rt->dst.dev->name, num); } rt_emergency_hash_rebuild(net); spin_unlock_bh(rt_hash_lock_addr(hash)); @@ -1207,7 +1206,7 @@ restart: route or unicast forwarding path. */ if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { - int err = arp_bind_neighbour(&rt->u.dst); + int err = arp_bind_neighbour(&rt->dst); if (err) { spin_unlock_bh(rt_hash_lock_addr(hash)); @@ -1238,14 +1237,14 @@ restart: } } - rt->u.dst.rt_next = rt_hash_table[hash].chain; + rt->dst.rt_next = rt_hash_table[hash].chain; #if RT_CACHE_DEBUG >= 2 - if (rt->u.dst.rt_next) { + if (rt->dst.rt_next) { struct rtable *trt; printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst); - for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) + for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next) printk(" . %pI4", &trt->rt_dst); printk("\n"); } @@ -1263,7 +1262,7 @@ skip_hashing: if (rp) *rp = rt; else - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); return 0; } @@ -1335,20 +1334,21 @@ static void rt_del(unsigned hash, struct rtable *rt) ip_rt_put(rt); while ((aux = *rthp) != NULL) { if (aux == rt || rt_is_expired(aux)) { - *rthp = aux->u.dst.rt_next; + *rthp = aux->dst.rt_next; rt_free(aux); continue; } - rthp = &aux->u.dst.rt_next; + rthp = &aux->dst.rt_next; } spin_unlock_bh(rt_hash_lock_addr(hash)); } +/* called in rcu_read_lock() section */ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { int i, k; - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rth, **rthp; __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; @@ -1384,7 +1384,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rthp=&rt_hash_table[hash].chain; - rcu_read_lock(); while ((rth = rcu_dereference(*rthp)) != NULL) { struct rtable *rt; @@ -1393,44 +1392,42 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rth->fl.oif != ikeys[k] || rth->fl.iif != 0 || rt_is_expired(rth) || - !net_eq(dev_net(rth->u.dst.dev), net)) { - rthp = &rth->u.dst.rt_next; + !net_eq(dev_net(rth->dst.dev), net)) { + rthp = &rth->dst.rt_next; continue; } if (rth->rt_dst != daddr || rth->rt_src != saddr || - rth->u.dst.error || + rth->dst.error || rth->rt_gateway != old_gw || - rth->u.dst.dev != dev) + rth->dst.dev != dev) break; - dst_hold(&rth->u.dst); - rcu_read_unlock(); + dst_hold(&rth->dst); rt = dst_alloc(&ipv4_dst_ops); if (rt == NULL) { ip_rt_put(rth); - in_dev_put(in_dev); return; } /* Copy all the information. */ *rt = *rth; - rt->u.dst.__use = 1; - atomic_set(&rt->u.dst.__refcnt, 1); - rt->u.dst.child = NULL; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); + rt->dst.__use = 1; + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.child = NULL; + if (rt->dst.dev) + dev_hold(rt->dst.dev); if (rt->idev) in_dev_hold(rt->idev); - rt->u.dst.obsolete = -1; - rt->u.dst.lastuse = jiffies; - rt->u.dst.path = &rt->u.dst; - rt->u.dst.neighbour = NULL; - rt->u.dst.hh = NULL; + rt->dst.obsolete = -1; + rt->dst.lastuse = jiffies; + rt->dst.path = &rt->dst; + rt->dst.neighbour = NULL; + rt->dst.hh = NULL; #ifdef CONFIG_XFRM - rt->u.dst.xfrm = NULL; + rt->dst.xfrm = NULL; #endif rt->rt_genid = rt_genid(net); rt->rt_flags |= RTCF_REDIRECTED; @@ -1439,23 +1436,23 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rt->rt_gateway = new_gw; /* Redirect received -> path was valid */ - dst_confirm(&rth->u.dst); + dst_confirm(&rth->dst); if (rt->peer) atomic_inc(&rt->peer->refcnt); - if (arp_bind_neighbour(&rt->u.dst) || - !(rt->u.dst.neighbour->nud_state & + if (arp_bind_neighbour(&rt->dst) || + !(rt->dst.neighbour->nud_state & NUD_VALID)) { - if (rt->u.dst.neighbour) - neigh_event_send(rt->u.dst.neighbour, NULL); + if (rt->dst.neighbour) + neigh_event_send(rt->dst.neighbour, NULL); ip_rt_put(rth); rt_drop(rt); goto do_next; } - netevent.old = &rth->u.dst; - netevent.new = &rt->u.dst; + netevent.old = &rth->dst; + netevent.new = &rt->dst; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); @@ -1464,12 +1461,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, ip_rt_put(rt); goto do_next; } - rcu_read_unlock(); do_next: ; } } - in_dev_put(in_dev); return; reject_redirect: @@ -1480,7 +1475,7 @@ reject_redirect: &old_gw, dev->name, &new_gw, &saddr, &daddr); #endif - in_dev_put(in_dev); + ; } static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) @@ -1493,8 +1488,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) ip_rt_put(rt); ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || - (rt->u.dst.expires && - time_after_eq(jiffies, rt->u.dst.expires))) { + (rt->dst.expires && + time_after_eq(jiffies, rt->dst.expires))) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, rt->fl.oif, rt_genid(dev_net(dst->dev))); @@ -1532,7 +1527,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) int log_martians; rcu_read_lock(); - in_dev = __in_dev_get_rcu(rt->u.dst.dev); + in_dev = __in_dev_get_rcu(rt->dst.dev); if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { rcu_read_unlock(); return; @@ -1543,30 +1538,30 @@ void ip_rt_send_redirect(struct sk_buff *skb) /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) - rt->u.dst.rate_tokens = 0; + if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) + rt->dst.rate_tokens = 0; /* Too many ignored redirects; do not send anything - * set u.dst.rate_last to the last seen redirected packet. + * set dst.rate_last to the last seen redirected packet. */ - if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { - rt->u.dst.rate_last = jiffies; + if (rt->dst.rate_tokens >= ip_rt_redirect_number) { + rt->dst.rate_last = jiffies; return; } /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (rt->u.dst.rate_tokens == 0 || + if (rt->dst.rate_tokens == 0 || time_after(jiffies, - (rt->u.dst.rate_last + - (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { + (rt->dst.rate_last + + (ip_rt_redirect_load << rt->dst.rate_tokens)))) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); - rt->u.dst.rate_last = jiffies; - ++rt->u.dst.rate_tokens; + rt->dst.rate_last = jiffies; + ++rt->dst.rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - rt->u.dst.rate_tokens == ip_rt_redirect_number && + rt->dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", &rt->rt_src, rt->rt_iif, @@ -1581,7 +1576,7 @@ static int ip_error(struct sk_buff *skb) unsigned long now; int code; - switch (rt->u.dst.error) { + switch (rt->dst.error) { case EINVAL: default: goto out; @@ -1590,7 +1585,7 @@ static int ip_error(struct sk_buff *skb) break; case ENETUNREACH: code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(dev_net(rt->u.dst.dev), + IP_INC_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INNOROUTES); break; case EACCES: @@ -1599,12 +1594,12 @@ static int ip_error(struct sk_buff *skb) } now = jiffies; - rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; - if (rt->u.dst.rate_tokens > ip_rt_error_burst) - rt->u.dst.rate_tokens = ip_rt_error_burst; - rt->u.dst.rate_last = now; - if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { - rt->u.dst.rate_tokens -= ip_rt_error_cost; + rt->dst.rate_tokens += now - rt->dst.rate_last; + if (rt->dst.rate_tokens > ip_rt_error_burst) + rt->dst.rate_tokens = ip_rt_error_burst; + rt->dst.rate_last = now; + if (rt->dst.rate_tokens >= ip_rt_error_cost) { + rt->dst.rate_tokens -= ip_rt_error_cost; icmp_send(skb, ICMP_DEST_UNREACH, code, 0); } @@ -1649,7 +1644,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { + rth = rcu_dereference(rth->dst.rt_next)) { unsigned short mtu = new_mtu; if (rth->fl.fl4_dst != daddr || @@ -1658,8 +1653,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, rth->rt_src != iph->saddr || rth->fl.oif != ikeys[k] || rth->fl.iif != 0 || - dst_metric_locked(&rth->u.dst, RTAX_MTU) || - !net_eq(dev_net(rth->u.dst.dev), net) || + dst_metric_locked(&rth->dst, RTAX_MTU) || + !net_eq(dev_net(rth->dst.dev), net) || rt_is_expired(rth)) continue; @@ -1667,22 +1662,22 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, /* BSD 4.2 compatibility hack :-( */ if (mtu == 0 && - old_mtu >= dst_mtu(&rth->u.dst) && + old_mtu >= dst_mtu(&rth->dst) && old_mtu >= 68 + (iph->ihl << 2)) old_mtu -= iph->ihl << 2; mtu = guess_mtu(old_mtu); } - if (mtu <= dst_mtu(&rth->u.dst)) { - if (mtu < dst_mtu(&rth->u.dst)) { - dst_confirm(&rth->u.dst); + if (mtu <= dst_mtu(&rth->dst)) { + if (mtu < dst_mtu(&rth->dst)) { + dst_confirm(&rth->dst); if (mtu < ip_rt_min_pmtu) { mtu = ip_rt_min_pmtu; - rth->u.dst.metrics[RTAX_LOCK-1] |= + rth->dst.metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); } - rth->u.dst.metrics[RTAX_MTU-1] = mtu; - dst_set_expires(&rth->u.dst, + rth->dst.metrics[RTAX_MTU-1] = mtu; + dst_set_expires(&rth->dst, ip_rt_mtu_expires); } est_mtu = mtu; @@ -1755,7 +1750,7 @@ static void ipv4_link_failure(struct sk_buff *skb) rt = skb_rtable(skb); if (rt) - dst_set_expires(&rt->u.dst, 0); + dst_set_expires(&rt->dst, 0); } static int ip_rt_bug(struct sk_buff *skb) @@ -1783,11 +1778,11 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt->fl.iif == 0) src = rt->rt_src; - else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { + else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) { src = FIB_RES_PREFSRC(res); fib_res_put(&res); } else - src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, + src = inet_select_addr(rt->dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); memcpy(addr, &src, 4); } @@ -1795,10 +1790,10 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) #ifdef CONFIG_NET_CLS_ROUTE static void set_class_tag(struct rtable *rt, u32 tag) { - if (!(rt->u.dst.tclassid & 0xFFFF)) - rt->u.dst.tclassid |= tag & 0xFFFF; - if (!(rt->u.dst.tclassid & 0xFFFF0000)) - rt->u.dst.tclassid |= tag & 0xFFFF0000; + if (!(rt->dst.tclassid & 0xFFFF)) + rt->dst.tclassid |= tag & 0xFFFF; + if (!(rt->dst.tclassid & 0xFFFF0000)) + rt->dst.tclassid |= tag & 0xFFFF0000; } #endif @@ -1810,30 +1805,30 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); - memcpy(rt->u.dst.metrics, fi->fib_metrics, - sizeof(rt->u.dst.metrics)); + memcpy(rt->dst.metrics, fi->fib_metrics, + sizeof(rt->dst.metrics)); if (fi->fib_mtu == 0) { - rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; - if (dst_metric_locked(&rt->u.dst, RTAX_MTU) && + rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu; + if (dst_metric_locked(&rt->dst, RTAX_MTU) && rt->rt_gateway != rt->rt_dst && - rt->u.dst.dev->mtu > 576) - rt->u.dst.metrics[RTAX_MTU-1] = 576; + rt->dst.dev->mtu > 576) + rt->dst.metrics[RTAX_MTU-1] = 576; } #ifdef CONFIG_NET_CLS_ROUTE - rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; + rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } else - rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; - - if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; - if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) - rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; - if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0) - rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, + rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu; + + if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0) + rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; + if (dst_mtu(&rt->dst) > IP_MAX_MTU) + rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; + if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0) + rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40, ip_rt_min_advmss); - if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40) - rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; + if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40) + rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; #ifdef CONFIG_NET_CLS_ROUTE #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -1844,14 +1839,16 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) rt->rt_type = res->type; } +/* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { - unsigned hash; + unsigned int hash; struct rtable *rth; __be32 spec_dst; - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); u32 itag = 0; + int err; /* Primary sanity checks. */ @@ -1866,21 +1863,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (!ipv4_is_local_multicast(daddr)) goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - } else if (fib_validate_source(saddr, 0, tos, 0, - dev, &spec_dst, &itag, 0) < 0) - goto e_inval; - + } else { + err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, + &itag, 0); + if (err < 0) + goto e_err; + } rth = dst_alloc(&ipv4_dst_ops); if (!rth) goto e_nobufs; - rth->u.dst.output = ip_rt_bug; - rth->u.dst.obsolete = -1; + rth->dst.output = ip_rt_bug; + rth->dst.obsolete = -1; - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; @@ -1888,13 +1887,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; + rth->dst.tclassid = itag; #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); + rth->dst.dev = init_net.loopback_dev; + dev_hold(rth->dst.dev); + rth->idev = in_dev_get(rth->dst.dev); rth->fl.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; @@ -1902,27 +1901,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; if (our) { - rth->u.dst.input= ip_local_deliver; + rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; } #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) - rth->u.dst.input = ip_mr_input; + rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); - in_dev_put(in_dev); hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); e_nobufs: - in_dev_put(in_dev); return -ENOBUFS; - e_inval: - in_dev_put(in_dev); return -EINVAL; +e_err: + return err; } @@ -1956,22 +1953,22 @@ static void ip_handle_martian_source(struct net_device *dev, #endif } +/* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) { - struct rtable *rth; int err; struct in_device *out_dev; - unsigned flags = 0; + unsigned int flags = 0; __be32 spec_dst; u32 itag; /* get a working reference to the output device */ - out_dev = in_dev_get(FIB_RES_DEV(*res)); + out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); if (out_dev == NULL) { if (net_ratelimit()) printk(KERN_CRIT "Bug in ip_route_input" \ @@ -1986,7 +1983,6 @@ static int __mkroute_input(struct sk_buff *skb, ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); - err = -EINVAL; goto cleanup; } @@ -2020,12 +2016,12 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; if (IN_DEV_CONF_GET(out_dev, NOXFRM)) - rth->u.dst.flags |= DST_NOXFRM; + rth->dst.flags |= DST_NOXFRM; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; @@ -2035,16 +2031,16 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = daddr; rth->rt_iif = rth->fl.iif = in_dev->dev->ifindex; - rth->u.dst.dev = (out_dev)->dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); + rth->dst.dev = (out_dev)->dev; + dev_hold(rth->dst.dev); + rth->idev = in_dev_get(rth->dst.dev); rth->fl.oif = 0; rth->rt_spec_dst= spec_dst; - rth->u.dst.obsolete = -1; - rth->u.dst.input = ip_forward; - rth->u.dst.output = ip_output; - rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); + rth->dst.obsolete = -1; + rth->dst.input = ip_forward; + rth->dst.output = ip_output; + rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); rt_set_nexthop(rth, res, itag); @@ -2053,8 +2049,6 @@ static int __mkroute_input(struct sk_buff *skb, *result = rth; err = 0; cleanup: - /* release the working reference to the output device */ - in_dev_put(out_dev); return err; } @@ -2080,7 +2074,7 @@ static int ip_mkroute_input(struct sk_buff *skb, /* put it into the cache */ hash = rt_hash(daddr, saddr, fl->iif, - rt_genid(dev_net(rth->u.dst.dev))); + rt_genid(dev_net(rth->dst.dev))); return rt_intern_hash(hash, rth, NULL, skb, fl->iif); } @@ -2098,7 +2092,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { struct fib_result res; - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); struct flowi fl = { .nl_u = { .ip4_u = { .daddr = daddr, .saddr = saddr, @@ -2158,13 +2152,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto brd_input; if (res.type == RTN_LOCAL) { - int result; - result = fib_validate_source(saddr, daddr, tos, + err = fib_validate_source(saddr, daddr, tos, net->loopback_dev->ifindex, dev, &spec_dst, &itag, skb->mark); - if (result < 0) - goto martian_source; - if (result) + if (err < 0) + goto martian_source_keep_err; + if (err) flags |= RTCF_DIRECTSRC; spec_dst = daddr; goto local_input; @@ -2177,7 +2170,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); done: - in_dev_put(in_dev); if (free_res) fib_res_put(&res); out: return err; @@ -2192,7 +2184,7 @@ brd_input: err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag, skb->mark); if (err < 0) - goto martian_source; + goto martian_source_keep_err; if (err) flags |= RTCF_DIRECTSRC; } @@ -2205,14 +2197,14 @@ local_input: if (!rth) goto e_nobufs; - rth->u.dst.output= ip_rt_bug; - rth->u.dst.obsolete = -1; + rth->dst.output= ip_rt_bug; + rth->dst.obsolete = -1; rth->rt_genid = rt_genid(net); - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; @@ -2220,20 +2212,20 @@ local_input: rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; + rth->dst.tclassid = itag; #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = net->loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); + rth->dst.dev = net->loopback_dev; + dev_hold(rth->dst.dev); + rth->idev = in_dev_get(rth->dst.dev); rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; - rth->u.dst.input= ip_local_deliver; + rth->dst.input= ip_local_deliver; rth->rt_flags = flags|RTCF_LOCAL; if (res.type == RTN_UNREACHABLE) { - rth->u.dst.input= ip_error; - rth->u.dst.error= -err; + rth->dst.input= ip_error; + rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; @@ -2273,8 +2265,10 @@ e_nobufs: goto done; martian_source: + err = -EINVAL; +martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - goto e_inval; + goto done; } int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, @@ -2284,32 +2278,34 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, unsigned hash; int iif = dev->ifindex; struct net *net; + int res; net = dev_net(dev); + rcu_read_lock(); + if (!rt_caching(net)) goto skip_cache; tos &= IPTOS_RT_MASK; hash = rt_hash(daddr, saddr, iif, rt_genid(net)); - rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { + rth = rcu_dereference(rth->dst.rt_next)) { if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | (rth->fl.iif ^ iif) | rth->fl.oif | (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && - net_eq(dev_net(rth->u.dst.dev), net) && + net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { if (noref) { - dst_use_noref(&rth->u.dst, jiffies); - skb_dst_set_noref(skb, &rth->u.dst); + dst_use_noref(&rth->dst, jiffies); + skb_dst_set_noref(skb, &rth->dst); } else { - dst_use(&rth->u.dst, jiffies); - skb_dst_set(skb, &rth->u.dst); + dst_use(&rth->dst, jiffies); + skb_dst_set(skb, &rth->dst); } RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); @@ -2317,7 +2313,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, } RT_CACHE_STAT_INC(in_hlist_search); } - rcu_read_unlock(); skip_cache: /* Multicast recognition logic is moved from route cache to here. @@ -2332,12 +2327,11 @@ skip_cache: route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { - struct in_device *in_dev; + struct in_device *in_dev = __in_dev_get_rcu(dev); - rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { + if (in_dev) { int our = ip_check_mc(in_dev, daddr, saddr, - ip_hdr(skb)->protocol); + ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE || @@ -2345,15 +2339,18 @@ skip_cache: IN_DEV_MFORWARD(in_dev)) #endif ) { + int res = ip_route_input_mc(skb, daddr, saddr, + tos, dev, our); rcu_read_unlock(); - return ip_route_input_mc(skb, daddr, saddr, - tos, dev, our); + return res; } } rcu_read_unlock(); return -EINVAL; } - return ip_route_input_slow(skb, daddr, saddr, tos, dev); + res = ip_route_input_slow(skb, daddr, saddr, tos, dev); + rcu_read_unlock(); + return res; } EXPORT_SYMBOL(ip_route_input_common); @@ -2415,12 +2412,12 @@ static int __mkroute_output(struct rtable **result, goto cleanup; } - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOXFRM)) - rth->u.dst.flags |= DST_NOXFRM; + rth->dst.flags |= DST_NOXFRM; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = oldflp->fl4_dst; rth->fl.fl4_tos = tos; @@ -2432,35 +2429,35 @@ static int __mkroute_output(struct rtable **result, rth->rt_iif = oldflp->oif ? : dev_out->ifindex; /* get references to the devices that are to be hold by the routing cache entry */ - rth->u.dst.dev = dev_out; + rth->dst.dev = dev_out; dev_hold(dev_out); rth->idev = in_dev_get(dev_out); rth->rt_gateway = fl->fl4_dst; rth->rt_spec_dst= fl->fl4_src; - rth->u.dst.output=ip_output; - rth->u.dst.obsolete = -1; + rth->dst.output=ip_output; + rth->dst.obsolete = -1; rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) { - rth->u.dst.input = ip_local_deliver; + rth->dst.input = ip_local_deliver; rth->rt_spec_dst = fl->fl4_dst; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { rth->rt_spec_dst = fl->fl4_src; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { - rth->u.dst.output = ip_mc_output; + rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (res->type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(oldflp->fl4_dst)) { - rth->u.dst.input = ip_mr_input; - rth->u.dst.output = ip_mc_output; + rth->dst.input = ip_mr_input; + rth->dst.output = ip_mc_output; } } #endif @@ -2715,7 +2712,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, rcu_read_lock_bh(); for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; - rth = rcu_dereference_bh(rth->u.dst.rt_next)) { + rth = rcu_dereference_bh(rth->dst.rt_next)) { if (rth->fl.fl4_dst == flp->fl4_dst && rth->fl.fl4_src == flp->fl4_src && rth->fl.iif == 0 && @@ -2723,9 +2720,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, rth->fl.mark == flp->mark && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && - net_eq(dev_net(rth->u.dst.dev), net) && + net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { - dst_use(&rth->u.dst, jiffies); + dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); *rp = rth; @@ -2762,15 +2759,15 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi dst_alloc(&ipv4_dst_blackhole_ops); if (rt) { - struct dst_entry *new = &rt->u.dst; + struct dst_entry *new = &rt->dst; atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; new->output = dst_discard; - memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); + memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); - new->dev = ort->u.dst.dev; + new->dev = ort->dst.dev; if (new->dev) dev_hold(new->dev); @@ -2794,7 +2791,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi dst_free(new); } - dst_release(&(*rp)->u.dst); + dst_release(&(*rp)->dst); *rp = rt; return (rt ? 0 : -ENOMEM); } @@ -2864,11 +2861,11 @@ static int rt_fill_info(struct net *net, r->rtm_src_len = 32; NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); } - if (rt->u.dst.dev) - NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); + if (rt->dst.dev) + NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); #ifdef CONFIG_NET_CLS_ROUTE - if (rt->u.dst.tclassid) - NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); + if (rt->dst.tclassid) + NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); #endif if (rt->fl.iif) NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); @@ -2878,12 +2875,13 @@ static int rt_fill_info(struct net *net, if (rt->rt_dst != rt->rt_gateway) NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); - if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) goto nla_put_failure; - error = rt->u.dst.error; - expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; + error = rt->dst.error; + expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; if (rt->peer) { + inet_peer_refcheck(rt->peer); id = atomic_read(&rt->peer->ip_id_count) & 0xffff; if (rt->peer->tcp_ts_stamp) { ts = rt->peer->tcp_ts; @@ -2914,7 +2912,7 @@ static int rt_fill_info(struct net *net, NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); } - if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, + if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, expires, error) < 0) goto nla_put_failure; @@ -2979,8 +2977,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void local_bh_enable(); rt = skb_rtable(skb); - if (err == 0 && rt->u.dst.error) - err = -rt->u.dst.error; + if (err == 0 && rt->dst.error) + err = -rt->dst.error; } else { struct flowi fl = { .nl_u = { @@ -2998,7 +2996,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (err) goto errout_free; - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; @@ -3034,12 +3032,12 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; rcu_read_lock_bh(); for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) { - if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) + rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { + if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) continue; if (rt_is_expired(rt)) continue; - skb_dst_set_noref(skb, &rt->u.dst); + skb_dst_set_noref(skb, &rt->dst); if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 9f6b22206c5..51b5662545d 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -138,23 +138,23 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, } /* - * This table has to be sorted and terminated with (__u16)-1. - * XXX generate a better table. - * Unresolved Issues: HIPPI with a 64k MSS is not well supported. + * MSS Values are taken from the 2009 paper + * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson: + * - values 1440 to 1460 accounted for 80% of observed mss values + * - values outside the 536-1460 range are rare (<0.2%). + * + * Table must be sorted. */ static __u16 const msstab[] = { - 64 - 1, - 256 - 1, - 512 - 1, - 536 - 1, - 1024 - 1, - 1440 - 1, - 1460 - 1, - 4312 - 1, - (__u16)-1 + 64, + 512, + 536, + 1024, + 1440, + 1460, + 4312, + 8960, }; -/* The number doesn't include the -1 terminator */ -#define NUM_MSS (ARRAY_SIZE(msstab) - 1) /* * Generate a syncookie. mssp points to the mss, which is returned @@ -169,10 +169,10 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) tcp_synq_overflow(sk); - /* XXX sort msstab[] by probability? Binary search? */ - for (mssind = 0; mss > msstab[mssind + 1]; mssind++) - ; - *mssp = msstab[mssind] + 1; + for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) + if (mss >= msstab[mssind]) + break; + *mssp = msstab[mssind]; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); @@ -202,7 +202,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) jiffies / (HZ * 60), COUNTER_TRIES); - return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; + return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, @@ -230,23 +230,36 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, * The lowest 4 bits are for snd_wscale * The next 4 lsb are for rcv_wscale * The next lsb is for sack_ok + * + * return false if we decode an option that should not be. */ -void cookie_check_timestamp(struct tcp_options_received *tcp_opt) +bool cookie_check_timestamp(struct tcp_options_received *tcp_opt) { /* echoed timestamp, 9 lowest bits contain options */ u32 options = tcp_opt->rcv_tsecr & TSMASK; + if (!tcp_opt->saw_tstamp) { + tcp_clear_options(tcp_opt); + return true; + } + + if (!sysctl_tcp_timestamps) + return false; + tcp_opt->snd_wscale = options & 0xf; options >>= 4; tcp_opt->rcv_wscale = options & 0xf; tcp_opt->sack_ok = (options >> 4) & 0x1; - if (tcp_opt->sack_ok) - tcp_sack_reset(tcp_opt); + if (tcp_opt->sack_ok && !sysctl_tcp_sack) + return false; - if (tcp_opt->snd_wscale || tcp_opt->rcv_wscale) + if (tcp_opt->snd_wscale || tcp_opt->rcv_wscale) { tcp_opt->wscale_ok = 1; + return sysctl_tcp_window_scaling != 0; + } + return true; } EXPORT_SYMBOL(cookie_check_timestamp); @@ -266,7 +279,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct rtable *rt; __u8 rcv_wscale; - if (!sysctl_tcp_syncookies || !th->ack) + if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk) || @@ -281,8 +294,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(skb, &tcp_opt, &hash_location, 0); - if (tcp_opt.saw_tstamp) - cookie_check_timestamp(&tcp_opt); + if (!cookie_check_timestamp(&tcp_opt)) + goto out; ret = NULL; req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ @@ -354,15 +367,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW); + req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, &rcv_wscale, - dst_metric(&rt->u.dst, RTAX_INITRWND)); + dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; - ret = get_cookie_sock(sk, skb, req, &rt->u.dst); + ret = get_cookie_sock(sk, skb, req, &rt->dst); out: return ret; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6596b4feedd..779d40c3b96 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -511,7 +511,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; } @@ -527,7 +527,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) skb->csum = 0; tcb->seq = tcb->end_seq = tp->write_seq; - tcb->flags = TCPCB_FLAG_ACK; + tcb->flags = TCPHDR_ACK; tcb->sacked = 0; skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); @@ -815,7 +815,7 @@ new_segment: skb_shinfo(skb)->gso_segs = 0; if (!copied) - TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; copied += copy; poffset += copy; @@ -1061,7 +1061,7 @@ new_segment: } if (!copied) - TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; @@ -2999,6 +2999,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, const unsigned head_data_len = skb_headlen(skb) > header_len ? skb_headlen(skb) - header_len : 0; const struct skb_shared_info *shi = skb_shinfo(skb); + struct sk_buff *frag_iter; sg_init_table(&sg, 1); @@ -3013,6 +3014,10 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, return 1; } + skb_walk_frags(skb, frag_iter) + if (tcp_md5_hash_skb_data(hp, frag_iter, 0)) + return 1; + return 0; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 548d575e6cc..04334661fa2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3286,7 +3286,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if (!(scb->flags & TCPCB_FLAG_SYN)) { + if (!(scb->flags & TCPHDR_SYN)) { flag |= FLAG_DATA_ACKED; } else { flag |= FLAG_SYN_ACKED; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fe193e53af4..2e41e6f9296 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -204,10 +204,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * TIME-WAIT * and initialize rx_opt.ts_recent from it, * when trying new connection. */ - if (peer != NULL && - (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; - tp->rx_opt.ts_recent = peer->tcp_ts; + if (peer) { + inet_peer_refcheck(peer); + if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { + tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; + tp->rx_opt.ts_recent = peer->tcp_ts; + } } } @@ -237,7 +239,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, @@ -793,19 +795,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -#ifdef CONFIG_SYN_COOKIES -static void syn_flood_warning(struct sk_buff *skb) +static void syn_flood_warning(const struct sk_buff *skb) { - static unsigned long warntime; + const char *msg; - if (time_after(jiffies, (warntime + HZ * 60))) { - warntime = jiffies; - printk(KERN_INFO - "possible SYN flooding on port %d. Sending cookies.\n", - ntohs(tcp_hdr(skb)->dest)); - } -} +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) + msg = "Sending cookies"; + else #endif + msg = "Dropping request"; + + pr_info("TCP: Possible SYN flooding on port %d. %s.\n", + ntohs(tcp_hdr(skb)->dest), msg); +} /* * Save and compile IPv4 options into the request_sock if needed. @@ -1243,6 +1246,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * evidently real one. */ if (inet_csk_reqsk_queue_is_full(sk) && !isn) { + if (net_ratelimit()) + syn_flood_warning(skb); #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { want_cookie = 1; @@ -1328,7 +1333,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (want_cookie) { #ifdef CONFIG_SYN_COOKIES - syn_flood_warning(skb); req->cookie_ts = tmp_opt.tstamp_ok; #endif isn = cookie_v4_init_sequence(sk, skb, &req->mss); @@ -1349,6 +1353,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { + inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { @@ -1504,7 +1509,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) } #ifdef CONFIG_SYN_COOKIES - if (!th->rst && !th->syn && th->ack) + if (!th->syn) sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); #endif return sk; @@ -1978,6 +1983,11 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; } +/* + * Get next listener socket follow cur. If cur is NULL, get first socket + * starting from bucket given in st->bucket; when st->bucket is zero the + * very first socket in the hash table is returned. + */ static void *listening_get_next(struct seq_file *seq, void *cur) { struct inet_connection_sock *icsk; @@ -1988,14 +1998,15 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct net *net = seq_file_net(seq); if (!sk) { - st->bucket = 0; - ilb = &tcp_hashinfo.listening_hash[0]; + ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock); sk = sk_nulls_head(&ilb->head); + st->offset = 0; goto get_sk; } ilb = &tcp_hashinfo.listening_hash[st->bucket]; ++st->num; + ++st->offset; if (st->state == TCP_SEQ_STATE_OPENREQ) { struct request_sock *req = cur; @@ -2010,6 +2021,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) } req = req->dl_next; } + st->offset = 0; if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) break; get_req: @@ -2045,6 +2057,7 @@ start_req: read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); + st->offset = 0; if (++st->bucket < INET_LHTABLE_SIZE) { ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock); @@ -2058,7 +2071,12 @@ out: static void *listening_get_idx(struct seq_file *seq, loff_t *pos) { - void *rc = listening_get_next(seq, NULL); + struct tcp_iter_state *st = seq->private; + void *rc; + + st->bucket = 0; + st->offset = 0; + rc = listening_get_next(seq, NULL); while (rc && *pos) { rc = listening_get_next(seq, rc); @@ -2073,13 +2091,18 @@ static inline int empty_bucket(struct tcp_iter_state *st) hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); } +/* + * Get first established socket starting from bucket given in st->bucket. + * If st->bucket is zero, the very first socket in the hash is returned. + */ static void *established_get_first(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); void *rc = NULL; - for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { + st->offset = 0; + for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; struct inet_timewait_sock *tw; @@ -2124,6 +2147,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) struct net *net = seq_file_net(seq); ++st->num; + ++st->offset; if (st->state == TCP_SEQ_STATE_TIME_WAIT) { tw = cur; @@ -2140,6 +2164,7 @@ get_tw: st->state = TCP_SEQ_STATE_ESTABLISHED; /* Look for next non empty bucket */ + st->offset = 0; while (++st->bucket <= tcp_hashinfo.ehash_mask && empty_bucket(st)) ; @@ -2167,7 +2192,11 @@ out: static void *established_get_idx(struct seq_file *seq, loff_t pos) { - void *rc = established_get_first(seq); + struct tcp_iter_state *st = seq->private; + void *rc; + + st->bucket = 0; + rc = established_get_first(seq); while (rc && pos) { rc = established_get_next(seq, rc); @@ -2192,24 +2221,72 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) return rc; } +static void *tcp_seek_last_pos(struct seq_file *seq) +{ + struct tcp_iter_state *st = seq->private; + int offset = st->offset; + int orig_num = st->num; + void *rc = NULL; + + switch (st->state) { + case TCP_SEQ_STATE_OPENREQ: + case TCP_SEQ_STATE_LISTENING: + if (st->bucket >= INET_LHTABLE_SIZE) + break; + st->state = TCP_SEQ_STATE_LISTENING; + rc = listening_get_next(seq, NULL); + while (offset-- && rc) + rc = listening_get_next(seq, rc); + if (rc) + break; + st->bucket = 0; + /* Fallthrough */ + case TCP_SEQ_STATE_ESTABLISHED: + case TCP_SEQ_STATE_TIME_WAIT: + st->state = TCP_SEQ_STATE_ESTABLISHED; + if (st->bucket > tcp_hashinfo.ehash_mask) + break; + rc = established_get_first(seq); + while (offset-- && rc) + rc = established_get_next(seq, rc); + } + + st->num = orig_num; + + return rc; +} + static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) { struct tcp_iter_state *st = seq->private; + void *rc; + + if (*pos && *pos == st->last_pos) { + rc = tcp_seek_last_pos(seq); + if (rc) + goto out; + } + st->state = TCP_SEQ_STATE_LISTENING; st->num = 0; - return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + st->bucket = 0; + st->offset = 0; + rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + +out: + st->last_pos = *pos; + return rc; } static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct tcp_iter_state *st = seq->private; void *rc = NULL; - struct tcp_iter_state *st; if (v == SEQ_START_TOKEN) { rc = tcp_get_idx(seq, 0); goto out; } - st = seq->private; switch (st->state) { case TCP_SEQ_STATE_OPENREQ: @@ -2217,6 +2294,8 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) rc = listening_get_next(seq, v); if (!rc) { st->state = TCP_SEQ_STATE_ESTABLISHED; + st->bucket = 0; + st->offset = 0; rc = established_get_first(seq); } break; @@ -2227,6 +2306,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } out: ++*pos; + st->last_pos = *pos; return rc; } @@ -2265,6 +2345,7 @@ static int tcp_seq_open(struct inode *inode, struct file *file) s = ((struct seq_file *)file->private_data)->private; s->family = afinfo->family; + s->last_pos = 0; return 0; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b4ed957f201..51d316dbb05 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -294,9 +294,9 @@ static u16 tcp_select_window(struct sock *sk) /* Packet ECN state for a SYN-ACK */ static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) { - TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; + TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) - TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; + TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE; } /* Packet ECN state for a SYN. */ @@ -306,7 +306,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) tp->ecn_flags = 0; if (sysctl_tcp_ecn == 1) { - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR; + TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; } } @@ -361,7 +361,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) skb_shinfo(skb)->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; - if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN)) + if (flags & (TCPHDR_SYN | TCPHDR_FIN)) seq++; TCP_SKB_CB(skb)->end_seq = seq; } @@ -820,7 +820,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) + if (unlikely(tcb->flags & TCPHDR_SYN)) tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); else tcp_options_size = tcp_established_options(sk, skb, &opts, @@ -843,7 +843,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + if (unlikely(tcb->flags & TCPHDR_SYN)) { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ @@ -866,7 +866,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } tcp_options_write((__be32 *)(th + 1), tp, &opts); - if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) + if (likely((tcb->flags & TCPHDR_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); #ifdef CONFIG_TCP_MD5SIG @@ -880,7 +880,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, icsk->icsk_af_ops->send_check(sk, skb); - if (likely(tcb->flags & TCPCB_FLAG_ACK)) + if (likely(tcb->flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) @@ -1023,7 +1023,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); + TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; @@ -1328,8 +1328,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, u32 in_flight, cwnd; /* Don't be strict about the congestion window for the final FIN. */ - if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && - tcp_skb_pcount(skb) == 1) + if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1) return 1; in_flight = tcp_packets_in_flight(tp); @@ -1398,7 +1397,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, * Nagle can be ignored during F-RTO too (see RFC4138). */ if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) + (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)) return 1; if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) @@ -1487,7 +1486,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); + TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->flags = flags; /* This packet was never sent out yet, so no SACK bits. */ @@ -1518,7 +1517,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; - if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) goto send_now; if (icsk->icsk_ca_state != TCP_CA_Open) @@ -1644,7 +1643,7 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; - TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(nskb)->flags = TCPHDR_ACK; TCP_SKB_CB(nskb)->sacked = 0; nskb->csum = 0; nskb->ip_summed = skb->ip_summed; @@ -1669,7 +1668,7 @@ static int tcp_mtu_probe(struct sock *sk) sk_wmem_free_skb(sk, skb); } else { TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & - ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + ~(TCPHDR_FIN|TCPHDR_PSH); if (!skb_shinfo(skb)->nr_frags) { skb_pull(skb, copy); if (skb->ip_summed != CHECKSUM_PARTIAL) @@ -2020,7 +2019,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!sysctl_tcp_retrans_collapse) return; - if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) + if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN) return; tcp_for_write_queue_from_safe(skb, tmp, sk) { @@ -2112,7 +2111,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * since it is cheap to do so and saves bytes on the network. */ if (skb->len > 0 && - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { /* Reuse, even though it does some unnecessary work */ @@ -2301,7 +2300,7 @@ void tcp_send_fin(struct sock *sk) mss_now = tcp_current_mss(sk); if (tcp_send_head(sk) != NULL) { - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; + TCP_SKB_CB(skb)->flags |= TCPHDR_FIN; TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; } else { @@ -2318,7 +2317,7 @@ void tcp_send_fin(struct sock *sk) skb_reserve(skb, MAX_TCP_HEADER); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, - TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); + TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); @@ -2343,7 +2342,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), - TCPCB_FLAG_ACK | TCPCB_FLAG_RST); + TCPHDR_ACK | TCPHDR_RST); /* Send it off. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) @@ -2363,11 +2362,11 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *skb; skb = tcp_write_queue_head(sk); - if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { + if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) { printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); return -EFAULT; } - if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) { + if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) @@ -2381,7 +2380,7 @@ int tcp_send_synack(struct sock *sk) skb = nskb; } - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->flags |= TCPHDR_ACK; TCP_ECN_send_synack(tcp_sk(sk), skb); } TCP_SKB_CB(skb)->when = tcp_time_stamp; @@ -2460,7 +2459,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, * not even correctly set) */ tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, - TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); + TCPHDR_SYN | TCPHDR_ACK); if (OPTION_COOKIE_EXTENSION & opts.options) { if (s_data_desired) { @@ -2592,7 +2591,7 @@ int tcp_connect(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); tp->snd_nxt = tp->write_seq; - tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); + tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); TCP_ECN_send_syn(sk, buff); /* Send it off. */ @@ -2698,7 +2697,7 @@ void tcp_send_ack(struct sock *sk) /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); + tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; @@ -2732,7 +2731,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) * end to send an ack. Don't queue or clone SKB, just * send it. */ - tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK); + tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -2762,13 +2761,13 @@ int tcp_write_wakeup(struct sock *sk) if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || skb->len > mss) { seg_size = min(seg_size, mss); - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; if (tcp_fragment(sk, skb, seg_size, mss)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb, mss); - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eec4ff456e3..32e0bef60d0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -914,7 +914,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, !sock_flag(sk, SOCK_BROADCAST)) goto out; if (connected) - sk_dst_set(sk, dst_clone(&rt->u.dst)); + sk_dst_set(sk, dst_clone(&rt->dst)); } if (msg->msg_flags&MSG_CONFIRM) @@ -978,7 +978,7 @@ out: return err; do_confirm: - dst_confirm(&rt->u.dst); + dst_confirm(&rt->dst); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 1705476670e..349327092c9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -37,7 +37,7 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, fl.fl4_src = saddr->a4; err = __ip_route_output_key(net, &rt, &fl); - dst = &rt->u.dst; + dst = &rt->dst; if (err) dst = ERR_PTR(err); return dst; |