summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig8
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c15
-rw-r--r--net/ipv4/ah4.c50
-rw-r--r--net/ipv4/arp.c42
-rw-r--r--net/ipv4/cipso_ipv4.c115
-rw-r--r--net/ipv4/devinet.c27
-rw-r--r--net/ipv4/esp4.c66
-rw-r--r--net/ipv4/fib_frontend.c26
-rw-r--r--net/ipv4/fib_hash.c25
-rw-r--r--net/ipv4/fib_rules.c16
-rw-r--r--net/ipv4/fib_semantics.c6
-rw-r--r--net/ipv4/fib_trie.c145
-rw-r--r--net/ipv4/icmp.c55
-rw-r--r--net/ipv4/igmp.c52
-rw-r--r--net/ipv4/inet_connection_sock.c22
-rw-r--r--net/ipv4/inet_diag.c21
-rw-r--r--net/ipv4/inet_hashtables.c13
-rw-r--r--net/ipv4/inet_lro.c600
-rw-r--r--net/ipv4/inet_timewait_sock.c4
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_fragment.c2
-rw-r--r--net/ipv4/ip_gre.c18
-rw-r--r--net/ipv4/ip_input.c3
-rw-r--r--net/ipv4/ip_output.c6
-rw-r--r--net/ipv4/ip_sockglue.c8
-rw-r--r--net/ipv4/ipcomp.c34
-rw-r--r--net/ipv4/ipconfig.c25
-rw-r--r--net/ipv4/ipip.c6
-rw-r--r--net/ipv4/ipmr.c59
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c5
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c5
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c2
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c29
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c5
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c10
-rw-r--r--net/ipv4/netfilter/ip_queue.c38
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c5
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c3
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c3
-rw-r--r--net/ipv4/netfilter/ipt_recent.c20
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c30
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c53
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c40
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c22
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_tcp.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udp.c4
-rw-r--r--net/ipv4/proc.c135
-rw-r--r--net/ipv4/raw.c28
-rw-r--r--net/ipv4/route.c82
-rw-r--r--net/ipv4/sysctl_net_ipv4.c75
-rw-r--r--net/ipv4/tcp.c14
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cubic.c2
-rw-r--r--net/ipv4/tcp_diag.c8
-rw-r--r--net/ipv4/tcp_input.c897
-rw-r--r--net/ipv4/tcp_ipv4.c6
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_output.c181
-rw-r--r--net/ipv4/tcp_probe.c7
-rw-r--r--net/ipv4/tcp_timer.c2
-rw-r--r--net/ipv4/udp.c89
-rw-r--r--net/ipv4/udp_impl.h2
-rw-r--r--net/ipv4/udplite.c3
-rw-r--r--net/ipv4/xfrm4_input.c7
-rw-r--r--net/ipv4/xfrm4_mode_beet.c23
-rw-r--r--net/ipv4/xfrm4_mode_transport.c9
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c15
-rw-r--r--net/ipv4/xfrm4_output.c43
-rw-r--r--net/ipv4/xfrm4_policy.c2
-rw-r--r--net/ipv4/xfrm4_tunnel.c8
73 files changed, 2078 insertions, 1329 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index fb790977425..d894f616c3d 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -394,6 +394,14 @@ config INET_XFRM_MODE_BEET
If unsure, say Y.
+config INET_LRO
+ tristate "Large Receive Offload (ipv4/tcp)"
+
+ ---help---
+ Support for Large Receive Offload (ipv4/tcp).
+
+ If unsure, say Y.
+
config INET_DIAG
tristate "INET: socket monitoring interface"
default y
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index fbf1674e0c2..a02c36d0a13 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_INET_ESP) += esp4.o
obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
+obj-$(CONFIG_INET_LRO) += inet_lro.o
obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e68103475cc..621b128897d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -241,7 +241,7 @@ EXPORT_SYMBOL(build_ehash_secret);
* Create an inet socket.
*/
-static int inet_create(struct socket *sock, int protocol)
+static int inet_create(struct net *net, struct socket *sock, int protocol)
{
struct sock *sk;
struct list_head *p;
@@ -253,6 +253,9 @@ static int inet_create(struct socket *sock, int protocol)
int try_loading_module = 0;
int err;
+ if (net != &init_net)
+ return -EAFNOSUPPORT;
+
if (sock->type != SOCK_RAW &&
sock->type != SOCK_DGRAM &&
!inet_ehash_secret)
@@ -320,7 +323,7 @@ lookup_protocol:
BUG_TRAP(answer_prot->slab != NULL);
err = -ENOBUFS;
- sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
+ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, 1);
if (sk == NULL)
goto out;
@@ -939,7 +942,7 @@ static struct inet_protosw inetsw_array[] =
}
};
-#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw))
+#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
void inet_register_protosw(struct inet_protosw *p)
{
@@ -1299,6 +1302,10 @@ static int __init init_ipv4_mibs(void)
sizeof(struct icmp_mib),
__alignof__(struct icmp_mib)) < 0)
goto err_icmp_mib;
+ if (snmp_mib_init((void **)icmpmsg_statistics,
+ sizeof(struct icmpmsg_mib),
+ __alignof__(struct icmpmsg_mib)) < 0)
+ goto err_icmpmsg_mib;
if (snmp_mib_init((void **)tcp_statistics,
sizeof(struct tcp_mib),
__alignof__(struct tcp_mib)) < 0)
@@ -1321,6 +1328,8 @@ err_udplite_mib:
err_udp_mib:
snmp_mib_free((void **)tcp_statistics);
err_tcp_mib:
+ snmp_mib_free((void **)icmpmsg_statistics);
+err_icmpmsg_mib:
snmp_mib_free((void **)icmp_statistics);
err_icmp_mib:
snmp_mib_free((void **)ip_statistics);
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 39f6211f149..4e8e3b079f5 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -5,6 +5,7 @@
#include <net/ah.h>
#include <linux/crypto.h>
#include <linux/pfkeyv2.h>
+#include <linux/spinlock.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <asm/scatterlist.h>
@@ -65,6 +66,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
char buf[60];
} tmp_iph;
+ skb_push(skb, -skb_network_offset(skb));
top_iph = ip_hdr(skb);
iph = &tmp_iph.iph;
@@ -80,28 +82,30 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
goto error;
}
- ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4);
- ah->nexthdr = top_iph->protocol;
+ ah = ip_auth_hdr(skb);
+ ah->nexthdr = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_AH;
top_iph->tos = 0;
top_iph->tot_len = htons(skb->len);
top_iph->frag_off = 0;
top_iph->ttl = 0;
- top_iph->protocol = IPPROTO_AH;
top_iph->check = 0;
ahp = x->data;
- ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
- ahp->icv_trunc_len) >> 2) - 2;
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
ah->reserved = 0;
ah->spi = x->id.spi;
- ah->seq_no = htonl(++x->replay.oseq);
- xfrm_aevent_doreplay(x);
+ ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq);
+
+ spin_lock_bh(&x->lock);
err = ah_mac_digest(ahp, skb, ah->auth_data);
+ memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len);
+ spin_unlock_bh(&x->lock);
+
if (err)
goto error;
- memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len);
top_iph->tos = iph->tos;
top_iph->ttl = iph->ttl;
@@ -111,8 +115,6 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
}
- ip_send_check(top_iph);
-
err = 0;
error:
@@ -123,21 +125,23 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
{
int ah_hlen;
int ihl;
+ int nexthdr;
int err = -EINVAL;
struct iphdr *iph;
struct ip_auth_hdr *ah;
struct ah_data *ahp;
char work_buf[60];
- if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
+ if (!pskb_may_pull(skb, sizeof(*ah)))
goto out;
- ah = (struct ip_auth_hdr*)skb->data;
+ ah = (struct ip_auth_hdr *)skb->data;
ahp = x->data;
+ nexthdr = ah->nexthdr;
ah_hlen = (ah->hdrlen + 2) << 2;
- if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) &&
- ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len))
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
goto out;
if (!pskb_may_pull(skb, ah_hlen))
@@ -151,7 +155,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
skb->ip_summed = CHECKSUM_NONE;
- ah = (struct ip_auth_hdr*)skb->data;
+ ah = (struct ip_auth_hdr *)skb->data;
iph = ip_hdr(skb);
ihl = skb->data - skb_network_header(skb);
@@ -180,13 +184,12 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
goto out;
}
}
- ((struct iphdr*)work_buf)->protocol = ah->nexthdr;
skb->network_header += ah_hlen;
memcpy(skb_network_header(skb), work_buf, ihl);
skb->transport_header = skb->network_header;
__skb_pull(skb, ah_hlen + ihl);
- return 0;
+ return nexthdr;
out:
return err;
@@ -219,10 +222,6 @@ static int ah_init_state(struct xfrm_state *x)
if (!x->aalg)
goto error;
- /* null auth can use a zero length key */
- if (x->aalg->alg_key_len > 512)
- goto error;
-
if (x->encap)
goto error;
@@ -230,14 +229,13 @@ static int ah_init_state(struct xfrm_state *x)
if (ahp == NULL)
return -ENOMEM;
- ahp->key = x->aalg->alg_key;
- ahp->key_len = (x->aalg->alg_key_len+7)/8;
tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm))
goto error;
ahp->tfm = tfm;
- if (crypto_hash_setkey(tfm, ahp->key, ahp->key_len))
+ if (crypto_hash_setkey(tfm, x->aalg->alg_key,
+ (x->aalg->alg_key_len + 7) / 8))
goto error;
/*
@@ -266,7 +264,8 @@ static int ah_init_state(struct xfrm_state *x)
if (!ahp->work_icv)
goto error;
- x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len);
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
if (x->props.mode == XFRM_MODE_TUNNEL)
x->props.header_len += sizeof(struct iphdr);
x->data = ahp;
@@ -302,6 +301,7 @@ static struct xfrm_type ah_type =
.description = "AH4",
.owner = THIS_MODULE,
.proto = IPPROTO_AH,
+ .flags = XFRM_TYPE_REPLAY_PROT,
.init_state = ah_init_state,
.destructor = ah_destroy,
.input = ah_input,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 9ab9d534fba..36d6798947b 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -103,6 +103,7 @@
#include <linux/sysctl.h>
#endif
+#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/route.h>
@@ -252,7 +253,7 @@ static int arp_constructor(struct neighbour *neigh)
neigh->parms = neigh_parms_clone(parms);
rcu_read_unlock();
- if (dev->hard_header == NULL) {
+ if (!dev->header_ops) {
neigh->nud_state = NUD_NOARP;
neigh->ops = &arp_direct_ops;
neigh->output = neigh->ops->queue_xmit;
@@ -309,10 +310,12 @@ static int arp_constructor(struct neighbour *neigh)
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->broadcast, dev->addr_len);
}
- if (dev->hard_header_cache)
+
+ if (dev->header_ops->cache)
neigh->ops = &arp_hh_ops;
else
neigh->ops = &arp_generic_ops;
+
if (neigh->nud_state&NUD_VALID)
neigh->output = neigh->ops->connected_output;
else
@@ -590,8 +593,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
/*
* Fill the device header for the ARP frame
*/
- if (dev->hard_header &&
- dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0)
+ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0)
goto out;
/*
@@ -931,6 +933,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
{
struct arphdr *arp;
+ if (dev->nd_net != &init_net)
+ goto freeskb;
+
/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
(2 * dev->addr_len) +
@@ -977,7 +982,7 @@ static int arp_req_set(struct arpreq *r, struct net_device * dev)
if (mask && mask != htonl(0xFFFFFFFF))
return -EINVAL;
if (!dev && (r->arp_flags & ATF_COM)) {
- dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data);
+ dev = dev_getbyhwaddr(&init_net, r->arp_ha.sa_family, r->arp_ha.sa_data);
if (!dev)
return -ENODEV;
}
@@ -1165,7 +1170,7 @@ int arp_ioctl(unsigned int cmd, void __user *arg)
rtnl_lock();
if (r.arp_dev[0]) {
err = -ENODEV;
- if ((dev = __dev_get_by_name(r.arp_dev)) == NULL)
+ if ((dev = __dev_get_by_name(&init_net, r.arp_dev)) == NULL)
goto out;
/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1201,6 +1206,9 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo
{
struct net_device *dev = ptr;
+ if (dev->nd_net != &init_net)
+ return NOTIFY_DONE;
+
switch (event) {
case NETDEV_CHANGEADDR:
neigh_changeaddr(&arp_tbl, dev);
@@ -1370,24 +1378,8 @@ static const struct seq_operations arp_seq_ops = {
static int arp_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct neigh_seq_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &arp_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ return seq_open_private(file, &arp_seq_ops,
+ sizeof(struct neigh_seq_state));
}
static const struct file_operations arp_seq_fops = {
@@ -1400,7 +1392,7 @@ static const struct file_operations arp_seq_fops = {
static int __init arp_proc_init(void)
{
- if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops))
+ if (!proc_net_fops_create(&init_net, "arp", S_IRUGO, &arp_seq_fops))
return -ENOMEM;
return 0;
}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index ab56a052ce3..805a78e6ed5 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1831,68 +1831,75 @@ socket_setattr_failure:
}
/**
- * cipso_v4_sock_getattr - Get the security attributes from a sock
- * @sk: the sock
+ * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions
+ * @cipso: the CIPSO v4 option
* @secattr: the security attributes
*
* Description:
- * Query @sk to see if there is a CIPSO option attached to the sock and if
- * there is return the CIPSO security attributes in @secattr. This function
- * requires that @sk be locked, or privately held, but it does not do any
- * locking itself. Returns zero on success and negative values on failure.
+ * Inspect @cipso and return the security attributes in @secattr. Returns zero
+ * on success and negative values on failure.
*
*/
-int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+static int cipso_v4_getattr(const unsigned char *cipso,
+ struct netlbl_lsm_secattr *secattr)
{
int ret_val = -ENOMSG;
- struct inet_sock *sk_inet;
- unsigned char *cipso_ptr;
u32 doi;
struct cipso_v4_doi *doi_def;
- sk_inet = inet_sk(sk);
- if (sk_inet->opt == NULL || sk_inet->opt->cipso == 0)
- return -ENOMSG;
- cipso_ptr = sk_inet->opt->__data + sk_inet->opt->cipso -
- sizeof(struct iphdr);
- ret_val = cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr);
- if (ret_val == 0)
- return ret_val;
+ if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0)
+ return 0;
- doi = ntohl(get_unaligned((__be32 *)&cipso_ptr[2]));
+ doi = ntohl(get_unaligned((__be32 *)&cipso[2]));
rcu_read_lock();
doi_def = cipso_v4_doi_search(doi);
- if (doi_def == NULL) {
- rcu_read_unlock();
- return -ENOMSG;
- }
-
+ if (doi_def == NULL)
+ goto getattr_return;
/* XXX - This code assumes only one tag per CIPSO option which isn't
* really a good assumption to make but since we only support the MAC
* tags right now it is a safe assumption. */
- switch (cipso_ptr[6]) {
+ switch (cipso[6]) {
case CIPSO_V4_TAG_RBITMAP:
- ret_val = cipso_v4_parsetag_rbm(doi_def,
- &cipso_ptr[6],
- secattr);
+ ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr);
break;
case CIPSO_V4_TAG_ENUM:
- ret_val = cipso_v4_parsetag_enum(doi_def,
- &cipso_ptr[6],
- secattr);
+ ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr);
break;
case CIPSO_V4_TAG_RANGE:
- ret_val = cipso_v4_parsetag_rng(doi_def,
- &cipso_ptr[6],
- secattr);
+ ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
break;
}
- rcu_read_unlock();
+getattr_return:
+ rcu_read_unlock();
return ret_val;
}
/**
+ * cipso_v4_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CIPSO option attached to the sock and if
+ * there is return the CIPSO security attributes in @secattr. This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself. Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+{
+ struct ip_options *opt;
+
+ opt = inet_sk(sk)->opt;
+ if (opt == NULL || opt->cipso == 0)
+ return -ENOMSG;
+
+ return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr),
+ secattr);
+}
+
+/**
* cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option
* @skb: the packet
* @secattr: the security attributes
@@ -1905,45 +1912,7 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
int cipso_v4_skbuff_getattr(const struct sk_buff *skb,
struct netlbl_lsm_secattr *secattr)
{
- int ret_val = -ENOMSG;
- unsigned char *cipso_ptr;
- u32 doi;
- struct cipso_v4_doi *doi_def;
-
- cipso_ptr = CIPSO_V4_OPTPTR(skb);
- if (cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr) == 0)
- return 0;
-
- doi = ntohl(get_unaligned((__be32 *)&cipso_ptr[2]));
- rcu_read_lock();
- doi_def = cipso_v4_doi_search(doi);
- if (doi_def == NULL)
- goto skbuff_getattr_return;
-
- /* XXX - This code assumes only one tag per CIPSO option which isn't
- * really a good assumption to make but since we only support the MAC
- * tags right now it is a safe assumption. */
- switch (cipso_ptr[6]) {
- case CIPSO_V4_TAG_RBITMAP:
- ret_val = cipso_v4_parsetag_rbm(doi_def,
- &cipso_ptr[6],
- secattr);
- break;
- case CIPSO_V4_TAG_ENUM:
- ret_val = cipso_v4_parsetag_enum(doi_def,
- &cipso_ptr[6],
- secattr);
- break;
- case CIPSO_V4_TAG_RANGE:
- ret_val = cipso_v4_parsetag_rng(doi_def,
- &cipso_ptr[6],
- secattr);
- break;
- }
-
-skbuff_getattr_return:
- rcu_read_unlock();
- return ret_val;
+ return cipso_v4_getattr(CIPSO_V4_OPTPTR(skb), secattr);
}
/*
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 5dbe5803b7d..55d199e4ae2 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -203,8 +203,6 @@ static void inetdev_destroy(struct in_device *in_dev)
ASSERT_RTNL();
dev = in_dev->dev;
- if (dev == &loopback_dev)
- return;
in_dev->dead = 1;
@@ -420,7 +418,7 @@ struct in_device *inetdev_by_index(int ifindex)
struct net_device *dev;
struct in_device *in_dev = NULL;
read_lock(&dev_base_lock);
- dev = __dev_get_by_index(ifindex);
+ dev = __dev_get_by_index(&init_net, ifindex);
if (dev)
in_dev = in_dev_get(dev);
read_unlock(&dev_base_lock);
@@ -506,7 +504,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct nlmsghdr *nlh)
goto errout;
}
- dev = __dev_get_by_index(ifm->ifa_index);
+ dev = __dev_get_by_index(&init_net, ifm->ifa_index);
if (dev == NULL) {
err = -ENODEV;
goto errout;
@@ -628,7 +626,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
*colon = 0;
#ifdef CONFIG_KMOD
- dev_load(ifr.ifr_name);
+ dev_load(&init_net, ifr.ifr_name);
#endif
switch (cmd) {
@@ -669,7 +667,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
rtnl_lock();
ret = -ENODEV;
- if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL)
+ if ((dev = __dev_get_by_name(&init_net, ifr.ifr_name)) == NULL)
goto done;
if (colon)
@@ -909,7 +907,7 @@ no_in_dev:
*/
read_lock(&dev_base_lock);
rcu_read_lock();
- for_each_netdev(dev) {
+ for_each_netdev(&init_net, dev) {
if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
continue;
@@ -988,7 +986,7 @@ __be32 inet_confirm_addr(const struct net_device *dev, __be32 dst, __be32 local,
read_lock(&dev_base_lock);
rcu_read_lock();
- for_each_netdev(dev) {
+ for_each_netdev(&init_net, dev) {
if ((in_dev = __in_dev_get_rcu(dev))) {
addr = confirm_addr_indev(in_dev, dst, local, scope);
if (addr)
@@ -1051,6 +1049,9 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
struct net_device *dev = ptr;
struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ if (dev->nd_net != &init_net)
+ return NOTIFY_DONE;
+
ASSERT_RTNL();
if (!in_dev) {
@@ -1058,7 +1059,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
in_dev = inetdev_init(dev);
if (!in_dev)
return notifier_from_errno(-ENOMEM);
- if (dev == &loopback_dev) {
+ if (dev->flags & IFF_LOOPBACK) {
IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
}
@@ -1074,7 +1075,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
case NETDEV_UP:
if (dev->mtu < 68)
break;
- if (dev == &loopback_dev) {
+ if (dev->flags & IFF_LOOPBACK) {
struct in_ifaddr *ifa;
if ((ifa = inet_alloc_ifa()) != NULL) {
ifa->ifa_local =
@@ -1182,7 +1183,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
s_ip_idx = ip_idx = cb->args[1];
idx = 0;
- for_each_netdev(dev) {
+ for_each_netdev(&init_net, dev) {
if (idx < s_idx)
goto cont;
if (idx > s_idx)
@@ -1241,7 +1242,7 @@ static void devinet_copy_dflt_conf(int i)
struct net_device *dev;
read_lock(&dev_base_lock);
- for_each_netdev(dev) {
+ for_each_netdev(&init_net, dev) {
struct in_device *in_dev;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
@@ -1330,7 +1331,7 @@ void inet_forward_change(void)
IPV4_DEVCONF_DFLT(FORWARDING) = on;
read_lock(&dev_base_lock);
- for_each_netdev(dev) {
+ for_each_netdev(&init_net, dev) {
struct in_device *in_dev;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 98767a4f118..6b1a31a74cf 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/pfkeyv2.h>
#include <linux/random.h>
+#include <linux/spinlock.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/udp.h>
@@ -15,7 +16,6 @@
static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
- struct iphdr *top_iph;
struct ip_esp_hdr *esph;
struct crypto_blkcipher *tfm;
struct blkcipher_desc desc;
@@ -27,9 +27,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int alen;
int nfrags;
- /* Strip IP+ESP header. */
- __skb_pull(skb, skb_transport_offset(skb));
- /* Now skb is pure payload to encrypt */
+ /* skb is pure payload to encrypt */
err = -ENOMEM;
@@ -59,12 +57,12 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
tail[clen - skb->len - 2] = (clen - skb->len) - 2;
pskb_put(skb, trailer, clen - skb->len);
- __skb_push(skb, skb->data - skb_network_header(skb));
- top_iph = ip_hdr(skb);
- esph = (struct ip_esp_hdr *)(skb_network_header(skb) +
- top_iph->ihl * 4);
- top_iph->tot_len = htons(skb->len + alen);
- *(skb_tail_pointer(trailer) - 1) = top_iph->protocol;
+ skb_push(skb, -skb_network_offset(skb));
+ esph = ip_esp_hdr(skb);
+ *(skb_tail_pointer(trailer) - 1) = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_ESP;
+
+ spin_lock_bh(&x->lock);
/* this is non-NULL only with UDP Encapsulation */
if (x->encap) {
@@ -75,7 +73,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
uh = (struct udphdr *)esph;
uh->source = encap->encap_sport;
uh->dest = encap->encap_dport;
- uh->len = htons(skb->len + alen - top_iph->ihl*4);
+ uh->len = htons(skb->len + alen - skb_transport_offset(skb));
uh->check = 0;
switch (encap->encap_type) {
@@ -90,13 +88,11 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
break;
}
- top_iph->protocol = IPPROTO_UDP;
- } else
- top_iph->protocol = IPPROTO_ESP;
+ *skb_mac_header(skb) = IPPROTO_UDP;
+ }
esph->spi = x->id.spi;
- esph->seq_no = htonl(++x->replay.oseq);
- xfrm_aevent_doreplay(x);
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq);
if (esp->conf.ivlen) {
if (unlikely(!esp->conf.ivinitted)) {
@@ -112,7 +108,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
if (!sg)
- goto error;
+ goto unlock;
}
skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen);
err = crypto_blkcipher_encrypt(&desc, sg, sg, clen);
@@ -121,7 +117,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
} while (0);
if (unlikely(err))
- goto error;
+ goto unlock;
if (esp->conf.ivlen) {
memcpy(esph->enc_data, esp->conf.ivec, esp->conf.ivlen);
@@ -134,7 +130,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
memcpy(pskb_put(skb, trailer, alen), esp->auth.work_icv, alen);
}
- ip_send_check(top_iph);
+unlock:
+ spin_unlock_bh(&x->lock);
error:
return err;
@@ -155,7 +152,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
struct sk_buff *trailer;
int blksize = ALIGN(crypto_blkcipher_blocksize(tfm), 4);
int alen = esp->auth.icv_trunc_len;
- int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
+ int elen = skb->len - sizeof(*esph) - esp->conf.ivlen - alen;
int nfrags;
int ihl;
u8 nexthdr[2];
@@ -163,7 +160,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
int padlen;
int err;
- if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
+ if (!pskb_may_pull(skb, sizeof(*esph)))
goto out;
if (elen <= 0 || (elen & (blksize-1)))
@@ -191,7 +188,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
skb->ip_summed = CHECKSUM_NONE;
- esph = (struct ip_esp_hdr*)skb->data;
+ esph = (struct ip_esp_hdr *)skb->data;
/* Get ivec. This can be wrong, check against another impls. */
if (esp->conf.ivlen)
@@ -204,7 +201,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
if (!sg)
goto out;
}
- skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen);
+ skb_to_sgvec(skb, sg, sizeof(*esph) + esp->conf.ivlen, elen);
err = crypto_blkcipher_decrypt(&desc, sg, sg, elen);
if (unlikely(sg != &esp->sgbuf[0]))
kfree(sg);
@@ -256,17 +253,15 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
* as per draft-ietf-ipsec-udp-encaps-06,
* section 3.1.2
*/
- if (x->props.mode == XFRM_MODE_TRANSPORT ||
- x->props.mode == XFRM_MODE_BEET)
+ if (x->props.mode == XFRM_MODE_TRANSPORT)
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
- iph->protocol = nexthdr[1];
pskb_trim(skb, skb->len - alen - padlen - 2);
__skb_pull(skb, sizeof(*esph) + esp->conf.ivlen);
skb_set_transport_header(skb, -ihl);
- return 0;
+ return nexthdr[1];
out:
return -EINVAL;
@@ -343,11 +338,6 @@ static int esp_init_state(struct xfrm_state *x)
struct crypto_blkcipher *tfm;
u32 align;
- /* null auth and encryption can have zero length keys */
- if (x->aalg) {
- if (x->aalg->alg_key_len > 512)
- goto error;
- }
if (x->ealg == NULL)
goto error;
@@ -359,15 +349,14 @@ static int esp_init_state(struct xfrm_state *x)
struct xfrm_algo_desc *aalg_desc;
struct crypto_hash *hash;
- esp->auth.key = x->aalg->alg_key;
- esp->auth.key_len = (x->aalg->alg_key_len+7)/8;
hash = crypto_alloc_hash(x->aalg->alg_name, 0,
CRYPTO_ALG_ASYNC);
if (IS_ERR(hash))
goto error;
esp->auth.tfm = hash;
- if (crypto_hash_setkey(hash, esp->auth.key, esp->auth.key_len))
+ if (crypto_hash_setkey(hash, x->aalg->alg_key,
+ (x->aalg->alg_key_len + 7) / 8))
goto error;
aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
@@ -389,8 +378,7 @@ static int esp_init_state(struct xfrm_state *x)
if (!esp->auth.work_icv)
goto error;
}
- esp->conf.key = x->ealg->alg_key;
- esp->conf.key_len = (x->ealg->alg_key_len+7)/8;
+
tfm = crypto_alloc_blkcipher(x->ealg->alg_name, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm))
goto error;
@@ -403,7 +391,8 @@ static int esp_init_state(struct xfrm_state *x)
goto error;
esp->conf.ivinitted = 0;
}
- if (crypto_blkcipher_setkey(tfm, esp->conf.key, esp->conf.key_len))
+ if (crypto_blkcipher_setkey(tfm, x->ealg->alg_key,
+ (x->ealg->alg_key_len + 7) / 8))
goto error;
x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
if (x->props.mode == XFRM_MODE_TUNNEL)
@@ -443,6 +432,7 @@ static struct xfrm_type esp_type =
.description = "ESP4",
.owner = THIS_MODULE,
.proto = IPPROTO_ESP,
+ .flags = XFRM_TYPE_REPLAY_PROT,
.init_state = esp_init_state,
.destructor = esp_destroy,
.get_mtu = esp4_get_mtu,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index eff6bce453e..78b514ba141 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -49,6 +49,8 @@
#define FFprint(a...) printk(KERN_DEBUG a)
+static struct sock *fibnl;
+
#ifndef CONFIG_IP_MULTIPLE_TABLES
struct fib_table *ip_fib_local_table;
@@ -334,7 +336,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
colon = strchr(devname, ':');
if (colon)
*colon = 0;
- dev = __dev_get_by_name(devname);
+ dev = __dev_get_by_name(&init_net, devname);
if (!dev)
return -ENODEV;
cfg->fc_oif = dev->ifindex;
@@ -487,7 +489,7 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
}
nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
- switch (attr->nla_type) {
+ switch (nla_type(attr)) {
case RTA_DST:
cfg->fc_dst = nla_get_be32(attr);
break;
@@ -784,17 +786,12 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
}
}
-static void nl_fib_input(struct sock *sk, int len)
+static void nl_fib_input(struct sk_buff *skb)
{
- struct sk_buff *skb = NULL;
- struct nlmsghdr *nlh = NULL;
struct fib_result_nl *frn;
- u32 pid;
+ struct nlmsghdr *nlh;
struct fib_table *tb;
-
- skb = skb_dequeue(&sk->sk_receive_queue);
- if (skb == NULL)
- return;
+ u32 pid;
nlh = nlmsg_hdr(skb);
if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
@@ -811,13 +808,13 @@ static void nl_fib_input(struct sock *sk, int len)
pid = NETLINK_CB(skb).pid; /* pid of sending process */
NETLINK_CB(skb).pid = 0; /* from kernel */
NETLINK_CB(skb).dst_group = 0; /* unicast */
- netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
+ netlink_unicast(fibnl, skb, pid, MSG_DONTWAIT);
}
static void nl_fib_lookup_init(void)
{
- netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, NULL,
- THIS_MODULE);
+ fibnl = netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0,
+ nl_fib_input, NULL, THIS_MODULE);
}
static void fib_disable_ip(struct net_device *dev, int force)
@@ -860,6 +857,9 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
struct net_device *dev = ptr;
struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ if (dev->nd_net != &init_net)
+ return NOTIFY_DONE;
+
if (event == NETDEV_UNREGISTER) {
fib_disable_ip(dev, 2);
return NOTIFY_DONE;
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 9ad1d9ff9ce..527a6e0af5b 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -35,6 +35,7 @@
#include <linux/netlink.h>
#include <linux/init.h>
+#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -1038,24 +1039,8 @@ static const struct seq_operations fib_seq_ops = {
static int fib_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct fib_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &fib_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ return seq_open_private(file, &fib_seq_ops,
+ sizeof(struct fib_iter_state));
}
static const struct file_operations fib_seq_fops = {
@@ -1068,13 +1053,13 @@ static const struct file_operations fib_seq_fops = {
int __init fib_proc_init(void)
{
- if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops))
+ if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_seq_fops))
return -ENOMEM;
return 0;
}
void __init fib_proc_exit(void)
{
- proc_net_remove("route");
+ proc_net_remove(&init_net, "route");
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 2a947840210..f16839c6a72 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -76,8 +76,6 @@ static struct fib4_rule local_rule = {
},
};
-static LIST_HEAD(fib4_rules);
-
#ifdef CONFIG_NET_CLS_ROUTE
u32 fib_rules_tclass(struct fib_result *res)
{
@@ -279,9 +277,9 @@ static u32 fib4_rule_default_pref(void)
struct list_head *pos;
struct fib_rule *rule;
- if (!list_empty(&fib4_rules)) {
- pos = fib4_rules.next;
- if (pos->next != &fib4_rules) {
+ if (!list_empty(&fib4_rules_ops.rules_list)) {
+ pos = fib4_rules_ops.rules_list.next;
+ if (pos->next != &fib4_rules_ops.rules_list) {
rule = list_entry(pos->next, struct fib_rule, list);
if (rule->pref)
return rule->pref - 1;
@@ -317,15 +315,15 @@ static struct fib_rules_ops fib4_rules_ops = {
.flush_cache = fib4_rule_flush_cache,
.nlgroup = RTNLGRP_IPV4_RULE,
.policy = fib4_rule_policy,
- .rules_list = &fib4_rules,
+ .rules_list = LIST_HEAD_INIT(fib4_rules_ops.rules_list),
.owner = THIS_MODULE,
};
void __init fib4_rules_init(void)
{
- list_add_tail(&local_rule.common.list, &fib4_rules);
- list_add_tail(&main_rule.common.list, &fib4_rules);
- list_add_tail(&default_rule.common.list, &fib4_rules);
+ list_add_tail(&local_rule.common.list, &fib4_rules_ops.rules_list);
+ list_add_tail(&main_rule.common.list, &fib4_rules_ops.rules_list);
+ list_add_tail(&default_rule.common.list, &fib4_rules_ops.rules_list);
fib_rules_register(&fib4_rules_ops);
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c434119deb5..1351a2617dc 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -533,7 +533,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
return -EINVAL;
if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
return -EINVAL;
- if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
+ if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL)
return -ENODEV;
if (!(dev->flags&IFF_UP))
return -ENETDOWN;
@@ -743,7 +743,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
int remaining;
nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
- int type = nla->nla_type;
+ int type = nla_type(nla);
if (type) {
if (type > RTAX_MAX)
@@ -799,7 +799,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (nhs != 1 || nh->nh_gw)
goto err_inval;
nh->nh_scope = RT_SCOPE_NOWHERE;
- nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
+ nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif);
err = -ENODEV;
if (nh->nh_dev == NULL)
goto failure;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 9ca786a6fd3..81a8285d6d6 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -73,6 +73,7 @@
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -85,23 +86,14 @@
#define MAX_STAT_DEPTH 32
#define KEYLENGTH (8*sizeof(t_key))
-#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
-#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
typedef unsigned int t_key;
#define T_TNODE 0
#define T_LEAF 1
#define NODE_TYPE_MASK 0x1UL
-#define NODE_PARENT(node) \
- ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
-
#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
-#define NODE_SET_PARENT(node, ptr) \
- rcu_assign_pointer((node)->parent, \
- ((unsigned long)(ptr)) | NODE_TYPE(node))
-
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
@@ -174,6 +166,19 @@ static void tnode_free(struct tnode *tn);
static struct kmem_cache *fn_alias_kmem __read_mostly;
static struct trie *trie_local = NULL, *trie_main = NULL;
+static inline struct tnode *node_parent(struct node *node)
+{
+ struct tnode *ret;
+
+ ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
+ return rcu_dereference(ret);
+}
+
+static inline void node_set_parent(struct node *node, struct tnode *ptr)
+{
+ rcu_assign_pointer(node->parent,
+ (unsigned long)ptr | NODE_TYPE(node));
+}
/* rcu_read_lock needs to be hold by caller from readside */
@@ -189,6 +194,11 @@ static inline int tnode_child_length(const struct tnode *tn)
return 1 << tn->bits;
}
+static inline t_key mask_pfx(t_key k, unsigned short l)
+{
+ return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
+}
+
static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
{
if (offset < KEYLENGTH)
@@ -446,7 +456,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
tn->full_children++;
if (n)
- NODE_SET_PARENT(n, tn);
+ node_set_parent(n, tn);
rcu_assign_pointer(tn->child[i], n);
}
@@ -481,7 +491,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
continue;
/* compress one level */
- NODE_SET_PARENT(n, NULL);
+ node_set_parent(n, NULL);
tnode_free(tn);
return n;
}
@@ -636,7 +646,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* compress one level */
- NODE_SET_PARENT(n, NULL);
+ node_set_parent(n, NULL);
tnode_free(tn);
return n;
}
@@ -673,7 +683,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
inode->pos == oldtnode->pos + oldtnode->bits &&
inode->bits > 1) {
struct tnode *left, *right;
- t_key m = TKEY_GET_MASK(inode->pos, 1);
+ t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
left = tnode_new(inode->key&(~m), inode->pos + 1,
inode->bits - 1);
@@ -961,24 +971,21 @@ fib_find_node(struct trie *t, u32 key)
static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
{
int wasfull;
- t_key cindex, key;
- struct tnode *tp = NULL;
-
- key = tn->key;
-
- while (tn != NULL && NODE_PARENT(tn) != NULL) {
+ t_key cindex, key = tn->key;
+ struct tnode *tp;
- tp = NODE_PARENT(tn);
+ while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
tn = (struct tnode *) resize (t, (struct tnode *)tn);
tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
- if (!NODE_PARENT(tn))
+ tp = node_parent((struct node *) tn);
+ if (!tp)
break;
-
- tn = NODE_PARENT(tn);
+ tn = tp;
}
+
/* Handle last (top) tnode */
if (IS_TNODE(tn))
tn = (struct tnode*) resize(t, (struct tnode *)tn);
@@ -1031,7 +1038,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
pos = tn->pos + tn->bits;
n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
- BUG_ON(n && NODE_PARENT(n) != tn);
+ BUG_ON(n && node_parent(n) != tn);
} else
break;
}
@@ -1083,7 +1090,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
if (t->trie && n == NULL) {
/* Case 2: n is NULL, and will just insert a new leaf */
- NODE_SET_PARENT(l, tp);
+ node_set_parent((struct node *)l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
@@ -1114,7 +1121,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
goto err;
}
- NODE_SET_PARENT(tn, tp);
+ node_set_parent((struct node *)tn, tp);
missbit = tkey_extract_bits(key, newpos, 1);
put_child(t, tn, missbit, (struct node *)l);
@@ -1364,7 +1371,8 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
bits = pn->bits;
if (!chopped_off)
- cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
+ cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
+ pos, bits);
n = tnode_get_child(pn, cindex);
@@ -1450,8 +1458,8 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
* to find a matching prefix.
*/
- node_prefix = MASK_PFX(cn->key, cn->pos);
- key_prefix = MASK_PFX(key, cn->pos);
+ node_prefix = mask_pfx(cn->key, cn->pos);
+ key_prefix = mask_pfx(key, cn->pos);
pref_mismatch = key_prefix^node_prefix;
mp = 0;
@@ -1495,12 +1503,13 @@ backtrace:
if (chopped_off <= pn->bits) {
cindex &= ~(1 << (chopped_off-1));
} else {
- if (NODE_PARENT(pn) == NULL)
+ struct tnode *parent = node_parent((struct node *) pn);
+ if (!parent)
goto failed;
/* Get Child's index */
- cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
- pn = NODE_PARENT(pn);
+ cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
+ pn = parent;
chopped_off = 0;
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -1536,7 +1545,7 @@ static int trie_leaf_remove(struct trie *t, t_key key)
check_tnode(tn);
n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
- BUG_ON(n && NODE_PARENT(n) != tn);
+ BUG_ON(n && node_parent(n) != tn);
}
l = (struct leaf *) n;
@@ -1551,7 +1560,7 @@ static int trie_leaf_remove(struct trie *t, t_key key)
t->revision++;
t->size--;
- tp = NODE_PARENT(n);
+ tp = node_parent(n);
tnode_free((struct tnode *) n);
if (tp) {
@@ -1703,7 +1712,7 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
p = (struct tnode*) trie; /* Start */
} else
- p = (struct tnode *) NODE_PARENT(c);
+ p = node_parent(c);
while (p) {
int pos, last;
@@ -1740,7 +1749,7 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
up:
/* No more children go up one step */
c = (struct node *) p;
- p = (struct tnode *) NODE_PARENT(p);
+ p = node_parent(c);
}
return NULL; /* Ready. Root of trie */
}
@@ -2043,7 +2052,7 @@ rescan:
}
/* Current node exhausted, pop back up */
- p = NODE_PARENT(tn);
+ p = node_parent((struct node *)tn);
if (p) {
cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
tn = p;
@@ -2317,7 +2326,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN)
return 0;
- if (!NODE_PARENT(n)) {
+ if (!node_parent(n)) {
if (iter->trie == trie_local)
seq_puts(seq, "<local>:\n");
else
@@ -2326,7 +2335,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
if (IS_TNODE(n)) {
struct tnode *tn = (struct tnode *) n;
- __be32 prf = htonl(MASK_PFX(tn->key, tn->pos));
+ __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
seq_indent(seq, iter->depth-1);
seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
@@ -2370,25 +2379,8 @@ static const struct seq_operations fib_trie_seq_ops = {
static int fib_trie_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &fib_trie_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
- memset(s, 0, sizeof(*s));
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ return seq_open_private(file, &fib_trie_seq_ops,
+ sizeof(struct fib_trie_iter));
}
static const struct file_operations fib_trie_fops = {
@@ -2491,25 +2483,8 @@ static const struct seq_operations fib_route_seq_ops = {
static int fib_route_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &fib_route_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
- memset(s, 0, sizeof(*s));
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ return seq_open_private(file, &fib_route_seq_ops,
+ sizeof(struct fib_trie_iter));
}
static const struct file_operations fib_route_fops = {
@@ -2522,30 +2497,30 @@ static const struct file_operations fib_route_fops = {
int __init fib_proc_init(void)
{
- if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
+ if (!proc_net_fops_create(&init_net, "fib_trie", S_IRUGO, &fib_trie_fops))
goto out1;
- if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
+ if (!proc_net_fops_create(&init_net, "fib_triestat", S_IRUGO, &fib_triestat_fops))
goto out2;
- if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
+ if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_route_fops))
goto out3;
return 0;
out3:
- proc_net_remove("fib_triestat");
+ proc_net_remove(&init_net, "fib_triestat");
out2:
- proc_net_remove("fib_trie");
+ proc_net_remove(&init_net, "fib_trie");
out1:
return -ENOMEM;
}
void __init fib_proc_exit(void)
{
- proc_net_remove("fib_trie");
- proc_net_remove("fib_triestat");
- proc_net_remove("route");
+ proc_net_remove(&init_net, "fib_trie");
+ proc_net_remove(&init_net, "fib_triestat");
+ proc_net_remove(&init_net, "route");
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 02a899bec19..272c69e106e 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -115,6 +115,7 @@ struct icmp_bxm {
* Statistics
*/
DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
+DEFINE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics) __read_mostly;
/* An array of errno for error messages from dest unreach. */
/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
@@ -214,8 +215,6 @@ int sysctl_icmp_errors_use_inbound_ifaddr __read_mostly;
*/
struct icmp_control {
- int output_entry; /* Field for increment on output */
- int input_entry; /* Field for increment on input */
void (*handler)(struct sk_buff *skb);
short error; /* This ICMP is classed as an error message */
};
@@ -316,12 +315,10 @@ out:
/*
* Maintain the counters used in the SNMP statistics for outgoing ICMP
*/
-static void icmp_out_count(int type)
+void icmp_out_count(unsigned char type)
{
- if (type <= NR_ICMP_TYPES) {
- ICMP_INC_STATS(icmp_pointers[type].output_entry);
- ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
- }
+ ICMPMSGOUT_INC_STATS(type);
+ ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
}
/*
@@ -390,7 +387,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
return;
icmp_param->data.icmph.checksum = 0;
- icmp_out_count(icmp_param->data.icmph.type);
inet->tos = ip_hdr(skb)->tos;
daddr = ipc.addr = rt->rt_src;
@@ -517,7 +513,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
struct net_device *dev = NULL;
if (rt->fl.iif && sysctl_icmp_errors_use_inbound_ifaddr)
- dev = dev_get_by_index(rt->fl.iif);
+ dev = dev_get_by_index(&init_net, rt->fl.iif);
if (dev) {
saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -952,6 +948,7 @@ int icmp_rcv(struct sk_buff *skb)
icmph = icmp_hdr(skb);
+ ICMPMSGIN_INC_STATS_BH(icmph->type);
/*
* 18 is the highest 'known' ICMP type. Anything else is a mystery
*
@@ -986,7 +983,6 @@ int icmp_rcv(struct sk_buff *skb)
}
}
- ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry);
icmp_pointers[icmph->type].handler(skb);
drop:
@@ -1002,109 +998,71 @@ error:
*/
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
[ICMP_ECHOREPLY] = {
- .output_entry = ICMP_MIB_OUTECHOREPS,
- .input_entry = ICMP_MIB_INECHOREPS,
.handler = icmp_discard,
},
[1] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[2] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[ICMP_DEST_UNREACH] = {
- .output_entry = ICMP_MIB_OUTDESTUNREACHS,
- .input_entry = ICMP_MIB_INDESTUNREACHS,
.handler = icmp_unreach,
.error = 1,
},
[ICMP_SOURCE_QUENCH] = {
- .output_entry = ICMP_MIB_OUTSRCQUENCHS,
- .input_entry = ICMP_MIB_INSRCQUENCHS,
.handler = icmp_unreach,
.error = 1,
},
[ICMP_REDIRECT] = {
- .output_entry = ICMP_MIB_OUTREDIRECTS,
- .input_entry = ICMP_MIB_INREDIRECTS,
.handler = icmp_redirect,
.error = 1,
},
[6] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[7] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[ICMP_ECHO] = {
- .output_entry = ICMP_MIB_OUTECHOS,
- .input_entry = ICMP_MIB_INECHOS,
.handler = icmp_echo,
},
[9] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[10] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_INERRORS,
.handler = icmp_discard,
.error = 1,
},
[ICMP_TIME_EXCEEDED] = {
- .output_entry = ICMP_MIB_OUTTIMEEXCDS,
- .input_entry = ICMP_MIB_INTIMEEXCDS,
.handler = icmp_unreach,
.error = 1,
},
[ICMP_PARAMETERPROB] = {
- .output_entry = ICMP_MIB_OUTPARMPROBS,
- .input_entry = ICMP_MIB_INPARMPROBS,
.handler = icmp_unreach,
.error = 1,
},
[ICMP_TIMESTAMP] = {
- .output_entry = ICMP_MIB_OUTTIMESTAMPS,
- .input_entry = ICMP_MIB_INTIMESTAMPS,
.handler = icmp_timestamp,
},
[ICMP_TIMESTAMPREPLY] = {
- .output_entry = ICMP_MIB_OUTTIMESTAMPREPS,
- .input_entry = ICMP_MIB_INTIMESTAMPREPS,
.handler = icmp_discard,
},
[ICMP_INFO_REQUEST] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_DUMMY,
.handler = icmp_discard,
},
[ICMP_INFO_REPLY] = {
- .output_entry = ICMP_MIB_DUMMY,
- .input_entry = ICMP_MIB_DUMMY,
.handler = icmp_discard,
},
[ICMP_ADDRESS] = {
- .output_entry = ICMP_MIB_OUTADDRMASKS,
- .input_entry = ICMP_MIB_INADDRMASKS,
.handler = icmp_address,
},
[ICMP_ADDRESSREPLY] = {
- .output_entry = ICMP_MIB_OUTADDRMASKREPS,
- .input_entry = ICMP_MIB_INADDRMASKREPS,
.handler = icmp_address_reply,
},
};
@@ -1146,4 +1104,5 @@ void __init icmp_init(struct net_proto_family *ops)
EXPORT_SYMBOL(icmp_err_convert);
EXPORT_SYMBOL(icmp_send);
EXPORT_SYMBOL(icmp_statistics);
+EXPORT_SYMBOL(icmpmsg_statistics);
EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a646409c2d0..7dbc282d4f9 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -91,6 +91,7 @@
#include <linux/rtnetlink.h>
#include <linux/times.h>
+#include <net/net_namespace.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -1694,8 +1695,8 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
(void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
#ifdef CONFIG_IP_MULTICAST
- struct in_device *in_dev = pmc->interface;
struct ip_sf_list *psf;
+ in_dev = pmc->interface;
#endif
/* filter mode change */
@@ -1798,7 +1799,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
{
int err;
- if (iml->sflist == 0) {
+ if (iml->sflist == NULL) {
/* any-source empty exclude case */
return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
iml->sfmode, 0, NULL, 0);
@@ -2166,7 +2167,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
return -EFAULT;
}
for (i=0; i<copycount; i++) {
- struct sockaddr_in *psin;
struct sockaddr_storage ss;
psin = (struct sockaddr_in *)&ss;
@@ -2291,7 +2291,7 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
state->in_dev = NULL;
- for_each_netdev(state->dev) {
+ for_each_netdev(&init_net, state->dev) {
struct in_device *in_dev;
in_dev = in_dev_get(state->dev);
if (!in_dev)
@@ -2410,23 +2410,8 @@ static const struct seq_operations igmp_mc_seq_ops = {
static int igmp_mc_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct igmp_mc_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
- rc = seq_open(file, &igmp_mc_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ return seq_open_private(file, &igmp_mc_seq_ops,
+ sizeof(struct igmp_mc_iter_state));
}
static const struct file_operations igmp_mc_seq_fops = {
@@ -2453,7 +2438,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
state->idev = NULL;
state->im = NULL;
- for_each_netdev(state->dev) {
+ for_each_netdev(&init_net, state->dev) {
struct in_device *idev;
idev = in_dev_get(state->dev);
if (unlikely(idev == NULL))
@@ -2584,23 +2569,8 @@ static const struct seq_operations igmp_mcf_seq_ops = {
static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct igmp_mcf_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
- rc = seq_open(file, &igmp_mcf_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ return seq_open_private(file, &igmp_mcf_seq_ops,
+ sizeof(struct igmp_mcf_iter_state));
}
static const struct file_operations igmp_mcf_seq_fops = {
@@ -2613,8 +2583,8 @@ static const struct file_operations igmp_mcf_seq_fops = {
int __init igmp_mc_proc_init(void)
{
- proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops);
- proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
+ proc_net_fops_create(&init_net, "igmp", S_IRUGO, &igmp_mc_seq_fops);
+ proc_net_fops_create(&init_net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
return 0;
}
#endif
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fbe7714f21d..3cef12835c4 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -33,6 +33,19 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
* This array holds the first and last local port number.
*/
int sysctl_local_port_range[2] = { 32768, 61000 };
+DEFINE_SEQLOCK(sysctl_port_range_lock);
+
+void inet_get_local_port_range(int *low, int *high)
+{
+ unsigned seq;
+ do {
+ seq = read_seqbegin(&sysctl_port_range_lock);
+
+ *low = sysctl_local_port_range[0];
+ *high = sysctl_local_port_range[1];
+ } while (read_seqretry(&sysctl_port_range_lock, seq));
+}
+EXPORT_SYMBOL(inet_get_local_port_range);
int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb)
@@ -77,10 +90,11 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
local_bh_disable();
if (!snum) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int remaining = (high - low) + 1;
- int rover = net_random() % (high - low) + low;
+ int remaining, rover, low, high;
+
+ inet_get_local_port_range(&low, &high);
+ remaining = high - low;
+ rover = net_random() % remaining + low;
do {
head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index def007ec1d6..7eb83ebed2e 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -11,6 +11,7 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
@@ -112,7 +113,7 @@ static int inet_csk_diag_fill(struct sock *sk,
}
#endif
-#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ
+#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
r->idiag_timer = 1;
@@ -190,7 +191,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
r->id.idiag_dst[0] = tw->tw_daddr;
r->idiag_state = tw->tw_substate;
r->idiag_timer = 3;
- r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ;
+ r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
r->idiag_uid = 0;
@@ -838,15 +839,11 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
static DEFINE_MUTEX(inet_diag_mutex);
-static void inet_diag_rcv(struct sock *sk, int len)
+static void inet_diag_rcv(struct sk_buff *skb)
{
- unsigned int qlen = 0;
-
- do {
- mutex_lock(&inet_diag_mutex);
- netlink_run_queue(sk, &qlen, &inet_diag_rcv_msg);
- mutex_unlock(&inet_diag_mutex);
- } while (qlen);
+ mutex_lock(&inet_diag_mutex);
+ netlink_rcv_skb(skb, &inet_diag_rcv_msg);
+ mutex_unlock(&inet_diag_mutex);
}
static DEFINE_SPINLOCK(inet_diag_register_lock);
@@ -896,8 +893,8 @@ static int __init inet_diag_init(void)
if (!inet_diag_table)
goto out;
- idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
- NULL, THIS_MODULE);
+ idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0,
+ inet_diag_rcv, NULL, THIS_MODULE);
if (idiagnl == NULL)
goto out_free_table;
err = 0;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fb662621c54..fac6398e436 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -279,19 +279,18 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
int ret;
if (!snum) {
- int low = sysctl_local_port_range[0];
- int high = sysctl_local_port_range[1];
- int range = high - low;
- int i;
- int port;
+ int i, remaining, low, high, port;
static u32 hint;
u32 offset = hint + inet_sk_port_offset(sk);
struct hlist_node *node;
struct inet_timewait_sock *tw = NULL;
+ inet_get_local_port_range(&low, &high);
+ remaining = high - low;
+
local_bh_disable();
- for (i = 1; i <= range; i++) {
- port = low + (i + offset) % range;
+ for (i = 1; i <= remaining; i++) {
+ port = low + (i + offset) % remaining;
head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
spin_lock(&head->lock);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
new file mode 100644
index 00000000000..4545b64e281
--- /dev/null
+++ b/net/ipv4/inet_lro.c
@@ -0,0 +1,600 @@
+/*
+ * linux/net/ipv4/inet_lro.c
+ *
+ * Large Receive Offload (ipv4 / tcp)
+ *
+ * (C) Copyright IBM Corp. 2007
+ *
+ * Authors:
+ * Jan-Bernd Themann <themann@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/module.h>
+#include <linux/if_vlan.h>
+#include <linux/inet_lro.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
+MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
+
+#define TCP_HDR_LEN(tcph) (tcph->doff << 2)
+#define IP_HDR_LEN(iph) (iph->ihl << 2)
+#define TCP_PAYLOAD_LENGTH(iph, tcph) \
+ (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
+
+#define IPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_W_TIMESTAMP 8
+
+#define LRO_MAX_PG_HLEN 64
+
+#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
+
+/*
+ * Basic tcp checks whether packet is suitable for LRO
+ */
+
+static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
+ int len, struct net_lro_desc *lro_desc)
+{
+ /* check ip header: don't aggregate padded frames */
+ if (ntohs(iph->tot_len) != len)
+ return -1;
+
+ if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
+ return -1;
+
+ if (iph->ihl != IPH_LEN_WO_OPTIONS)
+ return -1;
+
+ if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack
+ || tcph->rst || tcph->syn || tcph->fin)
+ return -1;
+
+ if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
+ return -1;
+
+ if (tcph->doff != TCPH_LEN_WO_OPTIONS
+ && tcph->doff != TCPH_LEN_W_TIMESTAMP)
+ return -1;
+
+ /* check tcp options (only timestamp allowed) */
+ if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
+ u32 *topt = (u32 *)(tcph + 1);
+
+ if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8)
+ | TCPOLEN_TIMESTAMP))
+ return -1;
+
+ /* timestamp should be in right order */
+ topt++;
+ if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
+ ntohl(*topt)))
+ return -1;
+
+ /* timestamp reply should not be zero */
+ topt++;
+ if (*topt == 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
+{
+ struct iphdr *iph = lro_desc->iph;
+ struct tcphdr *tcph = lro_desc->tcph;
+ u32 *p;
+ __wsum tcp_hdr_csum;
+
+ tcph->ack_seq = lro_desc->tcp_ack;
+ tcph->window = lro_desc->tcp_window;
+
+ if (lro_desc->tcp_saw_tstamp) {
+ p = (u32 *)(tcph + 1);
+ *(p+2) = lro_desc->tcp_rcv_tsecr;
+ }
+
+ iph->tot_len = htons(lro_desc->ip_tot_len);
+
+ iph->check = 0;
+ iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl);
+
+ tcph->check = 0;
+ tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), 0);
+ lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
+ tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ lro_desc->ip_tot_len -
+ IP_HDR_LEN(iph), IPPROTO_TCP,
+ lro_desc->data_csum);
+}
+
+static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
+{
+ __wsum tcp_csum;
+ __wsum tcp_hdr_csum;
+ __wsum tcp_ps_hdr_csum;
+
+ tcp_csum = ~csum_unfold(tcph->check);
+ tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), tcp_csum);
+
+ tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+ len + TCP_HDR_LEN(tcph),
+ IPPROTO_TCP, 0);
+
+ return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
+ tcp_ps_hdr_csum);
+}
+
+static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
+ struct iphdr *iph, struct tcphdr *tcph,
+ u16 vlan_tag, struct vlan_group *vgrp)
+{
+ int nr_frags;
+ u32 *ptr;
+ u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+ nr_frags = skb_shinfo(skb)->nr_frags;
+ lro_desc->parent = skb;
+ lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
+ lro_desc->iph = iph;
+ lro_desc->tcph = tcph;
+ lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
+ lro_desc->tcp_ack = ntohl(tcph->ack_seq);
+ lro_desc->tcp_window = tcph->window;
+
+ lro_desc->pkt_aggr_cnt = 1;
+ lro_desc->ip_tot_len = ntohs(iph->tot_len);
+
+ if (tcph->doff == 8) {
+ ptr = (u32 *)(tcph+1);
+ lro_desc->tcp_saw_tstamp = 1;
+ lro_desc->tcp_rcv_tsval = *(ptr+1);
+ lro_desc->tcp_rcv_tsecr = *(ptr+2);
+ }
+
+ lro_desc->mss = tcp_data_len;
+ lro_desc->vgrp = vgrp;
+ lro_desc->vlan_tag = vlan_tag;
+ lro_desc->active = 1;
+
+ lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
+ tcp_data_len);
+}
+
+static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
+{
+ memset(lro_desc, 0, sizeof(struct net_lro_desc));
+}
+
+static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
+ struct tcphdr *tcph, int tcp_data_len)
+{
+ struct sk_buff *parent = lro_desc->parent;
+ u32 *topt;
+
+ lro_desc->pkt_aggr_cnt++;
+ lro_desc->ip_tot_len += tcp_data_len;
+ lro_desc->tcp_next_seq += tcp_data_len;
+ lro_desc->tcp_window = tcph->window;
+ lro_desc->tcp_ack = tcph->ack_seq;
+
+ /* don't update tcp_rcv_tsval, would not work with PAWS */
+ if (lro_desc->tcp_saw_tstamp) {
+ topt = (u32 *) (tcph + 1);
+ lro_desc->tcp_rcv_tsecr = *(topt + 2);
+ }
+
+ lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
+ lro_tcp_data_csum(iph, tcph,
+ tcp_data_len),
+ parent->len);
+
+ parent->len += tcp_data_len;
+ parent->data_len += tcp_data_len;
+ if (tcp_data_len > lro_desc->mss)
+ lro_desc->mss = tcp_data_len;
+}
+
+static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
+ struct iphdr *iph, struct tcphdr *tcph)
+{
+ struct sk_buff *parent = lro_desc->parent;
+ int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+ lro_add_common(lro_desc, iph, tcph, tcp_data_len);
+
+ skb_pull(skb, (skb->len - tcp_data_len));
+ parent->truesize += skb->truesize;
+
+ if (lro_desc->last_skb)
+ lro_desc->last_skb->next = skb;
+ else
+ skb_shinfo(parent)->frag_list = skb;
+
+ lro_desc->last_skb = skb;
+}
+
+static void lro_add_frags(struct net_lro_desc *lro_desc,
+ int len, int hlen, int truesize,
+ struct skb_frag_struct *skb_frags,
+ struct iphdr *iph, struct tcphdr *tcph)
+{
+ struct sk_buff *skb = lro_desc->parent;
+ int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+ lro_add_common(lro_desc, iph, tcph, tcp_data_len);
+
+ skb->truesize += truesize;
+
+ skb_frags[0].page_offset += hlen;
+ skb_frags[0].size -= hlen;
+
+ while (tcp_data_len > 0) {
+ *(lro_desc->next_frag) = *skb_frags;
+ tcp_data_len -= skb_frags->size;
+ lro_desc->next_frag++;
+ skb_frags++;
+ skb_shinfo(skb)->nr_frags++;
+ }
+}
+
+static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
+ struct iphdr *iph,
+ struct tcphdr *tcph)
+{
+ if ((lro_desc->iph->saddr != iph->saddr)
+ || (lro_desc->iph->daddr != iph->daddr)
+ || (lro_desc->tcph->source != tcph->source)
+ || (lro_desc->tcph->dest != tcph->dest))
+ return -1;
+ return 0;
+}
+
+static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
+ struct net_lro_desc *lro_arr,
+ struct iphdr *iph,
+ struct tcphdr *tcph)
+{
+ struct net_lro_desc *lro_desc = NULL;
+ struct net_lro_desc *tmp;
+ int max_desc = lro_mgr->max_desc;
+ int i;
+
+ for (i = 0; i < max_desc; i++) {
+ tmp = &lro_arr[i];
+ if (tmp->active)
+ if (!lro_check_tcp_conn(tmp, iph, tcph)) {
+ lro_desc = tmp;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < max_desc; i++) {
+ if (!lro_arr[i].active) {
+ lro_desc = &lro_arr[i];
+ goto out;
+ }
+ }
+
+ LRO_INC_STATS(lro_mgr, no_desc);
+out:
+ return lro_desc;
+}
+
+static void lro_flush(struct net_lro_mgr *lro_mgr,
+ struct net_lro_desc *lro_desc)
+{
+ if (lro_desc->pkt_aggr_cnt > 1)
+ lro_update_tcp_ip_header(lro_desc);
+
+ skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
+
+ if (lro_desc->vgrp) {
+ if (test_bit(LRO_F_NAPI, &lro_mgr->features))
+ vlan_hwaccel_receive_skb(lro_desc->parent,
+ lro_desc->vgrp,
+ lro_desc->vlan_tag);
+ else
+ vlan_hwaccel_rx(lro_desc->parent,
+ lro_desc->vgrp,
+ lro_desc->vlan_tag);
+
+ } else {
+ if (test_bit(LRO_F_NAPI, &lro_mgr->features))
+ netif_receive_skb(lro_desc->parent);
+ else
+ netif_rx(lro_desc->parent);
+ }
+
+ LRO_INC_STATS(lro_mgr, flushed);
+ lro_clear_desc(lro_desc);
+}
+
+static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
+ struct vlan_group *vgrp, u16 vlan_tag, void *priv)
+{
+ struct net_lro_desc *lro_desc;
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+ u64 flags;
+ int vlan_hdr_len = 0;
+
+ if (!lro_mgr->get_skb_header
+ || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
+ &flags, priv))
+ goto out;
+
+ if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
+ goto out;
+
+ lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
+ if (!lro_desc)
+ goto out;
+
+ if ((skb->protocol == htons(ETH_P_8021Q))
+ && !test_bit(LRO_F_EXTRACT_VLAN_ID, &lro_mgr->features))
+ vlan_hdr_len = VLAN_HLEN;
+
+ if (!lro_desc->active) { /* start new lro session */
+ if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
+ goto out;
+
+ skb->ip_summed = lro_mgr->ip_summed_aggr;
+ lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp);
+ LRO_INC_STATS(lro_mgr, aggregated);
+ return 0;
+ }
+
+ if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
+ goto out2;
+
+ if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
+ goto out2;
+
+ lro_add_packet(lro_desc, skb, iph, tcph);
+ LRO_INC_STATS(lro_mgr, aggregated);
+
+ if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
+ lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
+ lro_flush(lro_mgr, lro_desc);
+
+ return 0;
+
+out2: /* send aggregated SKBs to stack */
+ lro_flush(lro_mgr, lro_desc);
+
+out: /* Original SKB has to be posted to stack */
+ skb->ip_summed = lro_mgr->ip_summed;
+ return 1;
+}
+
+
+static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
+ struct skb_frag_struct *frags,
+ int len, int true_size,
+ void *mac_hdr,
+ int hlen, __wsum sum,
+ u32 ip_summed)
+{
+ struct sk_buff *skb;
+ struct skb_frag_struct *skb_frags;
+ int data_len = len;
+ int hdr_len = min(len, hlen);
+
+ skb = netdev_alloc_skb(lro_mgr->dev, hlen);
+ if (!skb)
+ return NULL;
+
+ skb->len = len;
+ skb->data_len = len - hdr_len;
+ skb->truesize += true_size;
+ skb->tail += hdr_len;
+
+ memcpy(skb->data, mac_hdr, hdr_len);
+
+ skb_frags = skb_shinfo(skb)->frags;
+ while (data_len > 0) {
+ *skb_frags = *frags;
+ data_len -= frags->size;
+ skb_frags++;
+ frags++;
+ skb_shinfo(skb)->nr_frags++;
+ }
+
+ skb_shinfo(skb)->frags[0].page_offset += hdr_len;
+ skb_shinfo(skb)->frags[0].size -= hdr_len;
+
+ skb->ip_summed = ip_summed;
+ skb->csum = sum;
+ skb->protocol = eth_type_trans(skb, lro_mgr->dev);
+ return skb;
+}
+
+static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
+ struct skb_frag_struct *frags,
+ int len, int true_size,
+ struct vlan_group *vgrp,
+ u16 vlan_tag, void *priv, __wsum sum)
+{
+ struct net_lro_desc *lro_desc;
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+ struct sk_buff *skb;
+ u64 flags;
+ void *mac_hdr;
+ int mac_hdr_len;
+ int hdr_len = LRO_MAX_PG_HLEN;
+ int vlan_hdr_len = 0;
+
+ if (!lro_mgr->get_frag_header
+ || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
+ (void *)&tcph, &flags, priv)) {
+ mac_hdr = page_address(frags->page) + frags->page_offset;
+ goto out1;
+ }
+
+ if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
+ goto out1;
+
+ hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr);
+ mac_hdr_len = (int)((void *)(iph) - mac_hdr);
+
+ lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
+ if (!lro_desc)
+ goto out1;
+
+ if (!lro_desc->active) { /* start new lro session */
+ if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL))
+ goto out1;
+
+ skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
+ hdr_len, 0, lro_mgr->ip_summed_aggr);
+ if (!skb)
+ goto out;
+
+ if ((skb->protocol == htons(ETH_P_8021Q))
+ && !test_bit(LRO_F_EXTRACT_VLAN_ID, &lro_mgr->features))
+ vlan_hdr_len = VLAN_HLEN;
+
+ iph = (void *)(skb->data + vlan_hdr_len);
+ tcph = (void *)((u8 *)skb->data + vlan_hdr_len
+ + IP_HDR_LEN(iph));
+
+ lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL);
+ LRO_INC_STATS(lro_mgr, aggregated);
+ return NULL;
+ }
+
+ if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
+ goto out2;
+
+ if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc))
+ goto out2;
+
+ lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph);
+ LRO_INC_STATS(lro_mgr, aggregated);
+
+ if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) ||
+ lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
+ lro_flush(lro_mgr, lro_desc);
+
+ return NULL;
+
+out2: /* send aggregated packets to the stack */
+ lro_flush(lro_mgr, lro_desc);
+
+out1: /* Original packet has to be posted to the stack */
+ skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr,
+ hdr_len, sum, lro_mgr->ip_summed);
+out:
+ return skb;
+}
+
+void lro_receive_skb(struct net_lro_mgr *lro_mgr,
+ struct sk_buff *skb,
+ void *priv)
+{
+ if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) {
+ if (test_bit(LRO_F_NAPI, &lro_mgr->features))
+ netif_receive_skb(skb);
+ else
+ netif_rx(skb);
+ }
+}
+EXPORT_SYMBOL(lro_receive_skb);
+
+void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr,
+ struct sk_buff *skb,
+ struct vlan_group *vgrp,
+ u16 vlan_tag,
+ void *priv)
+{
+ if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) {
+ if (test_bit(LRO_F_NAPI, &lro_mgr->features))
+ vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
+ else
+ vlan_hwaccel_rx(skb, vgrp, vlan_tag);
+ }
+}
+EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb);
+
+void lro_receive_frags(struct net_lro_mgr *lro_mgr,
+ struct skb_frag_struct *frags,
+ int len, int true_size, void *priv, __wsum sum)
+{
+ struct sk_buff *skb;
+
+ skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0,
+ priv, sum);
+ if (!skb)
+ return;
+
+ if (test_bit(LRO_F_NAPI, &lro_mgr->features))
+ netif_receive_skb(skb);
+ else
+ netif_rx(skb);
+}
+EXPORT_SYMBOL(lro_receive_frags);
+
+void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr,
+ struct skb_frag_struct *frags,
+ int len, int true_size,
+ struct vlan_group *vgrp,
+ u16 vlan_tag, void *priv, __wsum sum)
+{
+ struct sk_buff *skb;
+
+ skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp,
+ vlan_tag, priv, sum);
+ if (!skb)
+ return;
+
+ if (test_bit(LRO_F_NAPI, &lro_mgr->features))
+ vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag);
+ else
+ vlan_hwaccel_rx(skb, vgrp, vlan_tag);
+}
+EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags);
+
+void lro_flush_all(struct net_lro_mgr *lro_mgr)
+{
+ int i;
+ struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
+
+ for (i = 0; i < lro_mgr->max_desc; i++) {
+ if (lro_desc[i].active)
+ lro_flush(lro_mgr, &lro_desc[i]);
+ }
+}
+EXPORT_SYMBOL(lro_flush_all);
+
+void lro_flush_pkt(struct net_lro_mgr *lro_mgr,
+ struct iphdr *iph, struct tcphdr *tcph)
+{
+ struct net_lro_desc *lro_desc;
+
+ lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
+ if (lro_desc->active)
+ lro_flush(lro_mgr, lro_desc);
+}
+EXPORT_SYMBOL(lro_flush_pkt);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2586df09b9b..4e189e28f30 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -8,7 +8,7 @@
* From code orinally in TCP
*/
-
+#include <linux/kernel.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
@@ -292,7 +292,7 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
if (timeo >= timewait_len) {
slot = INET_TWDR_TWKILL_SLOTS - 1;
} else {
- slot = (timeo + twdr->period - 1) / twdr->period;
+ slot = DIV_ROUND_UP(timeo, twdr->period);
if (slot >= INET_TWDR_TWKILL_SLOTS)
slot = INET_TWDR_TWKILL_SLOTS - 1;
}
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 8c95cf09f87..afbf938836f 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -105,7 +105,7 @@ int ip_forward(struct sk_buff *skb)
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
*/
- if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
+ if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb->sp)
ip_rt_send_redirect(skb);
skb->priority = rt_tos2priority(iph->tos);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 0231bdcb2ab..fabb86db763 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -292,7 +292,7 @@ static void ip_expire(unsigned long arg)
if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) {
struct sk_buff *head = qp->fragments;
/* Send an ICMP "Fragment Reassembly Timeout" message. */
- if ((head->dev = dev_get_by_index(qp->iif)) != NULL) {
+ if ((head->dev = dev_get_by_index(&init_net, qp->iif)) != NULL) {
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
dev_put(head->dev);
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5c14ed63e56..f151900efaf 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -262,7 +262,7 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
int i;
for (i=1; i<100; i++) {
sprintf(name, "gre%d", i);
- if (__dev_get_by_name(name) == NULL)
+ if (__dev_get_by_name(&init_net, name) == NULL)
break;
}
if (i==100)
@@ -684,7 +684,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
goto tx_error;
}
- if (dev->hard_header) {
+ if (dev->header_ops) {
gre_hlen = 0;
tiph = (struct iphdr*)skb->data;
} else {
@@ -1063,8 +1063,9 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
*/
-static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
- void *daddr, void *saddr, unsigned len)
+static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
+ unsigned short type,
+ const void *daddr, const void *saddr, unsigned len)
{
struct ip_tunnel *t = netdev_priv(dev);
struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1091,6 +1092,10 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned sh
return -t->hlen;
}
+static const struct header_ops ipgre_header_ops = {
+ .create = ipgre_header,
+};
+
static int ipgre_open(struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
@@ -1132,7 +1137,6 @@ static int ipgre_close(struct net_device *dev)
static void ipgre_tunnel_setup(struct net_device *dev)
{
- SET_MODULE_OWNER(dev);
dev->uninit = ipgre_tunnel_uninit;
dev->destructor = free_netdev;
dev->hard_start_xmit = ipgre_tunnel_xmit;
@@ -1188,7 +1192,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
if (!iph->saddr)
return -EINVAL;
dev->flags = IFF_BROADCAST;
- dev->hard_header = ipgre_header;
+ dev->header_ops = &ipgre_header_ops;
dev->open = ipgre_open;
dev->stop = ipgre_close;
}
@@ -1196,7 +1200,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
}
if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(tunnel->parms.link);
+ tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
if (tdev) {
hlen = tdev->hard_header_len;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 97069399d86..41d8964591e 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -382,6 +382,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
struct iphdr *iph;
u32 len;
+ if (dev->nd_net != &init_net)
+ goto drop;
+
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0f1d7beacf7..699f06781fd 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -169,7 +169,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
/* Be paranoid, rather than too clever. */
- if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
@@ -1261,6 +1261,10 @@ int ip_push_pending_frames(struct sock *sk)
skb->priority = sk->sk_priority;
skb->dst = dst_clone(&rt->u.dst);
+ if (iph->protocol == IPPROTO_ICMP)
+ icmp_out_count(((struct icmphdr *)
+ skb_transport_header(skb))->type);
+
/* Netfilter gets whole the not fragmented skb. */
err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
skb->dst->dev, dst_output);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6b420aedcdc..f51f20e487c 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -602,7 +602,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
dev_put(dev);
}
} else
- dev = __dev_get_by_index(mreq.imr_ifindex);
+ dev = __dev_get_by_index(&init_net, mreq.imr_ifindex);
err = -EADDRNOTAVAIL;
@@ -659,7 +659,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
break;
}
msf = kmalloc(optlen, GFP_KERNEL);
- if (msf == 0) {
+ if (!msf) {
err = -ENOBUFS;
break;
}
@@ -816,7 +816,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
break;
}
gsf = kmalloc(optlen,GFP_KERNEL);
- if (gsf == 0) {
+ if (!gsf) {
err = -ENOBUFS;
break;
}
@@ -836,7 +836,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
msf = kmalloc(msize,GFP_KERNEL);
- if (msf == 0) {
+ if (!msf) {
err = -ENOBUFS;
goto mc_msf_out;
}
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index e787044a851..0bfeb02a5f8 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -75,7 +75,6 @@ out:
static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
{
int err = -ENOMEM;
- struct iphdr *iph;
struct ip_comp_hdr *ipch;
if (skb_linearize_cow(skb))
@@ -84,12 +83,14 @@ static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
skb->ip_summed = CHECKSUM_NONE;
/* Remove ipcomp header and decompress original payload */
- iph = ip_hdr(skb);
ipch = (void *)skb->data;
- iph->protocol = ipch->nexthdr;
skb->transport_header = skb->network_header + sizeof(*ipch);
__skb_pull(skb, sizeof(*ipch));
err = ipcomp_decompress(x, skb);
+ if (err)
+ goto out;
+
+ err = ipch->nexthdr;
out:
return err;
@@ -98,10 +99,9 @@ out:
static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
{
struct ipcomp_data *ipcd = x->data;
- const int ihlen = ip_hdrlen(skb);
- const int plen = skb->len - ihlen;
+ const int plen = skb->len;
int dlen = IPCOMP_SCRATCH_SIZE;
- u8 *start = skb->data + ihlen;
+ u8 *start = skb->data;
const int cpu = get_cpu();
u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
@@ -118,7 +118,7 @@ static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
put_cpu();
- pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr));
+ pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr));
return 0;
out:
@@ -131,12 +131,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
int err;
struct ip_comp_hdr *ipch;
struct ipcomp_data *ipcd = x->data;
- int hdr_len = 0;
- struct iphdr *iph = ip_hdr(skb);
- iph->tot_len = htons(skb->len);
- hdr_len = iph->ihl * 4;
- if ((skb->len - hdr_len) < ipcd->threshold) {
+ if (skb->len < ipcd->threshold) {
/* Don't bother compressing */
goto out_ok;
}
@@ -145,25 +141,19 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
goto out_ok;
err = ipcomp_compress(x, skb);
- iph = ip_hdr(skb);
if (err) {
goto out_ok;
}
/* Install ipcomp header, convert into ipcomp datagram. */
- iph->tot_len = htons(skb->len);
- ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4);
- ipch->nexthdr = iph->protocol;
+ ipch = ip_comp_hdr(skb);
+ ipch->nexthdr = *skb_mac_header(skb);
ipch->flags = 0;
ipch->cpi = htons((u16 )ntohl(x->id.spi));
- iph->protocol = IPPROTO_COMP;
- ip_send_check(iph);
- return 0;
-
+ *skb_mac_header(skb) = IPPROTO_COMP;
out_ok:
- if (x->props.mode == XFRM_MODE_TUNNEL)
- ip_send_check(iph);
+ skb_push(skb, -skb_network_offset(skb));
return 0;
}
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index c5b24707753..c5c107a0182 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -55,6 +55,7 @@
#include <linux/root_dev.h>
#include <linux/delay.h>
#include <linux/nfs_fs.h>
+#include <net/net_namespace.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/ipconfig.h>
@@ -189,11 +190,15 @@ static int __init ic_open_devs(void)
rtnl_lock();
/* bring loopback device up first */
- if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0)
- printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name);
+ for_each_netdev(&init_net, dev) {
+ if (!(dev->flags & IFF_LOOPBACK))
+ continue;
+ if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
+ printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+ }
- for_each_netdev(dev) {
- if (dev == &loopback_dev)
+ for_each_netdev(&init_net, dev) {
+ if (dev->flags & IFF_LOOPBACK)
continue;
if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
(!(dev->flags & IFF_LOOPBACK) &&
@@ -425,6 +430,9 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
unsigned char *sha, *tha; /* s for "source", t for "target" */
struct ic_device *d;
+ if (dev->nd_net != &init_net)
+ goto drop;
+
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
return NET_RX_DROP;
@@ -749,8 +757,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
/* Chain packet down the line... */
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- if ((dev->hard_header &&
- dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) ||
+ if (dev_hard_header(skb, dev, ntohs(skb->protocol),
+ dev->broadcast, dev->dev_addr, skb->len) < 0 ||
dev_queue_xmit(skb) < 0)
printk("E");
}
@@ -834,6 +842,9 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
struct ic_device *d;
int len, ext_len;
+ if (dev->nd_net != &init_net)
+ goto drop;
+
/* Perform verifications before taking the lock. */
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
@@ -1253,7 +1264,7 @@ static int __init ip_auto_config(void)
__be32 addr;
#ifdef CONFIG_PROC_FS
- proc_net_fops_create("pnp", S_IRUGO, &pnp_seq_fops);
+ proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
#endif /* CONFIG_PROC_FS */
if (!ic_enable)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 396437242a1..5cd5bbe1379 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -225,7 +225,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c
int i;
for (i=1; i<100; i++) {
sprintf(name, "tunl%d", i);
- if (__dev_get_by_name(name) == NULL)
+ if (__dev_get_by_name(&init_net, name) == NULL)
break;
}
if (i==100)
@@ -237,7 +237,6 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c
return NULL;
nt = netdev_priv(dev);
- SET_MODULE_OWNER(dev);
dev->init = ipip_tunnel_init;
nt->parms = *parms;
@@ -775,7 +774,6 @@ static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
static void ipip_tunnel_setup(struct net_device *dev)
{
- SET_MODULE_OWNER(dev);
dev->uninit = ipip_tunnel_uninit;
dev->hard_start_xmit = ipip_tunnel_xmit;
dev->get_stats = ipip_tunnel_get_stats;
@@ -822,7 +820,7 @@ static int ipip_tunnel_init(struct net_device *dev)
}
if (!tdev && tunnel->parms.link)
- tdev = __dev_get_by_index(tunnel->parms.link);
+ tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
if (tdev) {
dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 7003cc1b7fe..37bb497d92a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -49,6 +49,7 @@
#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/if_ether.h>
+#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -124,7 +125,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
{
struct net_device *dev;
- dev = __dev_get_by_name("tunl0");
+ dev = __dev_get_by_name(&init_net, "tunl0");
if (dev) {
int err;
@@ -148,7 +149,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
dev = NULL;
- if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
+ if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
dev->flags |= IFF_MULTICAST;
in_dev = __in_dev_get_rtnl(dev);
@@ -1082,13 +1083,18 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
+ struct net_device *dev = ptr;
struct vif_device *v;
int ct;
+
+ if (dev->nd_net != &init_net)
+ return NOTIFY_DONE;
+
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
v=&vif_table[0];
for (ct=0;ct<maxvif;ct++,v++) {
- if (v->dev==ptr)
+ if (v->dev==dev)
vif_delete(ct);
}
return NOTIFY_DONE;
@@ -1708,26 +1714,8 @@ static const struct seq_operations ipmr_vif_seq_ops = {
static int ipmr_vif_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &ipmr_vif_seq_ops);
- if (rc)
- goto out_kfree;
-
- s->ct = 0;
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-
+ return seq_open_private(file, &ipmr_vif_seq_ops,
+ sizeof(struct ipmr_vif_iter));
}
static const struct file_operations ipmr_vif_fops = {
@@ -1871,25 +1859,8 @@ static const struct seq_operations ipmr_mfc_seq_ops = {
static int ipmr_mfc_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &ipmr_mfc_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
-
+ return seq_open_private(file, &ipmr_mfc_seq_ops,
+ sizeof(struct ipmr_mfc_iter));
}
static const struct file_operations ipmr_mfc_fops = {
@@ -1922,7 +1893,7 @@ void __init ip_mr_init(void)
ipmr_expire_timer.function=ipmr_expire_process;
register_netdevice_notifier(&ip_mr_notifier);
#ifdef CONFIG_PROC_FS
- proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
- proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
+ proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops);
+ proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops);
#endif
}
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index 8d6901d4e94..341474eefa5 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -25,6 +25,7 @@
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/ip.h>
+#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <asm/system.h>
@@ -616,12 +617,12 @@ int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
int ip_vs_app_init(void)
{
/* we will replace it with proc_net_ipvs_create() soon */
- proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops);
+ proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
return 0;
}
void ip_vs_app_cleanup(void)
{
- proc_net_remove("ip_vs_app");
+ proc_net_remove(&init_net, "ip_vs_app");
}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index d612a6a5d95..4b702f708d3 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -35,6 +35,7 @@
#include <linux/jhash.h>
#include <linux/random.h>
+#include <net/net_namespace.h>
#include <net/ip_vs.h>
@@ -922,7 +923,7 @@ int ip_vs_conn_init(void)
rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
}
- proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops);
+ proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
/* calculate the random value for connection hash */
get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
@@ -938,6 +939,6 @@ void ip_vs_conn_cleanup(void)
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
- proc_net_remove("ip_vs_conn");
+ proc_net_remove(&init_net, "ip_vs_conn");
vfree(ip_vs_conn_tab);
}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index f005a2f929f..fbca2a2ff29 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -961,7 +961,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
* ... don't know why 1st test DOES NOT include 2nd (?)
*/
if (unlikely(skb->pkt_type != PACKET_HOST
- || skb->dev == &loopback_dev || skb->sk)) {
+ || skb->dev->flags & IFF_LOOPBACK || skb->sk)) {
IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
skb->pkt_type,
ip_hdr(skb)->protocol,
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index f656d41d8d4..7345fc252a2 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -35,6 +35,7 @@
#include <linux/netfilter_ipv4.h>
#include <linux/mutex.h>
+#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/sock.h>
@@ -1791,24 +1792,8 @@ static const struct seq_operations ip_vs_info_seq_ops = {
static int ip_vs_info_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct ip_vs_iter *s = kzalloc(sizeof(*s), GFP_KERNEL);
-
- if (!s)
- goto out;
-
- rc = seq_open(file, &ip_vs_info_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ return seq_open_private(file, &ip_vs_info_seq_ops,
+ sizeof(struct ip_vs_iter));
}
static const struct file_operations ip_vs_info_fops = {
@@ -2356,8 +2341,8 @@ int ip_vs_control_init(void)
return ret;
}
- proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
- proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
+ proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
+ proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
sysctl_header = register_sysctl_table(vs_root_table);
@@ -2390,8 +2375,8 @@ void ip_vs_control_cleanup(void)
cancel_work_sync(&defense_work.work);
ip_vs_kill_estimator(&ip_vs_stats);
unregister_sysctl_table(sysctl_header);
- proc_net_remove("ip_vs_stats");
- proc_net_remove("ip_vs");
+ proc_net_remove(&init_net, "ip_vs_stats");
+ proc_net_remove(&init_net, "ip_vs");
nf_unregister_sockopt(&ip_vs_sockopts);
LeaveFunction(2);
}
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 6225acac7a3..6a1fec416ea 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -50,6 +50,7 @@
#include <linux/sysctl.h>
/* for proc_net_create/proc_net_remove */
#include <linux/proc_fs.h>
+#include <net/net_namespace.h>
#include <net/ip_vs.h>
@@ -843,7 +844,7 @@ static int __init ip_vs_lblcr_init(void)
INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
sysctl_header = register_sysctl_table(lblcr_root_table);
#ifdef CONFIG_IP_VS_LBLCR_DEBUG
- proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
+ proc_net_create(&init_net, "ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
#endif
return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
}
@@ -852,7 +853,7 @@ static int __init ip_vs_lblcr_init(void)
static void __exit ip_vs_lblcr_cleanup(void)
{
#ifdef CONFIG_IP_VS_LBLCR_DEBUG
- proc_net_remove("ip_vs_lblcr");
+ proc_net_remove(&init_net, "ip_vs_lblcr");
#endif
unregister_sysctl_table(sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 356f067484e..1960747f354 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -387,7 +387,7 @@ static int set_mcast_if(struct sock *sk, char *ifname)
struct net_device *dev;
struct inet_sock *inet = inet_sk(sk);
- if ((dev = __dev_get_by_name(ifname)) == NULL)
+ if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
@@ -412,7 +412,7 @@ static int set_sync_mesg_maxlen(int sync_state)
int num;
if (sync_state == IP_VS_STATE_MASTER) {
- if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
+ if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
return -ENODEV;
num = (dev->mtu - sizeof(struct iphdr) -
@@ -423,7 +423,7 @@ static int set_sync_mesg_maxlen(int sync_state)
IP_VS_DBG(7, "setting the maximum length of sync sending "
"message %d.\n", sync_send_mesg_maxlen);
} else if (sync_state == IP_VS_STATE_BACKUP) {
- if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
+ if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
return -ENODEV;
sync_recv_mesg_maxlen = dev->mtu -
@@ -451,7 +451,7 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
memset(&mreq, 0, sizeof(mreq));
memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
- if ((dev = __dev_get_by_name(ifname)) == NULL)
+ if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -472,7 +472,7 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
__be32 addr;
struct sockaddr_in sin;
- if ((dev = __dev_get_by_name(ifname)) == NULL)
+ if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
return -ENODEV;
addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index 702d94db19b..23cbfc7c80f 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -24,6 +24,7 @@
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/mutex.h>
+#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/route.h>
@@ -249,10 +250,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
if (entry->info->indev && entry->skb->dev) {
pmsg->hw_type = entry->skb->dev->type;
- if (entry->skb->dev->hard_header_parse)
- pmsg->hw_addrlen =
- entry->skb->dev->hard_header_parse(entry->skb,
- pmsg->hw_addr);
+ pmsg->hw_addrlen = dev_parse_header(entry->skb,
+ pmsg->hw_addr);
}
if (data_len)
@@ -476,7 +475,7 @@ ipq_dev_drop(int ifindex)
#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
static inline void
-ipq_rcv_skb(struct sk_buff *skb)
+__ipq_rcv_skb(struct sk_buff *skb)
{
int status, type, pid, flags, nlmsglen, skblen;
struct nlmsghdr *nlh;
@@ -534,19 +533,10 @@ ipq_rcv_skb(struct sk_buff *skb)
}
static void
-ipq_rcv_sk(struct sock *sk, int len)
+ipq_rcv_skb(struct sk_buff *skb)
{
- struct sk_buff *skb;
- unsigned int qlen;
-
mutex_lock(&ipqnl_mutex);
-
- for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
- skb = skb_dequeue(&sk->sk_receive_queue);
- ipq_rcv_skb(skb);
- kfree_skb(skb);
- }
-
+ __ipq_rcv_skb(skb);
mutex_unlock(&ipqnl_mutex);
}
@@ -556,6 +546,9 @@ ipq_rcv_dev_event(struct notifier_block *this,
{
struct net_device *dev = ptr;
+ if (dev->nd_net != &init_net)
+ return NOTIFY_DONE;
+
/* Drop any packets associated with the downed device */
if (event == NETDEV_DOWN)
ipq_dev_drop(dev->ifindex);
@@ -575,7 +568,7 @@ ipq_rcv_nl_event(struct notifier_block *this,
if (event == NETLINK_URELEASE &&
n->protocol == NETLINK_FIREWALL && n->pid) {
write_lock_bh(&queue_lock);
- if (n->pid == peer_pid)
+ if ((n->net == &init_net) && (n->pid == peer_pid))
__ipq_reset();
write_unlock_bh(&queue_lock);
}
@@ -667,14 +660,14 @@ static int __init ip_queue_init(void)
struct proc_dir_entry *proc;
netlink_register_notifier(&ipq_nl_notifier);
- ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
- NULL, THIS_MODULE);
+ ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0,
+ ipq_rcv_skb, NULL, THIS_MODULE);
if (ipqnl == NULL) {
printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
}
- proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
+ proc = proc_net_create(&init_net, IPQ_PROC_FS_NAME, 0, ipq_get_info);
if (proc)
proc->owner = THIS_MODULE;
else {
@@ -695,8 +688,7 @@ static int __init ip_queue_init(void)
cleanup_sysctl:
unregister_sysctl_table(ipq_sysctl_header);
unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(IPQ_PROC_FS_NAME);
-
+ proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
cleanup_ipqnl:
sock_release(ipqnl->sk_socket);
mutex_lock(&ipqnl_mutex);
@@ -715,7 +707,7 @@ static void __exit ip_queue_fini(void)
unregister_sysctl_table(ipq_sysctl_header);
unregister_netdevice_notifier(&ipq_dev_notifier);
- proc_net_remove(IPQ_PROC_FS_NAME);
+ proc_net_remove(&init_net, IPQ_PROC_FS_NAME);
sock_release(ipqnl->sk_socket);
mutex_lock(&ipqnl_mutex);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 69bd362b5fa..27f14e1ebd8 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -25,6 +25,7 @@
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/net_namespace.h>
#include <net/checksum.h>
#define CLUSTERIP_VERSION "0.8"
@@ -400,7 +401,7 @@ checkentry(const char *tablename,
return false;
}
- dev = dev_get_by_name(e->ip.iniface);
+ dev = dev_get_by_name(&init_net, e->ip.iniface);
if (!dev) {
printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
return false;
@@ -726,7 +727,7 @@ static int __init ipt_clusterip_init(void)
goto cleanup_target;
#ifdef CONFIG_PROC_FS
- clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net);
+ clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
if (!clusterip_procdir) {
printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n");
ret = -ENOMEM;
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 7c4e4be7c8b..3e0b562b2db 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -125,6 +125,9 @@ static int masq_device_event(struct notifier_block *this,
{
const struct net_device *dev = ptr;
+ if (dev->nd_net != &init_net)
+ return NOTIFY_DONE;
+
if (event == NETDEV_DOWN) {
/* Device was downed. Search entire table for
conntracks which were associated with that device,
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 6ca43e4ca7e..c636d6d6357 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -409,7 +409,8 @@ static int __init ipt_ulog_init(void)
for (i = 0; i < ULOG_MAXNLGROUPS; i++)
setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
- nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
+ nflognl = netlink_kernel_create(&init_net,
+ NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
NULL, THIS_MODULE);
if (!nflognl)
return -ENOMEM;
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 6d0c0f7364a..11d39fb5f38 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -24,6 +24,7 @@
#include <linux/bitops.h>
#include <linux/skbuff.h>
#include <linux/inet.h>
+#include <net/net_namespace.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ipt_recent.h>
@@ -380,25 +381,14 @@ static const struct seq_operations recent_seq_ops = {
static int recent_seq_open(struct inode *inode, struct file *file)
{
struct proc_dir_entry *pde = PDE(inode);
- struct seq_file *seq;
struct recent_iter_state *st;
- int ret;
- st = kzalloc(sizeof(*st), GFP_KERNEL);
+ st = __seq_open_private(file, &recent_seq_ops, sizeof(*st));
if (st == NULL)
return -ENOMEM;
- ret = seq_open(file, &recent_seq_ops);
- if (ret) {
- kfree(st);
- goto out;
- }
-
st->table = pde->data;
- seq = file->private_data;
- seq->private = st;
-out:
- return ret;
+ return 0;
}
static ssize_t recent_proc_write(struct file *file, const char __user *input,
@@ -487,7 +477,7 @@ static int __init ipt_recent_init(void)
#ifdef CONFIG_PROC_FS
if (err)
return err;
- proc_dir = proc_mkdir("ipt_recent", proc_net);
+ proc_dir = proc_mkdir("ipt_recent", init_net.proc_net);
if (proc_dir == NULL) {
xt_unregister_match(&recent_match);
err = -ENOMEM;
@@ -501,7 +491,7 @@ static void __exit ipt_recent_exit(void)
BUG_ON(!list_empty(&tables));
xt_unregister_match(&recent_match);
#ifdef CONFIG_PROC_FS
- remove_proc_entry("ipt_recent", proc_net);
+ remove_proc_entry("ipt_recent", init_net.proc_net);
#endif
}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index f813e02aab3..2fcb9249a8d 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -360,35 +360,32 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
-static int ipv4_tuple_to_nfattr(struct sk_buff *skb,
+static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
- NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t),
+ NLA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t),
&tuple->src.u3.ip);
- NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t),
+ NLA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t),
&tuple->dst.u3.ip);
return 0;
-nfattr_failure:
+nla_put_failure:
return -1;
}
-static const size_t cta_min_ip[CTA_IP_MAX] = {
- [CTA_IP_V4_SRC-1] = sizeof(u_int32_t),
- [CTA_IP_V4_DST-1] = sizeof(u_int32_t),
+static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = {
+ [CTA_IP_V4_SRC] = { .type = NLA_U32 },
+ [CTA_IP_V4_DST] = { .type = NLA_U32 },
};
-static int ipv4_nfattr_to_tuple(struct nfattr *tb[],
+static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
struct nf_conntrack_tuple *t)
{
- if (!tb[CTA_IP_V4_SRC-1] || !tb[CTA_IP_V4_DST-1])
+ if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
return -EINVAL;
- if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
- return -EINVAL;
-
- t->src.u3.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
- t->dst.u3.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
+ t->src.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_SRC]);
+ t->dst.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_DST]);
return 0;
}
@@ -411,8 +408,9 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.print_conntrack = ipv4_print_conntrack,
.get_l4proto = ipv4_get_l4proto,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .tuple_to_nfattr = ipv4_tuple_to_nfattr,
- .nfattr_to_tuple = ipv4_nfattr_to_tuple,
+ .tuple_to_nlattr = ipv4_tuple_to_nlattr,
+ .nlattr_to_tuple = ipv4_nlattr_to_tuple,
+ .nla_policy = ipv4_nla_policy,
#endif
#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
.ctl_table_path = nf_net_ipv4_netfilter_sysctl_path,
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index b3dd5de9a25..741f3dfaa5a 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/percpu.h>
+#include <net/net_namespace.h>
#include <linux/netfilter.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -173,22 +174,8 @@ static const struct seq_operations ct_seq_ops = {
static int ct_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- struct ct_iter_state *st;
- int ret;
-
- st = kzalloc(sizeof(struct ct_iter_state), GFP_KERNEL);
- if (st == NULL)
- return -ENOMEM;
- ret = seq_open(file, &ct_seq_ops);
- if (ret)
- goto out_free;
- seq = file->private_data;
- seq->private = st;
- return ret;
-out_free:
- kfree(st);
- return ret;
+ return seq_open_private(file, &ct_seq_ops,
+ sizeof(struct ct_iter_state));
}
static const struct file_operations ct_file_ops = {
@@ -290,22 +277,8 @@ static const struct seq_operations exp_seq_ops = {
static int exp_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- struct ct_expect_iter_state *st;
- int ret;
-
- st = kzalloc(sizeof(struct ct_expect_iter_state), GFP_KERNEL);
- if (!st)
- return -ENOMEM;
- ret = seq_open(file, &exp_seq_ops);
- if (ret)
- goto out_free;
- seq = file->private_data;
- seq->private = st;
- return ret;
-out_free:
- kfree(st);
- return ret;
+ return seq_open_private(file, &exp_seq_ops,
+ sizeof(struct ct_expect_iter_state));
}
static const struct file_operations ip_exp_file_ops = {
@@ -408,16 +381,16 @@ int __init nf_conntrack_ipv4_compat_init(void)
{
struct proc_dir_entry *proc, *proc_exp, *proc_stat;
- proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops);
+ proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops);
if (!proc)
goto err1;
- proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440,
+ proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440,
&ip_exp_file_ops);
if (!proc_exp)
goto err2;
- proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat);
+ proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, init_net.proc_net_stat);
if (!proc_stat)
goto err3;
@@ -427,16 +400,16 @@ int __init nf_conntrack_ipv4_compat_init(void)
return 0;
err3:
- proc_net_remove("ip_conntrack_expect");
+ proc_net_remove(&init_net, "ip_conntrack_expect");
err2:
- proc_net_remove("ip_conntrack");
+ proc_net_remove(&init_net, "ip_conntrack");
err1:
return -ENOMEM;
}
void __exit nf_conntrack_ipv4_compat_fini(void)
{
- remove_proc_entry("ip_conntrack", proc_net_stat);
- proc_net_remove("ip_conntrack_expect");
- proc_net_remove("ip_conntrack");
+ remove_proc_entry("ip_conntrack", init_net.proc_net_stat);
+ proc_net_remove(&init_net, "ip_conntrack_expect");
+ proc_net_remove(&init_net, "ip_conntrack");
}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 6593fd2c5b1..11fedc73049 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -232,45 +232,42 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
-static int icmp_tuple_to_nfattr(struct sk_buff *skb,
+static int icmp_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *t)
{
- NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
+ NLA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
&t->src.u.icmp.id);
- NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
+ NLA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
&t->dst.u.icmp.type);
- NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
+ NLA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
&t->dst.u.icmp.code);
return 0;
-nfattr_failure:
+nla_put_failure:
return -1;
}
-static const size_t cta_min_proto[CTA_PROTO_MAX] = {
- [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t),
- [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t),
- [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t)
+static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
+ [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 },
+ [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 },
+ [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 },
};
-static int icmp_nfattr_to_tuple(struct nfattr *tb[],
+static int icmp_nlattr_to_tuple(struct nlattr *tb[],
struct nf_conntrack_tuple *tuple)
{
- if (!tb[CTA_PROTO_ICMP_TYPE-1]
- || !tb[CTA_PROTO_ICMP_CODE-1]
- || !tb[CTA_PROTO_ICMP_ID-1])
- return -EINVAL;
-
- if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+ if (!tb[CTA_PROTO_ICMP_TYPE]
+ || !tb[CTA_PROTO_ICMP_CODE]
+ || !tb[CTA_PROTO_ICMP_ID])
return -EINVAL;
tuple->dst.u.icmp.type =
- *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
+ *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_TYPE]);
tuple->dst.u.icmp.code =
- *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
+ *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_CODE]);
tuple->src.u.icmp.id =
- *(__be16 *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
+ *(__be16 *)nla_data(tb[CTA_PROTO_ICMP_ID]);
if (tuple->dst.u.icmp.type >= sizeof(invmap)
|| !invmap[tuple->dst.u.icmp.type])
@@ -327,8 +324,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
.destroy = NULL,
.me = NULL,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .tuple_to_nfattr = icmp_tuple_to_nfattr,
- .nfattr_to_tuple = icmp_nfattr_to_tuple,
+ .tuple_to_nlattr = icmp_tuple_to_nlattr,
+ .nlattr_to_tuple = icmp_nlattr_to_tuple,
+ .nla_policy = icmp_nla_policy,
#endif
#ifdef CONFIG_SYSCTL
.ctl_table_header = &icmp_sysctl_header,
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index deab27facba..7221aa20e6f 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -544,46 +544,46 @@ EXPORT_SYMBOL(nf_nat_protocol_unregister);
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
int
-nf_nat_port_range_to_nfattr(struct sk_buff *skb,
+nf_nat_port_range_to_nlattr(struct sk_buff *skb,
const struct nf_nat_range *range)
{
- NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16),
+ NLA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16),
&range->min.tcp.port);
- NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16),
+ NLA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16),
&range->max.tcp.port);
return 0;
-nfattr_failure:
+nla_put_failure:
return -1;
}
-EXPORT_SYMBOL_GPL(nf_nat_port_nfattr_to_range);
+EXPORT_SYMBOL_GPL(nf_nat_port_nlattr_to_range);
int
-nf_nat_port_nfattr_to_range(struct nfattr *tb[], struct nf_nat_range *range)
+nf_nat_port_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range)
{
int ret = 0;
/* we have to return whether we actually parsed something or not */
- if (tb[CTA_PROTONAT_PORT_MIN-1]) {
+ if (tb[CTA_PROTONAT_PORT_MIN]) {
ret = 1;
range->min.tcp.port =
- *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
+ *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MIN]);
}
- if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
+ if (!tb[CTA_PROTONAT_PORT_MAX]) {
if (ret)
range->max.tcp.port = range->min.tcp.port;
} else {
ret = 1;
range->max.tcp.port =
- *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
+ *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MAX]);
}
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_port_range_to_nfattr);
+EXPORT_SYMBOL_GPL(nf_nat_port_range_to_nlattr);
#endif
/* Noone using conntrack by the time this called. */
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 2e40cc83526..d562290b182 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -142,8 +142,8 @@ static struct nf_nat_protocol gre __read_mostly = {
.in_range = gre_in_range,
.unique_tuple = gre_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nfattr = nf_nat_port_range_to_nfattr,
- .nfattr_to_range = nf_nat_port_nfattr_to_range,
+ .range_to_nlattr = nf_nat_port_range_to_nlattr,
+ .nlattr_to_range = nf_nat_port_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index f71ef9b5f42..898d7377115 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -79,7 +79,7 @@ struct nf_nat_protocol nf_nat_protocol_icmp = {
.in_range = icmp_in_range,
.unique_tuple = icmp_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nfattr = nf_nat_port_range_to_nfattr,
- .nfattr_to_range = nf_nat_port_nfattr_to_range,
+ .range_to_nlattr = nf_nat_port_range_to_nlattr,
+ .nlattr_to_range = nf_nat_port_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
index 123c95913f2..5bbbb2acdc7 100644
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c
@@ -145,7 +145,7 @@ struct nf_nat_protocol nf_nat_protocol_tcp = {
.in_range = tcp_in_range,
.unique_tuple = tcp_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nfattr = nf_nat_port_range_to_nfattr,
- .nfattr_to_range = nf_nat_port_nfattr_to_range,
+ .range_to_nlattr = nf_nat_port_range_to_nlattr,
+ .nlattr_to_range = nf_nat_port_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
index 1c4c70e25cd..a0af4fd9558 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udp.c
@@ -135,7 +135,7 @@ struct nf_nat_protocol nf_nat_protocol_udp = {
.in_range = udp_in_range,
.unique_tuple = udp_unique_tuple,
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
- .range_to_nfattr = nf_nat_port_range_to_nfattr,
- .nfattr_to_range = nf_nat_port_nfattr_to_range,
+ .range_to_nlattr = nf_nat_port_range_to_nlattr,
+ .nlattr_to_range = nf_nat_port_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3b690cf2a4e..e5b05b03910 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -34,6 +34,7 @@
* 2 of the License, or (at your option) any later version.
*/
#include <linux/types.h>
+#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/tcp.h>
@@ -123,33 +124,30 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
static const struct snmp_mib snmp4_icmp_list[] = {
SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS),
SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS),
- SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS),
- SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS),
- SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS),
- SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS),
- SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS),
- SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS),
- SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS),
- SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS),
- SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS),
- SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS),
- SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS),
SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS),
SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS),
- SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS),
- SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS),
- SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS),
- SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS),
- SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS),
- SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS),
- SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS),
- SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS),
- SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS),
- SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS),
- SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS),
SNMP_MIB_SENTINEL
};
+static struct {
+ char *name;
+ int index;
+} icmpmibmap[] = {
+ { "DestUnreachs", ICMP_DEST_UNREACH },
+ { "TimeExcds", ICMP_TIME_EXCEEDED },
+ { "ParmProbs", ICMP_PARAMETERPROB },
+ { "SrcQuenchs", ICMP_SOURCE_QUENCH },
+ { "Redirects", ICMP_REDIRECT },
+ { "Echos", ICMP_ECHO },
+ { "EchoReps", ICMP_ECHOREPLY },
+ { "Timestamps", ICMP_TIMESTAMP },
+ { "TimestampReps", ICMP_TIMESTAMPREPLY },
+ { "AddrMasks", ICMP_ADDRESS },
+ { "AddrMaskReps", ICMP_ADDRESSREPLY },
+ { NULL, 0 }
+};
+
+
static const struct snmp_mib snmp4_tcp_list[] = {
SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
@@ -244,9 +242,79 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
+ SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD),
+ SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD),
+ SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO),
+ SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
SNMP_MIB_SENTINEL
};
+static void icmpmsg_put(struct seq_file *seq)
+{
+#define PERLINE 16
+
+ int j, i, count;
+ static int out[PERLINE];
+
+ count = 0;
+ for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
+
+ if (snmp_fold_field((void **) icmpmsg_statistics, i))
+ out[count++] = i;
+ if (count < PERLINE)
+ continue;
+
+ seq_printf(seq, "\nIcmpMsg:");
+ for (j = 0; j < PERLINE; ++j)
+ seq_printf(seq, " %sType%u", i & 0x100 ? "Out" : "In",
+ i & 0xff);
+ seq_printf(seq, "\nIcmpMsg: ");
+ for (j = 0; j < PERLINE; ++j)
+ seq_printf(seq, " %lu",
+ snmp_fold_field((void **) icmpmsg_statistics,
+ out[j]));
+ seq_putc(seq, '\n');
+ }
+ if (count) {
+ seq_printf(seq, "\nIcmpMsg:");
+ for (j = 0; j < count; ++j)
+ seq_printf(seq, " %sType%u", out[j] & 0x100 ? "Out" :
+ "In", out[j] & 0xff);
+ seq_printf(seq, "\nIcmpMsg:");
+ for (j = 0; j < count; ++j)
+ seq_printf(seq, " %lu", snmp_fold_field((void **)
+ icmpmsg_statistics, out[j]));
+ }
+
+#undef PERLINE
+}
+
+static void icmp_put(struct seq_file *seq)
+{
+ int i;
+
+ seq_puts(seq, "\nIcmp: InMsgs InErrors");
+ for (i=0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " In%s", icmpmibmap[i].name);
+ seq_printf(seq, " OutMsgs OutErrors");
+ for (i=0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " Out%s", icmpmibmap[i].name);
+ seq_printf(seq, "\nIcmp: %lu %lu",
+ snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INMSGS),
+ snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INERRORS));
+ for (i=0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field((void **) icmpmsg_statistics,
+ icmpmibmap[i].index));
+ seq_printf(seq, " %lu %lu",
+ snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTMSGS),
+ snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTERRORS));
+ for (i=0; icmpmibmap[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field((void **) icmpmsg_statistics,
+ icmpmibmap[i].index));
+}
+
/*
* Called from the PROCfs module. This outputs /proc/net/snmp.
*/
@@ -267,15 +335,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
snmp_fold_field((void **)ip_statistics,
snmp4_ipstats_list[i].entry));
- seq_puts(seq, "\nIcmp:");
- for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
- seq_printf(seq, " %s", snmp4_icmp_list[i].name);
-
- seq_puts(seq, "\nIcmp:");
- for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field((void **)icmp_statistics,
- snmp4_icmp_list[i].entry));
+ icmp_put(seq); /* RFC 2011 compatibility */
+ icmpmsg_put(seq);
seq_puts(seq, "\nTcp:");
for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
@@ -332,6 +393,8 @@ static const struct file_operations snmp_seq_fops = {
.release = single_release,
};
+
+
/*
* Output /proc/net/netstat
*/
@@ -380,20 +443,20 @@ int __init ip_misc_proc_init(void)
{
int rc = 0;
- if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops))
+ if (!proc_net_fops_create(&init_net, "netstat", S_IRUGO, &netstat_seq_fops))
goto out_netstat;
- if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops))
+ if (!proc_net_fops_create(&init_net, "snmp", S_IRUGO, &snmp_seq_fops))
goto out_snmp;
- if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops))
+ if (!proc_net_fops_create(&init_net, "sockstat", S_IRUGO, &sockstat_seq_fops))
goto out_sockstat;
out:
return rc;
out_sockstat:
- proc_net_remove("snmp");
+ proc_net_remove(&init_net, "snmp");
out_snmp:
- proc_net_remove("netstat");
+ proc_net_remove(&init_net, "netstat");
out_netstat:
rc = -ENOMEM;
goto out;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index c6d71526f62..3916faca3af 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -59,6 +59,7 @@
#include <linux/in_route.h>
#include <linux/route.h>
#include <linux/skbuff.h>
+#include <net/net_namespace.h>
#include <net/dst.h>
#include <net/sock.h>
#include <linux/gfp.h>
@@ -313,6 +314,9 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
+ if (iph->protocol == IPPROTO_ICMP)
+ icmp_out_count(((struct icmphdr *)
+ skb_transport_header(skb))->type);
err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
dst_output);
@@ -898,24 +902,8 @@ static const struct seq_operations raw_seq_ops = {
static int raw_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct raw_iter_state *s;
-
- s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- goto out;
- rc = seq_open(file, &raw_seq_ops);
- if (rc)
- goto out_kfree;
-
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ return seq_open_private(file, &raw_seq_ops,
+ sizeof(struct raw_iter_state));
}
static const struct file_operations raw_seq_fops = {
@@ -928,13 +916,13 @@ static const struct file_operations raw_seq_fops = {
int __init raw_proc_init(void)
{
- if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops))
+ if (!proc_net_fops_create(&init_net, "raw", S_IRUGO, &raw_seq_fops))
return -ENOMEM;
return 0;
}
void __init raw_proc_exit(void)
{
- proc_net_remove("raw");
+ proc_net_remove(&init_net, "raw");
}
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c7ca94bd152..21b12de9e65 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -81,6 +81,7 @@
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
+#include <linux/workqueue.h>
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
@@ -91,6 +92,7 @@
#include <linux/jhash.h>
#include <linux/rcupdate.h>
#include <linux/times.h>
+#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/ip.h>
#include <net/route.h>
@@ -135,7 +137,8 @@ static unsigned long rt_deadline;
#define RTprint(a...) printk(KERN_DEBUG a)
static struct timer_list rt_flush_timer;
-static struct timer_list rt_periodic_timer;
+static void rt_check_expire(struct work_struct *work);
+static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
static struct timer_list rt_secret_timer;
/*
@@ -243,7 +246,7 @@ static spinlock_t *rt_hash_locks;
static struct rt_hash_bucket *rt_hash_table;
static unsigned rt_hash_mask;
-static int rt_hash_log;
+static unsigned int rt_hash_log;
static unsigned int rt_hash_rnd;
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
@@ -372,23 +375,8 @@ static const struct seq_operations rt_cache_seq_ops = {
static int rt_cache_seq_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int rc = -ENOMEM;
- struct rt_cache_iter_state *s;
-
- s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- goto out;
- rc = seq_open(file, &rt_cache_seq_ops);
- if (rc)
- goto out_kfree;
- seq = file->private_data;
- seq->private = s;
-out:
- return rc;
-out_kfree:
- kfree(s);
- goto out;
+ return seq_open_private(file, &rt_cache_seq_ops,
+ sizeof(struct rt_cache_iter_state));
}
static const struct file_operations rt_cache_seq_fops = {
@@ -571,33 +559,32 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
(fl1->iif ^ fl2->iif)) == 0;
}
-/* This runs via a timer and thus is always in BH context. */
-static void rt_check_expire(unsigned long dummy)
+static void rt_check_expire(struct work_struct *work)
{
static unsigned int rover;
unsigned int i = rover, goal;
struct rtable *rth, **rthp;
- unsigned long now = jiffies;
u64 mult;
mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
if (ip_rt_gc_timeout > 1)
do_div(mult, ip_rt_gc_timeout);
goal = (unsigned int)mult;
- if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+ if (goal > rt_hash_mask)
+ goal = rt_hash_mask + 1;
for (; goal > 0; goal--) {
unsigned long tmo = ip_rt_gc_timeout;
i = (i + 1) & rt_hash_mask;
rthp = &rt_hash_table[i].chain;
- if (*rthp == 0)
+ if (*rthp == NULL)
continue;
- spin_lock(rt_hash_lock_addr(i));
+ spin_lock_bh(rt_hash_lock_addr(i));
while ((rth = *rthp) != NULL) {
if (rth->u.dst.expires) {
/* Entry is expired even if it is in use */
- if (time_before_eq(now, rth->u.dst.expires)) {
+ if (time_before_eq(jiffies, rth->u.dst.expires)) {
tmo >>= 1;
rthp = &rth->u.dst.rt_next;
continue;
@@ -612,14 +599,10 @@ static void rt_check_expire(unsigned long dummy)
*rthp = rth->u.dst.rt_next;
rt_free(rth);
}
- spin_unlock(rt_hash_lock_addr(i));
-
- /* Fallback loop breaker. */
- if (time_after(jiffies, now))
- break;
+ spin_unlock_bh(rt_hash_lock_addr(i));
}
rover = i;
- mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
+ schedule_delayed_work(&expires_work, ip_rt_gc_interval);
}
/* This can run from both BH and non-BH contexts, the latter
@@ -1404,8 +1387,8 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
{
struct rtable *rt = (struct rtable *) dst;
struct in_device *idev = rt->idev;
- if (dev != &loopback_dev && idev && idev->dev == dev) {
- struct in_device *loopback_idev = in_dev_get(&loopback_dev);
+ if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
+ struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
if (loopback_idev) {
rt->idev = loopback_idev;
in_dev_put(idev);
@@ -1557,7 +1540,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
- rth->u.dst.dev = &loopback_dev;
+ rth->u.dst.dev = init_net.loopback_dev;
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->fl.oif = 0;
@@ -1814,7 +1797,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res.type == RTN_LOCAL) {
int result;
result = fib_validate_source(saddr, daddr, tos,
- loopback_dev.ifindex,
+ init_net.loopback_dev->ifindex,
dev, &spec_dst, &itag);
if (result < 0)
goto martian_source;
@@ -1881,7 +1864,7 @@ local_input:
#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
- rth->u.dst.dev = &loopback_dev;
+ rth->u.dst.dev = init_net.loopback_dev;
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->rt_gateway = daddr;
@@ -2151,7 +2134,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
RT_SCOPE_UNIVERSE),
} },
.mark = oldflp->mark,
- .iif = loopback_dev.ifindex,
+ .iif = init_net.loopback_dev->ifindex,
.oif = oldflp->oif };
struct fib_result res;
unsigned flags = 0;
@@ -2212,7 +2195,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
if (oldflp->oif) {
- dev_out = dev_get_by_index(oldflp->oif);
+ dev_out = dev_get_by_index(&init_net, oldflp->oif);
err = -ENODEV;
if (dev_out == NULL)
goto out;
@@ -2245,9 +2228,9 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
if (dev_out)
dev_put(dev_out);
- dev_out = &loopback_dev;
+ dev_out = init_net.loopback_dev;
dev_hold(dev_out);
- fl.oif = loopback_dev.ifindex;
+ fl.oif = init_net.loopback_dev->ifindex;
res.type = RTN_LOCAL;
flags |= RTCF_LOCAL;
goto make_route;
@@ -2292,7 +2275,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
fl.fl4_src = fl.fl4_dst;
if (dev_out)
dev_put(dev_out);
- dev_out = &loopback_dev;
+ dev_out = init_net.loopback_dev;
dev_hold(dev_out);
fl.oif = dev_out->ifindex;
if (res.fi)
@@ -2591,7 +2574,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
if (iif) {
struct net_device *dev;
- dev = __dev_get_by_index(iif);
+ dev = __dev_get_by_index(&init_net, iif);
if (dev == NULL) {
err = -ENODEV;
goto errout_free;
@@ -2992,17 +2975,14 @@ int __init ip_rt_init(void)
init_timer(&rt_flush_timer);
rt_flush_timer.function = rt_run_flush;
- init_timer(&rt_periodic_timer);
- rt_periodic_timer.function = rt_check_expire;
init_timer(&rt_secret_timer);
rt_secret_timer.function = rt_secret_rebuild;
/* All the timers, started at system startup tend
to synchronize. Perturb it a bit.
*/
- rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
- ip_rt_gc_interval;
- add_timer(&rt_periodic_timer);
+ schedule_delayed_work(&expires_work,
+ net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
ip_rt_secret_interval;
@@ -3011,15 +2991,15 @@ int __init ip_rt_init(void)
#ifdef CONFIG_PROC_FS
{
struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
- if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
+ if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
!(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
- proc_net_stat))) {
+ init_net.proc_net_stat))) {
return -ENOMEM;
}
rtstat_pde->proc_fops = &rt_cpu_seq_fops;
}
#ifdef CONFIG_NET_CLS_ROUTE
- create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
+ create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
#endif
#endif
#ifdef CONFIG_XFRM
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 53ef0f4bbda..eb286abcf5d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
#include <linux/sysctl.h>
#include <linux/igmp.h>
#include <linux/inetdevice.h>
+#include <linux/seqlock.h>
#include <net/snmp.h>
#include <net/icmp.h>
#include <net/ip.h>
@@ -89,6 +90,74 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
return 1;
}
+extern seqlock_t sysctl_port_range_lock;
+extern int sysctl_local_port_range[2];
+
+/* Update system visible IP port range */
+static void set_local_port_range(int range[2])
+{
+ write_seqlock(&sysctl_port_range_lock);
+ sysctl_local_port_range[0] = range[0];
+ sysctl_local_port_range[1] = range[1];
+ write_sequnlock(&sysctl_port_range_lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+ int range[2] = { sysctl_local_port_range[0],
+ sysctl_local_port_range[1] };
+ ctl_table tmp = {
+ .data = &range,
+ .maxlen = sizeof(range),
+ .mode = table->mode,
+ .extra1 = &ip_local_port_range_min,
+ .extra2 = &ip_local_port_range_max,
+ };
+
+ ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ if (range[1] <= range[0])
+ ret = -EINVAL;
+ else
+ set_local_port_range(range);
+ }
+
+ return ret;
+}
+
+/* Validate changes from sysctl interface. */
+static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name,
+ int nlen, void __user *oldval,
+ size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ int ret;
+ int range[2] = { sysctl_local_port_range[0],
+ sysctl_local_port_range[1] };
+ ctl_table tmp = {
+ .data = &range,
+ .maxlen = sizeof(range),
+ .mode = table->mode,
+ .extra1 = &ip_local_port_range_min,
+ .extra2 = &ip_local_port_range_max,
+ };
+
+ ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen);
+ if (ret == 0 && newval && newlen) {
+ if (range[1] <= range[0])
+ ret = -EINVAL;
+ else
+ set_local_port_range(range);
+ }
+ return ret;
+}
+
+
static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -427,10 +496,8 @@ ctl_table ipv4_table[] = {
.data = &sysctl_local_port_range,
.maxlen = sizeof(sysctl_local_port_range),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = ip_local_port_range_min,
- .extra2 = ip_local_port_range_max
+ .proc_handler = &ipv4_local_port_range,
+ .strategy = &ipv4_sysctl_local_port_range,
},
{
.ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7e740112b23..4f322003835 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -247,6 +247,7 @@
* TCP_CLOSE socket is finished
*/
+#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
@@ -2014,7 +2015,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->rx_opt.tstamp_ok)
info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
- if (tp->rx_opt.sack_ok)
+ if (tcp_is_sack(tp))
info->tcpi_options |= TCPI_OPT_SACK;
if (tp->rx_opt.wscale_ok) {
info->tcpi_options |= TCPI_OPT_WSCALE;
@@ -2030,8 +2031,13 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
- info->tcpi_unacked = tp->packets_out;
- info->tcpi_sacked = tp->sacked_out;
+ if (sk->sk_state == TCP_LISTEN) {
+ info->tcpi_unacked = sk->sk_ack_backlog;
+ info->tcpi_sacked = sk->sk_max_ack_backlog;
+ } else {
+ info->tcpi_unacked = tp->packets_out;
+ info->tcpi_sacked = tp->sacked_out;
+ }
info->tcpi_lost = tp->lost_out;
info->tcpi_retrans = tp->retrans_out;
info->tcpi_fackets = tp->fackets_out;
@@ -2210,7 +2216,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
goto out;
mss = skb_shinfo(skb)->gso_size;
- skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
segs = NULL;
goto out;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 4586211e375..5dba0fc8f57 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -210,7 +210,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
struct bictcp *ca = inet_csk_ca(sk);
cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
ca->delayed_ack += cnt;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 485d7ea35f7..80bd084a9f9 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -314,7 +314,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
struct bictcp *ca = inet_csk_ca(sk);
u32 delay;
- if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
+ if (icsk->icsk_ca_state == TCP_CA_Open) {
cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
ca->delayed_ack += cnt;
}
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 57c5f0b10e6..3904d2158a9 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -25,11 +25,13 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_info *info = _info;
- if (sk->sk_state == TCP_LISTEN)
+ if (sk->sk_state == TCP_LISTEN) {
r->idiag_rqueue = sk->sk_ack_backlog;
- else
+ r->idiag_wqueue = sk->sk_max_ack_backlog;
+ } else {
r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
- r->idiag_wqueue = tp->write_seq - tp->snd_una;
+ r->idiag_wqueue = tp->write_seq - tp->snd_una;
+ }
if (info != NULL)
tcp_get_info(sk, info);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f893e90061e..0a42e934034 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -85,7 +85,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 2;
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
-int sysctl_tcp_frto __read_mostly;
+int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_frto_response __read_mostly;
int sysctl_tcp_nometrics_save __read_mostly;
@@ -104,6 +104,7 @@ int sysctl_tcp_abc __read_mostly;
#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained DSACK info */
+#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -111,13 +112,10 @@ int sysctl_tcp_abc __read_mostly;
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
-#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0)
-#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2)
-#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4)
-
#define IsSackFrto() (sysctl_tcp_frto == 0x2)
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
+#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
@@ -198,6 +196,55 @@ static inline int tcp_in_quickack_mode(const struct sock *sk)
return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
}
+static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
+{
+ if (tp->ecn_flags&TCP_ECN_OK)
+ tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
+}
+
+static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ if (tcp_hdr(skb)->cwr)
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
+{
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ if (tp->ecn_flags&TCP_ECN_OK) {
+ if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ /* Funny extension: if ECT is not set on a segment,
+ * it is surely retransmit. It is not in ECN RFC,
+ * but Linux follows this rule. */
+ else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
+ tcp_enter_quickack_mode((struct sock *)tp);
+ }
+}
+
+static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th)
+{
+ if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr))
+ tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th)
+{
+ if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr))
+ tp->ecn_flags &= ~TCP_ECN_OK;
+}
+
+static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
+{
+ if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK))
+ return 1;
+ return 0;
+}
+
/* Buffer size and advertised window tuning.
*
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
@@ -810,6 +857,21 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
}
}
+/*
+ * Packet counting of FACK is based on in-order assumptions, therefore TCP
+ * disables it when reordering is detected
+ */
+static void tcp_disable_fack(struct tcp_sock *tp)
+{
+ tp->rx_opt.sack_ok &= ~2;
+}
+
+/* Take a notice that peer is sending DSACKs */
+static void tcp_dsack_seen(struct tcp_sock *tp)
+{
+ tp->rx_opt.sack_ok |= 4;
+}
+
/* Initialize metrics on socket. */
static void tcp_init_metrics(struct sock *sk)
@@ -831,7 +893,7 @@ static void tcp_init_metrics(struct sock *sk)
}
if (dst_metric(dst, RTAX_REORDERING) &&
tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
- tp->rx_opt.sack_ok &= ~2;
+ tcp_disable_fack(tp);
tp->reordering = dst_metric(dst, RTAX_REORDERING);
}
@@ -893,9 +955,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
/* This exciting event is worth to be remembered. 8) */
if (ts)
NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER);
- else if (IsReno(tp))
+ else if (tcp_is_reno(tp))
NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER);
- else if (IsFack(tp))
+ else if (tcp_is_fack(tp))
NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
@@ -907,8 +969,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
tp->sacked_out,
tp->undo_marker ? tp->undo_retrans : 0);
#endif
- /* Disable FACK yet. */
- tp->rx_opt.sack_ok &= ~2;
+ tcp_disable_fack(tp);
}
}
@@ -959,7 +1020,216 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
* for retransmitted and already SACKed segment -> reordering..
* Both of these heuristics are not used in Loss state, when we cannot
* account for retransmits accurately.
+ *
+ * SACK block validation.
+ * ----------------------
+ *
+ * SACK block range validation checks that the received SACK block fits to
+ * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
+ * Note that SND.UNA is not included to the range though being valid because
+ * it means that the receiver is rather inconsistent with itself reporting
+ * SACK reneging when it should advance SND.UNA. Such SACK block this is
+ * perfectly valid, however, in light of RFC2018 which explicitly states
+ * that "SACK block MUST reflect the newest segment. Even if the newest
+ * segment is going to be discarded ...", not that it looks very clever
+ * in case of head skb. Due to potentional receiver driven attacks, we
+ * choose to avoid immediate execution of a walk in write queue due to
+ * reneging and defer head skb's loss recovery to standard loss recovery
+ * procedure that will eventually trigger (nothing forbids us doing this).
+ *
+ * Implements also blockage to start_seq wrap-around. Problem lies in the
+ * fact that though start_seq (s) is before end_seq (i.e., not reversed),
+ * there's no guarantee that it will be before snd_nxt (n). The problem
+ * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
+ * wrap (s_w):
+ *
+ * <- outs wnd -> <- wrapzone ->
+ * u e n u_w e_w s n_w
+ * | | | | | | |
+ * |<------------+------+----- TCP seqno space --------------+---------->|
+ * ...-- <2^31 ->| |<--------...
+ * ...---- >2^31 ------>| |<--------...
+ *
+ * Current code wouldn't be vulnerable but it's better still to discard such
+ * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
+ * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
+ * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
+ * equal to the ideal case (infinite seqno space without wrap caused issues).
+ *
+ * With D-SACK the lower bound is extended to cover sequence space below
+ * SND.UNA down to undo_marker, which is the last point of interest. Yet
+ * again, DSACK block must not to go across snd_una (for the same reason as
+ * for the normal SACK blocks, explained above). But there all simplicity
+ * ends, TCP might receive valid D-SACKs below that. As long as they reside
+ * fully below undo_marker they do not affect behavior in anyway and can
+ * therefore be safely ignored. In rare cases (which are more or less
+ * theoretical ones), the D-SACK will nicely cross that boundary due to skb
+ * fragmentation and packet reordering past skb's retransmission. To consider
+ * them correctly, the acceptable range must be extended even more though
+ * the exact amount is rather hard to quantify. However, tp->max_window can
+ * be used as an exaggerated estimate.
+ */
+static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
+ u32 start_seq, u32 end_seq)
+{
+ /* Too far in future, or reversed (interpretation is ambiguous) */
+ if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
+ return 0;
+
+ /* Nasty start_seq wrap-around check (see comments above) */
+ if (!before(start_seq, tp->snd_nxt))
+ return 0;
+
+ /* In outstanding window? ...This is valid exit for DSACKs too.
+ * start_seq == snd_una is non-sensical (see comments above)
+ */
+ if (after(start_seq, tp->snd_una))
+ return 1;
+
+ if (!is_dsack || !tp->undo_marker)
+ return 0;
+
+ /* ...Then it's D-SACK, and must reside below snd_una completely */
+ if (!after(end_seq, tp->snd_una))
+ return 0;
+
+ if (!before(start_seq, tp->undo_marker))
+ return 1;
+
+ /* Too old */
+ if (!after(end_seq, tp->undo_marker))
+ return 0;
+
+ /* Undo_marker boundary crossing (overestimates a lot). Known already:
+ * start_seq < undo_marker and end_seq >= undo_marker.
+ */
+ return !before(start_seq, end_seq - tp->max_window);
+}
+
+/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
+ * Event "C". Later note: FACK people cheated me again 8), we have to account
+ * for reordering! Ugly, but should help.
+ *
+ * Search retransmitted skbs from write_queue that were sent when snd_nxt was
+ * less than what is now known to be received by the other end (derived from
+ * SACK blocks by the caller). Also calculate the lowest snd_nxt among the
+ * remaining retransmitted skbs to avoid some costly processing per ACKs.
*/
+static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ int flag = 0;
+ int cnt = 0;
+ u32 new_low_seq = 0;
+
+ tcp_for_write_queue(skb, sk) {
+ u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
+
+ if (skb == tcp_send_head(sk))
+ break;
+ if (cnt == tp->retrans_out)
+ break;
+ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+ continue;
+
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
+ continue;
+
+ if (after(received_upto, ack_seq) &&
+ (tcp_is_fack(tp) ||
+ !before(received_upto,
+ ack_seq + tp->reordering * tp->mss_cache))) {
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
+
+ if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+ tp->lost_out += tcp_skb_pcount(skb);
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ flag |= FLAG_DATA_SACKED;
+ NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
+ }
+ } else {
+ if (!new_low_seq || before(ack_seq, new_low_seq))
+ new_low_seq = ack_seq;
+ cnt += tcp_skb_pcount(skb);
+ }
+ }
+
+ if (tp->retrans_out)
+ tp->lost_retrans_low = new_low_seq;
+
+ return flag;
+}
+
+static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
+ struct tcp_sack_block_wire *sp, int num_sacks,
+ u32 prior_snd_una)
+{
+ u32 start_seq_0 = ntohl(get_unaligned(&sp[0].start_seq));
+ u32 end_seq_0 = ntohl(get_unaligned(&sp[0].end_seq));
+ int dup_sack = 0;
+
+ if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
+ dup_sack = 1;
+ tcp_dsack_seen(tp);
+ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
+ } else if (num_sacks > 1) {
+ u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq));
+ u32 start_seq_1 = ntohl(get_unaligned(&sp[1].start_seq));
+
+ if (!after(end_seq_0, end_seq_1) &&
+ !before(start_seq_0, start_seq_1)) {
+ dup_sack = 1;
+ tcp_dsack_seen(tp);
+ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
+ }
+ }
+
+ /* D-SACK for already forgotten data... Do dumb counting. */
+ if (dup_sack &&
+ !after(end_seq_0, prior_snd_una) &&
+ after(end_seq_0, tp->undo_marker))
+ tp->undo_retrans--;
+
+ return dup_sack;
+}
+
+/* Check if skb is fully within the SACK block. In presence of GSO skbs,
+ * the incoming SACK may not exactly match but we can find smaller MSS
+ * aligned portion of it that matches. Therefore we might need to fragment
+ * which may fail and creates some hassle (caller must handle error case
+ * returns).
+ */
+int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
+ u32 start_seq, u32 end_seq)
+{
+ int in_sack, err;
+ unsigned int pkt_len;
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+ !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+ if (tcp_skb_pcount(skb) > 1 && !in_sack &&
+ after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
+
+ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+
+ if (!in_sack)
+ pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
+ else
+ pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
+ err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size);
+ if (err < 0)
+ return err;
+ }
+
+ return in_sack;
+}
+
static int
tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
{
@@ -972,38 +1242,24 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
int reord = tp->packets_out;
int prior_fackets;
- u32 lost_retrans = 0;
+ u32 highest_sack_end_seq = 0;
int flag = 0;
int found_dup_sack = 0;
int cached_fack_count;
int i;
int first_sack_index;
- if (!tp->sacked_out)
- tp->fackets_out = 0;
+ if (!tp->sacked_out) {
+ if (WARN_ON(tp->fackets_out))
+ tp->fackets_out = 0;
+ tp->highest_sack = tp->snd_una;
+ }
prior_fackets = tp->fackets_out;
- /* Check for D-SACK. */
- if (before(ntohl(sp[0].start_seq), TCP_SKB_CB(ack_skb)->ack_seq)) {
+ found_dup_sack = tcp_check_dsack(tp, ack_skb, sp,
+ num_sacks, prior_snd_una);
+ if (found_dup_sack)
flag |= FLAG_DSACKING_ACK;
- found_dup_sack = 1;
- tp->rx_opt.sack_ok |= 4;
- NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
- } else if (num_sacks > 1 &&
- !after(ntohl(sp[0].end_seq), ntohl(sp[1].end_seq)) &&
- !before(ntohl(sp[0].start_seq), ntohl(sp[1].start_seq))) {
- flag |= FLAG_DSACKING_ACK;
- found_dup_sack = 1;
- tp->rx_opt.sack_ok |= 4;
- NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
- }
-
- /* D-SACK for already forgotten data...
- * Do dumb counting. */
- if (found_dup_sack &&
- !after(ntohl(sp[0].end_seq), prior_snd_una) &&
- after(ntohl(sp[0].end_seq), tp->undo_marker))
- tp->undo_retrans--;
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1083,6 +1339,22 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
int fack_count;
int dup_sack = (found_dup_sack && (i == first_sack_index));
+ if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) {
+ if (dup_sack) {
+ if (!tp->undo_marker)
+ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO);
+ else
+ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD);
+ } else {
+ /* Don't count olds caused by ACK reordering */
+ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
+ !after(end_seq, tp->snd_una))
+ continue;
+ NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD);
+ }
+ continue;
+ }
+
skb = cached_skb;
fack_count = cached_fack_count;
@@ -1091,7 +1363,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
flag |= FLAG_DATA_LOST;
tcp_for_write_queue_from(skb, sk) {
- int in_sack, pcount;
+ int in_sack;
u8 sacked;
if (skb == tcp_send_head(sk))
@@ -1110,30 +1382,11 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (!before(TCP_SKB_CB(skb)->seq, end_seq))
break;
- in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
- !before(end_seq, TCP_SKB_CB(skb)->end_seq);
-
- pcount = tcp_skb_pcount(skb);
-
- if (pcount > 1 && !in_sack &&
- after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
- unsigned int pkt_len;
-
- in_sack = !after(start_seq,
- TCP_SKB_CB(skb)->seq);
-
- if (!in_sack)
- pkt_len = (start_seq -
- TCP_SKB_CB(skb)->seq);
- else
- pkt_len = (end_seq -
- TCP_SKB_CB(skb)->seq);
- if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
- break;
- pcount = tcp_skb_pcount(skb);
- }
+ in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq);
+ if (in_sack < 0)
+ break;
- fack_count += pcount;
+ fack_count += tcp_skb_pcount(skb);
sacked = TCP_SKB_CB(skb)->sacked;
@@ -1160,11 +1413,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
continue;
}
- if ((sacked&TCPCB_SACKED_RETRANS) &&
- after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
- (!lost_retrans || after(end_seq, lost_retrans)))
- lost_retrans = end_seq;
-
if (!in_sack)
continue;
@@ -1217,6 +1465,11 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (fack_count > tp->fackets_out)
tp->fackets_out = fack_count;
+
+ if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) {
+ tp->highest_sack = TCP_SKB_CB(skb)->seq;
+ highest_sack_end_seq = TCP_SKB_CB(skb)->end_seq;
+ }
} else {
if (dup_sack && (sacked&TCPCB_RETRANS))
reord = min(fack_count, reord);
@@ -1236,45 +1489,12 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
}
}
- /* Check for lost retransmit. This superb idea is
- * borrowed from "ratehalving". Event "C".
- * Later note: FACK people cheated me again 8),
- * we have to account for reordering! Ugly,
- * but should help.
- */
- if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
- struct sk_buff *skb;
-
- tcp_for_write_queue(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
- break;
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
- continue;
- if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) &&
- after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) &&
- (IsFack(tp) ||
- !before(lost_retrans,
- TCP_SKB_CB(skb)->ack_seq + tp->reordering *
- tp->mss_cache))) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
-
- /* clear lost hint */
- tp->retransmit_skb_hint = NULL;
+ if (tp->retrans_out &&
+ after(highest_sack_end_seq, tp->lost_retrans_low) &&
+ icsk->icsk_ca_state == TCP_CA_Recovery)
+ flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq);
- if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
- tp->lost_out += tcp_skb_pcount(skb);
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- flag |= FLAG_DATA_SACKED;
- NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
- }
- }
- }
- }
-
- tp->left_out = tp->sacked_out + tp->lost_out;
+ tcp_verify_left_out(tp);
if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
(!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
@@ -1289,6 +1509,56 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
return flag;
}
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
+ */
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 holes;
+
+ holes = max(tp->lost_out, 1U);
+ holes = min(holes, tp->packets_out);
+
+ if ((tp->sacked_out + holes) > tp->packets_out) {
+ tp->sacked_out = tp->packets_out - holes;
+ tcp_update_reordering(sk, tp->packets_out + addend, 0);
+ }
+}
+
+/* Emulate SACKs for SACKless connection: account for a new dupack. */
+
+static void tcp_add_reno_sack(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ tp->sacked_out++;
+ tcp_check_reno_reordering(sk, 0);
+ tcp_verify_left_out(tp);
+}
+
+/* Account for ACK, ACKing some data in Reno Recovery phase. */
+
+static void tcp_remove_reno_sacks(struct sock *sk, int acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (acked > 0) {
+ /* One ACK acked hole. The rest eat duplicate ACKs. */
+ if (acked-1 >= tp->sacked_out)
+ tp->sacked_out = 0;
+ else
+ tp->sacked_out -= acked-1;
+ }
+ tcp_check_reno_reordering(sk, acked);
+ tcp_verify_left_out(tp);
+}
+
+static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+{
+ tp->sacked_out = 0;
+}
+
/* F-RTO can only be used if TCP has never retransmitted anything other than
* head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
*/
@@ -1376,11 +1646,13 @@ void tcp_enter_frto(struct sock *sk)
tp->undo_retrans = 0;
skb = tcp_write_queue_head(sk);
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
+ tp->undo_marker = 0;
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
}
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
/* Earlier loss recovery underway (see RFC4138; Appendix B).
* The last condition is necessary at least in tp->frto_counter case.
@@ -1405,17 +1677,15 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int cnt = 0;
- tp->sacked_out = 0;
tp->lost_out = 0;
- tp->fackets_out = 0;
tp->retrans_out = 0;
+ if (tcp_is_reno(tp))
+ tcp_reset_reno_sack(tp);
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
- cnt += tcp_skb_pcount(skb);
/*
* Count the retransmission made on RTO correctly (only when
* waiting for the first ACK and did not get it)...
@@ -1427,30 +1697,25 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
/* ...enter this if branch just for the first segment */
flag |= FLAG_DATA_ACKED;
} else {
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
+ tp->undo_marker = 0;
TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
}
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
- /* Do not mark those segments lost that were
- * forward transmitted after RTO
- */
- if (!after(TCP_SKB_CB(skb)->end_seq,
- tp->frto_highmark)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- tp->lost_out += tcp_skb_pcount(skb);
- }
- } else {
- tp->sacked_out += tcp_skb_pcount(skb);
- tp->fackets_out = cnt;
+ /* Don't lost mark skbs that were fwd transmitted after RTO */
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) &&
+ !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out += tcp_skb_pcount(skb);
}
}
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->undo_marker = 0;
tp->frto_counter = 0;
+ tp->bytes_acked = 0;
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
@@ -1458,22 +1723,26 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
tp->high_seq = tp->frto_highmark;
TCP_ECN_queue_cwr(tp);
- clear_all_retrans_hints(tp);
+ tcp_clear_retrans_hints_partial(tp);
}
-void tcp_clear_retrans(struct tcp_sock *tp)
+static void tcp_clear_retrans_partial(struct tcp_sock *tp)
{
- tp->left_out = 0;
tp->retrans_out = 0;
-
- tp->fackets_out = 0;
- tp->sacked_out = 0;
tp->lost_out = 0;
tp->undo_marker = 0;
tp->undo_retrans = 0;
}
+void tcp_clear_retrans(struct tcp_sock *tp)
+{
+ tcp_clear_retrans_partial(tp);
+
+ tp->fackets_out = 0;
+ tp->sacked_out = 0;
+}
+
/* Enter Loss state. If "how" is not zero, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
@@ -1483,7 +1752,6 @@ void tcp_enter_loss(struct sock *sk, int how)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int cnt = 0;
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
@@ -1497,17 +1765,26 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->bytes_acked = 0;
- tcp_clear_retrans(tp);
+ tcp_clear_retrans_partial(tp);
+
+ if (tcp_is_reno(tp))
+ tcp_reset_reno_sack(tp);
- /* Push undo marker, if it was plain RTO and nothing
- * was retransmitted. */
- if (!how)
+ if (!how) {
+ /* Push undo marker, if it was plain RTO and nothing
+ * was retransmitted. */
tp->undo_marker = tp->snd_una;
+ tcp_clear_retrans_hints_partial(tp);
+ } else {
+ tp->sacked_out = 0;
+ tp->fackets_out = 0;
+ tcp_clear_all_retrans_hints(tp);
+ }
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
- cnt += tcp_skb_pcount(skb);
+
if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
tp->undo_marker = 0;
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
@@ -1515,12 +1792,9 @@ void tcp_enter_loss(struct sock *sk, int how)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
- } else {
- tp->sacked_out += tcp_skb_pcount(skb);
- tp->fackets_out = cnt;
}
}
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
tp->reordering = min_t(unsigned int, tp->reordering,
sysctl_tcp_reordering);
@@ -1529,8 +1803,6 @@ void tcp_enter_loss(struct sock *sk, int how)
TCP_ECN_queue_cwr(tp);
/* Abort FRTO algorithm if one is in progress */
tp->frto_counter = 0;
-
- clear_all_retrans_hints(tp);
}
static int tcp_check_sack_reneging(struct sock *sk)
@@ -1560,7 +1832,7 @@ static int tcp_check_sack_reneging(struct sock *sk)
static inline int tcp_fackets_out(struct tcp_sock *tp)
{
- return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
+ return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out;
}
static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
@@ -1708,55 +1980,18 @@ static int tcp_time_to_recover(struct sock *sk)
return 0;
}
-/* If we receive more dupacks than we expected counting segments
- * in assumption of absent reordering, interpret this as reordering.
- * The only another reason could be bug in receiver TCP.
+/* RFC: This is from the original, I doubt that this is necessary at all:
+ * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
+ * retransmitted past LOST markings in the first place? I'm not fully sure
+ * about undo and end of connection cases, which can cause R without L?
*/
-static void tcp_check_reno_reordering(struct sock *sk, const int addend)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 holes;
-
- holes = max(tp->lost_out, 1U);
- holes = min(holes, tp->packets_out);
-
- if ((tp->sacked_out + holes) > tp->packets_out) {
- tp->sacked_out = tp->packets_out - holes;
- tcp_update_reordering(sk, tp->packets_out + addend, 0);
- }
-}
-
-/* Emulate SACKs for SACKless connection: account for a new dupack. */
-
-static void tcp_add_reno_sack(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- tp->sacked_out++;
- tcp_check_reno_reordering(sk, 0);
- tcp_sync_left_out(tp);
-}
-
-/* Account for ACK, ACKing some data in Reno Recovery phase. */
-
-static void tcp_remove_reno_sacks(struct sock *sk, int acked)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (acked > 0) {
- /* One ACK acked hole. The rest eat duplicate ACKs. */
- if (acked-1 >= tp->sacked_out)
- tp->sacked_out = 0;
- else
- tp->sacked_out -= acked-1;
- }
- tcp_check_reno_reordering(sk, acked);
- tcp_sync_left_out(tp);
-}
-
-static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+static void tcp_verify_retransmit_hint(struct tcp_sock *tp,
+ struct sk_buff *skb)
{
- tp->sacked_out = 0;
- tp->left_out = tp->lost_out;
+ if ((tp->retransmit_skb_hint != NULL) &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ tp->retransmit_skb_hint = NULL;
}
/* Mark head of queue up as lost. */
@@ -1786,20 +2021,13 @@ static void tcp_mark_head_lost(struct sock *sk,
cnt += tcp_skb_pcount(skb);
if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
break;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+ if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
-
- /* clear xmit_retransmit_queue hints
- * if this is beyond hint */
- if (tp->retransmit_skb_hint != NULL &&
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
- tp->retransmit_skb_hint = NULL;
-
+ tcp_verify_retransmit_hint(tp, skb);
}
}
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
}
/* Account newly detected lost packet(s) */
@@ -1808,7 +2036,7 @@ static void tcp_update_scoreboard(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (IsFack(tp)) {
+ if (tcp_is_fack(tp)) {
int lost = tp->fackets_out - tp->reordering;
if (lost <= 0)
lost = 1;
@@ -1822,7 +2050,7 @@ static void tcp_update_scoreboard(struct sock *sk)
* Hence, we can detect timed out packets during fast
* retransmit without falling to slow start.
*/
- if (!IsReno(tp) && tcp_head_timedout(sk)) {
+ if (!tcp_is_reno(tp) && tcp_head_timedout(sk)) {
struct sk_buff *skb;
skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
@@ -1837,19 +2065,13 @@ static void tcp_update_scoreboard(struct sock *sk)
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
-
- /* clear xmit_retrans hint */
- if (tp->retransmit_skb_hint &&
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
-
- tp->retransmit_skb_hint = NULL;
+ tcp_verify_retransmit_hint(tp, skb);
}
}
tp->scoreboard_skb_hint = skb;
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
}
}
@@ -1880,7 +2102,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag)
int decr = tp->snd_cwnd_cnt + 1;
if ((flag&(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) ||
- (IsReno(tp) && !(flag&FLAG_NOT_DUP))) {
+ (tcp_is_reno(tp) && !(flag&FLAG_NOT_DUP))) {
tp->snd_cwnd_cnt = decr&1;
decr >>= 1;
@@ -1913,7 +2135,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
msg,
NIPQUAD(inet->daddr), ntohs(inet->dport),
- tp->snd_cwnd, tp->left_out,
+ tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
@@ -1945,7 +2167,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
/* There is something screwy going on with the retrans hints after
an undo */
- clear_all_retrans_hints(tp);
+ tcp_clear_all_retrans_hints(tp);
}
static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -1971,7 +2193,7 @@ static int tcp_try_undo_recovery(struct sock *sk)
NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
tp->undo_marker = 0;
}
- if (tp->snd_una == tp->high_seq && IsReno(tp)) {
+ if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
@@ -2001,7 +2223,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Partial ACK arrived. Force Hoe's retransmit. */
- int failed = IsReno(tp) || tp->fackets_out>tp->reordering;
+ int failed = tcp_is_reno(tp) || tp->fackets_out>tp->reordering;
if (tcp_may_undo(tp)) {
/* Plain luck! Hole if filled with delayed
@@ -2038,16 +2260,15 @@ static int tcp_try_undo_loss(struct sock *sk)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
}
- clear_all_retrans_hints(tp);
+ tcp_clear_all_retrans_hints(tp);
DBGUNDO(sk, "partial loss");
tp->lost_out = 0;
- tp->left_out = tp->sacked_out;
tcp_undo_cwr(sk, 1);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
inet_csk(sk)->icsk_retransmits = 0;
tp->undo_marker = 0;
- if (!IsReno(tp))
+ if (tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
return 1;
}
@@ -2066,7 +2287,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
if (tp->retrans_out == 0)
tp->retrans_stamp = 0;
@@ -2077,7 +2298,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
int state = TCP_CA_Open;
- if (tp->left_out || tp->retrans_out || tp->undo_marker)
+ if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker)
state = TCP_CA_Disorder;
if (inet_csk(sk)->icsk_ca_state != state) {
@@ -2130,7 +2351,7 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
* tcp_xmit_retransmit_queue().
*/
static void
-tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag)
+tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -2142,8 +2363,8 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag)
* 1. Reno does not count dupacks (sacked_out) automatically. */
if (!tp->packets_out)
tp->sacked_out = 0;
- /* 2. SACK counts snd_fack in packets inaccurately. */
- if (tp->sacked_out == 0)
+
+ if (WARN_ON(!tp->sacked_out && tp->fackets_out))
tp->fackets_out = 0;
/* Now state machine starts.
@@ -2164,8 +2385,8 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag)
NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
}
- /* D. Synchronize left_out to current state. */
- tcp_sync_left_out(tp);
+ /* D. Check consistency of the current state. */
+ tcp_verify_left_out(tp);
/* E. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
@@ -2194,14 +2415,14 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag)
if (!tp->undo_marker ||
/* For SACK case do not Open to allow to undo
* catching for all duplicate ACKs. */
- IsReno(tp) || tp->snd_una != tp->high_seq) {
+ tcp_is_reno(tp) || tp->snd_una != tp->high_seq) {
tp->undo_marker = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
case TCP_CA_Recovery:
- if (IsReno(tp))
+ if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
if (tcp_try_undo_recovery(sk))
return;
@@ -2214,14 +2435,10 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag)
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
- if (IsReno(tp) && is_dupack)
+ if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
- } else {
- int acked = prior_packets - tp->packets_out;
- if (IsReno(tp))
- tcp_remove_reno_sacks(sk, acked);
- do_lost = tcp_try_undo_partial(sk, acked);
- }
+ } else
+ do_lost = tcp_try_undo_partial(sk, pkts_acked);
break;
case TCP_CA_Loss:
if (flag&FLAG_DATA_ACKED)
@@ -2235,7 +2452,7 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag)
return;
/* Loss is undone; fall through to processing in Open state. */
default:
- if (IsReno(tp)) {
+ if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
if (is_dupack)
@@ -2263,7 +2480,7 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag)
/* Otherwise enter Recovery state */
- if (IsReno(tp))
+ if (tcp_is_reno(tp))
NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY);
@@ -2361,8 +2578,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack,
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
-
-static void tcp_ack_packets_out(struct sock *sk)
+static void tcp_rearm_rto(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2373,158 +2589,143 @@ static void tcp_ack_packets_out(struct sock *sk)
}
}
-static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
- __u32 now, __s32 *seq_rtt)
+/* If we get here, the whole TSO packet has not been acked. */
+static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- __u32 seq = tp->snd_una;
- __u32 packets_acked;
- int acked = 0;
+ u32 packets_acked;
- /* If we get here, the whole TSO packet has not been
- * acked.
- */
- BUG_ON(!after(scb->end_seq, seq));
+ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
packets_acked = tcp_skb_pcount(skb);
- if (tcp_trim_head(sk, skb, seq - scb->seq))
+ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return 0;
packets_acked -= tcp_skb_pcount(skb);
if (packets_acked) {
- __u8 sacked = scb->sacked;
-
- acked |= FLAG_DATA_ACKED;
- if (sacked) {
- if (sacked & TCPCB_RETRANS) {
- if (sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out -= packets_acked;
- acked |= FLAG_RETRANS_DATA_ACKED;
- *seq_rtt = -1;
- } else if (*seq_rtt < 0)
- *seq_rtt = now - scb->when;
- if (sacked & TCPCB_SACKED_ACKED)
- tp->sacked_out -= packets_acked;
- if (sacked & TCPCB_LOST)
- tp->lost_out -= packets_acked;
- if (sacked & TCPCB_URG) {
- if (tp->urg_mode &&
- !before(seq, tp->snd_up))
- tp->urg_mode = 0;
- }
- } else if (*seq_rtt < 0)
- *seq_rtt = now - scb->when;
-
- if (tp->fackets_out) {
- __u32 dval = min(tp->fackets_out, packets_acked);
- tp->fackets_out -= dval;
- }
- /* hint's skb might be NULL but we don't need to care */
- tp->fastpath_cnt_hint -= min_t(u32, packets_acked,
- tp->fastpath_cnt_hint);
- tp->packets_out -= packets_acked;
-
BUG_ON(tcp_skb_pcount(skb) == 0);
- BUG_ON(!before(scb->seq, scb->end_seq));
+ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
}
- return acked;
+ return packets_acked;
}
-/* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+/* Remove acknowledged frames from the retransmission queue. If our packet
+ * is before the ack sequence we can discard it as it's confirmed to have
+ * arrived at the other end.
+ */
+static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
- __u32 now = tcp_time_stamp;
- int acked = 0;
+ u32 now = tcp_time_stamp;
+ int fully_acked = 1;
+ int flag = 0;
int prior_packets = tp->packets_out;
- __s32 seq_rtt = -1;
+ s32 seq_rtt = -1;
ktime_t last_ackt = net_invalid_timestamp();
- while ((skb = tcp_write_queue_head(sk)) &&
- skb != tcp_send_head(sk)) {
+ while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- __u8 sacked = scb->sacked;
+ u32 end_seq;
+ u32 packets_acked;
+ u8 sacked = scb->sacked;
- /* If our packet is before the ack sequence we can
- * discard it as it's confirmed to have arrived at
- * the other end.
- */
if (after(scb->end_seq, tp->snd_una)) {
- if (tcp_skb_pcount(skb) > 1 &&
- after(tp->snd_una, scb->seq))
- acked |= tcp_tso_acked(sk, skb,
- now, &seq_rtt);
- break;
- }
+ if (tcp_skb_pcount(skb) == 1 ||
+ !after(tp->snd_una, scb->seq))
+ break;
- /* Initial outgoing SYN's get put onto the write_queue
- * just like anything else we transmit. It is not
- * true data, and if we misinform our callers that
- * this ACK acks real data, we will erroneously exit
- * connection startup slow start one packet too
- * quickly. This is severely frowned upon behavior.
- */
- if (!(scb->flags & TCPCB_FLAG_SYN)) {
- acked |= FLAG_DATA_ACKED;
+ packets_acked = tcp_tso_acked(sk, skb);
+ if (!packets_acked)
+ break;
+
+ fully_acked = 0;
+ end_seq = tp->snd_una;
} else {
- acked |= FLAG_SYN_ACKED;
- tp->retrans_stamp = 0;
+ packets_acked = tcp_skb_pcount(skb);
+ end_seq = scb->end_seq;
}
/* MTU probing checks */
- if (icsk->icsk_mtup.probe_size) {
- if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
- tcp_mtup_probe_success(sk, skb);
- }
+ if (fully_acked && icsk->icsk_mtup.probe_size &&
+ !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) {
+ tcp_mtup_probe_success(sk, skb);
}
if (sacked) {
if (sacked & TCPCB_RETRANS) {
if (sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out -= tcp_skb_pcount(skb);
- acked |= FLAG_RETRANS_DATA_ACKED;
+ tp->retrans_out -= packets_acked;
+ flag |= FLAG_RETRANS_DATA_ACKED;
seq_rtt = -1;
+ if ((flag & FLAG_DATA_ACKED) ||
+ (packets_acked > 1))
+ flag |= FLAG_NONHEAD_RETRANS_ACKED;
} else if (seq_rtt < 0) {
seq_rtt = now - scb->when;
- last_ackt = skb->tstamp;
+ if (fully_acked)
+ last_ackt = skb->tstamp;
}
+
if (sacked & TCPCB_SACKED_ACKED)
- tp->sacked_out -= tcp_skb_pcount(skb);
+ tp->sacked_out -= packets_acked;
if (sacked & TCPCB_LOST)
- tp->lost_out -= tcp_skb_pcount(skb);
- if (sacked & TCPCB_URG) {
- if (tp->urg_mode &&
- !before(scb->end_seq, tp->snd_up))
- tp->urg_mode = 0;
- }
+ tp->lost_out -= packets_acked;
+
+ if ((sacked & TCPCB_URG) && tp->urg_mode &&
+ !before(end_seq, tp->snd_up))
+ tp->urg_mode = 0;
} else if (seq_rtt < 0) {
seq_rtt = now - scb->when;
- last_ackt = skb->tstamp;
+ if (fully_acked)
+ last_ackt = skb->tstamp;
+ }
+ tp->packets_out -= packets_acked;
+
+ /* Initial outgoing SYN's get put onto the write_queue
+ * just like anything else we transmit. It is not
+ * true data, and if we misinform our callers that
+ * this ACK acks real data, we will erroneously exit
+ * connection startup slow start one packet too
+ * quickly. This is severely frowned upon behavior.
+ */
+ if (!(scb->flags & TCPCB_FLAG_SYN)) {
+ flag |= FLAG_DATA_ACKED;
+ } else {
+ flag |= FLAG_SYN_ACKED;
+ tp->retrans_stamp = 0;
}
- tcp_dec_pcount_approx(&tp->fackets_out, skb);
- tcp_packets_out_dec(tp, skb);
+
+ if (!fully_acked)
+ break;
+
tcp_unlink_write_queue(skb, sk);
sk_stream_free_skb(sk, skb);
- clear_all_retrans_hints(tp);
+ tcp_clear_all_retrans_hints(tp);
}
- if (acked&FLAG_ACKED) {
+ if (flag & FLAG_ACKED) {
u32 pkts_acked = prior_packets - tp->packets_out;
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)->icsk_ca_ops;
- tcp_ack_update_rtt(sk, acked, seq_rtt);
- tcp_ack_packets_out(sk);
+ tcp_ack_update_rtt(sk, flag, seq_rtt);
+ tcp_rearm_rto(sk);
+
+ tp->fackets_out -= min(pkts_acked, tp->fackets_out);
+ /* hint's skb might be NULL but we don't need to care */
+ tp->fastpath_cnt_hint -= min_t(u32, pkts_acked,
+ tp->fastpath_cnt_hint);
+ if (tcp_is_reno(tp))
+ tcp_remove_reno_sacks(sk, pkts_acked);
if (ca_ops->pkts_acked) {
s32 rtt_us = -1;
/* Is the ACK triggering packet unambiguous? */
- if (!(acked & FLAG_RETRANS_DATA_ACKED)) {
+ if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
/* High resolution needed and available? */
if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
!ktime_equal(last_ackt,
@@ -2543,8 +2744,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
BUG_TRAP((int)tp->sacked_out >= 0);
BUG_TRAP((int)tp->lost_out >= 0);
BUG_TRAP((int)tp->retrans_out >= 0);
- if (!tp->packets_out && tp->rx_opt.sack_ok) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
+ if (!tp->packets_out && tcp_is_sack(tp)) {
+ icsk = inet_csk(sk);
if (tp->lost_out) {
printk(KERN_DEBUG "Leak l=%u %d\n",
tp->lost_out, icsk->icsk_ca_state);
@@ -2563,7 +2764,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
}
#endif
*seq_rtt_p = seq_rtt;
- return acked;
+ return flag;
}
static void tcp_ack_probe(struct sock *sk)
@@ -2658,6 +2859,7 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
{
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
tp->snd_cwnd_cnt = 0;
+ tp->bytes_acked = 0;
TCP_ECN_queue_cwr(tp);
tcp_moderate_cwnd(tp);
}
@@ -2712,18 +2914,22 @@ static int tcp_process_frto(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
/* Duplicate the behavior from Loss state (fastretrans_alert) */
if (flag&FLAG_DATA_ACKED)
inet_csk(sk)->icsk_retransmits = 0;
+ if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
+ ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
+ tp->undo_marker = 0;
+
if (!before(tp->snd_una, tp->frto_highmark)) {
tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
return 1;
}
- if (!IsSackFrto() || IsReno(tp)) {
+ if (!IsSackFrto() || tcp_is_reno(tp)) {
/* RFC4138 shortcoming in step 2; should also have case c):
* ACK isn't duplicate nor advances window, e.g., opposite dir
* data, winupdate
@@ -2782,6 +2988,8 @@ static int tcp_process_frto(struct sock *sk, int flag)
break;
}
tp->frto_counter = 0;
+ tp->undo_marker = 0;
+ NET_INC_STATS_BH(LINUX_MIB_TCPSPURIOUSRTOS);
}
return 0;
}
@@ -2862,6 +3070,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
+ /* Guarantee sacktag reordering detection against wrap-arounds */
+ if (before(tp->frto_highmark, tp->snd_una))
+ tp->frto_highmark = 0;
if (tp->frto_counter)
frto_cwnd = tcp_process_frto(sk, flag);
@@ -2870,7 +3081,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, prior_in_flight, 0);
- tcp_fastretrans_alert(sk, prior_packets, flag);
+ tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, flag);
} else {
if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
tcp_cong_avoid(sk, ack, prior_in_flight, 1);
@@ -3207,7 +3418,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
* Probably, we should reset in this case. For now drop them.
*/
__skb_queue_purge(&tp->out_of_order_queue);
- if (tp->rx_opt.sack_ok)
+ if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
sk_stream_mem_reclaim(sk);
@@ -3237,7 +3448,7 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_se
static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
{
- if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
+ if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
if (before(seq, tp->rcv_nxt))
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT);
else
@@ -3267,7 +3478,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk);
- if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
+ if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
@@ -3583,7 +3794,7 @@ drop:
if (!skb_peek(&tp->out_of_order_queue)) {
/* Initial out of order segment, build 1 SACK. */
- if (tp->rx_opt.sack_ok) {
+ if (tcp_is_sack(tp)) {
tp->rx_opt.num_sacks = 1;
tp->rx_opt.dsack = 0;
tp->rx_opt.eff_sacks = 1;
@@ -3648,7 +3859,7 @@ drop:
}
add_sack:
- if (tp->rx_opt.sack_ok)
+ if (tcp_is_sack(tp))
tcp_sack_new_ofo_skb(sk, seq, end_seq);
}
}
@@ -3837,7 +4048,7 @@ static int tcp_prune_queue(struct sock *sk)
* is in a sad state like this, we care only about integrity
* of the connection not performance.
*/
- if (tp->rx_opt.sack_ok)
+ if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
sk_stream_mem_reclaim(sk);
}
@@ -4538,8 +4749,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tp->tcp_header_len = sizeof(struct tcphdr);
}
- if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
- tp->rx_opt.sack_ok |= 2;
+ if (tcp_is_sack(tp) && sysctl_tcp_fack)
+ tcp_enable_fack(tp);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e089a978e12..38cf73a5673 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -62,6 +62,7 @@
#include <linux/init.h>
#include <linux/times.h>
+#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/tcp.h>
@@ -2249,7 +2250,7 @@ int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
afinfo->seq_fops->llseek = seq_lseek;
afinfo->seq_fops->release = seq_release_private;
- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
+ p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops);
if (p)
p->data = afinfo;
else
@@ -2261,7 +2262,7 @@ void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
{
if (!afinfo)
return;
- proc_net_remove(afinfo->name);
+ proc_net_remove(&init_net, afinfo->name);
memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
}
@@ -2469,6 +2470,5 @@ EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
EXPORT_SYMBOL(tcp_proc_register);
EXPORT_SYMBOL(tcp_proc_unregister);
#endif
-EXPORT_SYMBOL(sysctl_local_port_range);
EXPORT_SYMBOL(sysctl_tcp_low_latency);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a12b08fca5a..b61b76847ad 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -368,6 +368,12 @@ void tcp_twsk_destructor(struct sock *sk)
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
+ struct request_sock *req)
+{
+ tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
+}
+
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
@@ -399,7 +405,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
- newtp->left_out = 0;
newtp->retrans_out = 0;
newtp->sacked_out = 0;
newtp->fackets_out = 0;
@@ -440,7 +445,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
if (sysctl_tcp_fack)
- newtp->rx_opt.sack_ok |= 2;
+ tcp_enable_fack(newtp);
}
newtp->window_clamp = req->window_clamp;
newtp->rcv_ssthresh = req->rcv_wnd;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 666d8a58d14..324b4207254 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -61,6 +61,18 @@ int sysctl_tcp_base_mss __read_mostly = 512;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+static inline void tcp_packets_out_inc(struct sock *sk,
+ const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ int orig = tp->packets_out;
+
+ tp->packets_out += tcp_skb_pcount(skb);
+ if (!orig)
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+}
+
static void update_send_head(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -269,6 +281,56 @@ static u16 tcp_select_window(struct sock *sk)
return new_win;
}
+static inline void TCP_ECN_send_synack(struct tcp_sock *tp,
+ struct sk_buff *skb)
+{
+ TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
+ if (!(tp->ecn_flags&TCP_ECN_OK))
+ TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
+}
+
+static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->ecn_flags = 0;
+ if (sysctl_tcp_ecn) {
+ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR;
+ tp->ecn_flags = TCP_ECN_OK;
+ }
+}
+
+static __inline__ void
+TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
+{
+ if (inet_rsk(req)->ecn_ok)
+ th->ece = 1;
+}
+
+static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
+ int tcp_header_len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->ecn_flags & TCP_ECN_OK) {
+ /* Not-retransmitted data segment: set ECT and inject CWR. */
+ if (skb->len != tcp_header_len &&
+ !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
+ INET_ECN_xmit(sk);
+ if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) {
+ tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
+ tcp_hdr(skb)->cwr = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+ }
+ } else {
+ /* ACK or retransmitted segment: clear ECT|CE */
+ INET_ECN_dontxmit(sk);
+ }
+ if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
+ tcp_hdr(skb)->ece = 1;
+ }
+}
+
static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp,
__u32 tstamp, __u8 **md5_hash)
{
@@ -584,16 +646,32 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned
skb_shinfo(skb)->gso_size = 0;
skb_shinfo(skb)->gso_type = 0;
} else {
- unsigned int factor;
-
- factor = skb->len + (mss_now - 1);
- factor /= mss_now;
- skb_shinfo(skb)->gso_segs = factor;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
skb_shinfo(skb)->gso_size = mss_now;
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
}
}
+/* When a modification to fackets out becomes necessary, we need to check
+ * skb is counted to fackets_out or not. Another important thing is to
+ * tweak SACK fastpath hint too as it would overwrite all changes unless
+ * hint is also changed.
+ */
+static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb,
+ int decr)
+{
+ if (!tp->sacked_out || tcp_is_reno(tp))
+ return;
+
+ if (!before(tp->highest_sack, TCP_SKB_CB(skb)->seq))
+ tp->fackets_out -= decr;
+
+ /* cnt_hint is "off-by-one" compared with fackets_out (see sacktag) */
+ if (tp->fastpath_skb_hint != NULL &&
+ after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq))
+ tp->fastpath_cnt_hint -= decr;
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
@@ -609,7 +687,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
BUG_ON(len > skb->len);
- clear_all_retrans_hints(tp);
+ tcp_clear_retrans_hints_partial(tp);
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
@@ -634,6 +712,10 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+ if (tcp_is_sack(tp) && tp->sacked_out &&
+ (TCP_SKB_CB(skb)->seq == tp->highest_sack))
+ tp->highest_sack = TCP_SKB_CB(buff)->seq;
+
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->flags;
TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
@@ -682,32 +764,15 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= diff;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
tp->lost_out -= diff;
- tp->left_out -= diff;
- }
-
- if (diff > 0) {
- /* Adjust Reno SACK estimate. */
- if (!tp->rx_opt.sack_ok) {
- tp->sacked_out -= diff;
- if ((int)tp->sacked_out < 0)
- tp->sacked_out = 0;
- tcp_sync_left_out(tp);
- }
- tp->fackets_out -= diff;
- if ((int)tp->fackets_out < 0)
- tp->fackets_out = 0;
- /* SACK fastpath might overwrite it unless dealt with */
- if (tp->fastpath_skb_hint != NULL &&
- after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq,
- TCP_SKB_CB(skb)->seq)) {
- tp->fastpath_cnt_hint -= diff;
- if ((int)tp->fastpath_cnt_hint < 0)
- tp->fastpath_cnt_hint = 0;
- }
+ /* Adjust Reno SACK estimate. */
+ if (tcp_is_reno(tp) && diff > 0) {
+ tcp_dec_pcount_approx_int(&tp->sacked_out, diff);
+ tcp_verify_left_out(tp);
}
+ tcp_adjust_fackets_out(tp, skb, diff);
}
/* Link BUFF into the send queue. */
@@ -1654,8 +1719,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
BUG_ON(tcp_skb_pcount(skb) != 1 ||
tcp_skb_pcount(next_skb) != 1);
- /* changing transmit queue under us so clear hints */
- clear_all_retrans_hints(tp);
+ if (WARN_ON(tcp_is_sack(tp) && tp->sacked_out &&
+ (TCP_SKB_CB(next_skb)->seq == tp->highest_sack)))
+ return;
/* Ok. We will be able to collapse the packet. */
tcp_unlink_write_queue(next_skb, sk);
@@ -1683,21 +1749,23 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
tp->retrans_out -= tcp_skb_pcount(next_skb);
- if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
+ if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST)
tp->lost_out -= tcp_skb_pcount(next_skb);
- tp->left_out -= tcp_skb_pcount(next_skb);
- }
/* Reno case is special. Sigh... */
- if (!tp->rx_opt.sack_ok && tp->sacked_out) {
+ if (tcp_is_reno(tp) && tp->sacked_out)
tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
- tp->left_out -= tcp_skb_pcount(next_skb);
+
+ tcp_adjust_fackets_out(tp, next_skb, tcp_skb_pcount(next_skb));
+ tp->packets_out -= tcp_skb_pcount(next_skb);
+
+ /* changed transmit queue under us so clear hints */
+ tcp_clear_retrans_hints_partial(tp);
+ /* manually tune sacktag skb hint */
+ if (tp->fastpath_skb_hint == next_skb) {
+ tp->fastpath_skb_hint = skb;
+ tp->fastpath_cnt_hint -= tcp_skb_pcount(skb);
}
- /* Not quite right: it can be > snd.fack, but
- * it is better to underestimate fackets.
- */
- tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
- tcp_packets_out_dec(tp, next_skb);
sk_stream_free_skb(sk, next_skb);
}
}
@@ -1731,12 +1799,12 @@ void tcp_simple_retransmit(struct sock *sk)
}
}
- clear_all_retrans_hints(tp);
+ tcp_clear_all_retrans_hints(tp);
if (!lost)
return;
- tcp_sync_left_out(tp);
+ tcp_verify_left_out(tp);
/* Don't muck with the congestion window here.
* Reason is that we do not increase amount of _data_
@@ -1846,6 +1914,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
printk(KERN_DEBUG "retrans_out leaked.\n");
}
#endif
+ if (!tp->retrans_out)
+ tp->lost_retrans_low = tp->snd_nxt;
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
@@ -1938,40 +2008,35 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
return;
/* No forward retransmissions in Reno are possible. */
- if (!tp->rx_opt.sack_ok)
+ if (tcp_is_reno(tp))
return;
/* Yeah, we have to make difficult choice between forward transmission
* and retransmission... Both ways have their merits...
*
* For now we do not retransmit anything, while we have some new
- * segments to send.
+ * segments to send. In the other cases, follow rule 3 for
+ * NextSeg() specified in RFC3517.
*/
if (tcp_may_send_now(sk))
return;
- if (tp->forward_skb_hint) {
+ /* If nothing is SACKed, highest_sack in the loop won't be valid */
+ if (!tp->sacked_out)
+ return;
+
+ if (tp->forward_skb_hint)
skb = tp->forward_skb_hint;
- packet_cnt = tp->forward_cnt_hint;
- } else{
+ else
skb = tcp_write_queue_head(sk);
- packet_cnt = 0;
- }
tcp_for_write_queue_from(skb, sk) {
if (skb == tcp_send_head(sk))
break;
- tp->forward_cnt_hint = packet_cnt;
tp->forward_skb_hint = skb;
- /* Similar to the retransmit loop above we
- * can pretend that the retransmitted SKB
- * we send out here will be composed of one
- * real MSS sized packet because tcp_retransmit_skb()
- * will fragment it if necessary.
- */
- if (++packet_cnt > tp->fackets_out)
+ if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack))
break;
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index b76398d1b45..87dd5bff315 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -26,6 +26,7 @@
#include <linux/module.h>
#include <linux/ktime.h>
#include <linux/time.h>
+#include <net/net_namespace.h>
#include <net/tcp.h>
@@ -228,7 +229,7 @@ static __init int tcpprobe_init(void)
if (!tcp_probe.log)
goto err0;
- if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops))
+ if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &tcpprobe_fops))
goto err0;
ret = register_jprobe(&tcp_jprobe);
@@ -238,7 +239,7 @@ static __init int tcpprobe_init(void)
pr_info("TCP probe registered (port=%d)\n", port);
return 0;
err1:
- proc_net_remove(procname);
+ proc_net_remove(&init_net, procname);
err0:
kfree(tcp_probe.log);
return ret;
@@ -247,7 +248,7 @@ module_init(tcpprobe_init);
static __exit void tcpprobe_exit(void)
{
- proc_net_remove(procname);
+ proc_net_remove(&init_net, procname);
unregister_jprobe(&tcp_jprobe);
kfree(tcp_probe.log);
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e9b151b3a59..d8970ecfcfc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -315,7 +315,7 @@ static void tcp_retransmit_timer(struct sock *sk)
if (icsk->icsk_retransmits == 0) {
if (icsk->icsk_ca_state == TCP_CA_Disorder ||
icsk->icsk_ca_state == TCP_CA_Recovery) {
- if (tp->rx_opt.sack_ok) {
+ if (tcp_is_sack(tp)) {
if (icsk->icsk_ca_state == TCP_CA_Recovery)
NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
else
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 69d4bd10f9c..cb9fc58efb2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -98,6 +98,7 @@
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/route.h>
#include <net/checksum.h>
@@ -113,9 +114,8 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
struct hlist_head udp_hash[UDP_HTABLE_SIZE];
DEFINE_RWLOCK(udp_hash_lock);
-static int udp_port_rover;
-
-static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[])
+static inline int __udp_lib_lport_inuse(__u16 num,
+ const struct hlist_head udptable[])
{
struct sock *sk;
struct hlist_node *node;
@@ -132,11 +132,10 @@ static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[])
* @sk: socket struct in question
* @snum: port number to look up
* @udptable: hash list table, must be of UDP_HTABLE_SIZE
- * @port_rover: pointer to record of last unallocated port
* @saddr_comp: AF-dependent comparison of bound local IP addresses
*/
int __udp_lib_get_port(struct sock *sk, unsigned short snum,
- struct hlist_head udptable[], int *port_rover,
+ struct hlist_head udptable[],
int (*saddr_comp)(const struct sock *sk1,
const struct sock *sk2 ) )
{
@@ -146,49 +145,56 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
int error = 1;
write_lock_bh(&udp_hash_lock);
- if (snum == 0) {
- int best_size_so_far, best, result, i;
-
- if (*port_rover > sysctl_local_port_range[1] ||
- *port_rover < sysctl_local_port_range[0])
- *port_rover = sysctl_local_port_range[0];
- best_size_so_far = 32767;
- best = result = *port_rover;
- for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
- int size;
-
- head = &udptable[result & (UDP_HTABLE_SIZE - 1)];
- if (hlist_empty(head)) {
- if (result > sysctl_local_port_range[1])
- result = sysctl_local_port_range[0] +
- ((result - sysctl_local_port_range[0]) &
- (UDP_HTABLE_SIZE - 1));
+
+ if (!snum) {
+ int i, low, high;
+ unsigned rover, best, best_size_so_far;
+
+ inet_get_local_port_range(&low, &high);
+
+ best_size_so_far = UINT_MAX;
+ best = rover = net_random() % (high - low) + low;
+
+ /* 1st pass: look for empty (or shortest) hash chain */
+ for (i = 0; i < UDP_HTABLE_SIZE; i++) {
+ int size = 0;
+
+ head = &udptable[rover & (UDP_HTABLE_SIZE - 1)];
+ if (hlist_empty(head))
goto gotit;
- }
- size = 0;
+
sk_for_each(sk2, node, head) {
if (++size >= best_size_so_far)
goto next;
}
best_size_so_far = size;
- best = result;
+ best = rover;
next:
- ;
+ /* fold back if end of range */
+ if (++rover > high)
+ rover = low + ((rover - low)
+ & (UDP_HTABLE_SIZE - 1));
+
+
}
- result = best;
- for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE;
- i++, result += UDP_HTABLE_SIZE) {
- if (result > sysctl_local_port_range[1])
- result = sysctl_local_port_range[0]
- + ((result - sysctl_local_port_range[0]) &
- (UDP_HTABLE_SIZE - 1));
- if (! __udp_lib_lport_inuse(result, udptable))
- break;
+
+ /* 2nd pass: find hole in shortest hash chain */
+ rover = best;
+ for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) {
+ if (! __udp_lib_lport_inuse(rover, udptable))
+ goto gotit;
+ rover += UDP_HTABLE_SIZE;
+ if (rover > high)
+ rover = low + ((rover - low)
+ & (UDP_HTABLE_SIZE - 1));
}
- if (i >= (1 << 16) / UDP_HTABLE_SIZE)
- goto fail;
+
+
+ /* All ports in use! */
+ goto fail;
+
gotit:
- *port_rover = snum = result;
+ snum = rover;
} else {
head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];
@@ -201,6 +207,7 @@ gotit:
(*saddr_comp)(sk, sk2) )
goto fail;
}
+
inet_sk(sk)->num = snum;
sk->sk_hash = snum;
if (sk_unhashed(sk)) {
@@ -217,7 +224,7 @@ fail:
int udp_get_port(struct sock *sk, unsigned short snum,
int (*scmp)(const struct sock *, const struct sock *))
{
- return __udp_lib_get_port(sk, snum, udp_hash, &udp_port_rover, scmp);
+ return __udp_lib_get_port(sk, snum, udp_hash, scmp);
}
int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
@@ -1560,7 +1567,7 @@ int udp_proc_register(struct udp_seq_afinfo *afinfo)
afinfo->seq_fops->llseek = seq_lseek;
afinfo->seq_fops->release = seq_release_private;
- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
+ p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops);
if (p)
p->data = afinfo;
else
@@ -1572,7 +1579,7 @@ void udp_proc_unregister(struct udp_seq_afinfo *afinfo)
{
if (!afinfo)
return;
- proc_net_remove(afinfo->name);
+ proc_net_remove(&init_net, afinfo->name);
memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
}
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 820a477cfaa..6c55828e41b 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -9,7 +9,7 @@ extern int __udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int );
extern void __udp4_lib_err(struct sk_buff *, u32, struct hlist_head []);
extern int __udp_lib_get_port(struct sock *sk, unsigned short snum,
- struct hlist_head udptable[], int *port_rover,
+ struct hlist_head udptable[],
int (*)(const struct sock*,const struct sock*));
extern int ipv4_rcv_saddr_equal(const struct sock *, const struct sock *);
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index f34fd686a8f..94977205abb 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -16,12 +16,11 @@
DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics) __read_mostly;
struct hlist_head udplite_hash[UDP_HTABLE_SIZE];
-static int udplite_port_rover;
int udplite_get_port(struct sock *sk, unsigned short p,
int (*c)(const struct sock *, const struct sock *))
{
- return __udp_lib_get_port(sk, p, udplite_hash, &udplite_port_rover, c);
+ return __udp_lib_get_port(sk, p, udplite_hash, c);
}
static int udplite_v4_get_port(struct sock *sk, unsigned short snum)
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 2fa10824541..e9bbfde19ac 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -54,12 +54,14 @@ static int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
int xfrm_nr = 0;
int decaps = 0;
int err = xfrm4_parse_spi(skb, ip_hdr(skb)->protocol, &spi, &seq);
+ unsigned int nhoff = offsetof(struct iphdr, protocol);
if (err != 0)
goto drop;
do {
const struct iphdr *iph = ip_hdr(skb);
+ int nexthdr;
if (xfrm_nr == XFRM_MAX_DEPTH)
goto drop;
@@ -82,9 +84,12 @@ static int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
if (xfrm_state_check_expire(x))
goto drop_unlock;
- if (x->type->input(x, skb))
+ nexthdr = x->type->input(x, skb);
+ if (nexthdr <= 0)
goto drop_unlock;
+ skb_network_header(skb)[nhoff] = nexthdr;
+
/* only the first xfrm gets the encap type */
encap_type = 0;
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index a73e710740c..73d2338bec5 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -20,38 +20,33 @@
/* Add encapsulation header.
*
* The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt.
- * The following fields in it shall be filled in by x->type->output:
- * tot_len
- * check
- *
- * On exit, skb->h will be set to the start of the payload to be processed
- * by x->type->output and skb->nh will be set to the top IP header.
*/
static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
{
+ struct ip_beet_phdr *ph;
struct iphdr *iph, *top_iph;
int hdrlen, optlen;
iph = ip_hdr(skb);
- skb->transport_header = skb->network_header;
hdrlen = 0;
optlen = iph->ihl * 4 - sizeof(*iph);
if (unlikely(optlen))
hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);
- skb_push(skb, x->props.header_len - IPV4_BEET_PHMAXLEN + hdrlen);
- skb_reset_network_header(skb);
- top_iph = ip_hdr(skb);
- skb->transport_header += sizeof(*iph) - hdrlen;
+ skb_set_network_header(skb, IPV4_BEET_PHMAXLEN - x->props.header_len -
+ hdrlen);
+ skb->mac_header = skb->network_header +
+ offsetof(struct iphdr, protocol);
+ skb->transport_header = skb->network_header + sizeof(*iph);
+
+ ph = (struct ip_beet_phdr *)__skb_pull(skb, sizeof(*iph) - hdrlen);
+ top_iph = ip_hdr(skb);
memmove(top_iph, iph, sizeof(*iph));
if (unlikely(optlen)) {
- struct ip_beet_phdr *ph;
-
BUG_ON(optlen < 0);
- ph = (struct ip_beet_phdr *)skb_transport_header(skb);
ph->padlen = 4 - (optlen & 4);
ph->hdrlen = optlen / 8;
ph->nexthdr = top_iph->protocol;
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index 601047161ea..fd840c7d75e 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -17,18 +17,17 @@
*
* The IP header will be moved forward to make space for the encapsulation
* header.
- *
- * On exit, skb->h will be set to the start of the payload to be processed
- * by x->type->output and skb->nh will be set to the top IP header.
*/
static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
int ihl = iph->ihl * 4;
+ skb_set_network_header(skb, -x->props.header_len);
+ skb->mac_header = skb->network_header +
+ offsetof(struct iphdr, protocol);
skb->transport_header = skb->network_header + ihl;
- skb_push(skb, x->props.header_len);
- skb_reset_network_header(skb);
+ __skb_pull(skb, ihl);
memmove(skb_network_header(skb), iph, ihl);
return 0;
}
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 9963700e74c..1ae9d32276f 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -31,13 +31,7 @@ static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
/* Add encapsulation header.
*
- * The top IP header will be constructed per RFC 2401. The following fields
- * in it shall be filled in by x->type->output:
- * tot_len
- * check
- *
- * On exit, skb->h will be set to the start of the payload to be processed
- * by x->type->output and skb->nh will be set to the top IP header.
+ * The top IP header will be constructed per RFC 2401.
*/
static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
{
@@ -47,10 +41,11 @@ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
int flags;
iph = ip_hdr(skb);
- skb->transport_header = skb->network_header;
- skb_push(skb, x->props.header_len);
- skb_reset_network_header(skb);
+ skb_set_network_header(skb, -x->props.header_len);
+ skb->mac_header = skb->network_header +
+ offsetof(struct iphdr, protocol);
+ skb->transport_header = skb->network_header + sizeof(*iph);
top_iph = ip_hdr(skb);
top_iph->ihl = 5;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 44ef208a75c..434ef302ba8 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -12,7 +12,6 @@
#include <linux/if_ether.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
-#include <linux/spinlock.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip.h>
#include <net/xfrm.h>
@@ -41,58 +40,32 @@ out:
return ret;
}
-static int xfrm4_output_one(struct sk_buff *skb)
+static inline int xfrm4_output_one(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
struct xfrm_state *x = dst->xfrm;
+ struct iphdr *iph;
int err;
- if (skb->ip_summed == CHECKSUM_PARTIAL) {
- err = skb_checksum_help(skb);
- if (err)
- goto error_nolock;
- }
-
if (x->props.mode == XFRM_MODE_TUNNEL) {
err = xfrm4_tunnel_check_size(skb);
if (err)
goto error_nolock;
}
- do {
- spin_lock_bh(&x->lock);
- err = xfrm_state_check(x, skb);
- if (err)
- goto error;
-
- err = x->mode->output(x, skb);
- if (err)
- goto error;
-
- err = x->type->output(x, skb);
- if (err)
- goto error;
-
- x->curlft.bytes += skb->len;
- x->curlft.packets++;
+ err = xfrm_output(skb);
+ if (err)
+ goto error_nolock;
- spin_unlock_bh(&x->lock);
-
- if (!(skb->dst = dst_pop(dst))) {
- err = -EHOSTUNREACH;
- goto error_nolock;
- }
- dst = skb->dst;
- x = dst->xfrm;
- } while (x && (x->props.mode != XFRM_MODE_TUNNEL));
+ iph = ip_hdr(skb);
+ iph->tot_len = htons(skb->len);
+ ip_send_check(iph);
IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
err = 0;
out_exit:
return err;
-error:
- spin_unlock_bh(&x->lock);
error_nolock:
kfree_skb(skb);
goto out_exit;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 4ff8ed30024..329825ca68f 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -306,7 +306,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
xdst = (struct xfrm_dst *)dst;
if (xdst->u.rt.idev->dev == dev) {
- struct in_device *loopback_idev = in_dev_get(&loopback_dev);
+ struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
BUG_ON(!loopback_idev);
do {
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 9275c79119b..1312417608e 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -12,17 +12,13 @@
static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
{
- struct iphdr *iph = ip_hdr(skb);
-
- iph->tot_len = htons(skb->len);
- ip_send_check(iph);
-
+ skb_push(skb, -skb_network_offset(skb));
return 0;
}
static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb)
{
- return 0;
+ return IPPROTO_IP;
}
static int ipip_init_state(struct xfrm_state *x)