diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 10:01:50 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-24 10:01:50 -0700 |
commit | 3c4cfadef6a1665d9cd02a543782d03d3e6740c6 (patch) | |
tree | 3df72faaacd494d5ac8c9668df4f529b1b5e4457 /net/core | |
parent | e017507f37d5cb8b541df165a824958bc333bec3 (diff) | |
parent | 320f5ea0cedc08ef65d67e056bcb9d181386ef2c (diff) | |
download | kernel-common-3c4cfadef6a1665d9cd02a543782d03d3e6740c6.tar.gz kernel-common-3c4cfadef6a1665d9cd02a543782d03d3e6740c6.tar.bz2 kernel-common-3c4cfadef6a1665d9cd02a543782d03d3e6740c6.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking changes from David S Miller:
1) Remove the ipv4 routing cache. Now lookups go directly into the FIB
trie and use prebuilt routes cached there.
No more garbage collection, no more rDOS attacks on the routing
cache. Instead we now get predictable and consistent performance,
no matter what the pattern of traffic we service.
This has been almost 2 years in the making. Special thanks to
Julian Anastasov, Eric Dumazet, Steffen Klassert, and others who
have helped along the way.
I'm sure that with a change of this magnitude there will be some
kind of fallout, but such things ought the be simple to fix at this
point. Luckily I'm not European so I'll be around all of August to
fix things :-)
The major stages of this work here are each fronted by a forced
merge commit whose commit message contains a top-level description
of the motivations and implementation issues.
2) Pre-demux of established ipv4 TCP sockets, saves a route demux on
input.
3) TCP SYN/ACK performance tweaks from Eric Dumazet.
4) Add namespace support for netfilter L4 conntrack helpers, from Gao
Feng.
5) Add config mechanism for Energy Efficient Ethernet to ethtool, from
Yuval Mintz.
6) Remove quadratic behavior from /proc/net/unix, from Eric Dumazet.
7) Support for connection tracker helpers in userspace, from Pablo
Neira Ayuso.
8) Allow userspace driven TX load balancing functions in TEAM driver,
from Jiri Pirko.
9) Kill off NLMSG_PUT and RTA_PUT macros, more gross stuff with
embedded gotos.
10) TCP Small Queues, essentially minimize the amount of TCP data queued
up in the packet scheduler layer. Whereas the existing BQL (Byte
Queue Limits) limits the pkt_sched --> netdevice queuing levels,
this controls the TCP --> pkt_sched queueing levels.
From Eric Dumazet.
11) Reduce the number of get_page/put_page ops done on SKB fragments,
from Alexander Duyck.
12) Implement protection against blind resets in TCP (RFC 5961), from
Eric Dumazet.
13) Support the client side of TCP Fast Open, basically the ability to
send data in the SYN exchange, from Yuchung Cheng.
Basically, the sender queues up data with a sendmsg() call using
MSG_FASTOPEN, then they do the connect() which emits the queued up
fastopen data.
14) Avoid all the problems we get into in TCP when timers or PMTU events
hit a locked socket. The TCP Small Queues changes added a
tcp_release_cb() that allows us to queue work up to the
release_sock() caller, and that's what we use here too. From Eric
Dumazet.
15) Zero copy on TX support for TUN driver, from Michael S. Tsirkin.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1870 commits)
genetlink: define lockdep_genl_is_held() when CONFIG_LOCKDEP
r8169: revert "add byte queue limit support".
ipv4: Change rt->rt_iif encoding.
net: Make skb->skb_iif always track skb->dev
ipv4: Prepare for change of rt->rt_iif encoding.
ipv4: Remove all RTCF_DIRECTSRC handliing.
ipv4: Really ignore ICMP address requests/replies.
decnet: Don't set RTCF_DIRECTSRC.
net/ipv4/ip_vti.c: Fix __rcu warnings detected by sparse.
ipv4: Remove redundant assignment
rds: set correct msg_namelen
openvswitch: potential NULL deref in sample()
tcp: dont drop MTU reduction indications
bnx2x: Add new 57840 device IDs
tcp: avoid oops in tcp_metrics and reset tcpm_stamp
niu: Change niu_rbr_fill() to use unlikely() to check niu_rbr_add_page() return value
niu: Fix to check for dma mapping errors.
net: Fix references to out-of-scope variables in put_cmsg_compat()
net: ethernet: davinci_emac: add pm_runtime support
net: ethernet: davinci_emac: Remove unnecessary #include
...
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/datagram.c | 1 | ||||
-rw-r--r-- | net/core/dev.c | 44 | ||||
-rw-r--r-- | net/core/dst.c | 25 | ||||
-rw-r--r-- | net/core/ethtool.c | 45 | ||||
-rw-r--r-- | net/core/fib_rules.c | 4 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 5 | ||||
-rw-r--r-- | net/core/neighbour.c | 31 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 74 | ||||
-rw-r--r-- | net/core/netpoll.c | 10 | ||||
-rw-r--r-- | net/core/netprio_cgroup.c | 53 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 57 | ||||
-rw-r--r-- | net/core/skbuff.c | 71 | ||||
-rw-r--r-- | net/core/sock.c | 15 | ||||
-rw-r--r-- | net/core/sock_diag.c | 42 |
14 files changed, 284 insertions, 193 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index ae6acf6a3dea..0337e2b76862 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -248,7 +248,6 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) unlock_sock_fast(sk, slow); /* skb is now orphaned, can be freed outside of locked section */ - trace_kfree_skb(skb, skb_free_datagram_locked); __kfree_skb(skb); } EXPORT_SYMBOL(skb_free_datagram_locked); diff --git a/net/core/dev.c b/net/core/dev.c index 1cb0d8a6aa6c..0ebaea16632f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1632,6 +1632,8 @@ static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { + if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + return -ENOMEM; atomic_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } @@ -1691,7 +1693,8 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) rcu_read_unlock(); } -/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change +/** + * netif_setup_tc - Handle tc mappings on real_num_tx_queues change * @dev: Network device * @txq: number of queues available * @@ -1793,6 +1796,18 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif +/** + * netif_get_num_default_rss_queues - default number of RSS queues + * + * This routine should set an upper limit on the number of RSS queues + * used by default by multiqueue devices. + */ +int netif_get_num_default_rss_queues(void) +{ + return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); +} +EXPORT_SYMBOL(netif_get_num_default_rss_queues); + static inline void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; @@ -2459,6 +2474,23 @@ static DEFINE_PER_CPU(int, xmit_recursion); #define RECURSION_LIMIT 10 /** + * dev_loopback_xmit - loop back @skb + * @skb: buffer to transmit + */ +int dev_loopback_xmit(struct sk_buff *skb) +{ + skb_reset_mac_header(skb); + __skb_pull(skb, skb_network_offset(skb)); + skb->pkt_type = PACKET_LOOPBACK; + skb->ip_summed = CHECKSUM_UNNECESSARY; + WARN_ON(!skb_dst(skb)); + skb_dst_force(skb); + netif_rx_ni(skb); + return 0; +} +EXPORT_SYMBOL(dev_loopback_xmit); + +/** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit * @@ -3141,8 +3173,6 @@ static int __netif_receive_skb(struct sk_buff *skb) if (netpoll_receive_skb(skb)) return NET_RX_DROP; - if (!skb->skb_iif) - skb->skb_iif = skb->dev->ifindex; orig_dev = skb->dev; skb_reset_network_header(skb); @@ -3154,6 +3184,7 @@ static int __netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); another_round: + skb->skb_iif = skb->dev->ifindex; __this_cpu_inc(softnet_data.processed); @@ -3232,7 +3263,10 @@ ncls: } if (pt_prev) { - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + ret = -ENOMEM; + else + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); @@ -5646,7 +5680,7 @@ int netdev_refcnt_read(const struct net_device *dev) } EXPORT_SYMBOL(netdev_refcnt_read); -/* +/** * netdev_wait_allrefs - wait until all references are gone. * * This is called when unregistering network devices. diff --git a/net/core/dst.c b/net/core/dst.c index 43d94cedbf7c..069d51d29414 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -94,7 +94,7 @@ loop: * But we do not have state "obsoleted, but * referenced by parent", so it is right. */ - if (dst->obsolete > 1) + if (dst->obsolete > 0) continue; ___dst_free(dst); @@ -152,7 +152,7 @@ EXPORT_SYMBOL(dst_discard); const u32 dst_default_metrics[RTAX_MAX]; void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, int flags) + int initial_ref, int initial_obsolete, unsigned short flags) { struct dst_entry *dst; @@ -171,7 +171,6 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst_init_metrics(dst, dst_default_metrics, true); dst->expires = 0UL; dst->path = dst; - RCU_INIT_POINTER(dst->_neighbour, NULL); #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif @@ -188,6 +187,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; + dst->pending_confirm = 0; dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); @@ -202,7 +202,7 @@ static void ___dst_free(struct dst_entry *dst) */ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) dst->input = dst->output = dst_discard; - dst->obsolete = 2; + dst->obsolete = DST_OBSOLETE_DEAD; } void __dst_free(struct dst_entry *dst) @@ -224,19 +224,12 @@ EXPORT_SYMBOL(__dst_free); struct dst_entry *dst_destroy(struct dst_entry * dst) { struct dst_entry *child; - struct neighbour *neigh; smp_rmb(); again: - neigh = rcu_dereference_protected(dst->_neighbour, 1); child = dst->child; - if (neigh) { - RCU_INIT_POINTER(dst->_neighbour, NULL); - neigh_release(neigh); - } - if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); @@ -360,19 +353,9 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (!unregister) { dst->input = dst->output = dst_discard; } else { - struct neighbour *neigh; - dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); dev_put(dev); - rcu_read_lock(); - neigh = dst_get_neighbour_noref(dst); - if (neigh && neigh->dev == dev) { - neigh->dev = dst->dev; - dev_hold(dst->dev); - dev_put(dev); - } - rcu_read_unlock(); } } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 9c2afb480270..cbf033dcaf1f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -729,6 +729,40 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_wol(dev, &wol); } +static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_eee edata; + int rc; + + if (!dev->ethtool_ops->get_eee) + return -EOPNOTSUPP; + + memset(&edata, 0, sizeof(struct ethtool_eee)); + edata.cmd = ETHTOOL_GEEE; + rc = dev->ethtool_ops->get_eee(dev, &edata); + + if (rc) + return rc; + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + + return 0; +} + +static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_eee edata; + + if (!dev->ethtool_ops->set_eee) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + return dev->ethtool_ops->set_eee(dev, &edata); +} + static int ethtool_nway_reset(struct net_device *dev) { if (!dev->ethtool_ops->nway_reset) @@ -1409,6 +1443,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: + case ETHTOOL_GLINK: case ETHTOOL_GCOALESCE: case ETHTOOL_GRINGPARAM: case ETHTOOL_GPAUSEPARAM: @@ -1417,6 +1452,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GSG: case ETHTOOL_GSSET_INFO: case ETHTOOL_GSTRINGS: + case ETHTOOL_GSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: @@ -1429,8 +1465,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: + case ETHTOOL_GRXFHINDIR: case ETHTOOL_GFEATURES: + case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: + case ETHTOOL_GEEE: break; default: if (!capable(CAP_NET_ADMIN)) @@ -1471,6 +1510,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_set_value_void(dev, useraddr, dev->ethtool_ops->set_msglevel); break; + case ETHTOOL_GEEE: + rc = ethtool_get_eee(dev, useraddr); + break; + case ETHTOOL_SEEE: + rc = ethtool_set_eee(dev, useraddr); + break; case ETHTOOL_NWAY_RST: rc = ethtool_nway_reset(dev); break; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 72cceb79d0d4..ab7db83236c9 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -151,6 +151,8 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); + if (ops->delete) + ops->delete(rule); fib_rule_put(rule); } } @@ -499,6 +501,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).pid); + if (ops->delete) + ops->delete(rule); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index a225089df5b6..466820b6e344 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -4,6 +4,7 @@ #include <linux/ipv6.h> #include <linux/if_vlan.h> #include <net/ip.h> +#include <net/ipv6.h> #include <linux/if_tunnel.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> @@ -55,8 +56,8 @@ ipv6: return false; ip_proto = iph->nexthdr; - flow->src = iph->saddr.s6_addr32[3]; - flow->dst = iph->daddr.s6_addr32[3]; + flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); + flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); nhoff += sizeof(struct ipv6hdr); break; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d81d026138f0..117afaf51268 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -474,8 +474,8 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev) +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) { u32 hash_val; int key_len = tbl->key_len; @@ -535,14 +535,16 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { - neigh_hold(n1); + if (want_ref) + neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } n->dead = 0; - neigh_hold(n); + if (want_ref) + neigh_hold(n); rcu_assign_pointer(n->next, rcu_dereference_protected(nht->hash_buckets[hash_val], lockdep_is_held(&tbl->lock))); @@ -558,7 +560,7 @@ out_neigh_release: neigh_release(n); goto out; } -EXPORT_SYMBOL(neigh_create); +EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, int key_len) { @@ -1199,10 +1201,23 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, write_unlock_bh(&neigh->lock); rcu_read_lock(); - /* On shaper/eql skb->dst->neighbour != neigh :( */ - if (dst && (n2 = dst_get_neighbour_noref(dst)) != NULL) - n1 = n2; + + /* Why not just use 'neigh' as-is? The problem is that + * things such as shaper, eql, and sch_teql can end up + * using alternative, different, neigh objects to output + * the packet in the output path. So what we need to do + * here is re-lookup the top-level neigh in the path so + * we can reinject the packet there. + */ + n2 = NULL; + if (dst) { + n2 = dst_neigh_lookup_skb(dst, skb); + if (n2) + n1 = n2; + } n1->output(n1, skb); + if (n2) + neigh_release(n2); rcu_read_unlock(); write_lock_bh(&neigh->lock); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fdf9e61d0651..72607174ea5a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -417,72 +417,6 @@ static struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, }; - -#ifdef CONFIG_WIRELESS_EXT_SYSFS -/* helper function that does all the locking etc for wireless stats */ -static ssize_t wireless_show(struct device *d, char *buf, - ssize_t (*format)(const struct iw_statistics *, - char *)) -{ - struct net_device *dev = to_net_dev(d); - const struct iw_statistics *iw; - ssize_t ret = -EINVAL; - - if (!rtnl_trylock()) - return restart_syscall(); - if (dev_isalive(dev)) { - iw = get_wireless_stats(dev); - if (iw) - ret = (*format)(iw, buf); - } - rtnl_unlock(); - - return ret; -} - -/* show function template for wireless fields */ -#define WIRELESS_SHOW(name, field, format_string) \ -static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \ -{ \ - return sprintf(buf, format_string, iw->field); \ -} \ -static ssize_t show_iw_##name(struct device *d, \ - struct device_attribute *attr, char *buf) \ -{ \ - return wireless_show(d, buf, format_iw_##name); \ -} \ -static DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL) - -WIRELESS_SHOW(status, status, fmt_hex); -WIRELESS_SHOW(link, qual.qual, fmt_dec); -WIRELESS_SHOW(level, qual.level, fmt_dec); -WIRELESS_SHOW(noise, qual.noise, fmt_dec); -WIRELESS_SHOW(nwid, discard.nwid, fmt_dec); -WIRELESS_SHOW(crypt, discard.code, fmt_dec); -WIRELESS_SHOW(fragment, discard.fragment, fmt_dec); -WIRELESS_SHOW(misc, discard.misc, fmt_dec); -WIRELESS_SHOW(retries, discard.retries, fmt_dec); -WIRELESS_SHOW(beacon, miss.beacon, fmt_dec); - -static struct attribute *wireless_attrs[] = { - &dev_attr_status.attr, - &dev_attr_link.attr, - &dev_attr_level.attr, - &dev_attr_noise.attr, - &dev_attr_nwid.attr, - &dev_attr_crypt.attr, - &dev_attr_fragment.attr, - &dev_attr_retries.attr, - &dev_attr_misc.attr, - &dev_attr_beacon.attr, - NULL -}; - -static struct attribute_group wireless_group = { - .name = "wireless", - .attrs = wireless_attrs, -}; -#endif #endif /* CONFIG_SYSFS */ #ifdef CONFIG_RPS @@ -1463,14 +1397,6 @@ int netdev_register_kobject(struct net_device *net) groups++; *groups++ = &netstat_group; -#ifdef CONFIG_WIRELESS_EXT_SYSFS - if (net->ieee80211_ptr) - *groups++ = &wireless_group; -#ifdef CONFIG_WIRELESS_EXT - else if (net->wireless_handlers) - *groups++ = &wireless_group; -#endif -#endif #endif /* CONFIG_SYSFS */ error = device_add(dev); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index f9f40b932e4b..b4c90e42b443 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -715,14 +715,16 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { - struct net_device *ndev = np->dev; struct netpoll_info *npinfo; const struct net_device_ops *ops; unsigned long flags; int err; + np->dev = ndev; + strlcpy(np->dev_name, ndev->name, IFNAMSIZ); + if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || !ndev->netdev_ops->ndo_poll_controller) { np_err(np, "%s doesn't support polling, aborting\n", @@ -851,13 +853,11 @@ int netpoll_setup(struct netpoll *np) np_info(np, "local IP %pI4\n", &np->local_ip); } - np->dev = ndev; - /* fill up the skb queue */ refill_skbs(); rtnl_lock(); - err = __netpoll_setup(np); + err = __netpoll_setup(np, ndev); rtnl_unlock(); if (err) diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index b2e9caa1ad1a..63d15e8f80e9 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -25,6 +25,8 @@ #include <net/sock.h> #include <net/netprio_cgroup.h> +#include <linux/fdtable.h> + #define PRIOIDX_SZ 128 static unsigned long prioidx_map[PRIOIDX_SZ]; @@ -272,6 +274,56 @@ out_free_devname: return ret; } +void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +{ + struct task_struct *p; + char *tmp = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); + + if (!tmp) { + pr_warn("Unable to attach cgrp due to alloc failure!\n"); + return; + } + + cgroup_taskset_for_each(p, cgrp, tset) { + unsigned int fd; + struct fdtable *fdt; + struct files_struct *files; + + task_lock(p); + files = p->files; + if (!files) { + task_unlock(p); + continue; + } + + rcu_read_lock(); + fdt = files_fdtable(files); + for (fd = 0; fd < fdt->max_fds; fd++) { + char *path; + struct file *file; + struct socket *sock; + unsigned long s; + int rv, err = 0; + + file = fcheck_files(files, fd); + if (!file) + continue; + + path = d_path(&file->f_path, tmp, PAGE_SIZE); + rv = sscanf(path, "socket:[%lu]", &s); + if (rv <= 0) + continue; + + sock = sock_from_file(file, &err); + if (!err) + sock_update_netprioidx(sock->sk, p); + } + rcu_read_unlock(); + task_unlock(p); + } + kfree(tmp); +} + static struct cftype ss_files[] = { { .name = "prioidx", @@ -289,6 +341,7 @@ struct cgroup_subsys net_prio_subsys = { .name = "net_prio", .create = cgrp_create, .destroy = cgrp_destroy, + .attach = net_prio_attach, #ifdef CONFIG_NETPRIO_CGROUP .subsys_id = net_prio_subsys_id, #endif diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 21318d15bbc3..334b930e0de3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -541,19 +541,6 @@ static const int rta_max[RTM_NR_FAMILIES] = [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, }; -void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) -{ - struct rtattr *rta; - int size = RTA_LENGTH(attrlen); - - rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size)); - rta->rta_type = attrtype; - rta->rta_len = size; - memcpy(RTA_DATA(rta), data, attrlen); - memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); -} -EXPORT_SYMBOL(__rta_fill); - int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; @@ -628,7 +615,7 @@ nla_put_failure: EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, - u32 ts, u32 tsage, long expires, u32 error) + long expires, u32 error) { struct rta_cacheinfo ci = { .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), @@ -636,8 +623,6 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, .rta_clntref = atomic_read(&(dst->__refcnt)), .rta_error = error, .rta_id = id, - .rta_ts = ts, - .rta_tsage = tsage, }; if (expires) @@ -786,6 +771,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(ext_filter_mask @@ -904,6 +891,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || + nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || +#ifdef CONFIG_RPS + nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || +#endif (dev->ifindex != dev->iflink && nla_put_u32(skb, IFLA_LINK, dev->iflink)) || (dev->master && @@ -1121,6 +1112,8 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_AF_SPEC] = { .type = NLA_NESTED }, [IFLA_EXT_MASK] = { .type = NLA_U32 }, [IFLA_PROMISCUITY] = { .type = NLA_U32 }, + [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, + [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, }; EXPORT_SYMBOL(ifla_policy); @@ -1639,17 +1632,22 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net, { int err; struct net_device *dev; - unsigned int num_queues = 1; + unsigned int num_tx_queues = 1; + unsigned int num_rx_queues = 1; - if (ops->get_tx_queues) { - err = ops->get_tx_queues(src_net, tb); - if (err < 0) - goto err; - num_queues = err; - } + if (tb[IFLA_NUM_TX_QUEUES]) + num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); + else if (ops->get_num_tx_queues) + num_tx_queues = ops->get_num_tx_queues(); + + if (tb[IFLA_NUM_RX_QUEUES]) + num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]); + else if (ops->get_num_rx_queues) + num_rx_queues = ops->get_num_rx_queues(); err = -ENOMEM; - dev = alloc_netdev_mq(ops->priv_size, ifname, ops->setup, num_queues); + dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup, + num_tx_queues, num_rx_queues); if (!dev) goto err; @@ -2189,7 +2187,7 @@ skip: } /** - * ndo_dflt_fdb_dump: default netdevice operation to dump an FDB table. + * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. * @nlh: netlink message header * @dev: netdevice * @@ -2366,8 +2364,13 @@ static struct notifier_block rtnetlink_dev_notifier = { static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; - sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, - rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .groups = RTNLGRP_MAX, + .input = rtnetlink_rcv, + .cb_mutex = &rtnl_mutex, + }; + + sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d124306b81fd..368f65c15e4f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -160,8 +160,8 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) * @node: numa node to allocate memory on * * Allocate a new &sk_buff. The returned buffer has no headroom and a - * tail room of size bytes. The object has a reference count of one. - * The return is the buffer. On a failure the return is %NULL. + * tail room of at least size bytes. The object has a reference count + * of one. The return is the buffer. On a failure the return is %NULL. * * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. @@ -296,9 +296,12 @@ EXPORT_SYMBOL(build_skb); struct netdev_alloc_cache { struct page *page; unsigned int offset; + unsigned int pagecnt_bias; }; static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); +#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) + /** * netdev_alloc_frag - allocate a page fragment * @fragsz: fragment size @@ -317,17 +320,26 @@ void *netdev_alloc_frag(unsigned int fragsz) if (unlikely(!nc->page)) { refill: nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD); + if (unlikely(!nc->page)) + goto end; +recycle: + atomic_set(&nc->page->_count, NETDEV_PAGECNT_BIAS); + nc->pagecnt_bias = NETDEV_PAGECNT_BIAS; nc->offset = 0; } - if (likely(nc->page)) { - if (nc->offset + fragsz > PAGE_SIZE) { - put_page(nc->page); - goto refill; - } - data = page_address(nc->page) + nc->offset; - nc->offset += fragsz; - get_page(nc->page); + + if (nc->offset + fragsz > PAGE_SIZE) { + /* avoid unnecessary locked operations if possible */ + if ((atomic_read(&nc->page->_count) == nc->pagecnt_bias) || + atomic_sub_and_test(nc->pagecnt_bias, &nc->page->_count)) + goto recycle; + goto refill; } + + data = page_address(nc->page) + nc->offset; + nc->offset += fragsz; + nc->pagecnt_bias--; +end: local_irq_restore(flags); return data; } @@ -713,7 +725,8 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); -/* skb_copy_ubufs - copy userspace skb frags buffers to kernel +/** + * skb_copy_ubufs - copy userspace skb frags buffers to kernel * @skb: the skb to modify * @gfp_mask: allocation priority * @@ -738,7 +751,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) u8 *vaddr; skb_frag_t *f = &skb_shinfo(skb)->frags[i]; - page = alloc_page(GFP_ATOMIC); + page = alloc_page(gfp_mask); if (!page) { while (head) { struct page *next = (struct page *)head->private; @@ -756,22 +769,22 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) } /* skb frags release userspace buffers */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); uarg->callback(uarg); /* skb frags point to kernel buffers */ - for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { - __skb_fill_page_desc(skb, i-1, head, 0, - skb_shinfo(skb)->frags[i - 1].size); + for (i = num_frags - 1; i >= 0; i--) { + __skb_fill_page_desc(skb, i, head, 0, + skb_shinfo(skb)->frags[i].size); head = (struct page *)head->private; } skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; return 0; } - +EXPORT_SYMBOL_GPL(skb_copy_ubufs); /** * skb_clone - duplicate an sk_buff @@ -791,10 +804,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *n; - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, gfp_mask)) - return NULL; - } + if (skb_orphan_frags(skb, gfp_mask)) + return NULL; n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && @@ -914,12 +925,10 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) if (skb_shinfo(skb)->nr_frags) { int i; - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, gfp_mask)) { - kfree_skb(n); - n = NULL; - goto out; - } + if (skb_orphan_frags(skb, gfp_mask)) { + kfree_skb(n); + n = NULL; + goto out; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; @@ -992,10 +1001,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, */ if (skb_cloned(skb)) { /* copy this zero copy skb frags */ - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, gfp_mask)) - goto nofrags; - } + if (skb_orphan_frags(skb, gfp_mask)) + goto nofrags; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); @@ -2614,7 +2621,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, EXPORT_SYMBOL(skb_find_text); /** - * skb_append_datato_frags: - append the user data to a skb + * skb_append_datato_frags - append the user data to a skb * @sk: sock structure * @skb: skb structure to be appened with user data. * @getfrag: call back function to be used for getting the user data diff --git a/net/core/sock.c b/net/core/sock.c index 9e5b71fda6ec..2676a88f533e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1180,12 +1180,12 @@ void sock_update_classid(struct sock *sk) } EXPORT_SYMBOL(sock_update_classid); -void sock_update_netprioidx(struct sock *sk) +void sock_update_netprioidx(struct sock *sk, struct task_struct *task) { if (in_interrupt()) return; - sk->sk_cgrp_prioidx = task_netprioidx(current); + sk->sk_cgrp_prioidx = task_netprioidx(task); } EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif @@ -1215,7 +1215,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); - sock_update_netprioidx(sk); + sock_update_netprioidx(sk, current); } return sk; @@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +void sock_edemux(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_edemux); int sock_i_uid(struct sock *sk) { @@ -2154,6 +2159,10 @@ void release_sock(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); + + if (sk->sk_prot->release_cb) + sk->sk_prot->release_cb(sk); + sk->sk_lock.owned = 0; if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 5fd146720f39..9d8755e4a7a5 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -4,7 +4,6 @@ #include <net/netlink.h> #include <net/net_namespace.h> #include <linux/module.h> -#include <linux/rtnetlink.h> #include <net/sock.h> #include <linux/inet_diag.h> @@ -35,9 +34,7 @@ EXPORT_SYMBOL_GPL(sock_diag_save_cookie); int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) { - __u32 *mem; - - mem = RTA_DATA(__RTA_PUT(skb, attrtype, SK_MEMINFO_VARS * sizeof(__u32))); + u32 mem[SK_MEMINFO_VARS]; mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; @@ -46,11 +43,9 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; - return 0; - -rtattr_failure: - return -EMSGSIZE; + return nla_put(skb, attrtype, sizeof(mem), &mem); } EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); @@ -120,7 +115,7 @@ static inline void sock_diag_unlock_handler(const struct sock_diag_handler *h) static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { int err; - struct sock_diag_req *req = NLMSG_DATA(nlh); + struct sock_diag_req *req = nlmsg_data(nlh); const struct sock_diag_handler *hndl; if (nlmsg_len(nlh) < sizeof(*req)) @@ -171,19 +166,36 @@ static void sock_diag_rcv(struct sk_buff *skb) mutex_unlock(&sock_diag_mutex); } -struct sock *sock_diag_nlsk; -EXPORT_SYMBOL_GPL(sock_diag_nlsk); +static int __net_init diag_net_init(struct net *net) +{ + struct netlink_kernel_cfg cfg = { + .input = sock_diag_rcv, + }; + + net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, + THIS_MODULE, &cfg); + return net->diag_nlsk == NULL ? -ENOMEM : 0; +} + +static void __net_exit diag_net_exit(struct net *net) +{ + netlink_kernel_release(net->diag_nlsk); + net->diag_nlsk = NULL; +} + +static struct pernet_operations diag_net_ops = { + .init = diag_net_init, + .exit = diag_net_exit, +}; static int __init sock_diag_init(void) { - sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0, - sock_diag_rcv, NULL, THIS_MODULE); - return sock_diag_nlsk == NULL ? -ENOMEM : 0; + return register_pernet_subsys(&diag_net_ops); } static void __exit sock_diag_exit(void) { - netlink_kernel_release(sock_diag_nlsk); + unregister_pernet_subsys(&diag_net_ops); } module_init(sock_diag_init); |