diff options
author | Chris Mason <chris.mason@oracle.com> | 2010-05-11 15:09:45 -0700 |
---|---|---|
committer | Andy Grover <andy.grover@oracle.com> | 2010-09-08 18:15:08 -0700 |
commit | 38a4e5e61344490f18241333d7b1b368a3a38748 (patch) | |
tree | 60268171e32aff9938334806d0debd5c35b3dc56 | |
parent | e4c52c98e04937ea87b0979a81354d0040d284f9 (diff) | |
download | linux-3.10-38a4e5e61344490f18241333d7b1b368a3a38748.tar.gz linux-3.10-38a4e5e61344490f18241333d7b1b368a3a38748.tar.bz2 linux-3.10-38a4e5e61344490f18241333d7b1b368a3a38748.zip |
rds: Use RCU for the bind lookup searches
The RDS bind lookups are somewhat expensive in terms of CPU
time and locking overhead. This commit changes them into a
faster RCU based hash tree instead of the rbtrees they were using
before.
On large NUMA systems it is a significant improvement.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r-- | net/rds/af_rds.c | 8 | ||||
-rw-r--r-- | net/rds/bind.c | 90 | ||||
-rw-r--r-- | net/rds/ib_rdma.c | 2 | ||||
-rw-r--r-- | net/rds/rds.h | 2 |
4 files changed, 57 insertions, 45 deletions
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index ef09340cf7a..f16d2a92cb8 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -72,7 +72,15 @@ static int rds_release(struct socket *sock) * with the socket. */ rds_clear_recv_queue(rs); rds_cong_remove_socket(rs); + + /* + * the binding lookup hash uses rcu, we need to + * make sure we sychronize_rcu before we free our + * entry + */ rds_remove_bound(rs); + synchronize_rcu(); + rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); diff --git a/net/rds/bind.c b/net/rds/bind.c index a65afff6f4b..2f6b3fcc79f 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -34,45 +34,52 @@ #include <net/sock.h> #include <linux/in.h> #include <linux/if_arp.h> +#include <linux/jhash.h> #include "rds.h" -/* - * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't - * particularly zippy. - * - * This is now called for every incoming frame so we arguably care much more - * about it than we used to. - */ -static DEFINE_RWLOCK(rds_bind_lock); -static struct rb_root rds_bind_tree = RB_ROOT; +#define BIND_HASH_SIZE 1024 +static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; +static DEFINE_SPINLOCK(rds_bind_lock); + +static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) +{ + return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & + (BIND_HASH_SIZE - 1)); +} -static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, - struct rds_sock *insert) +static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, + struct rds_sock *insert) { - struct rb_node **p = &rds_bind_tree.rb_node; - struct rb_node *parent = NULL; struct rds_sock *rs; + struct hlist_node *node; + struct hlist_head *head = hash_to_bucket(addr, port); u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); - while (*p) { - parent = *p; - rs = rb_entry(parent, struct rds_sock, rs_bound_node); - + rcu_read_lock(); + hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) { cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); - if (needle < cmp) - p = &(*p)->rb_left; - else if (needle > cmp) - p = &(*p)->rb_right; - else + if (cmp == needle) { + rcu_read_unlock(); return rs; + } } + rcu_read_unlock(); if (insert) { - rb_link_node(&insert->rs_bound_node, parent, p); - rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); + /* + * make sure our addr and port are set before + * we are added to the list, other people + * in rcu will find us as soon as the + * hlist_add_head_rcu is done + */ + insert->rs_bound_addr = addr; + insert->rs_bound_port = port; + rds_sock_addref(insert); + + hlist_add_head_rcu(&insert->rs_bound_node, head); } return NULL; } @@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; - unsigned long flags; - read_lock_irqsave(&rds_bind_lock, flags); - rs = rds_bind_tree_walk(addr, port, NULL); + rs = rds_bind_lookup(addr, port, NULL); + if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else rs = NULL; - read_unlock_irqrestore(&rds_bind_lock, flags); rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); @@ -116,28 +121,21 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) last = rover - 1; } - write_lock_irqsave(&rds_bind_lock, flags); + spin_lock_irqsave(&rds_bind_lock, flags); do { if (rover == 0) rover++; - if (!rds_bind_tree_walk(addr, cpu_to_be16(rover), rs)) { - *port = cpu_to_be16(rover); + if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + *port = rs->rs_bound_port; ret = 0; + rdsdebug("rs %p binding to %pI4:%d\n", + rs, &addr, (int)ntohs(*port)); break; } } while (rover++ != last); - if (ret == 0) { - rs->rs_bound_addr = addr; - rs->rs_bound_port = *port; - rds_sock_addref(rs); - - rdsdebug("rs %p binding to %pI4:%d\n", - rs, &addr, (int)ntohs(*port)); - } - - write_unlock_irqrestore(&rds_bind_lock, flags); + spin_unlock_irqrestore(&rds_bind_lock, flags); return ret; } @@ -146,19 +144,19 @@ void rds_remove_bound(struct rds_sock *rs) { unsigned long flags; - write_lock_irqsave(&rds_bind_lock, flags); + spin_lock_irqsave(&rds_bind_lock, flags); if (rs->rs_bound_addr) { rdsdebug("rs %p unbinding from %pI4:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); - rb_erase(&rs->rs_bound_node, &rds_bind_tree); + hlist_del_init_rcu(&rs->rs_bound_node); rds_sock_put(rs); rs->rs_bound_addr = 0; } - write_unlock_irqrestore(&rds_bind_lock, flags); + spin_unlock_irqrestore(&rds_bind_lock, flags); } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out: release_sock(sk); + + /* we might have called rds_remove_bound on error */ + if (ret) + synchronize_rcu(); return ret; } diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index cc341cd70c8..4ba01b9ffd4 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -303,6 +303,8 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) goto out_no_cigar; } + memset(ibmr, 0, sizeof(*ibmr)); + ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | diff --git a/net/rds/rds.h b/net/rds/rds.h index c22bd7b4946..241a0859d16 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -452,7 +452,7 @@ struct rds_sock { * bound_addr used for both incoming and outgoing, no INADDR_ANY * support. */ - struct rb_node rs_bound_node; + struct hlist_node rs_bound_node; __be32 rs_bound_addr; __be32 rs_conn_addr; __be16 rs_bound_port; |