diff options
author | Eli Cohen <eli@dev.mellanox.co.il> | 2010-10-13 21:26:51 +0200 |
---|---|---|
committer | Roland Dreier <rolandd@cisco.com> | 2010-10-13 15:46:43 -0700 |
commit | 3c86aa70bf677a31b71c8292e349242e26cbc743 (patch) | |
tree | 7f38edd826e444b1232185e154f313e70966d250 /drivers/infiniband | |
parent | fac70d51914674ce8ae742ed73441ddb4770ad20 (diff) | |
download | linux-3.10-3c86aa70bf677a31b71c8292e349242e26cbc743.tar.gz linux-3.10-3c86aa70bf677a31b71c8292e349242e26cbc743.tar.bz2 linux-3.10-3c86aa70bf677a31b71c8292e349242e26cbc743.zip |
RDMA/cm: Add RDMA CM support for IBoE devices
Add support for IBoE device binding and IP --> GID resolution. Path
resolving and multicast joining are implemented within cma.c by
filling in the responses and running callbacks in the CMA work queue.
IP --> GID resolution always yields IPv6 link local addresses; remote
GIDs are derived from the destination MAC address of the remote port.
Multicast GIDs are always mapped to multicast MACs as is done in IPv6.
(IPv4 multicast is enabled by translating IPv4 multicast addresses to
IPv6 multicast as described in
<http://www.mail-archive.com/ipng@sunroof.eng.sun.com/msg02134.html>.)
Some helper functions are added to ib_addr.h.
Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r-- | drivers/infiniband/core/cma.c | 309 | ||||
-rw-r--r-- | drivers/infiniband/core/sa_query.c | 5 | ||||
-rw-r--r-- | drivers/infiniband/core/ucma.c | 45 |
3 files changed, 333 insertions, 26 deletions
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index b930b8110a6..f61bc073848 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -59,6 +59,7 @@ MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) +#define CMA_IBOE_PACKET_LIFETIME 18 static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device); @@ -157,6 +158,7 @@ struct cma_multicast { struct list_head list; void *context; struct sockaddr_storage addr; + struct kref mcref; }; struct cma_work { @@ -173,6 +175,12 @@ struct cma_ndev_work { struct rdma_cm_event event; }; +struct iboe_mcast_work { + struct work_struct work; + struct rdma_id_private *id; + struct cma_multicast *mc; +}; + union cma_ip_addr { struct in6_addr ip6; struct { @@ -281,6 +289,8 @@ static void cma_attach_to_dev(struct rdma_id_private *id_priv, atomic_inc(&cma_dev->refcount); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; + id_priv->id.route.addr.dev_addr.transport = + rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } @@ -290,6 +300,14 @@ static inline void cma_deref_dev(struct cma_device *cma_dev) complete(&cma_dev->comp); } +static inline void release_mc(struct kref *kref) +{ + struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref); + + kfree(mc->multicast.ib); + kfree(mc); +} + static void cma_detach_from_dev(struct rdma_id_private *id_priv) { list_del(&id_priv->list); @@ -323,22 +341,63 @@ static int cma_set_qkey(struct rdma_id_private *id_priv) return ret; } +static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) +{ + int i; + int err; + struct ib_port_attr props; + union ib_gid tmp; + + err = ib_query_port(device, port_num, &props); + if (err) + return 1; + + for (i = 0; i < props.gid_tbl_len; ++i) { + err = ib_query_gid(device, port_num, i, &tmp); + if (err) + return 1; + if (!memcmp(&tmp, gid, sizeof tmp)) + return 0; + } + + return -EAGAIN; +} + static int cma_acquire_dev(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; - union ib_gid gid; + union ib_gid gid, iboe_gid; int ret = -ENODEV; + u8 port; + enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; - rdma_addr_get_sgid(dev_addr, &gid); + iboe_addr_get_sgid(dev_addr, &iboe_gid); + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof gid); list_for_each_entry(cma_dev, &dev_list, list) { - ret = ib_find_cached_gid(cma_dev->device, &gid, - &id_priv->id.port_num, NULL); - if (!ret) { - cma_attach_to_dev(id_priv, cma_dev); - break; + for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { + if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { + if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) + ret = find_gid_port(cma_dev->device, &iboe_gid, port); + else + ret = find_gid_port(cma_dev->device, &gid, port); + + if (!ret) { + id_priv->id.port_num = port; + goto out; + } else if (ret == 1) + break; + } } } + +out: + if (!ret) + cma_attach_to_dev(id_priv, cma_dev); + return ret; } @@ -556,10 +615,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; + u16 pkey; + + if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == + IB_LINK_LAYER_INFINIBAND) + pkey = ib_addr_get_pkey(dev_addr); + else + pkey = 0xffff; ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, - ib_addr_get_pkey(dev_addr), - &qp_attr->pkey_index); + pkey, &qp_attr->pkey_index); if (ret) return ret; @@ -737,8 +802,8 @@ static inline int cma_user_data_offset(enum rdma_port_space ps) static void cma_cancel_route(struct rdma_id_private *id_priv) { - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: + switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) { + case IB_LINK_LAYER_INFINIBAND: if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); break; @@ -816,8 +881,17 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) mc = container_of(id_priv->mc_list.next, struct cma_multicast, list); list_del(&mc->list); - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); + switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ib_sa_free_multicast(mc->multicast.ib); + kfree(mc); + break; + case IB_LINK_LAYER_ETHERNET: + kref_put(&mc->mcref, release_mc); + break; + default: + break; + } } } @@ -833,7 +907,7 @@ void rdma_destroy_id(struct rdma_cm_id *id) mutex_lock(&lock); if (id_priv->cma_dev) { mutex_unlock(&lock); - switch (rdma_node_get_transport(id->device->node_type)) { + switch (rdma_node_get_transport(id_priv->id.device->node_type)) { case RDMA_TRANSPORT_IB: if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) ib_destroy_cm_id(id_priv->cm_id.ib); @@ -1708,6 +1782,77 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) return 0; } +static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = &id_priv->id.route; + struct rdma_addr *addr = &route->addr; + struct cma_work *work; + int ret; + struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; + struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; + struct net_device *ndev = NULL; + + if (src_addr->sin_family != dst_addr->sin_family) + return -EINVAL; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + + route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); + if (!route->path_rec) { + ret = -ENOMEM; + goto err1; + } + + route->num_paths = 1; + + iboe_mac_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr); + iboe_mac_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr); + + route->path_rec->hop_limit = 1; + route->path_rec->reversible = 1; + route->path_rec->pkey = cpu_to_be16(0xffff); + route->path_rec->mtu_selector = IB_SA_EQ; + + if (addr->dev_addr.bound_dev_if) + ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); + if (!ndev) { + ret = -ENODEV; + goto err2; + } + + route->path_rec->mtu = iboe_get_mtu(ndev->mtu); + route->path_rec->rate_selector = IB_SA_EQ; + route->path_rec->rate = iboe_get_rate(ndev); + dev_put(ndev); + route->path_rec->packet_life_time_selector = IB_SA_EQ; + route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; + if (!route->path_rec->mtu) { + ret = -EINVAL; + goto err2; + } + + work->old_state = CMA_ROUTE_QUERY; + work->new_state = CMA_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + work->event.status = 0; + + queue_work(cma_wq, &work->work); + + return 0; + +err2: + kfree(route->path_rec); + route->path_rec = NULL; +err1: + kfree(work); + return ret; +} + int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) { struct rdma_id_private *id_priv; @@ -1720,7 +1865,16 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) atomic_inc(&id_priv->refcount); switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - ret = cma_resolve_ib_route(id_priv, timeout_ms); + switch (rdma_port_get_link_layer(id->device, id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ret = cma_resolve_ib_route(id_priv, timeout_ms); + break; + case IB_LINK_LAYER_ETHERNET: + ret = cma_resolve_iboe_route(id_priv); + break; + default: + ret = -ENOSYS; + } break; case RDMA_TRANSPORT_IWARP: ret = cma_resolve_iw_route(id_priv, timeout_ms); @@ -1773,7 +1927,7 @@ port_found: goto out; id_priv->id.route.addr.dev_addr.dev_type = - (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB) ? + (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); @@ -2758,6 +2912,102 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, return 0; } +static void iboe_mcast_work_handler(struct work_struct *work) +{ + struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); + struct cma_multicast *mc = mw->mc; + struct ib_sa_multicast *m = mc->multicast.ib; + + mc->multicast.ib->context = mc; + cma_ib_mc_handler(0, m); + kref_put(&mc->mcref, release_mc); + kfree(mw); +} + +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; + + if (cma_any_addr(addr)) { + memset(mgid, 0, sizeof *mgid); + } else if (addr->sa_family == AF_INET6) { + memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else { + mgid->raw[0] = 0xff; + mgid->raw[1] = 0x0e; + mgid->raw[2] = 0; + mgid->raw[3] = 0; + mgid->raw[4] = 0; + mgid->raw[5] = 0; + mgid->raw[6] = 0; + mgid->raw[7] = 0; + mgid->raw[8] = 0; + mgid->raw[9] = 0; + mgid->raw[10] = 0xff; + mgid->raw[11] = 0xff; + *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; + } +} + +static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, + struct cma_multicast *mc) +{ + struct iboe_mcast_work *work; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + int err; + struct sockaddr *addr = (struct sockaddr *)&mc->addr; + struct net_device *ndev = NULL; + + if (cma_zero_addr((struct sockaddr *)&mc->addr)) + return -EINVAL; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL); + if (!mc->multicast.ib) { + err = -ENOMEM; + goto out1; + } + + cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); + + mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); + if (id_priv->id.ps == RDMA_PS_UDP) + mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (!ndev) { + err = -ENODEV; + goto out2; + } + mc->multicast.ib->rec.rate = iboe_get_rate(ndev); + mc->multicast.ib->rec.hop_limit = 1; + mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); + dev_put(ndev); + if (!mc->multicast.ib->rec.mtu) { + err = -EINVAL; + goto out2; + } + iboe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid); + work->id = id_priv; + work->mc = mc; + INIT_WORK(&work->work, iboe_mcast_work_handler); + kref_get(&mc->mcref); + queue_work(cma_wq, &work->work); + + return 0; + +out2: + kfree(mc->multicast.ib); +out1: + kfree(work); + return err; +} + int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, void *context) { @@ -2784,7 +3034,17 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - ret = cma_join_ib_multicast(id_priv, mc); + switch (rdma_port_get_link_layer(id->device, id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ret = cma_join_ib_multicast(id_priv, mc); + break; + case IB_LINK_LAYER_ETHERNET: + kref_init(&mc->mcref); + ret = cma_iboe_join_multicast(id_priv, mc); + break; + default: + ret = -EINVAL; + } break; default: ret = -ENOSYS; @@ -2817,8 +3077,19 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) ib_detach_mcast(id->qp, &mc->multicast.ib->rec.mgid, mc->multicast.ib->rec.mlid); - ib_sa_free_multicast(mc->multicast.ib); - kfree(mc); + if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) { + switch (rdma_port_get_link_layer(id->device, id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ib_sa_free_multicast(mc->multicast.ib); + kfree(mc); + break; + case IB_LINK_LAYER_ETHERNET: + kref_put(&mc->mcref, release_mc); + break; + default: + break; + } + } return; } } diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 27674c790a7..91a660310b7 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -496,6 +496,7 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, { int ret; u16 gid_index; + int force_grh; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->dlid); @@ -505,7 +506,9 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; - if (rec->hop_limit > 1) { + force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET; + + if (rec->hop_limit > 1 || force_grh) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index ac7edc24165..3d3c9264c45 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -583,6 +583,34 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, } } +static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, + struct rdma_route *route) +{ + struct rdma_dev_addr *dev_addr; + + resp->num_paths = route->num_paths; + switch (route->num_paths) { + case 0: + dev_addr = &route->addr.dev_addr; + iboe_mac_to_ll((union ib_gid *) &resp->ib_route[0].dgid, + dev_addr->dst_dev_addr); + iboe_addr_get_sgid(dev_addr, + (union ib_gid *) &resp->ib_route[0].sgid); + resp->ib_route[0].pkey = cpu_to_be16(0xffff); + break; + case 2: + ib_copy_path_rec_to_user(&resp->ib_route[1], + &route->path_rec[1]); + /* fall through */ + case 1: + ib_copy_path_rec_to_user(&resp->ib_route[0], + &route->path_rec[0]); + break; + default: + break; + } +} + static ssize_t ucma_query_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -617,12 +645,17 @@ static ssize_t ucma_query_route(struct ucma_file *file, resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; resp.port_num = ctx->cm_id->port_num; - switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) { - case RDMA_TRANSPORT_IB: - ucma_copy_ib_route(&resp, &ctx->cm_id->route); - break; - default: - break; + if (rdma_node_get_transport(ctx->cm_id->device->node_type) == RDMA_TRANSPORT_IB) { + switch (rdma_port_get_link_layer(ctx->cm_id->device, ctx->cm_id->port_num)) { + case IB_LINK_LAYER_INFINIBAND: + ucma_copy_ib_route(&resp, &ctx->cm_id->route); + break; + case IB_LINK_LAYER_ETHERNET: + ucma_copy_iboe_route(&resp, &ctx->cm_id->route); + break; + default: + break; + } } out: |