summaryrefslogtreecommitdiff
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /drivers/infiniband
downloadlinux-3.10-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.gz
linux-3.10-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.bz2
linux-3.10-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/Kconfig14
-rw-r--r--drivers/infiniband/Makefile3
-rw-r--r--drivers/infiniband/core/Makefile12
-rw-r--r--drivers/infiniband/core/agent.c373
-rw-r--r--drivers/infiniband/core/agent.h55
-rw-r--r--drivers/infiniband/core/agent_priv.h63
-rw-r--r--drivers/infiniband/core/cache.c365
-rw-r--r--drivers/infiniband/core/core_priv.h52
-rw-r--r--drivers/infiniband/core/device.c614
-rw-r--r--drivers/infiniband/core/fmr_pool.c507
-rw-r--r--drivers/infiniband/core/mad.c2714
-rw-r--r--drivers/infiniband/core/mad_priv.h199
-rw-r--r--drivers/infiniband/core/packer.c201
-rw-r--r--drivers/infiniband/core/sa_query.c866
-rw-r--r--drivers/infiniband/core/smi.c234
-rw-r--r--drivers/infiniband/core/smi.h67
-rw-r--r--drivers/infiniband/core/sysfs.c762
-rw-r--r--drivers/infiniband/core/ud_header.c365
-rw-r--r--drivers/infiniband/core/user_mad.c840
-rw-r--r--drivers/infiniband/core/verbs.c434
-rw-r--r--drivers/infiniband/hw/mthca/Kconfig16
-rw-r--r--drivers/infiniband/hw/mthca/Makefile12
-rw-r--r--drivers/infiniband/hw/mthca/mthca_allocator.c179
-rw-r--r--drivers/infiniband/hw/mthca/mthca_av.c241
-rw-r--r--drivers/infiniband/hw/mthca/mthca_cmd.c1767
-rw-r--r--drivers/infiniband/hw/mthca/mthca_cmd.h310
-rw-r--r--drivers/infiniband/hw/mthca/mthca_config_reg.h51
-rw-r--r--drivers/infiniband/hw/mthca/mthca_cq.c918
-rw-r--r--drivers/infiniband/hw/mthca/mthca_dev.h437
-rw-r--r--drivers/infiniband/hw/mthca/mthca_doorbell.h95
-rw-r--r--drivers/infiniband/hw/mthca/mthca_eq.c964
-rw-r--r--drivers/infiniband/hw/mthca/mthca_mad.c323
-rw-r--r--drivers/infiniband/hw/mthca/mthca_main.c1123
-rw-r--r--drivers/infiniband/hw/mthca/mthca_mcg.c376
-rw-r--r--drivers/infiniband/hw/mthca/mthca_memfree.c465
-rw-r--r--drivers/infiniband/hw/mthca/mthca_memfree.h161
-rw-r--r--drivers/infiniband/hw/mthca/mthca_mr.c416
-rw-r--r--drivers/infiniband/hw/mthca/mthca_pd.c80
-rw-r--r--drivers/infiniband/hw/mthca/mthca_profile.c266
-rw-r--r--drivers/infiniband/hw/mthca/mthca_profile.h58
-rw-r--r--drivers/infiniband/hw/mthca/mthca_provider.c660
-rw-r--r--drivers/infiniband/hw/mthca/mthca_provider.h251
-rw-r--r--drivers/infiniband/hw/mthca/mthca_qp.c2056
-rw-r--r--drivers/infiniband/hw/mthca/mthca_reset.c232
-rw-r--r--drivers/infiniband/hw/mthca/mthca_uar.c78
-rw-r--r--drivers/infiniband/include/ib_cache.h103
-rw-r--r--drivers/infiniband/include/ib_fmr_pool.h92
-rw-r--r--drivers/infiniband/include/ib_mad.h404
-rw-r--r--drivers/infiniband/include/ib_pack.h245
-rw-r--r--drivers/infiniband/include/ib_sa.h308
-rw-r--r--drivers/infiniband/include/ib_smi.h96
-rw-r--r--drivers/infiniband/include/ib_user_mad.h123
-rw-r--r--drivers/infiniband/include/ib_verbs.h1252
-rw-r--r--drivers/infiniband/ulp/ipoib/Kconfig33
-rw-r--r--drivers/infiniband/ulp/ipoib/Makefile11
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h353
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_fs.c287
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c668
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c1103
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c991
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c260
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_vlan.c177
62 files changed, 26781 insertions, 0 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
new file mode 100644
index 00000000000..3cc3ff0cccb
--- /dev/null
+++ b/drivers/infiniband/Kconfig
@@ -0,0 +1,14 @@
+menu "InfiniBand support"
+
+config INFINIBAND
+ tristate "InfiniBand support"
+ ---help---
+ Core support for InfiniBand (IB). Make sure to also select
+ any protocols you wish to use as well as drivers for your
+ InfiniBand hardware.
+
+source "drivers/infiniband/hw/mthca/Kconfig"
+
+source "drivers/infiniband/ulp/ipoib/Kconfig"
+
+endmenu
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
new file mode 100644
index 00000000000..d256cf79821
--- /dev/null
+++ b/drivers/infiniband/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND) += core/
+obj-$(CONFIG_INFINIBAND_MTHCA) += hw/mthca/
+obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
new file mode 100644
index 00000000000..d2dbfb52c0a
--- /dev/null
+++ b/drivers/infiniband/core/Makefile
@@ -0,0 +1,12 @@
+EXTRA_CFLAGS += -Idrivers/infiniband/include
+
+obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o ib_umad.o
+
+ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
+ device.o fmr_pool.o cache.o
+
+ib_mad-y := mad.o smi.o agent.o
+
+ib_sa-y := sa_query.o
+
+ib_umad-y := user_mad.o
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
new file mode 100644
index 00000000000..7aee5ebf3f0
--- /dev/null
+++ b/drivers/infiniband/core/agent.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: agent.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <linux/dma-mapping.h>
+
+#include <asm/bug.h>
+
+#include <ib_smi.h>
+
+#include "smi.h"
+#include "agent_priv.h"
+#include "mad_priv.h"
+#include "agent.h"
+
+spinlock_t ib_agent_port_list_lock;
+static LIST_HEAD(ib_agent_port_list);
+
+/*
+ * Caller must hold ib_agent_port_list_lock
+ */
+static inline struct ib_agent_port_private *
+__ib_get_agent_port(struct ib_device *device, int port_num,
+ struct ib_mad_agent *mad_agent)
+{
+ struct ib_agent_port_private *entry;
+
+ BUG_ON(!(!!device ^ !!mad_agent)); /* Exactly one MUST be (!NULL) */
+
+ if (device) {
+ list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+ if (entry->smp_agent->device == device &&
+ entry->port_num == port_num)
+ return entry;
+ }
+ } else {
+ list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+ if ((entry->smp_agent == mad_agent) ||
+ (entry->perf_mgmt_agent == mad_agent))
+ return entry;
+ }
+ }
+ return NULL;
+}
+
+static inline struct ib_agent_port_private *
+ib_get_agent_port(struct ib_device *device, int port_num,
+ struct ib_mad_agent *mad_agent)
+{
+ struct ib_agent_port_private *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ entry = __ib_get_agent_port(device, port_num, mad_agent);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+ return entry;
+}
+
+int smi_check_local_dr_smp(struct ib_smp *smp,
+ struct ib_device *device,
+ int port_num)
+{
+ struct ib_agent_port_private *port_priv;
+
+ if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ return 1;
+ port_priv = ib_get_agent_port(device, port_num, NULL);
+ if (!port_priv) {
+ printk(KERN_DEBUG SPFX "smi_check_local_dr_smp %s port %d "
+ "not open\n",
+ device->name, port_num);
+ return 1;
+ }
+
+ return smi_check_local_smp(port_priv->smp_agent, smp);
+}
+
+static int agent_mad_send(struct ib_mad_agent *mad_agent,
+ struct ib_agent_port_private *port_priv,
+ struct ib_mad_private *mad_priv,
+ struct ib_grh *grh,
+ struct ib_wc *wc)
+{
+ struct ib_agent_send_wr *agent_send_wr;
+ struct ib_sge gather_list;
+ struct ib_send_wr send_wr;
+ struct ib_send_wr *bad_send_wr;
+ struct ib_ah_attr ah_attr;
+ unsigned long flags;
+ int ret = 1;
+
+ agent_send_wr = kmalloc(sizeof(*agent_send_wr), GFP_KERNEL);
+ if (!agent_send_wr)
+ goto out;
+ agent_send_wr->mad = mad_priv;
+
+ /* PCI mapping */
+ gather_list.addr = dma_map_single(mad_agent->device->dma_device,
+ &mad_priv->mad,
+ sizeof(mad_priv->mad),
+ DMA_TO_DEVICE);
+ gather_list.length = sizeof(mad_priv->mad);
+ gather_list.lkey = (*port_priv->mr).lkey;
+
+ send_wr.next = NULL;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.sg_list = &gather_list;
+ send_wr.num_sge = 1;
+ send_wr.wr.ud.remote_qpn = wc->src_qp; /* DQPN */
+ send_wr.wr.ud.timeout_ms = 0;
+ send_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+ ah_attr.dlid = wc->slid;
+ ah_attr.port_num = mad_agent->port_num;
+ ah_attr.src_path_bits = wc->dlid_path_bits;
+ ah_attr.sl = wc->sl;
+ ah_attr.static_rate = 0;
+ ah_attr.ah_flags = 0; /* No GRH */
+ if (mad_priv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT) {
+ if (wc->wc_flags & IB_WC_GRH) {
+ ah_attr.ah_flags = IB_AH_GRH;
+ /* Should sgid be looked up ? */
+ ah_attr.grh.sgid_index = 0;
+ ah_attr.grh.hop_limit = grh->hop_limit;
+ ah_attr.grh.flow_label = be32_to_cpup(
+ &grh->version_tclass_flow) & 0xfffff;
+ ah_attr.grh.traffic_class = (be32_to_cpup(
+ &grh->version_tclass_flow) >> 20) & 0xff;
+ memcpy(ah_attr.grh.dgid.raw,
+ grh->sgid.raw,
+ sizeof(ah_attr.grh.dgid));
+ }
+ }
+
+ agent_send_wr->ah = ib_create_ah(mad_agent->qp->pd, &ah_attr);
+ if (IS_ERR(agent_send_wr->ah)) {
+ printk(KERN_ERR SPFX "No memory for address handle\n");
+ kfree(agent_send_wr);
+ goto out;
+ }
+
+ send_wr.wr.ud.ah = agent_send_wr->ah;
+ if (mad_priv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT) {
+ send_wr.wr.ud.pkey_index = wc->pkey_index;
+ send_wr.wr.ud.remote_qkey = IB_QP1_QKEY;
+ } else { /* for SMPs */
+ send_wr.wr.ud.pkey_index = 0;
+ send_wr.wr.ud.remote_qkey = 0;
+ }
+ send_wr.wr.ud.mad_hdr = &mad_priv->mad.mad.mad_hdr;
+ send_wr.wr_id = (unsigned long)agent_send_wr;
+
+ pci_unmap_addr_set(agent_send_wr, mapping, gather_list.addr);
+
+ /* Send */
+ spin_lock_irqsave(&port_priv->send_list_lock, flags);
+ if (ib_post_send_mad(mad_agent, &send_wr, &bad_send_wr)) {
+ spin_unlock_irqrestore(&port_priv->send_list_lock, flags);
+ dma_unmap_single(mad_agent->device->dma_device,
+ pci_unmap_addr(agent_send_wr, mapping),
+ sizeof(mad_priv->mad),
+ DMA_TO_DEVICE);
+ ib_destroy_ah(agent_send_wr->ah);
+ kfree(agent_send_wr);
+ } else {
+ list_add_tail(&agent_send_wr->send_list,
+ &port_priv->send_posted_list);
+ spin_unlock_irqrestore(&port_priv->send_list_lock, flags);
+ ret = 0;
+ }
+
+out:
+ return ret;
+}
+
+int agent_send(struct ib_mad_private *mad,
+ struct ib_grh *grh,
+ struct ib_wc *wc,
+ struct ib_device *device,
+ int port_num)
+{
+ struct ib_agent_port_private *port_priv;
+ struct ib_mad_agent *mad_agent;
+
+ port_priv = ib_get_agent_port(device, port_num, NULL);
+ if (!port_priv) {
+ printk(KERN_DEBUG SPFX "agent_send %s port %d not open\n",
+ device->name, port_num);
+ return 1;
+ }
+
+ /* Get mad agent based on mgmt_class in MAD */
+ switch (mad->mad.mad.mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ mad_agent = port_priv->smp_agent;
+ break;
+ case IB_MGMT_CLASS_PERF_MGMT:
+ mad_agent = port_priv->perf_mgmt_agent;
+ break;
+ default:
+ return 1;
+ }
+
+ return agent_mad_send(mad_agent, port_priv, mad, grh, wc);
+}
+
+static void agent_send_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_agent_port_private *port_priv;
+ struct ib_agent_send_wr *agent_send_wr;
+ unsigned long flags;
+
+ /* Find matching MAD agent */
+ port_priv = ib_get_agent_port(NULL, 0, mad_agent);
+ if (!port_priv) {
+ printk(KERN_ERR SPFX "agent_send_handler: no matching MAD "
+ "agent %p\n", mad_agent);
+ return;
+ }
+
+ agent_send_wr = (struct ib_agent_send_wr *)(unsigned long)mad_send_wc->wr_id;
+ spin_lock_irqsave(&port_priv->send_list_lock, flags);
+ /* Remove completed send from posted send MAD list */
+ list_del(&agent_send_wr->send_list);
+ spin_unlock_irqrestore(&port_priv->send_list_lock, flags);
+
+ /* Unmap PCI */
+ dma_unmap_single(mad_agent->device->dma_device,
+ pci_unmap_addr(agent_send_wr, mapping),
+ sizeof(agent_send_wr->mad->mad),
+ DMA_TO_DEVICE);
+
+ ib_destroy_ah(agent_send_wr->ah);
+
+ /* Release allocated memory */
+ kmem_cache_free(ib_mad_cache, agent_send_wr->mad);
+ kfree(agent_send_wr);
+}
+
+int ib_agent_port_open(struct ib_device *device, int port_num)
+{
+ int ret;
+ struct ib_agent_port_private *port_priv;
+ unsigned long flags;
+
+ /* First, check if port already open for SMI */
+ port_priv = ib_get_agent_port(device, port_num, NULL);
+ if (port_priv) {
+ printk(KERN_DEBUG SPFX "%s port %d already open\n",
+ device->name, port_num);
+ return 0;
+ }
+
+ /* Create new device info */
+ port_priv = kmalloc(sizeof *port_priv, GFP_KERNEL);
+ if (!port_priv) {
+ printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n");
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ memset(port_priv, 0, sizeof *port_priv);
+ port_priv->port_num = port_num;
+ spin_lock_init(&port_priv->send_list_lock);
+ INIT_LIST_HEAD(&port_priv->send_posted_list);
+
+ /* Obtain send only MAD agent for SM class (SMI QP) */
+ port_priv->smp_agent = ib_register_mad_agent(device, port_num,
+ IB_QPT_SMI,
+ NULL, 0,
+ &agent_send_handler,
+ NULL, NULL);
+
+ if (IS_ERR(port_priv->smp_agent)) {
+ ret = PTR_ERR(port_priv->smp_agent);
+ goto error2;
+ }
+
+ /* Obtain send only MAD agent for PerfMgmt class (GSI QP) */
+ port_priv->perf_mgmt_agent = ib_register_mad_agent(device, port_num,
+ IB_QPT_GSI,
+ NULL, 0,
+ &agent_send_handler,
+ NULL, NULL);
+ if (IS_ERR(port_priv->perf_mgmt_agent)) {
+ ret = PTR_ERR(port_priv->perf_mgmt_agent);
+ goto error3;
+ }
+
+ port_priv->mr = ib_get_dma_mr(port_priv->smp_agent->qp->pd,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(port_priv->mr)) {
+ printk(KERN_ERR SPFX "Couldn't get DMA MR\n");
+ ret = PTR_ERR(port_priv->mr);
+ goto error4;
+ }
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ list_add_tail(&port_priv->port_list, &ib_agent_port_list);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+ return 0;
+
+error4:
+ ib_unregister_mad_agent(port_priv->perf_mgmt_agent);
+error3:
+ ib_unregister_mad_agent(port_priv->smp_agent);
+error2:
+ kfree(port_priv);
+error1:
+ return ret;
+}
+
+int ib_agent_port_close(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *port_priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ port_priv = __ib_get_agent_port(device, port_num, NULL);
+ if (port_priv == NULL) {
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+ printk(KERN_ERR SPFX "Port %d not found\n", port_num);
+ return -ENODEV;
+ }
+ list_del(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+ ib_dereg_mr(port_priv->mr);
+
+ ib_unregister_mad_agent(port_priv->perf_mgmt_agent);
+ ib_unregister_mad_agent(port_priv->smp_agent);
+ kfree(port_priv);
+
+ return 0;
+}
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
new file mode 100644
index 00000000000..d9426842254
--- /dev/null
+++ b/drivers/infiniband/core/agent.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: agent.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef __AGENT_H_
+#define __AGENT_H_
+
+extern spinlock_t ib_agent_port_list_lock;
+
+extern int ib_agent_port_open(struct ib_device *device,
+ int port_num);
+
+extern int ib_agent_port_close(struct ib_device *device, int port_num);
+
+extern int agent_send(struct ib_mad_private *mad,
+ struct ib_grh *grh,
+ struct ib_wc *wc,
+ struct ib_device *device,
+ int port_num);
+
+#endif /* __AGENT_H_ */
diff --git a/drivers/infiniband/core/agent_priv.h b/drivers/infiniband/core/agent_priv.h
new file mode 100644
index 00000000000..17a0cce5813
--- /dev/null
+++ b/drivers/infiniband/core/agent_priv.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: agent_priv.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef __IB_AGENT_PRIV_H__
+#define __IB_AGENT_PRIV_H__
+
+#include <linux/pci.h>
+
+#define SPFX "ib_agent: "
+
+struct ib_agent_send_wr {
+ struct list_head send_list;
+ struct ib_ah *ah;
+ struct ib_mad_private *mad;
+ DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+struct ib_agent_port_private {
+ struct list_head port_list;
+ struct list_head send_posted_list;
+ spinlock_t send_list_lock;
+ int port_num;
+ struct ib_mad_agent *smp_agent; /* SM class */
+ struct ib_mad_agent *perf_mgmt_agent; /* PerfMgmt class */
+ struct ib_mr *mr;
+};
+
+#endif /* __IB_AGENT_PRIV_H__ */
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
new file mode 100644
index 00000000000..3042360c97e
--- /dev/null
+++ b/drivers/infiniband/core/cache.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: cache.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+#include <ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+ int table_len;
+ u16 table[0];
+};
+
+struct ib_gid_cache {
+ int table_len;
+ union ib_gid table[0];
+};
+
+struct ib_update_work {
+ struct work_struct work;
+ struct ib_device *device;
+ u8 port_num;
+};
+
+static inline int start_port(struct ib_device *device)
+{
+ return device->node_type == IB_NODE_SWITCH ? 0 : 1;
+}
+
+static inline int end_port(struct ib_device *device)
+{
+ return device->node_type == IB_NODE_SWITCH ? 0 : device->phys_port_cnt;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+ u8 port_num,
+ int index,
+ union ib_gid *gid)
+{
+ struct ib_gid_cache *cache;
+ unsigned long flags;
+ int ret = 0;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.gid_cache[port_num - start_port(device)];
+
+ if (index < 0 || index >= cache->table_len)
+ ret = -EINVAL;
+ else
+ *gid = cache->table[index];
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+ union ib_gid *gid,
+ u8 *port_num,
+ u16 *index)
+{
+ struct ib_gid_cache *cache;
+ unsigned long flags;
+ int p, i;
+ int ret = -ENOENT;
+
+ *port_num = -1;
+ if (index)
+ *index = -1;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ cache = device->cache.gid_cache[p];
+ for (i = 0; i < cache->table_len; ++i) {
+ if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
+ *port_num = p + start_port(device);
+ if (index)
+ *index = i;
+ ret = 0;
+ goto found;
+ }
+ }
+ }
+found:
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_gid);
+
+int ib_get_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ int index,
+ u16 *pkey)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int ret = 0;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+ if (index < 0 || index >= cache->table_len)
+ ret = -EINVAL;
+ else
+ *pkey = cache->table[index];
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+int ib_find_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int i;
+ int ret = -ENOENT;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+ *index = -1;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+ *index = i;
+ ret = 0;
+ break;
+ }
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+static void ib_cache_update(struct ib_device *device,
+ u8 port)
+{
+ struct ib_port_attr *tprops = NULL;
+ struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
+ struct ib_gid_cache *gid_cache = NULL, *old_gid_cache;
+ int i;
+ int ret;
+
+ tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+ if (!tprops)
+ return;
+
+ ret = ib_query_port(device, port, tprops);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_port failed (%d) for %s\n",
+ ret, device->name);
+ goto err;
+ }
+
+ pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
+ sizeof *pkey_cache->table, GFP_KERNEL);
+ if (!pkey_cache)
+ goto err;
+
+ pkey_cache->table_len = tprops->pkey_tbl_len;
+
+ gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
+ sizeof *gid_cache->table, GFP_KERNEL);
+ if (!gid_cache)
+ goto err;
+
+ gid_cache->table_len = tprops->gid_tbl_len;
+
+ for (i = 0; i < pkey_cache->table_len; ++i) {
+ ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ }
+
+ for (i = 0; i < gid_cache->table_len; ++i) {
+ ret = ib_query_gid(device, port, i, gid_cache->table + i);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ }
+
+ write_lock_irq(&device->cache.lock);
+
+ old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
+ old_gid_cache = device->cache.gid_cache [port - start_port(device)];
+
+ device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
+ device->cache.gid_cache [port - start_port(device)] = gid_cache;
+
+ write_unlock_irq(&device->cache.lock);
+
+ kfree(old_pkey_cache);
+ kfree(old_gid_cache);
+ kfree(tprops);
+ return;
+
+err:
+ kfree(pkey_cache);
+ kfree(gid_cache);
+ kfree(tprops);
+}
+
+static void ib_cache_task(void *work_ptr)
+{
+ struct ib_update_work *work = work_ptr;
+
+ ib_cache_update(work->device, work->port_num);
+ kfree(work);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct ib_update_work *work;
+
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE) {
+ work = kmalloc(sizeof *work, GFP_ATOMIC);
+ if (work) {
+ INIT_WORK(&work->work, ib_cache_task, work);
+ work->device = event->device;
+ work->port_num = event->element.port_num;
+ schedule_work(&work->work);
+ }
+ }
+}
+
+static void ib_cache_setup_one(struct ib_device *device)
+{
+ int p;
+
+ rwlock_init(&device->cache.lock);
+
+ device->cache.pkey_cache =
+ kmalloc(sizeof *device->cache.pkey_cache *
+ (end_port(device) - start_port(device) + 1), GFP_KERNEL);
+ device->cache.gid_cache =
+ kmalloc(sizeof *device->cache.pkey_cache *
+ (end_port(device) - start_port(device) + 1), GFP_KERNEL);
+
+ if (!device->cache.pkey_cache || !device->cache.gid_cache) {
+ printk(KERN_WARNING "Couldn't allocate cache "
+ "for %s\n", device->name);
+ goto err;
+ }
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ device->cache.pkey_cache[p] = NULL;
+ device->cache.gid_cache [p] = NULL;
+ ib_cache_update(device, p + start_port(device));
+ }
+
+ INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+ device, ib_cache_event);
+ if (ib_register_event_handler(&device->cache.event_handler))
+ goto err_cache;
+
+ return;
+
+err_cache:
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ kfree(device->cache.pkey_cache[p]);
+ kfree(device->cache.gid_cache[p]);
+ }
+
+err:
+ kfree(device->cache.pkey_cache);
+ kfree(device->cache.gid_cache);
+}
+
+static void ib_cache_cleanup_one(struct ib_device *device)
+{
+ int p;
+
+ ib_unregister_event_handler(&device->cache.event_handler);
+ flush_scheduled_work();
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ kfree(device->cache.pkey_cache[p]);
+ kfree(device->cache.gid_cache[p]);
+ }
+
+ kfree(device->cache.pkey_cache);
+ kfree(device->cache.gid_cache);
+}
+
+static struct ib_client cache_client = {
+ .name = "cache",
+ .add = ib_cache_setup_one,
+ .remove = ib_cache_cleanup_one
+};
+
+int __init ib_cache_setup(void)
+{
+ return ib_register_client(&cache_client);
+}
+
+void __exit ib_cache_cleanup(void)
+{
+ ib_unregister_client(&cache_client);
+}
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
new file mode 100644
index 00000000000..797049626ff
--- /dev/null
+++ b/drivers/infiniband/core/core_priv.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: core_priv.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <ib_verbs.h>
+
+int ib_device_register_sysfs(struct ib_device *device);
+void ib_device_unregister_sysfs(struct ib_device *device);
+
+int ib_sysfs_setup(void);
+void ib_sysfs_cleanup(void);
+
+int ib_cache_setup(void);
+void ib_cache_cleanup(void);
+
+#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
new file mode 100644
index 00000000000..9197e92d708
--- /dev/null
+++ b/drivers/infiniband/core/device.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: device.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+
+#include <asm/semaphore.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_client_data {
+ struct list_head list;
+ struct ib_client *client;
+ void * data;
+};
+
+static LIST_HEAD(device_list);
+static LIST_HEAD(client_list);
+
+/*
+ * device_sem protects access to both device_list and client_list.
+ * There's no real point to using multiple locks or something fancier
+ * like an rwsem: we always access both lists, and we're always
+ * modifying one list or the other list. In any case this is not a
+ * hot path so there's no point in trying to optimize.
+ */
+static DECLARE_MUTEX(device_sem);
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+ static const struct {
+ size_t offset;
+ char *name;
+ } mandatory_table[] = {
+ IB_MANDATORY_FUNC(query_device),
+ IB_MANDATORY_FUNC(query_port),
+ IB_MANDATORY_FUNC(query_pkey),
+ IB_MANDATORY_FUNC(query_gid),
+ IB_MANDATORY_FUNC(alloc_pd),
+ IB_MANDATORY_FUNC(dealloc_pd),
+ IB_MANDATORY_FUNC(create_ah),
+ IB_MANDATORY_FUNC(destroy_ah),
+ IB_MANDATORY_FUNC(create_qp),
+ IB_MANDATORY_FUNC(modify_qp),
+ IB_MANDATORY_FUNC(destroy_qp),
+ IB_MANDATORY_FUNC(post_send),
+ IB_MANDATORY_FUNC(post_recv),
+ IB_MANDATORY_FUNC(create_cq),
+ IB_MANDATORY_FUNC(destroy_cq),
+ IB_MANDATORY_FUNC(poll_cq),
+ IB_MANDATORY_FUNC(req_notify_cq),
+ IB_MANDATORY_FUNC(get_dma_mr),
+ IB_MANDATORY_FUNC(dereg_mr)
+ };
+ int i;
+
+ for (i = 0; i < sizeof mandatory_table / sizeof mandatory_table[0]; ++i) {
+ if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
+ printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
+ device->name, mandatory_table[i].name);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+ struct ib_device *device;
+
+ list_for_each_entry(device, &device_list, core_list)
+ if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+ return device;
+
+ return NULL;
+}
+
+
+static int alloc_name(char *name)
+{
+ long *inuse;
+ char buf[IB_DEVICE_NAME_MAX];
+ struct ib_device *device;
+ int i;
+
+ inuse = (long *) get_zeroed_page(GFP_KERNEL);
+ if (!inuse)
+ return -ENOMEM;
+
+ list_for_each_entry(device, &device_list, core_list) {
+ if (!sscanf(device->name, name, &i))
+ continue;
+ if (i < 0 || i >= PAGE_SIZE * 8)
+ continue;
+ snprintf(buf, sizeof buf, name, i);
+ if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+ set_bit(i, inuse);
+ }
+
+ i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+ free_page((unsigned long) inuse);
+ snprintf(buf, sizeof buf, name, i);
+
+ if (__ib_device_get_by_name(buf))
+ return -ENFILE;
+
+ strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+ return 0;
+}
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device. @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+ void *dev;
+
+ BUG_ON(size < sizeof (struct ib_device));
+
+ dev = kmalloc(size, GFP_KERNEL);
+ if (!dev)
+ return NULL;
+
+ memset(dev, 0, size);
+
+ return dev;
+}
+EXPORT_SYMBOL(ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+ if (device->reg_state == IB_DEV_UNINITIALIZED) {
+ kfree(device);
+ return;
+ }
+
+ BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
+
+ ib_device_unregister_sysfs(device);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+ struct ib_client_data *context;
+ unsigned long flags;
+
+ context = kmalloc(sizeof *context, GFP_KERNEL);
+ if (!context) {
+ printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n",
+ device->name, client->name);
+ return -ENOMEM;
+ }
+
+ context->client = client;
+ context->data = NULL;
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_add(&context->list, &device->client_data_list);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+ return 0;
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core. All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device)
+{
+ int ret;
+
+ down(&device_sem);
+
+ if (strchr(device->name, '%')) {
+ ret = alloc_name(device->name);
+ if (ret)
+ goto out;
+ }
+
+ if (ib_device_check_mandatory(device)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&device->event_handler_list);
+ INIT_LIST_HEAD(&device->client_data_list);
+ spin_lock_init(&device->event_handler_lock);
+ spin_lock_init(&device->client_data_lock);
+
+ ret = ib_device_register_sysfs(device);
+ if (ret) {
+ printk(KERN_WARNING "Couldn't register device %s with driver model\n",
+ device->name);
+ goto out;
+ }
+
+ list_add_tail(&device->core_list, &device_list);
+
+ device->reg_state = IB_DEV_REGISTERED;
+
+ {
+ struct ib_client *client;
+
+ list_for_each_entry(client, &client_list, list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
+ }
+
+ out:
+ up(&device_sem);
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device. All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+ struct ib_client *client;
+ struct ib_client_data *context, *tmp;
+ unsigned long flags;
+
+ down(&device_sem);
+
+ list_for_each_entry_reverse(client, &client_list, list)
+ if (client->remove)
+ client->remove(device);
+
+ list_del(&device->core_list);
+
+ up(&device_sem);
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ kfree(context);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+ device->reg_state = IB_DEV_UNREGISTERED;
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal. When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered). In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+ struct ib_device *device;
+
+ down(&device_sem);
+
+ list_add_tail(&client->list, &client_list);
+ list_for_each_entry(device, &device_list, core_list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
+
+ up(&device_sem);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration. When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+ struct ib_client_data *context, *tmp;
+ struct ib_device *device;
+ unsigned long flags;
+
+ down(&device_sem);
+
+ list_for_each_entry(device, &device_list, core_list) {
+ if (client->remove)
+ client->remove(device);
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ if (context->client == client) {
+ list_del(&context->list);
+ kfree(context);
+ }
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ }
+ list_del(&client->list);
+
+ up(&device_sem);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+ struct ib_client_data *context;
+ void *ret = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry(context, &device->client_data_list, list)
+ if (context->client == client) {
+ ret = context->data;
+ break;
+ }
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_set_client_data - Get IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data)
+{
+ struct ib_client_data *context;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry(context, &device->client_data_list, list)
+ if (context->client == client) {
+ context->data = data;
+ goto out;
+ }
+
+ printk(KERN_WARNING "No client context found for %s/%s\n",
+ device->name, client->name);
+
+out:
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification). This
+ * callback may occur in interrupt context.
+ */
+int ib_register_event_handler (struct ib_event_handler *event_handler)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+ list_add_tail(&event_handler->list,
+ &event_handler->device->event_handler_list);
+ spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+int ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+ list_del(&event_handler->list);
+ spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+ unsigned long flags;
+ struct ib_event_handler *handler;
+
+ spin_lock_irqsave(&event->device->event_handler_lock, flags);
+
+ list_for_each_entry(handler, &event->device->event_handler_list, list)
+ handler->handler(handler, event);
+
+ spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+/**
+ * ib_query_device - Query IB device attributes
+ * @device:Device to query
+ * @device_attr:Device attributes
+ *
+ * ib_query_device() returns the attributes of a device through the
+ * @device_attr pointer.
+ */
+int ib_query_device(struct ib_device *device,
+ struct ib_device_attr *device_attr)
+{
+ return device->query_device(device, device_attr);
+}
+EXPORT_SYMBOL(ib_query_device);
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+ u8 port_num,
+ struct ib_port_attr *port_attr)
+{
+ return device->query_port(device, port_num, port_attr);
+}
+EXPORT_SYMBOL(ib_query_port);
+
+/**
+ * ib_query_gid - Get GID table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:GID table index to query
+ * @gid:Returned GID
+ *
+ * ib_query_gid() fetches the specified GID table entry.
+ */
+int ib_query_gid(struct ib_device *device,
+ u8 port_num, int index, union ib_gid *gid)
+{
+ return device->query_gid(device, port_num, index, gid);
+}
+EXPORT_SYMBOL(ib_query_gid);
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey)
+{
+ return device->query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ return device->modify_device(device, device_modify_mask,
+ device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ * to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+ u8 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify)
+{
+ return device->modify_port(device, port_num, port_modify_mask,
+ port_modify);
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+static int __init ib_core_init(void)
+{
+ int ret;
+
+ ret = ib_sysfs_setup();
+ if (ret)
+ printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+
+ ret = ib_cache_setup();
+ if (ret) {
+ printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+ ib_sysfs_cleanup();
+ }
+
+ return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+ ib_cache_cleanup();
+ ib_sysfs_cleanup();
+}
+
+module_init(ib_core_init);
+module_exit(ib_core_cleanup);
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
new file mode 100644
index 00000000000..2e9469f1892
--- /dev/null
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: fmr_pool.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/kthread.h>
+
+#include <ib_fmr_pool.h>
+
+#include "core_priv.h"
+
+enum {
+ IB_FMR_MAX_REMAPS = 32,
+
+ IB_FMR_HASH_BITS = 8,
+ IB_FMR_HASH_SIZE = 1 << IB_FMR_HASH_BITS,
+ IB_FMR_HASH_MASK = IB_FMR_HASH_SIZE - 1
+};
+
+/*
+ * If an FMR is not in use, then the list member will point to either
+ * its pool's free_list (if the FMR can be mapped again; that is,
+ * remap_count < IB_FMR_MAX_REMAPS) or its pool's dirty_list (if the
+ * FMR needs to be unmapped before being remapped). In either of
+ * these cases it is a bug if the ref_count is not 0. In other words,
+ * if ref_count is > 0, then the list member must not be linked into
+ * either free_list or dirty_list.
+ *
+ * The cache_node member is used to link the FMR into a cache bucket
+ * (if caching is enabled). This is independent of the reference
+ * count of the FMR. When a valid FMR is released, its ref_count is
+ * decremented, and if ref_count reaches 0, the FMR is placed in
+ * either free_list or dirty_list as appropriate. However, it is not
+ * removed from the cache and may be "revived" if a call to
+ * ib_fmr_register_physical() occurs before the FMR is remapped. In
+ * this case we just increment the ref_count and remove the FMR from
+ * free_list/dirty_list.
+ *
+ * Before we remap an FMR from free_list, we remove it from the cache
+ * (to prevent another user from obtaining a stale FMR). When an FMR
+ * is released, we add it to the tail of the free list, so that our
+ * cache eviction policy is "least recently used."
+ *
+ * All manipulation of ref_count, list and cache_node is protected by
+ * pool_lock to maintain consistency.
+ */
+
+struct ib_fmr_pool {
+ spinlock_t pool_lock;
+
+ int pool_size;
+ int max_pages;
+ int dirty_watermark;
+ int dirty_len;
+ struct list_head free_list;
+ struct list_head dirty_list;
+ struct hlist_head *cache_bucket;
+
+ void (*flush_function)(struct ib_fmr_pool *pool,
+ void * arg);
+ void *flush_arg;
+
+ struct task_struct *thread;
+
+ atomic_t req_ser;
+ atomic_t flush_ser;
+
+ wait_queue_head_t force_wait;
+};
+
+static inline u32 ib_fmr_hash(u64 first_page)
+{
+ return jhash_2words((u32) first_page,
+ (u32) (first_page >> 32),
+ 0);
+}
+
+/* Caller must hold pool_lock */
+static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
+ u64 *page_list,
+ int page_list_len,
+ u64 io_virtual_address)
+{
+ struct hlist_head *bucket;
+ struct ib_pool_fmr *fmr;
+ struct hlist_node *pos;
+
+ if (!pool->cache_bucket)
+ return NULL;
+
+ bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
+
+ hlist_for_each_entry(fmr, pos, bucket, cache_node)
+ if (io_virtual_address == fmr->io_virtual_address &&
+ page_list_len == fmr->page_list_len &&
+ !memcmp(page_list, fmr->page_list,
+ page_list_len * sizeof *page_list))
+ return fmr;
+
+ return NULL;
+}
+
+static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
+{
+ int ret;
+ struct ib_pool_fmr *fmr;
+ LIST_HEAD(unmap_list);
+ LIST_HEAD(fmr_list);
+
+ spin_lock_irq(&pool->pool_lock);
+
+ list_for_each_entry(fmr, &pool->dirty_list, list) {
+ hlist_del_init(&fmr->cache_node);
+ fmr->remap_count = 0;
+ list_add_tail(&fmr->fmr->list, &fmr_list);
+
+#ifdef DEBUG
+ if (fmr->ref_count !=0) {
+ printk(KERN_WARNING "Unmapping FMR 0x%08x with ref count %d",
+ fmr, fmr->ref_count);
+ }
+#endif
+ }
+
+ list_splice(&pool->dirty_list, &unmap_list);
+ INIT_LIST_HEAD(&pool->dirty_list);
+ pool->dirty_len = 0;
+
+ spin_unlock_irq(&pool->pool_lock);
+
+ if (list_empty(&unmap_list)) {
+ return;
+ }
+
+ ret = ib_unmap_fmr(&fmr_list);
+ if (ret)
+ printk(KERN_WARNING "ib_unmap_fmr returned %d", ret);
+
+ spin_lock_irq(&pool->pool_lock);
+ list_splice(&unmap_list, &pool->free_list);
+ spin_unlock_irq(&pool->pool_lock);
+}
+
+static int ib_fmr_cleanup_thread(void *pool_ptr)
+{
+ struct ib_fmr_pool *pool = pool_ptr;
+
+ do {
+ if (pool->dirty_len >= pool->dirty_watermark ||
+ atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
+ ib_fmr_batch_release(pool);
+
+ atomic_inc(&pool->flush_ser);
+ wake_up_interruptible(&pool->force_wait);
+
+ if (pool->flush_function)
+ pool->flush_function(pool, pool->flush_arg);
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (pool->dirty_len < pool->dirty_watermark &&
+ atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
+ !kthread_should_stop())
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+/**
+ * ib_create_fmr_pool - Create an FMR pool
+ * @pd:Protection domain for FMRs
+ * @params:FMR pool parameters
+ *
+ * Create a pool of FMRs. Return value is pointer to new pool or
+ * error code if creation failed.
+ */
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
+ struct ib_fmr_pool_param *params)
+{
+ struct ib_device *device;
+ struct ib_fmr_pool *pool;
+ int i;
+ int ret;
+
+ if (!params)
+ return ERR_PTR(-EINVAL);
+
+ device = pd->device;
+ if (!device->alloc_fmr || !device->dealloc_fmr ||
+ !device->map_phys_fmr || !device->unmap_fmr) {
+ printk(KERN_WARNING "Device %s does not support fast memory regions",
+ device->name);
+ return ERR_PTR(-ENOSYS);
+ }
+
+ pool = kmalloc(sizeof *pool, GFP_KERNEL);
+ if (!pool) {
+ printk(KERN_WARNING "couldn't allocate pool struct");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ pool->cache_bucket = NULL;
+
+ pool->flush_function = params->flush_function;
+ pool->flush_arg = params->flush_arg;
+
+ INIT_LIST_HEAD(&pool->free_list);
+ INIT_LIST_HEAD(&pool->dirty_list);
+
+ if (params->cache) {
+ pool->cache_bucket =
+ kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
+ GFP_KERNEL);
+ if (!pool->cache_bucket) {
+ printk(KERN_WARNING "Failed to allocate cache in pool");
+ ret = -ENOMEM;
+ goto out_free_pool;
+ }
+
+ for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
+ INIT_HLIST_HEAD(pool->cache_bucket + i);
+ }
+
+ pool->pool_size = 0;
+ pool->max_pages = params->max_pages_per_fmr;
+ pool->dirty_watermark = params->dirty_watermark;
+ pool->dirty_len = 0;
+ spin_lock_init(&pool->pool_lock);
+ atomic_set(&pool->req_ser, 0);
+ atomic_set(&pool->flush_ser, 0);
+ init_waitqueue_head(&pool->force_wait);
+
+ pool->thread = kthread_create(ib_fmr_cleanup_thread,
+ pool,
+ "ib_fmr(%s)",
+ device->name);
+ if (IS_ERR(pool->thread)) {
+ printk(KERN_WARNING "couldn't start cleanup thread");
+ ret = PTR_ERR(pool->thread);
+ goto out_free_pool;
+ }
+
+ {
+ struct ib_pool_fmr *fmr;
+ struct ib_fmr_attr attr = {
+ .max_pages = params->max_pages_per_fmr,
+ .max_maps = IB_FMR_MAX_REMAPS,
+ .page_size = PAGE_SHIFT
+ };
+
+ for (i = 0; i < params->pool_size; ++i) {
+ fmr = kmalloc(sizeof *fmr + params->max_pages_per_fmr * sizeof (u64),
+ GFP_KERNEL);
+ if (!fmr) {
+ printk(KERN_WARNING "failed to allocate fmr struct "
+ "for FMR %d", i);
+ goto out_fail;
+ }
+
+ fmr->pool = pool;
+ fmr->remap_count = 0;
+ fmr->ref_count = 0;
+ INIT_HLIST_NODE(&fmr->cache_node);
+
+ fmr->fmr = ib_alloc_fmr(pd, params->access, &attr);
+ if (IS_ERR(fmr->fmr)) {
+ printk(KERN_WARNING "fmr_create failed for FMR %d", i);
+ kfree(fmr);
+ goto out_fail;
+ }
+
+ list_add_tail(&fmr->list, &pool->free_list);
+ ++pool->pool_size;
+ }
+ }
+
+ return pool;
+
+ out_free_pool:
+ kfree(pool->cache_bucket);
+ kfree(pool);
+
+ return ERR_PTR(ret);
+
+ out_fail:
+ ib_destroy_fmr_pool(pool);
+
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_fmr_pool);
+
+/**
+ * ib_destroy_fmr_pool - Free FMR pool
+ * @pool:FMR pool to free
+ *
+ * Destroy an FMR pool and free all associated resources.
+ */
+int ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
+{
+ struct ib_pool_fmr *fmr;
+ struct ib_pool_fmr *tmp;
+ int i;
+
+ kthread_stop(pool->thread);
+ ib_fmr_batch_release(pool);
+
+ i = 0;
+ list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
+ ib_dealloc_fmr(fmr->fmr);
+ list_del(&fmr->list);
+ kfree(fmr);
+ ++i;
+ }
+
+ if (i < pool->pool_size)
+ printk(KERN_WARNING "pool still has %d regions registered",
+ pool->pool_size - i);
+
+ kfree(pool->cache_bucket);
+ kfree(pool);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_destroy_fmr_pool);
+
+/**
+ * ib_flush_fmr_pool - Invalidate all unmapped FMRs
+ * @pool:FMR pool to flush
+ *
+ * Ensure that all unmapped FMRs are fully invalidated.
+ */
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
+{
+ int serial;
+
+ atomic_inc(&pool->req_ser);
+ /*
+ * It's OK if someone else bumps req_ser again here -- we'll
+ * just wait a little longer.
+ */
+ serial = atomic_read(&pool->req_ser);
+
+ wake_up_process(pool->thread);
+
+ if (wait_event_interruptible(pool->force_wait,
+ atomic_read(&pool->flush_ser) -
+ atomic_read(&pool->req_ser) >= 0))
+ return -EINTR;
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_flush_fmr_pool);
+
+/**
+ * ib_fmr_pool_map_phys -
+ * @pool:FMR pool to allocate FMR from
+ * @page_list:List of pages to map
+ * @list_len:Number of pages in @page_list
+ * @io_virtual_address:I/O virtual address for new FMR
+ *
+ * Map an FMR from an FMR pool.
+ */
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+ u64 *page_list,
+ int list_len,
+ u64 *io_virtual_address)
+{
+ struct ib_fmr_pool *pool = pool_handle;
+ struct ib_pool_fmr *fmr;
+ unsigned long flags;
+ int result;
+
+ if (list_len < 1 || list_len > pool->max_pages)
+ return ERR_PTR(-EINVAL);
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ fmr = ib_fmr_cache_lookup(pool,
+ page_list,
+ list_len,
+ *io_virtual_address);
+ if (fmr) {
+ /* found in cache */
+ ++fmr->ref_count;
+ if (fmr->ref_count == 1) {
+ list_del(&fmr->list);
+ }
+
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ return fmr;
+ }
+
+ if (list_empty(&pool->free_list)) {
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
+ list_del(&fmr->list);
+ hlist_del_init(&fmr->cache_node);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
+ *io_virtual_address);
+
+ if (result) {
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ list_add(&fmr->list, &pool->free_list);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ printk(KERN_WARNING "fmr_map returns %d",
+ result);
+
+ return ERR_PTR(result);
+ }
+
+ ++fmr->remap_count;
+ fmr->ref_count = 1;
+
+ if (pool->cache_bucket) {
+ fmr->io_virtual_address = *io_virtual_address;
+ fmr->page_list_len = list_len;
+ memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ hlist_add_head(&fmr->cache_node,
+ pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+ }
+
+ return fmr;
+}
+EXPORT_SYMBOL(ib_fmr_pool_map_phys);
+
+/**
+ * ib_fmr_pool_unmap - Unmap FMR
+ * @fmr:FMR to unmap
+ *
+ * Unmap an FMR. The FMR mapping may remain valid until the FMR is
+ * reused (or until ib_flush_fmr_pool() is called).
+ */
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+{
+ struct ib_fmr_pool *pool;
+ unsigned long flags;
+
+ pool = fmr->pool;
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+
+ --fmr->ref_count;
+ if (!fmr->ref_count) {
+ if (fmr->remap_count < IB_FMR_MAX_REMAPS) {
+ list_add_tail(&fmr->list, &pool->free_list);
+ } else {
+ list_add_tail(&fmr->list, &pool->dirty_list);
+ ++pool->dirty_len;
+ wake_up_process(pool->thread);
+ }
+ }
+
+#ifdef DEBUG
+ if (fmr->ref_count < 0)
+ printk(KERN_WARNING "FMR %p has ref count %d < 0",
+ fmr, fmr->ref_count);
+#endif
+
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_fmr_pool_unmap);
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
new file mode 100644
index 00000000000..4ec7fff29b5
--- /dev/null
+++ b/drivers/infiniband/core/mad.c
@@ -0,0 +1,2714 @@
+/*
+ * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mad.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+
+#include <ib_mad.h>
+
+#include "mad_priv.h"
+#include "smi.h"
+#include "agent.h"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("kernel IB MAD API");
+MODULE_AUTHOR("Hal Rosenstock");
+MODULE_AUTHOR("Sean Hefty");
+
+
+kmem_cache_t *ib_mad_cache;
+static struct list_head ib_mad_port_list;
+static u32 ib_mad_client_id = 0;
+
+/* Port list lock */
+static spinlock_t ib_mad_port_list_lock;
+
+
+/* Forward declarations */
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+ struct ib_mad_reg_req *mad_reg_req);
+static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
+static struct ib_mad_agent_private *find_mad_agent(
+ struct ib_mad_port_private *port_priv,
+ struct ib_mad *mad, int solicited);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_private *mad);
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
+static void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc);
+static void timeout_sends(void *data);
+static void cancel_sends(void *data);
+static void local_completions(void *data);
+static int solicited_mad(struct ib_mad *mad);
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv,
+ u8 mgmt_class);
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv);
+
+/*
+ * Returns a ib_mad_port_private structure or NULL for a device/port
+ * Assumes ib_mad_port_list_lock is being held
+ */
+static inline struct ib_mad_port_private *
+__ib_get_mad_port(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *entry;
+
+ list_for_each_entry(entry, &ib_mad_port_list, port_list) {
+ if (entry->device == device && entry->port_num == port_num)
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * Wrapper function to return a ib_mad_port_private structure or NULL
+ * for a device/port
+ */
+static inline struct ib_mad_port_private *
+ib_get_mad_port(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ entry = __ib_get_mad_port(device, port_num);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ return entry;
+}
+
+static inline u8 convert_mgmt_class(u8 mgmt_class)
+{
+ /* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */
+ return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ?
+ 0 : mgmt_class;
+}
+
+static int get_spl_qp_index(enum ib_qp_type qp_type)
+{
+ switch (qp_type)
+ {
+ case IB_QPT_SMI:
+ return 0;
+ case IB_QPT_GSI:
+ return 1;
+ default:
+ return -1;
+ }
+}
+
+static int vendor_class_index(u8 mgmt_class)
+{
+ return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START;
+}
+
+static int is_vendor_class(u8 mgmt_class)
+{
+ if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) ||
+ (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END))
+ return 0;
+ return 1;
+}
+
+static int is_vendor_oui(char *oui)
+{
+ if (oui[0] || oui[1] || oui[2])
+ return 1;
+ return 0;
+}
+
+static int is_vendor_method_in_use(
+ struct ib_mad_mgmt_vendor_class *vendor_class,
+ struct ib_mad_reg_req *mad_reg_req)
+{
+ struct ib_mad_mgmt_method_table *method;
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) {
+ method = vendor_class->method_table[i];
+ if (method) {
+ if (method_in_use(&method, mad_reg_req))
+ return 1;
+ else
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * ib_register_mad_agent - Register to send/receive MADs
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ struct ib_mad_reg_req *mad_reg_req,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_reg_req *reg_req = NULL;
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ struct ib_mad_mgmt_method_table *method;
+ int ret2, qpn;
+ unsigned long flags;
+ u8 mgmt_class, vclass;
+
+ /* Validate parameters */
+ qpn = get_spl_qp_index(qp_type);
+ if (qpn == -1)
+ goto error1;
+
+ if (rmpp_version)
+ goto error1; /* XXX: until RMPP implemented */
+
+ /* Validate MAD registration request if supplied */
+ if (mad_reg_req) {
+ if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION)
+ goto error1;
+ if (!recv_handler)
+ goto error1;
+ if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
+ /*
+ * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
+ * one in this range currently allowed
+ */
+ if (mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ goto error1;
+ } else if (mad_reg_req->mgmt_class == 0) {
+ /*
+ * Class 0 is reserved in IBA and is used for
+ * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ */
+ goto error1;
+ } else if (is_vendor_class(mad_reg_req->mgmt_class)) {
+ /*
+ * If class is in "new" vendor range,
+ * ensure supplied OUI is not zero
+ */
+ if (!is_vendor_oui(mad_reg_req->oui))
+ goto error1;
+ }
+ /* Make sure class supplied is consistent with QP type */
+ if (qp_type == IB_QPT_SMI) {
+ if ((mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
+ (mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+ goto error1;
+ } else {
+ if ((mad_reg_req->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad_reg_req->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+ goto error1;
+ }
+ } else {
+ /* No registration request supplied */
+ if (!send_handler)
+ goto error1;
+ }
+
+ /* Validate device and port */
+ port_priv = ib_get_mad_port(device, port_num);
+ if (!port_priv) {
+ ret = ERR_PTR(-ENODEV);
+ goto error1;
+ }
+
+ /* Allocate structures */
+ mad_agent_priv = kmalloc(sizeof *mad_agent_priv, GFP_KERNEL);
+ if (!mad_agent_priv) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error1;
+ }
+
+ if (mad_reg_req) {
+ reg_req = kmalloc(sizeof *reg_req, GFP_KERNEL);
+ if (!reg_req) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error2;
+ }
+ /* Make a copy of the MAD registration request */
+ memcpy(reg_req, mad_reg_req, sizeof *reg_req);
+ }
+
+ /* Now, fill in the various structures */
+ memset(mad_agent_priv, 0, sizeof *mad_agent_priv);
+ mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
+ mad_agent_priv->reg_req = reg_req;
+ mad_agent_priv->rmpp_version = rmpp_version;
+ mad_agent_priv->agent.device = device;
+ mad_agent_priv->agent.recv_handler = recv_handler;
+ mad_agent_priv->agent.send_handler = send_handler;
+ mad_agent_priv->agent.context = context;
+ mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
+ mad_agent_priv->agent.port_num = port_num;
+
+ spin_lock_irqsave(&port_priv->reg_lock, flags);
+ mad_agent_priv->agent.hi_tid = ++ib_mad_client_id;
+
+ /*
+ * Make sure MAD registration (if supplied)
+ * is non overlapping with any existing ones
+ */
+ if (mad_reg_req) {
+ mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class);
+ if (!is_vendor_class(mgmt_class)) {
+ class = port_priv->version[mad_reg_req->
+ mgmt_class_version].class;
+ if (class) {
+ method = class->method_table[mgmt_class];
+ if (method) {
+ if (method_in_use(&method,
+ mad_reg_req))
+ goto error3;
+ }
+ }
+ ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv,
+ mgmt_class);
+ } else {
+ /* "New" vendor class range */
+ vendor = port_priv->version[mad_reg_req->
+ mgmt_class_version].vendor;
+ if (vendor) {
+ vclass = vendor_class_index(mgmt_class);
+ vendor_class = vendor->vendor_class[vclass];
+ if (vendor_class) {
+ if (is_vendor_method_in_use(
+ vendor_class,
+ mad_reg_req))
+ goto error3;
+ }
+ }
+ ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv);
+ }
+ if (ret2) {
+ ret = ERR_PTR(ret2);
+ goto error3;
+ }
+ }
+
+ /* Add mad agent into port's agent list */
+ list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+ spin_lock_init(&mad_agent_priv->lock);
+ INIT_LIST_HEAD(&mad_agent_priv->send_list);
+ INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+ INIT_WORK(&mad_agent_priv->timed_work, timeout_sends, mad_agent_priv);
+ INIT_LIST_HEAD(&mad_agent_priv->local_list);
+ INIT_WORK(&mad_agent_priv->local_work, local_completions,
+ mad_agent_priv);
+ INIT_LIST_HEAD(&mad_agent_priv->canceled_list);
+ INIT_WORK(&mad_agent_priv->canceled_work, cancel_sends, mad_agent_priv);
+ atomic_set(&mad_agent_priv->refcount, 1);
+ init_waitqueue_head(&mad_agent_priv->wait);
+
+ return &mad_agent_priv->agent;
+
+error3:
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+ kfree(reg_req);
+error2:
+ kfree(mad_agent_priv);
+error1:
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_agent);
+
+static inline int is_snooping_sends(int mad_snoop_flags)
+{
+ return (mad_snoop_flags &
+ (/*IB_MAD_SNOOP_POSTED_SENDS |
+ IB_MAD_SNOOP_RMPP_SENDS |*/
+ IB_MAD_SNOOP_SEND_COMPLETIONS /*|
+ IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/));
+}
+
+static inline int is_snooping_recvs(int mad_snoop_flags)
+{
+ return (mad_snoop_flags &
+ (IB_MAD_SNOOP_RECVS /*|
+ IB_MAD_SNOOP_RMPP_RECVS*/));
+}
+
+static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_snoop_private *mad_snoop_priv)
+{
+ struct ib_mad_snoop_private **new_snoop_table;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ /* Check for empty slot in array. */
+ for (i = 0; i < qp_info->snoop_table_size; i++)
+ if (!qp_info->snoop_table[i])
+ break;
+
+ if (i == qp_info->snoop_table_size) {
+ /* Grow table. */
+ new_snoop_table = kmalloc(sizeof mad_snoop_priv *
+ qp_info->snoop_table_size + 1,
+ GFP_ATOMIC);
+ if (!new_snoop_table) {
+ i = -ENOMEM;
+ goto out;
+ }
+ if (qp_info->snoop_table) {
+ memcpy(new_snoop_table, qp_info->snoop_table,
+ sizeof mad_snoop_priv *
+ qp_info->snoop_table_size);
+ kfree(qp_info->snoop_table);
+ }
+ qp_info->snoop_table = new_snoop_table;
+ qp_info->snoop_table_size++;
+ }
+ qp_info->snoop_table[i] = mad_snoop_priv;
+ atomic_inc(&qp_info->snoop_count);
+out:
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ return i;
+}
+
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ int mad_snoop_flags,
+ ib_mad_snoop_handler snoop_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent *ret;
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ int qpn;
+
+ /* Validate parameters */
+ if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) ||
+ (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) {
+ ret = ERR_PTR(-EINVAL);
+ goto error1;
+ }
+ qpn = get_spl_qp_index(qp_type);
+ if (qpn == -1) {
+ ret = ERR_PTR(-EINVAL);
+ goto error1;
+ }
+ port_priv = ib_get_mad_port(device, port_num);
+ if (!port_priv) {
+ ret = ERR_PTR(-ENODEV);
+ goto error1;
+ }
+ /* Allocate structures */
+ mad_snoop_priv = kmalloc(sizeof *mad_snoop_priv, GFP_KERNEL);
+ if (!mad_snoop_priv) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error1;
+ }
+
+ /* Now, fill in the various structures */
+ memset(mad_snoop_priv, 0, sizeof *mad_snoop_priv);
+ mad_snoop_priv->qp_info = &port_priv->qp_info[qpn];
+ mad_snoop_priv->agent.device = device;
+ mad_snoop_priv->agent.recv_handler = recv_handler;
+ mad_snoop_priv->agent.snoop_handler = snoop_handler;
+ mad_snoop_priv->agent.context = context;
+ mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp;
+ mad_snoop_priv->agent.port_num = port_num;
+ mad_snoop_priv->mad_snoop_flags = mad_snoop_flags;
+ init_waitqueue_head(&mad_snoop_priv->wait);
+ mad_snoop_priv->snoop_index = register_snoop_agent(
+ &port_priv->qp_info[qpn],
+ mad_snoop_priv);
+ if (mad_snoop_priv->snoop_index < 0) {
+ ret = ERR_PTR(mad_snoop_priv->snoop_index);
+ goto error2;
+ }
+
+ atomic_set(&mad_snoop_priv->refcount, 1);
+ return &mad_snoop_priv->agent;
+
+error2:
+ kfree(mad_snoop_priv);
+error1:
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_snoop);
+
+static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+
+ /* Note that we could still be handling received MADs */
+
+ /*
+ * Canceling all sends results in dropping received response
+ * MADs, preventing us from queuing additional work
+ */
+ cancel_mads(mad_agent_priv);
+
+ port_priv = mad_agent_priv->qp_info->port_priv;
+
+ cancel_delayed_work(&mad_agent_priv->timed_work);
+ flush_workqueue(port_priv->wq);
+
+ spin_lock_irqsave(&port_priv->reg_lock, flags);
+ remove_mad_reg_req(mad_agent_priv);
+ list_del(&mad_agent_priv->agent_list);
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+ /* XXX: Cleanup pending RMPP receives for this agent */
+
+ atomic_dec(&mad_agent_priv->refcount);
+ wait_event(mad_agent_priv->wait,
+ !atomic_read(&mad_agent_priv->refcount));
+
+ if (mad_agent_priv->reg_req)
+ kfree(mad_agent_priv->reg_req);
+ kfree(mad_agent_priv);
+}
+
+static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+ struct ib_mad_qp_info *qp_info;
+ unsigned long flags;
+
+ qp_info = mad_snoop_priv->qp_info;
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL;
+ atomic_dec(&qp_info->snoop_count);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+
+ atomic_dec(&mad_snoop_priv->refcount);
+ wait_event(mad_snoop_priv->wait,
+ !atomic_read(&mad_snoop_priv->refcount));
+
+ kfree(mad_snoop_priv);
+}
+
+/*
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_snoop_private *mad_snoop_priv;
+
+ /* If the TID is zero, the agent can only snoop. */
+ if (mad_agent->hi_tid) {
+ mad_agent_priv = container_of(mad_agent,
+ struct ib_mad_agent_private,
+ agent);
+ unregister_mad_agent(mad_agent_priv);
+ } else {
+ mad_snoop_priv = container_of(mad_agent,
+ struct ib_mad_snoop_private,
+ agent);
+ unregister_mad_snoop(mad_snoop_priv);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ib_unregister_mad_agent);
+
+static void dequeue_mad(struct ib_mad_list_head *mad_list)
+{
+ struct ib_mad_queue *mad_queue;
+ unsigned long flags;
+
+ BUG_ON(!mad_list->mad_queue);
+ mad_queue = mad_list->mad_queue;
+ spin_lock_irqsave(&mad_queue->lock, flags);
+ list_del(&mad_list->list);
+ mad_queue->count--;
+ spin_unlock_irqrestore(&mad_queue->lock, flags);
+}
+
+static void snoop_send(struct ib_mad_qp_info *qp_info,
+ struct ib_send_wr *send_wr,
+ struct ib_mad_send_wc *mad_send_wc,
+ int mad_snoop_flags)
+{
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ for (i = 0; i < qp_info->snoop_table_size; i++) {
+ mad_snoop_priv = qp_info->snoop_table[i];
+ if (!mad_snoop_priv ||
+ !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+ continue;
+
+ atomic_inc(&mad_snoop_priv->refcount);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent,
+ send_wr, mad_send_wc);
+ if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+ wake_up(&mad_snoop_priv->wait);
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void snoop_recv(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ int mad_snoop_flags)
+{
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ for (i = 0; i < qp_info->snoop_table_size; i++) {
+ mad_snoop_priv = qp_info->snoop_table[i];
+ if (!mad_snoop_priv ||
+ !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+ continue;
+
+ atomic_inc(&mad_snoop_priv->refcount);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+ mad_recv_wc);
+ if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+ wake_up(&mad_snoop_priv->wait);
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void build_smp_wc(u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
+ struct ib_wc *wc)
+{
+ memset(wc, 0, sizeof *wc);
+ wc->wr_id = wr_id;
+ wc->status = IB_WC_SUCCESS;
+ wc->opcode = IB_WC_RECV;
+ wc->pkey_index = pkey_index;
+ wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh);
+ wc->src_qp = IB_QP0;
+ wc->qp_num = IB_QP0;
+ wc->slid = slid;
+ wc->sl = 0;
+ wc->dlid_path_bits = 0;
+ wc->port_num = port_num;
+}
+
+/*
+ * Return 0 if SMP is to be sent
+ * Return 1 if SMP was consumed locally (whether or not solicited)
+ * Return < 0 if error
+ */
+static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_smp *smp,
+ struct ib_send_wr *send_wr)
+{
+ int ret, solicited;
+ unsigned long flags;
+ struct ib_mad_local_private *local;
+ struct ib_mad_private *mad_priv;
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent_private *recv_mad_agent = NULL;
+ struct ib_device *device = mad_agent_priv->agent.device;
+ u8 port_num = mad_agent_priv->agent.port_num;
+ struct ib_wc mad_wc;
+
+ if (!smi_handle_dr_smp_send(smp, device->node_type, port_num)) {
+ ret = -EINVAL;
+ printk(KERN_ERR PFX "Invalid directed route\n");
+ goto out;
+ }
+ /* Check to post send on QP or process locally */
+ ret = smi_check_local_dr_smp(smp, device, port_num);
+ if (!ret || !device->process_mad)
+ goto out;
+
+ local = kmalloc(sizeof *local, GFP_ATOMIC);
+ if (!local) {
+ ret = -ENOMEM;
+ printk(KERN_ERR PFX "No memory for ib_mad_local_private\n");
+ goto out;
+ }
+ local->mad_priv = NULL;
+ local->recv_mad_agent = NULL;
+ mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC);
+ if (!mad_priv) {
+ ret = -ENOMEM;
+ printk(KERN_ERR PFX "No memory for local response MAD\n");
+ kfree(local);
+ goto out;
+ }
+
+ build_smp_wc(send_wr->wr_id, smp->dr_slid, send_wr->wr.ud.pkey_index,
+ send_wr->wr.ud.port_num, &mad_wc);
+
+ /* No GRH for DR SMP */
+ ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
+ (struct ib_mad *)smp,
+ (struct ib_mad *)&mad_priv->mad);
+ switch (ret)
+ {
+ case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
+ /*
+ * See if response is solicited and
+ * there is a recv handler
+ */
+ if (solicited_mad(&mad_priv->mad.mad) &&
+ mad_agent_priv->agent.recv_handler) {
+ local->mad_priv = mad_priv;
+ local->recv_mad_agent = mad_agent_priv;
+ /*
+ * Reference MAD agent until receive
+ * side of local completion handled
+ */
+ atomic_inc(&mad_agent_priv->refcount);
+ } else
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ break;
+ case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ break;
+ case IB_MAD_RESULT_SUCCESS:
+ /* Treat like an incoming receive MAD */
+ solicited = solicited_mad(&mad_priv->mad.mad);
+ port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
+ mad_agent_priv->agent.port_num);
+ if (port_priv) {
+ mad_priv->mad.mad.mad_hdr.tid =
+ ((struct ib_mad *)smp)->mad_hdr.tid;
+ recv_mad_agent = find_mad_agent(port_priv,
+ &mad_priv->mad.mad,
+ solicited);
+ }
+ if (!port_priv || !recv_mad_agent) {
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(local);
+ ret = 0;
+ goto out;
+ }
+ local->mad_priv = mad_priv;
+ local->recv_mad_agent = recv_mad_agent;
+ break;
+ default:
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(local);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ local->send_wr = *send_wr;
+ local->send_wr.sg_list = local->sg_list;
+ memcpy(local->sg_list, send_wr->sg_list,
+ sizeof *send_wr->sg_list * send_wr->num_sge);
+ local->send_wr.next = NULL;
+ local->tid = send_wr->wr.ud.mad_hdr->tid;
+ local->wr_id = send_wr->wr_id;
+ /* Reference MAD agent until send side of local completion handled */
+ atomic_inc(&mad_agent_priv->refcount);
+ /* Queue local completion to local list */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ queue_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->local_work);
+ ret = 1;
+out:
+ return ret;
+}
+
+static int ib_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_qp_info *qp_info;
+ struct ib_send_wr *bad_send_wr;
+ unsigned long flags;
+ int ret;
+
+ /* Replace user's WR ID with our own to find WR upon completion */
+ qp_info = mad_agent_priv->qp_info;
+ mad_send_wr->wr_id = mad_send_wr->send_wr.wr_id;
+ mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
+ mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+
+ spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+ if (qp_info->send_queue.count++ < qp_info->send_queue.max_active) {
+ list_add_tail(&mad_send_wr->mad_list.list,
+ &qp_info->send_queue.list);
+ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+ ret = ib_post_send(mad_agent_priv->agent.qp,
+ &mad_send_wr->send_wr, &bad_send_wr);
+ if (ret) {
+ printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret);
+ dequeue_mad(&mad_send_wr->mad_list);
+ }
+ } else {
+ list_add_tail(&mad_send_wr->mad_list.list,
+ &qp_info->overflow_list);
+ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ * with the registered client
+ */
+int ib_post_send_mad(struct ib_mad_agent *mad_agent,
+ struct ib_send_wr *send_wr,
+ struct ib_send_wr **bad_send_wr)
+{
+ int ret = -EINVAL;
+ struct ib_mad_agent_private *mad_agent_priv;
+
+ /* Validate supplied parameters */
+ if (!bad_send_wr)
+ goto error1;
+
+ if (!mad_agent || !send_wr)
+ goto error2;
+
+ if (!mad_agent->send_handler)
+ goto error2;
+
+ mad_agent_priv = container_of(mad_agent,
+ struct ib_mad_agent_private,
+ agent);
+
+ /* Walk list of send WRs and post each on send list */
+ while (send_wr) {
+ unsigned long flags;
+ struct ib_send_wr *next_send_wr;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_smp *smp;
+
+ /* Validate more parameters */
+ if (send_wr->num_sge > IB_MAD_SEND_REQ_MAX_SG)
+ goto error2;
+
+ if (send_wr->wr.ud.timeout_ms && !mad_agent->recv_handler)
+ goto error2;
+
+ if (!send_wr->wr.ud.mad_hdr) {
+ printk(KERN_ERR PFX "MAD header must be supplied "
+ "in WR %p\n", send_wr);
+ goto error2;
+ }
+
+ /*
+ * Save pointer to next work request to post in case the
+ * current one completes, and the user modifies the work
+ * request associated with the completion
+ */
+ next_send_wr = (struct ib_send_wr *)send_wr->next;
+
+ smp = (struct ib_smp *)send_wr->wr.ud.mad_hdr;
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ ret = handle_outgoing_dr_smp(mad_agent_priv, smp,
+ send_wr);
+ if (ret < 0) /* error */
+ goto error2;
+ else if (ret == 1) /* locally consumed */
+ goto next;
+ }
+
+ /* Allocate MAD send WR tracking structure */
+ mad_send_wr = kmalloc(sizeof *mad_send_wr, GFP_ATOMIC);
+ if (!mad_send_wr) {
+ printk(KERN_ERR PFX "No memory for "
+ "ib_mad_send_wr_private\n");
+ ret = -ENOMEM;
+ goto error2;
+ }
+
+ mad_send_wr->send_wr = *send_wr;
+ mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
+ memcpy(mad_send_wr->sg_list, send_wr->sg_list,
+ sizeof *send_wr->sg_list * send_wr->num_sge);
+ mad_send_wr->send_wr.next = NULL;
+ mad_send_wr->tid = send_wr->wr.ud.mad_hdr->tid;
+ mad_send_wr->agent = mad_agent;
+ /* Timeout will be updated after send completes */
+ mad_send_wr->timeout = msecs_to_jiffies(send_wr->wr.
+ ud.timeout_ms);
+ mad_send_wr->retry = 0;
+ /* One reference for each work request to QP + response */
+ mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+ mad_send_wr->status = IB_WC_SUCCESS;
+
+ /* Reference MAD agent until send completes */
+ atomic_inc(&mad_agent_priv->refcount);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ ret = ib_send_mad(mad_agent_priv, mad_send_wr);
+ if (ret) {
+ /* Fail send request */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_del(&mad_send_wr->agent_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ atomic_dec(&mad_agent_priv->refcount);
+ goto error2;
+ }
+next:
+ send_wr = next_send_wr;
+ }
+ return 0;
+
+error2:
+ *bad_send_wr = send_wr;
+error1:
+ return ret;
+}
+EXPORT_SYMBOL(ib_post_send_mad);
+
+/*
+ * ib_free_recv_mad - Returns data buffers used to receive
+ * a MAD to the access layer
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_recv_buf *entry;
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *priv;
+
+ mad_priv_hdr = container_of(mad_recv_wc,
+ struct ib_mad_private_header,
+ recv_wc);
+ priv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+
+ /*
+ * Walk receive buffer list associated with this WC
+ * No need to remove them from list of receive buffers
+ */
+ list_for_each_entry(entry, &mad_recv_wc->recv_buf.list, list) {
+ /* Free previous receive buffer */
+ kmem_cache_free(ib_mad_cache, priv);
+ mad_priv_hdr = container_of(mad_recv_wc,
+ struct ib_mad_private_header,
+ recv_wc);
+ priv = container_of(mad_priv_hdr, struct ib_mad_private,
+ header);
+ }
+
+ /* Free last buffer */
+ kmem_cache_free(ib_mad_cache, priv);
+}
+EXPORT_SYMBOL(ib_free_recv_mad);
+
+void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc,
+ void *buf)
+{
+ printk(KERN_ERR PFX "ib_coalesce_recv_mad() not implemented yet\n");
+}
+EXPORT_SYMBOL(ib_coalesce_recv_mad);
+
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context)
+{
+ return ERR_PTR(-EINVAL); /* XXX: for now */
+}
+EXPORT_SYMBOL(ib_redirect_mad_qp);
+
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+ struct ib_wc *wc)
+{
+ printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n");
+ return 0;
+}
+EXPORT_SYMBOL(ib_process_mad_wc);
+
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+ struct ib_mad_reg_req *mad_reg_req)
+{
+ int i;
+
+ for (i = find_first_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS);
+ i < IB_MGMT_MAX_METHODS;
+ i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+ 1+i)) {
+ if ((*method)->agent[i]) {
+ printk(KERN_ERR PFX "Method %d already in use\n", i);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
+{
+ /* Allocate management method table */
+ *method = kmalloc(sizeof **method, GFP_ATOMIC);
+ if (!*method) {
+ printk(KERN_ERR PFX "No memory for "
+ "ib_mad_mgmt_method_table\n");
+ return -ENOMEM;
+ }
+ /* Clear management method table */
+ memset(*method, 0, sizeof **method);
+
+ return 0;
+}
+
+/*
+ * Check to see if there are any methods still in use
+ */
+static int check_method_table(struct ib_mad_mgmt_method_table *method)
+{
+ int i;
+
+ for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+ if (method->agent[i])
+ return 1;
+ return 0;
+}
+
+/*
+ * Check to see if there are any method tables for this class still in use
+ */
+static int check_class_table(struct ib_mad_mgmt_class_table *class)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_CLASS; i++)
+ if (class->method_table[i])
+ return 1;
+ return 0;
+}
+
+static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++)
+ if (vendor_class->method_table[i])
+ return 1;
+ return 0;
+}
+
+static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
+ char *oui)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++)
+ /* Is there matching OUI for this vendor class ? */
+ if (!memcmp(vendor_class->oui[i], oui, 3))
+ return i;
+
+ return -1;
+}
+
+static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++)
+ if (vendor->vendor_class[i])
+ return 1;
+
+ return 0;
+}
+
+static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
+ struct ib_mad_agent_private *agent)
+{
+ int i;
+
+ /* Remove any methods for this mad agent */
+ for (i = 0; i < IB_MGMT_MAX_METHODS; i++) {
+ if (method->agent[i] == agent) {
+ method->agent[i] = NULL;
+ }
+ }
+}
+
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv,
+ u8 mgmt_class)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_class_table **class;
+ struct ib_mad_mgmt_method_table **method;
+ int i, ret;
+
+ port_priv = agent_priv->qp_info->port_priv;
+ class = &port_priv->version[mad_reg_req->mgmt_class_version].class;
+ if (!*class) {
+ /* Allocate management class table for "new" class version */
+ *class = kmalloc(sizeof **class, GFP_ATOMIC);
+ if (!*class) {
+ printk(KERN_ERR PFX "No memory for "
+ "ib_mad_mgmt_class_table\n");
+ ret = -ENOMEM;
+ goto error1;
+ }
+ /* Clear management class table */
+ memset(*class, 0, sizeof(**class));
+ /* Allocate method table for this management class */
+ method = &(*class)->method_table[mgmt_class];
+ if ((ret = allocate_method_table(method)))
+ goto error2;
+ } else {
+ method = &(*class)->method_table[mgmt_class];
+ if (!*method) {
+ /* Allocate method table for this management class */
+ if ((ret = allocate_method_table(method)))
+ goto error1;
+ }
+ }
+
+ /* Now, make sure methods are not already in use */
+ if (method_in_use(method, mad_reg_req))
+ goto error3;
+
+ /* Finally, add in methods being registered */
+ for (i = find_first_bit(mad_reg_req->method_mask,
+ IB_MGMT_MAX_METHODS);
+ i < IB_MGMT_MAX_METHODS;
+ i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+ 1+i)) {
+ (*method)->agent[i] = agent_priv;
+ }
+ return 0;
+
+error3:
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(*method, agent_priv);
+ /* Now, check to see if there are any methods in use */
+ if (!check_method_table(*method)) {
+ /* If not, release management method table */
+ kfree(*method);
+ *method = NULL;
+ }
+ ret = -EINVAL;
+ goto error1;
+error2:
+ kfree(*class);
+ *class = NULL;
+error1:
+ return ret;
+}
+
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_vendor_class_table **vendor_table;
+ struct ib_mad_mgmt_vendor_class_table *vendor = NULL;
+ struct ib_mad_mgmt_vendor_class *vendor_class = NULL;
+ struct ib_mad_mgmt_method_table **method;
+ int i, ret = -ENOMEM;
+ u8 vclass;
+
+ /* "New" vendor (with OUI) class */
+ vclass = vendor_class_index(mad_reg_req->mgmt_class);
+ port_priv = agent_priv->qp_info->port_priv;
+ vendor_table = &port_priv->version[
+ mad_reg_req->mgmt_class_version].vendor;
+ if (!*vendor_table) {
+ /* Allocate mgmt vendor class table for "new" class version */
+ vendor = kmalloc(sizeof *vendor, GFP_ATOMIC);
+ if (!vendor) {
+ printk(KERN_ERR PFX "No memory for "
+ "ib_mad_mgmt_vendor_class_table\n");
+ goto error1;
+ }
+ /* Clear management vendor class table */
+ memset(vendor, 0, sizeof(*vendor));
+ *vendor_table = vendor;
+ }
+ if (!(*vendor_table)->vendor_class[vclass]) {
+ /* Allocate table for this management vendor class */
+ vendor_class = kmalloc(sizeof *vendor_class, GFP_ATOMIC);
+ if (!vendor_class) {
+ printk(KERN_ERR PFX "No memory for "
+ "ib_mad_mgmt_vendor_class\n");
+ goto error2;
+ }
+ memset(vendor_class, 0, sizeof(*vendor_class));
+ (*vendor_table)->vendor_class[vclass] = vendor_class;
+ }
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ /* Is there matching OUI for this vendor class ? */
+ if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i],
+ mad_reg_req->oui, 3)) {
+ method = &(*vendor_table)->vendor_class[
+ vclass]->method_table[i];
+ BUG_ON(!*method);
+ goto check_in_use;
+ }
+ }
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ /* OUI slot available ? */
+ if (!is_vendor_oui((*vendor_table)->vendor_class[
+ vclass]->oui[i])) {
+ method = &(*vendor_table)->vendor_class[
+ vclass]->method_table[i];
+ BUG_ON(*method);
+ /* Allocate method table for this OUI */
+ if ((ret = allocate_method_table(method)))
+ goto error3;
+ memcpy((*vendor_table)->vendor_class[vclass]->oui[i],
+ mad_reg_req->oui, 3);
+ goto check_in_use;
+ }
+ }
+ printk(KERN_ERR PFX "All OUI slots in use\n");
+ goto error3;
+
+check_in_use:
+ /* Now, make sure methods are not already in use */
+ if (method_in_use(method, mad_reg_req))
+ goto error4;
+
+ /* Finally, add in methods being registered */
+ for (i = find_first_bit(mad_reg_req->method_mask,
+ IB_MGMT_MAX_METHODS);
+ i < IB_MGMT_MAX_METHODS;
+ i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+ 1+i)) {
+ (*method)->agent[i] = agent_priv;
+ }
+ return 0;
+
+error4:
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(*method, agent_priv);
+ /* Now, check to see if there are any methods in use */
+ if (!check_method_table(*method)) {
+ /* If not, release management method table */
+ kfree(*method);
+ *method = NULL;
+ }
+ ret = -EINVAL;
+error3:
+ if (vendor_class) {
+ (*vendor_table)->vendor_class[vclass] = NULL;
+ kfree(vendor_class);
+ }
+error2:
+ if (vendor) {
+ *vendor_table = NULL;
+ kfree(vendor);
+ }
+error1:
+ return ret;
+}
+
+static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_method_table *method;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ int index;
+ u8 mgmt_class;
+
+ /*
+ * Was MAD registration request supplied
+ * with original registration ?
+ */
+ if (!agent_priv->reg_req) {
+ goto out;
+ }
+
+ port_priv = agent_priv->qp_info->port_priv;
+ mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
+ class = port_priv->version[
+ agent_priv->reg_req->mgmt_class_version].class;
+ if (!class)
+ goto vendor_check;
+
+ method = class->method_table[mgmt_class];
+ if (method) {
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(method, agent_priv);
+ /* Now, check to see if there are any methods still in use */
+ if (!check_method_table(method)) {
+ /* If not, release management method table */
+ kfree(method);
+ class->method_table[mgmt_class] = NULL;
+ /* Any management classes left ? */
+ if (!check_class_table(class)) {
+ /* If not, release management class table */
+ kfree(class);
+ port_priv->version[
+ agent_priv->reg_req->
+ mgmt_class_version].class = NULL;
+ }
+ }
+ }
+
+vendor_check:
+ if (!is_vendor_class(mgmt_class))
+ goto out;
+
+ /* normalize mgmt_class to vendor range 2 */
+ mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class);
+ vendor = port_priv->version[
+ agent_priv->reg_req->mgmt_class_version].vendor;
+
+ if (!vendor)
+ goto out;
+
+ vendor_class = vendor->vendor_class[mgmt_class];
+ if (vendor_class) {
+ index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui);
+ if (index < 0)
+ goto out;
+ method = vendor_class->method_table[index];
+ if (method) {
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(method, agent_priv);
+ /*
+ * Now, check to see if there are
+ * any methods still in use
+ */
+ if (!check_method_table(method)) {
+ /* If not, release management method table */
+ kfree(method);
+ vendor_class->method_table[index] = NULL;
+ memset(vendor_class->oui[index], 0, 3);
+ /* Any OUIs left ? */
+ if (!check_vendor_class(vendor_class)) {
+ /* If not, release vendor class table */
+ kfree(vendor_class);
+ vendor->vendor_class[mgmt_class] = NULL;
+ /* Any other vendor classes left ? */
+ if (!check_vendor_table(vendor)) {
+ kfree(vendor);
+ port_priv->version[
+ agent_priv->reg_req->
+ mgmt_class_version].
+ vendor = NULL;
+ }
+ }
+ }
+ }
+ }
+
+out:
+ return;
+}
+
+static int response_mad(struct ib_mad *mad)
+{
+ /* Trap represses are responses although response bit is reset */
+ return ((mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+ (mad->mad_hdr.method & IB_MGMT_METHOD_RESP));
+}
+
+static int solicited_mad(struct ib_mad *mad)
+{
+ /* CM MADs are never solicited */
+ if (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CM) {
+ return 0;
+ }
+
+ /* XXX: Determine whether MAD is using RMPP */
+
+ /* Not using RMPP */
+ /* Is this MAD a response to a previous MAD ? */
+ return response_mad(mad);
+}
+
+static struct ib_mad_agent_private *
+find_mad_agent(struct ib_mad_port_private *port_priv,
+ struct ib_mad *mad,
+ int solicited)
+{
+ struct ib_mad_agent_private *mad_agent = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port_priv->reg_lock, flags);
+
+ /*
+ * Whether MAD was solicited determines type of routing to
+ * MAD client.
+ */
+ if (solicited) {
+ u32 hi_tid;
+ struct ib_mad_agent_private *entry;
+
+ /*
+ * Routing is based on high 32 bits of transaction ID
+ * of MAD.
+ */
+ hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32;
+ list_for_each_entry(entry, &port_priv->agent_list,
+ agent_list) {
+ if (entry->agent.hi_tid == hi_tid) {
+ mad_agent = entry;
+ break;
+ }
+ }
+ } else {
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_method_table *method;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ struct ib_vendor_mad *vendor_mad;
+ int index;
+
+ /*
+ * Routing is based on version, class, and method
+ * For "newer" vendor MADs, also based on OUI
+ */
+ if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION)
+ goto out;
+ if (!is_vendor_class(mad->mad_hdr.mgmt_class)) {
+ class = port_priv->version[
+ mad->mad_hdr.class_version].class;
+ if (!class)
+ goto out;
+ method = class->method_table[convert_mgmt_class(
+ mad->mad_hdr.mgmt_class)];
+ if (method)
+ mad_agent = method->agent[mad->mad_hdr.method &
+ ~IB_MGMT_METHOD_RESP];
+ } else {
+ vendor = port_priv->version[
+ mad->mad_hdr.class_version].vendor;
+ if (!vendor)
+ goto out;
+ vendor_class = vendor->vendor_class[vendor_class_index(
+ mad->mad_hdr.mgmt_class)];
+ if (!vendor_class)
+ goto out;
+ /* Find matching OUI */
+ vendor_mad = (struct ib_vendor_mad *)mad;
+ index = find_vendor_oui(vendor_class, vendor_mad->oui);
+ if (index == -1)
+ goto out;
+ method = vendor_class->method_table[index];
+ if (method) {
+ mad_agent = method->agent[mad->mad_hdr.method &
+ ~IB_MGMT_METHOD_RESP];
+ }
+ }
+ }
+
+ if (mad_agent) {
+ if (mad_agent->agent.recv_handler)
+ atomic_inc(&mad_agent->refcount);
+ else {
+ printk(KERN_NOTICE PFX "No receive handler for client "
+ "%p on port %d\n",
+ &mad_agent->agent, port_priv->port_num);
+ mad_agent = NULL;
+ }
+ }
+out:
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+ return mad_agent;
+}
+
+static int validate_mad(struct ib_mad *mad, u32 qp_num)
+{
+ int valid = 0;
+
+ /* Make sure MAD base version is understood */
+ if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
+ printk(KERN_ERR PFX "MAD received with unsupported base "
+ "version %d\n", mad->mad_hdr.base_version);
+ goto out;
+ }
+
+ /* Filter SMI packets sent to other than QP0 */
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ if (qp_num == 0)
+ valid = 1;
+ } else {
+ /* Filter GSI packets sent to QP0 */
+ if (qp_num != 0)
+ valid = 1;
+ }
+
+out:
+ return valid;
+}
+
+/*
+ * Return start of fully reassembled MAD, or NULL, if MAD isn't assembled yet
+ */
+static struct ib_mad_private *
+reassemble_recv(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_private *recv)
+{
+ /* Until we have RMPP, all receives are reassembled!... */
+ INIT_LIST_HEAD(&recv->header.recv_wc.recv_buf.list);
+ return recv;
+}
+
+static struct ib_mad_send_wr_private*
+find_send_req(struct ib_mad_agent_private *mad_agent_priv,
+ u64 tid)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+ agent_list) {
+ if (mad_send_wr->tid == tid)
+ return mad_send_wr;
+ }
+
+ /*
+ * It's possible to receive the response before we've
+ * been notified that the send has completed
+ */
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+ agent_list) {
+ if (mad_send_wr->tid == tid && mad_send_wr->timeout) {
+ /* Verify request has not been canceled */
+ return (mad_send_wr->status == IB_WC_SUCCESS) ?
+ mad_send_wr : NULL;
+ }
+ }
+ return NULL;
+}
+
+static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_private *recv,
+ int solicited)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+
+ /* Fully reassemble receive before processing */
+ recv = reassemble_recv(mad_agent_priv, recv);
+ if (!recv) {
+ if (atomic_dec_and_test(&mad_agent_priv->refcount))
+ wake_up(&mad_agent_priv->wait);
+ return;
+ }
+
+ /* Complete corresponding request */
+ if (solicited) {
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ mad_send_wr = find_send_req(mad_agent_priv,
+ recv->mad.mad.mad_hdr.tid);
+ if (!mad_send_wr) {
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ ib_free_recv_mad(&recv->header.recv_wc);
+ if (atomic_dec_and_test(&mad_agent_priv->refcount))
+ wake_up(&mad_agent_priv->wait);
+ return;
+ }
+ /* Timeout = 0 means that we won't wait for a response */
+ mad_send_wr->timeout = 0;
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ /* Defined behavior is to complete response before request */
+ recv->header.recv_wc.wc->wr_id = mad_send_wr->wr_id;
+ mad_agent_priv->agent.recv_handler(
+ &mad_agent_priv->agent,
+ &recv->header.recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.wr_id = mad_send_wr->wr_id;
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ } else {
+ mad_agent_priv->agent.recv_handler(
+ &mad_agent_priv->agent,
+ &recv->header.recv_wc);
+ if (atomic_dec_and_test(&mad_agent_priv->refcount))
+ wake_up(&mad_agent_priv->wait);
+ }
+}
+
+static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
+{
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *recv, *response;
+ struct ib_mad_list_head *mad_list;
+ struct ib_mad_agent_private *mad_agent;
+ int solicited;
+
+ response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+ if (!response)
+ printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory "
+ "for response buffer\n");
+
+ mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ qp_info = mad_list->mad_queue->qp_info;
+ dequeue_mad(mad_list);
+
+ mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
+ mad_list);
+ recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+ dma_unmap_single(port_priv->device->dma_device,
+ pci_unmap_addr(&recv->header, mapping),
+ sizeof(struct ib_mad_private) -
+ sizeof(struct ib_mad_private_header),
+ DMA_FROM_DEVICE);
+
+ /* Setup MAD receive work completion from "normal" work completion */
+ recv->header.recv_wc.wc = wc;
+ recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+ recv->header.recv_wc.recv_buf.mad = &recv->mad.mad;
+ recv->header.recv_wc.recv_buf.grh = &recv->grh;
+
+ if (atomic_read(&qp_info->snoop_count))
+ snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
+
+ /* Validate MAD */
+ if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num))
+ goto out;
+
+ if (recv->mad.mad.mad_hdr.mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ if (!smi_handle_dr_smp_recv(&recv->mad.smp,
+ port_priv->device->node_type,
+ port_priv->port_num,
+ port_priv->device->phys_port_cnt))
+ goto out;
+ if (!smi_check_forward_dr_smp(&recv->mad.smp))
+ goto local;
+ if (!smi_handle_dr_smp_send(&recv->mad.smp,
+ port_priv->device->node_type,
+ port_priv->port_num))
+ goto out;
+ if (!smi_check_local_dr_smp(&recv->mad.smp,
+ port_priv->device,
+ port_priv->port_num))
+ goto out;
+ }
+
+local:
+ /* Give driver "right of first refusal" on incoming MAD */
+ if (port_priv->device->process_mad) {
+ int ret;
+
+ if (!response) {
+ printk(KERN_ERR PFX "No memory for response MAD\n");
+ /*
+ * Is it better to assume that
+ * it wouldn't be processed ?
+ */
+ goto out;
+ }
+
+ ret = port_priv->device->process_mad(port_priv->device, 0,
+ port_priv->port_num,
+ wc, &recv->grh,
+ &recv->mad.mad,
+ &response->mad.mad);
+ if (ret & IB_MAD_RESULT_SUCCESS) {
+ if (ret & IB_MAD_RESULT_CONSUMED)
+ goto out;
+ if (ret & IB_MAD_RESULT_REPLY) {
+ /* Send response */
+ if (!agent_send(response, &recv->grh, wc,
+ port_priv->device,
+ port_priv->port_num))
+ response = NULL;
+ goto out;
+ }
+ }
+ }
+
+ /* Determine corresponding MAD agent for incoming receive MAD */
+ solicited = solicited_mad(&recv->mad.mad);
+ mad_agent = find_mad_agent(port_priv, &recv->mad.mad, solicited);
+ if (mad_agent) {
+ ib_mad_complete_recv(mad_agent, recv, solicited);
+ /*
+ * recv is freed up in error cases in ib_mad_complete_recv
+ * or via recv_handler in ib_mad_complete_recv()
+ */
+ recv = NULL;
+ }
+
+out:
+ /* Post another receive request for this QP */
+ if (response) {
+ ib_mad_post_receive_mads(qp_info, response);
+ if (recv)
+ kmem_cache_free(ib_mad_cache, recv);
+ } else
+ ib_mad_post_receive_mads(qp_info, recv);
+}
+
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long delay;
+
+ if (list_empty(&mad_agent_priv->wait_list)) {
+ cancel_delayed_work(&mad_agent_priv->timed_work);
+ } else {
+ mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ if (time_after(mad_agent_priv->timeout,
+ mad_send_wr->timeout)) {
+ mad_agent_priv->timeout = mad_send_wr->timeout;
+ cancel_delayed_work(&mad_agent_priv->timed_work);
+ delay = mad_send_wr->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ queue_delayed_work(mad_agent_priv->qp_info->
+ port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+ }
+ }
+}
+
+static void wait_for_response(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_wr_private *mad_send_wr )
+{
+ struct ib_mad_send_wr_private *temp_mad_send_wr;
+ struct list_head *list_item;
+ unsigned long delay;
+
+ list_del(&mad_send_wr->agent_list);
+
+ delay = mad_send_wr->timeout;
+ mad_send_wr->timeout += jiffies;
+
+ list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+ temp_mad_send_wr = list_entry(list_item,
+ struct ib_mad_send_wr_private,
+ agent_list);
+ if (time_after(mad_send_wr->timeout,
+ temp_mad_send_wr->timeout))
+ break;
+ }
+ list_add(&mad_send_wr->agent_list, list_item);
+
+ /* Reschedule a work item if we have a shorter timeout */
+ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) {
+ cancel_delayed_work(&mad_agent_priv->timed_work);
+ queue_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+ }
+}
+
+/*
+ * Process a send work completion
+ */
+static void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ unsigned long flags;
+
+ mad_agent_priv = container_of(mad_send_wr->agent,
+ struct ib_mad_agent_private, agent);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ if (mad_send_wc->status != IB_WC_SUCCESS &&
+ mad_send_wr->status == IB_WC_SUCCESS) {
+ mad_send_wr->status = mad_send_wc->status;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+
+ if (--mad_send_wr->refcount > 0) {
+ if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+ mad_send_wr->status == IB_WC_SUCCESS) {
+ wait_for_response(mad_agent_priv, mad_send_wr);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return;
+ }
+
+ /* Remove send from MAD agent and notify client of completion */
+ list_del(&mad_send_wr->agent_list);
+ adjust_timeout(mad_agent_priv);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (mad_send_wr->status != IB_WC_SUCCESS )
+ mad_send_wc->status = mad_send_wr->status;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ mad_send_wc);
+
+ /* Release reference on agent taken when sending */
+ if (atomic_dec_and_test(&mad_agent_priv->refcount))
+ wake_up(&mad_agent_priv->wait);
+
+ kfree(mad_send_wr);
+}
+
+static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
+{
+ struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr;
+ struct ib_mad_list_head *mad_list;
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_queue *send_queue;
+ struct ib_send_wr *bad_send_wr;
+ unsigned long flags;
+ int ret;
+
+ mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+ mad_list);
+ send_queue = mad_list->mad_queue;
+ qp_info = send_queue->qp_info;
+
+retry:
+ queued_send_wr = NULL;
+ spin_lock_irqsave(&send_queue->lock, flags);
+ list_del(&mad_list->list);
+
+ /* Move queued send to the send queue */
+ if (send_queue->count-- > send_queue->max_active) {
+ mad_list = container_of(qp_info->overflow_list.next,
+ struct ib_mad_list_head, list);
+ queued_send_wr = container_of(mad_list,
+ struct ib_mad_send_wr_private,
+ mad_list);
+ list_del(&mad_list->list);
+ list_add_tail(&mad_list->list, &send_queue->list);
+ }
+ spin_unlock_irqrestore(&send_queue->lock, flags);
+
+ /* Restore client wr_id in WC and complete send */
+ wc->wr_id = mad_send_wr->wr_id;
+ if (atomic_read(&qp_info->snoop_count))
+ snoop_send(qp_info, &mad_send_wr->send_wr,
+ (struct ib_mad_send_wc *)wc,
+ IB_MAD_SNOOP_SEND_COMPLETIONS);
+ ib_mad_complete_send_wr(mad_send_wr, (struct ib_mad_send_wc *)wc);
+
+ if (queued_send_wr) {
+ ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
+ &bad_send_wr);
+ if (ret) {
+ printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret);
+ mad_send_wr = queued_send_wr;
+ wc->status = IB_WC_LOC_QP_OP_ERR;
+ goto retry;
+ }
+ }
+}
+
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_list_head *mad_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+ list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+ mad_send_wr = container_of(mad_list,
+ struct ib_mad_send_wr_private,
+ mad_list);
+ mad_send_wr->retry = 1;
+ }
+ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static void mad_error_handler(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
+{
+ struct ib_mad_list_head *mad_list;
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ int ret;
+
+ /* Determine if failure was a send or receive */
+ mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ qp_info = mad_list->mad_queue->qp_info;
+ if (mad_list->mad_queue == &qp_info->recv_queue)
+ /*
+ * Receive errors indicate that the QP has entered the error
+ * state - error handling/shutdown code will cleanup
+ */
+ return;
+
+ /*
+ * Send errors will transition the QP to SQE - move
+ * QP to RTS and repost flushed work requests
+ */
+ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+ mad_list);
+ if (wc->status == IB_WC_WR_FLUSH_ERR) {
+ if (mad_send_wr->retry) {
+ /* Repost send */
+ struct ib_send_wr *bad_send_wr;
+
+ mad_send_wr->retry = 0;
+ ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr,
+ &bad_send_wr);
+ if (ret)
+ ib_mad_send_done_handler(port_priv, wc);
+ } else
+ ib_mad_send_done_handler(port_priv, wc);
+ } else {
+ struct ib_qp_attr *attr;
+
+ /* Transition QP to RTS and fail offending send */
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (attr) {
+ attr->qp_state = IB_QPS_RTS;
+ attr->cur_qp_state = IB_QPS_SQE;
+ ret = ib_modify_qp(qp_info->qp, attr,
+ IB_QP_STATE | IB_QP_CUR_STATE);
+ kfree(attr);
+ if (ret)
+ printk(KERN_ERR PFX "mad_error_handler - "
+ "ib_modify_qp to RTS : %d\n", ret);
+ else
+ mark_sends_for_retry(qp_info);
+ }
+ ib_mad_send_done_handler(port_priv, wc);
+ }
+}
+
+/*
+ * IB MAD completion callback
+ */
+static void ib_mad_completion_handler(void *data)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_wc wc;
+
+ port_priv = (struct ib_mad_port_private *)data;
+ ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+
+ while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
+ if (wc.status == IB_WC_SUCCESS) {
+ switch (wc.opcode) {
+ case IB_WC_SEND:
+ ib_mad_send_done_handler(port_priv, &wc);
+ break;
+ case IB_WC_RECV:
+ ib_mad_recv_done_handler(port_priv, &wc);
+ break;
+ default:
+ BUG_ON(1);
+ break;
+ }
+ } else
+ mad_error_handler(port_priv, &wc);
+ }
+}
+
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+ unsigned long flags;
+ struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ struct list_head cancel_list;
+
+ INIT_LIST_HEAD(&cancel_list);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &mad_agent_priv->send_list, agent_list) {
+ if (mad_send_wr->status == IB_WC_SUCCESS) {
+ mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+ }
+
+ /* Empty wait list to prevent receives from finding a request */
+ list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ /* Report all cancelled requests */
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wc.vendor_err = 0;
+
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &cancel_list, agent_list) {
+ mad_send_wc.wr_id = mad_send_wr->wr_id;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ list_del(&mad_send_wr->agent_list);
+ kfree(mad_send_wr);
+ atomic_dec(&mad_agent_priv->refcount);
+ }
+}
+
+static struct ib_mad_send_wr_private*
+find_send_by_wr_id(struct ib_mad_agent_private *mad_agent_priv,
+ u64 wr_id)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+ agent_list) {
+ if (mad_send_wr->wr_id == wr_id)
+ return mad_send_wr;
+ }
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+ agent_list) {
+ if (mad_send_wr->wr_id == wr_id)
+ return mad_send_wr;
+ }
+ return NULL;
+}
+
+void cancel_sends(void *data)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+
+ mad_agent_priv = data;
+
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wc.vendor_err = 0;
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->canceled_list)) {
+ mad_send_wr = list_entry(mad_agent_priv->canceled_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ list_del(&mad_send_wr->agent_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ mad_send_wc.wr_id = mad_send_wr->wr_id;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ kfree(mad_send_wr);
+ if (atomic_dec_and_test(&mad_agent_priv->refcount))
+ wake_up(&mad_agent_priv->wait);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+ u64 wr_id)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long flags;
+
+ mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+ agent);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ mad_send_wr = find_send_by_wr_id(mad_agent_priv, wr_id);
+ if (!mad_send_wr) {
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ goto out;
+ }
+
+ if (mad_send_wr->status == IB_WC_SUCCESS)
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+
+ if (mad_send_wr->refcount != 0) {
+ mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ goto out;
+ }
+
+ list_del(&mad_send_wr->agent_list);
+ list_add_tail(&mad_send_wr->agent_list, &mad_agent_priv->canceled_list);
+ adjust_timeout(mad_agent_priv);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ queue_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->canceled_work);
+out:
+ return;
+}
+EXPORT_SYMBOL(ib_cancel_mad);
+
+static void local_completions(void *data)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_local_private *local;
+ struct ib_mad_agent_private *recv_mad_agent;
+ unsigned long flags;
+ struct ib_wc wc;
+ struct ib_mad_send_wc mad_send_wc;
+
+ mad_agent_priv = (struct ib_mad_agent_private *)data;
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->local_list)) {
+ local = list_entry(mad_agent_priv->local_list.next,
+ struct ib_mad_local_private,
+ completion_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ if (local->mad_priv) {
+ recv_mad_agent = local->recv_mad_agent;
+ if (!recv_mad_agent) {
+ printk(KERN_ERR PFX "No receive MAD agent for local completion\n");
+ kmem_cache_free(ib_mad_cache, local->mad_priv);
+ goto local_send_completion;
+ }
+
+ /*
+ * Defined behavior is to complete response
+ * before request
+ */
+ build_smp_wc(local->wr_id, IB_LID_PERMISSIVE,
+ 0 /* pkey index */,
+ recv_mad_agent->agent.port_num, &wc);
+
+ local->mad_priv->header.recv_wc.wc = &wc;
+ local->mad_priv->header.recv_wc.mad_len =
+ sizeof(struct ib_mad);
+ INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.recv_buf.list);
+ local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
+ local->mad_priv->header.recv_wc.recv_buf.mad =
+ &local->mad_priv->mad.mad;
+ if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
+ snoop_recv(recv_mad_agent->qp_info,
+ &local->mad_priv->header.recv_wc,
+ IB_MAD_SNOOP_RECVS);
+ recv_mad_agent->agent.recv_handler(
+ &recv_mad_agent->agent,
+ &local->mad_priv->header.recv_wc);
+ spin_lock_irqsave(&recv_mad_agent->lock, flags);
+ atomic_dec(&recv_mad_agent->refcount);
+ spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
+ }
+
+local_send_completion:
+ /* Complete send */
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.wr_id = local->wr_id;
+ if (atomic_read(&mad_agent_priv->qp_info->snoop_count))
+ snoop_send(mad_agent_priv->qp_info, &local->send_wr,
+ &mad_send_wc,
+ IB_MAD_SNOOP_SEND_COMPLETIONS);
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_del(&local->completion_list);
+ atomic_dec(&mad_agent_priv->refcount);
+ kfree(local);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void timeout_sends(void *data)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags, delay;
+
+ mad_agent_priv = (struct ib_mad_agent_private *)data;
+
+ mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+ mad_send_wc.vendor_err = 0;
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->wait_list)) {
+ mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ if (time_after(mad_send_wr->timeout, jiffies)) {
+ delay = mad_send_wr->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ queue_delayed_work(mad_agent_priv->qp_info->
+ port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+ break;
+ }
+
+ list_del(&mad_send_wr->agent_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ mad_send_wc.wr_id = mad_send_wr->wr_id;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ kfree(mad_send_wr);
+ atomic_dec(&mad_agent_priv->refcount);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_thread_completion_handler(struct ib_cq *cq)
+{
+ struct ib_mad_port_private *port_priv = cq->cq_context;
+
+ queue_work(port_priv->wq, &port_priv->work);
+}
+
+/*
+ * Allocate receive MADs and post receive WRs for them
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_private *mad)
+{
+ unsigned long flags;
+ int post, ret;
+ struct ib_mad_private *mad_priv;
+ struct ib_sge sg_list;
+ struct ib_recv_wr recv_wr, *bad_recv_wr;
+ struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+
+ /* Initialize common scatter list fields */
+ sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
+ sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+
+ /* Initialize common receive WR fields */
+ recv_wr.next = NULL;
+ recv_wr.sg_list = &sg_list;
+ recv_wr.num_sge = 1;
+
+ do {
+ /* Allocate and map receive buffer */
+ if (mad) {
+ mad_priv = mad;
+ mad = NULL;
+ } else {
+ mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+ if (!mad_priv) {
+ printk(KERN_ERR PFX "No memory for receive buffer\n");
+ ret = -ENOMEM;
+ break;
+ }
+ }
+ sg_list.addr = dma_map_single(qp_info->port_priv->
+ device->dma_device,
+ &mad_priv->grh,
+ sizeof *mad_priv -
+ sizeof mad_priv->header,
+ DMA_FROM_DEVICE);
+ pci_unmap_addr_set(&mad_priv->header, mapping, sg_list.addr);
+ recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
+ mad_priv->header.mad_list.mad_queue = recv_queue;
+
+ /* Post receive WR */
+ spin_lock_irqsave(&recv_queue->lock, flags);
+ post = (++recv_queue->count < recv_queue->max_active);
+ list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
+ if (ret) {
+ spin_lock_irqsave(&recv_queue->lock, flags);
+ list_del(&mad_priv->header.mad_list.list);
+ recv_queue->count--;
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ dma_unmap_single(qp_info->port_priv->device->dma_device,
+ pci_unmap_addr(&mad_priv->header,
+ mapping),
+ sizeof *mad_priv -
+ sizeof mad_priv->header,
+ DMA_FROM_DEVICE);
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
+ break;
+ }
+ } while (post);
+
+ return ret;
+}
+
+/*
+ * Return all the posted receive MADs
+ */
+static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
+{
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *recv;
+ struct ib_mad_list_head *mad_list;
+
+ while (!list_empty(&qp_info->recv_queue.list)) {
+
+ mad_list = list_entry(qp_info->recv_queue.list.next,
+ struct ib_mad_list_head, list);
+ mad_priv_hdr = container_of(mad_list,
+ struct ib_mad_private_header,
+ mad_list);
+ recv = container_of(mad_priv_hdr, struct ib_mad_private,
+ header);
+
+ /* Remove from posted receive MAD list */
+ list_del(&mad_list->list);
+
+ /* Undo PCI mapping */
+ dma_unmap_single(qp_info->port_priv->device->dma_device,
+ pci_unmap_addr(&recv->header, mapping),
+ sizeof(struct ib_mad_private) -
+ sizeof(struct ib_mad_private_header),
+ DMA_FROM_DEVICE);
+ kmem_cache_free(ib_mad_cache, recv);
+ }
+
+ qp_info->recv_queue.count = 0;
+}
+
+/*
+ * Start the port
+ */
+static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
+{
+ int ret, i;
+ struct ib_qp_attr *attr;
+ struct ib_qp *qp;
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (!attr) {
+ printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+ qp = port_priv->qp_info[i].qp;
+ /*
+ * PKey index for QP1 is irrelevant but
+ * one is needed for the Reset to Init transition
+ */
+ attr->qp_state = IB_QPS_INIT;
+ attr->pkey_index = 0;
+ attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE |
+ IB_QP_PKEY_INDEX | IB_QP_QKEY);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't change QP%d state to "
+ "INIT: %d\n", i, ret);
+ goto out;
+ }
+
+ attr->qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't change QP%d state to "
+ "RTR: %d\n", i, ret);
+ goto out;
+ }
+
+ attr->qp_state = IB_QPS_RTS;
+ attr->sq_psn = IB_MAD_SEND_Q_PSN;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't change QP%d state to "
+ "RTS: %d\n", i, ret);
+ goto out;
+ }
+ }
+
+ ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ printk(KERN_ERR PFX "Failed to request completion "
+ "notification: %d\n", ret);
+ goto out;
+ }
+
+ for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+ ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't post receive WRs\n");
+ goto out;
+ }
+ }
+out:
+ kfree(attr);
+ return ret;
+}
+
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+ struct ib_mad_qp_info *qp_info = qp_context;
+
+ /* It's worse than that! He's dead, Jim! */
+ printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n",
+ event->event, qp_info->qp->qp_num);
+}
+
+static void init_mad_queue(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_queue *mad_queue)
+{
+ mad_queue->qp_info = qp_info;
+ mad_queue->count = 0;
+ spin_lock_init(&mad_queue->lock);
+ INIT_LIST_HEAD(&mad_queue->list);
+}
+
+static void init_mad_qp(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info)
+{
+ qp_info->port_priv = port_priv;
+ init_mad_queue(qp_info, &qp_info->send_queue);
+ init_mad_queue(qp_info, &qp_info->recv_queue);
+ INIT_LIST_HEAD(&qp_info->overflow_list);
+ spin_lock_init(&qp_info->snoop_lock);
+ qp_info->snoop_table = NULL;
+ qp_info->snoop_table_size = 0;
+ atomic_set(&qp_info->snoop_count, 0);
+}
+
+static int create_mad_qp(struct ib_mad_qp_info *qp_info,
+ enum ib_qp_type qp_type)
+{
+ struct ib_qp_init_attr qp_init_attr;
+ int ret;
+
+ memset(&qp_init_attr, 0, sizeof qp_init_attr);
+ qp_init_attr.send_cq = qp_info->port_priv->cq;
+ qp_init_attr.recv_cq = qp_info->port_priv->cq;
+ qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ qp_init_attr.cap.max_send_wr = IB_MAD_QP_SEND_SIZE;
+ qp_init_attr.cap.max_recv_wr = IB_MAD_QP_RECV_SIZE;
+ qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
+ qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
+ qp_init_attr.qp_type = qp_type;
+ qp_init_attr.port_num = qp_info->port_priv->port_num;
+ qp_init_attr.qp_context = qp_info;
+ qp_init_attr.event_handler = qp_event_handler;
+ qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
+ if (IS_ERR(qp_info->qp)) {
+ printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n",
+ get_spl_qp_index(qp_type));
+ ret = PTR_ERR(qp_info->qp);
+ goto error;
+ }
+ /* Use minimum queue sizes unless the CQ is resized */
+ qp_info->send_queue.max_active = IB_MAD_QP_SEND_SIZE;
+ qp_info->recv_queue.max_active = IB_MAD_QP_RECV_SIZE;
+ return 0;
+
+error:
+ return ret;
+}
+
+static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
+{
+ ib_destroy_qp(qp_info->qp);
+ if (qp_info->snoop_table)
+ kfree(qp_info->snoop_table);
+}
+
+/*
+ * Open the port
+ * Create the QP, PD, MR, and CQ if needed
+ */
+static int ib_mad_port_open(struct ib_device *device,
+ int port_num)
+{
+ int ret, cq_size;
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+ char name[sizeof "ib_mad123"];
+
+ /* First, check if port already open at MAD layer */
+ port_priv = ib_get_mad_port(device, port_num);
+ if (port_priv) {
+ printk(KERN_DEBUG PFX "%s port %d already open\n",
+ device->name, port_num);
+ return 0;
+ }
+
+ /* Create new device info */
+ port_priv = kmalloc(sizeof *port_priv, GFP_KERNEL);
+ if (!port_priv) {
+ printk(KERN_ERR PFX "No memory for ib_mad_port_private\n");
+ return -ENOMEM;
+ }
+ memset(port_priv, 0, sizeof *port_priv);
+ port_priv->device = device;
+ port_priv->port_num = port_num;
+ spin_lock_init(&port_priv->reg_lock);
+ INIT_LIST_HEAD(&port_priv->agent_list);
+ init_mad_qp(port_priv, &port_priv->qp_info[0]);
+ init_mad_qp(port_priv, &port_priv->qp_info[1]);
+
+ cq_size = (IB_MAD_QP_SEND_SIZE + IB_MAD_QP_RECV_SIZE) * 2;
+ port_priv->cq = ib_create_cq(port_priv->device,
+ (ib_comp_handler)
+ ib_mad_thread_completion_handler,
+ NULL, port_priv, cq_size);
+ if (IS_ERR(port_priv->cq)) {
+ printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n");
+ ret = PTR_ERR(port_priv->cq);
+ goto error3;
+ }
+
+ port_priv->pd = ib_alloc_pd(device);
+ if (IS_ERR(port_priv->pd)) {
+ printk(KERN_ERR PFX "Couldn't create ib_mad PD\n");
+ ret = PTR_ERR(port_priv->pd);
+ goto error4;
+ }
+
+ port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(port_priv->mr)) {
+ printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n");
+ ret = PTR_ERR(port_priv->mr);
+ goto error5;
+ }
+
+ ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
+ if (ret)
+ goto error6;
+ ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+ if (ret)
+ goto error7;
+
+ snprintf(name, sizeof name, "ib_mad%d", port_num);
+ port_priv->wq = create_singlethread_workqueue(name);
+ if (!port_priv->wq) {
+ ret = -ENOMEM;
+ goto error8;
+ }
+ INIT_WORK(&port_priv->work, ib_mad_completion_handler, port_priv);
+
+ ret = ib_mad_port_start(port_priv);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't start port\n");
+ goto error9;
+ }
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ list_add_tail(&port_priv->port_list, &ib_mad_port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+ return 0;
+
+error9:
+ destroy_workqueue(port_priv->wq);
+error8:
+ destroy_mad_qp(&port_priv->qp_info[1]);
+error7:
+ destroy_mad_qp(&port_priv->qp_info[0]);
+error6:
+ ib_dereg_mr(port_priv->mr);
+error5:
+ ib_dealloc_pd(port_priv->pd);
+error4:
+ ib_destroy_cq(port_priv->cq);
+ cleanup_recv_queue(&port_priv->qp_info[1]);
+ cleanup_recv_queue(&port_priv->qp_info[0]);
+error3:
+ kfree(port_priv);
+
+ return ret;
+}
+
+/*
+ * Close the port
+ * If there are no classes using the port, free the port
+ * resources (CQ, MR, PD, QP) and remove the port's info structure
+ */
+static int ib_mad_port_close(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ port_priv = __ib_get_mad_port(device, port_num);
+ if (port_priv == NULL) {
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+ printk(KERN_ERR PFX "Port %d not found\n", port_num);
+ return -ENODEV;
+ }
+ list_del(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ /* Stop processing completions. */
+ flush_workqueue(port_priv->wq);
+ destroy_workqueue(port_priv->wq);
+ destroy_mad_qp(&port_priv->qp_info[1]);
+ destroy_mad_qp(&port_priv->qp_info[0]);
+ ib_dereg_mr(port_priv->mr);
+ ib_dealloc_pd(port_priv->pd);
+ ib_destroy_cq(port_priv->cq);
+ cleanup_recv_queue(&port_priv->qp_info[1]);
+ cleanup_recv_queue(&port_priv->qp_info[0]);
+ /* XXX: Handle deallocation of MAD registration tables */
+
+ kfree(port_priv);
+
+ return 0;
+}
+
+static void ib_mad_init_device(struct ib_device *device)
+{
+ int ret, num_ports, cur_port, i, ret2;
+
+ if (device->node_type == IB_NODE_SWITCH) {
+ num_ports = 1;
+ cur_port = 0;
+ } else {
+ num_ports = device->phys_port_cnt;
+ cur_port = 1;
+ }
+ for (i = 0; i < num_ports; i++, cur_port++) {
+ ret = ib_mad_port_open(device, cur_port);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't open %s port %d\n",
+ device->name, cur_port);
+ goto error_device_open;
+ }
+ ret = ib_agent_port_open(device, cur_port);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't open %s port %d "
+ "for agents\n",
+ device->name, cur_port);
+ goto error_device_open;
+ }
+ }
+
+ goto error_device_query;
+
+error_device_open:
+ while (i > 0) {
+ cur_port--;
+ ret2 = ib_agent_port_close(device, cur_port);
+ if (ret2) {
+ printk(KERN_ERR PFX "Couldn't close %s port %d "
+ "for agents\n",
+ device->name, cur_port);
+ }
+ ret2 = ib_mad_port_close(device, cur_port);
+ if (ret2) {
+ printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+ device->name, cur_port);
+ }
+ i--;
+ }
+
+error_device_query:
+ return;
+}
+
+static void ib_mad_remove_device(struct ib_device *device)
+{
+ int ret = 0, i, num_ports, cur_port, ret2;
+
+ if (device->node_type == IB_NODE_SWITCH) {
+ num_ports = 1;
+ cur_port = 0;
+ } else {
+ num_ports = device->phys_port_cnt;
+ cur_port = 1;
+ }
+ for (i = 0; i < num_ports; i++, cur_port++) {
+ ret2 = ib_agent_port_close(device, cur_port);
+ if (ret2) {
+ printk(KERN_ERR PFX "Couldn't close %s port %d "
+ "for agents\n",
+ device->name, cur_port);
+ if (!ret)
+ ret = ret2;
+ }
+ ret2 = ib_mad_port_close(device, cur_port);
+ if (ret2) {
+ printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+ device->name, cur_port);
+ if (!ret)
+ ret = ret2;
+ }
+ }
+}
+
+static struct ib_client mad_client = {
+ .name = "mad",
+ .add = ib_mad_init_device,
+ .remove = ib_mad_remove_device
+};
+
+static int __init ib_mad_init_module(void)
+{
+ int ret;
+
+ spin_lock_init(&ib_mad_port_list_lock);
+ spin_lock_init(&ib_agent_port_list_lock);
+
+ ib_mad_cache = kmem_cache_create("ib_mad",
+ sizeof(struct ib_mad_private),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL,
+ NULL);
+ if (!ib_mad_cache) {
+ printk(KERN_ERR PFX "Couldn't create ib_mad cache\n");
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ INIT_LIST_HEAD(&ib_mad_port_list);
+
+ if (ib_register_client(&mad_client)) {
+ printk(KERN_ERR PFX "Couldn't register ib_mad client\n");
+ ret = -EINVAL;
+ goto error2;
+ }
+
+ return 0;
+
+error2:
+ kmem_cache_destroy(ib_mad_cache);
+error1:
+ return ret;
+}
+
+static void __exit ib_mad_cleanup_module(void)
+{
+ ib_unregister_client(&mad_client);
+
+ if (kmem_cache_destroy(ib_mad_cache)) {
+ printk(KERN_DEBUG PFX "Failed to destroy ib_mad cache\n");
+ }
+}
+
+module_init(ib_mad_init_module);
+module_exit(ib_mad_cleanup_module);
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
new file mode 100644
index 00000000000..4ba9f726bf1
--- /dev/null
+++ b/drivers/infiniband/core/mad_priv.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mad_priv.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef __IB_MAD_PRIV_H__
+#define __IB_MAD_PRIV_H__
+
+#include <linux/pci.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <ib_mad.h>
+#include <ib_smi.h>
+
+
+#define PFX "ib_mad: "
+
+#define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */
+
+/* QP and CQ parameters */
+#define IB_MAD_QP_SEND_SIZE 128
+#define IB_MAD_QP_RECV_SIZE 512
+#define IB_MAD_SEND_REQ_MAX_SG 2
+#define IB_MAD_RECV_REQ_MAX_SG 1
+
+#define IB_MAD_SEND_Q_PSN 0
+
+/* Registration table sizes */
+#define MAX_MGMT_CLASS 80
+#define MAX_MGMT_VERSION 8
+#define MAX_MGMT_OUI 8
+#define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \
+ IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
+
+struct ib_mad_list_head {
+ struct list_head list;
+ struct ib_mad_queue *mad_queue;
+};
+
+struct ib_mad_private_header {
+ struct ib_mad_list_head mad_list;
+ struct ib_mad_recv_wc recv_wc;
+ DECLARE_PCI_UNMAP_ADDR(mapping)
+} __attribute__ ((packed));
+
+struct ib_mad_private {
+ struct ib_mad_private_header header;
+ struct ib_grh grh;
+ union {
+ struct ib_mad mad;
+ struct ib_rmpp_mad rmpp_mad;
+ struct ib_smp smp;
+ } mad;
+} __attribute__ ((packed));
+
+struct ib_mad_agent_private {
+ struct list_head agent_list;
+ struct ib_mad_agent agent;
+ struct ib_mad_reg_req *reg_req;
+ struct ib_mad_qp_info *qp_info;
+
+ spinlock_t lock;
+ struct list_head send_list;
+ struct list_head wait_list;
+ struct work_struct timed_work;
+ unsigned long timeout;
+ struct list_head local_list;
+ struct work_struct local_work;
+ struct list_head canceled_list;
+ struct work_struct canceled_work;
+
+ atomic_t refcount;
+ wait_queue_head_t wait;
+ u8 rmpp_version;
+};
+
+struct ib_mad_snoop_private {
+ struct ib_mad_agent agent;
+ struct ib_mad_qp_info *qp_info;
+ int snoop_index;
+ int mad_snoop_flags;
+ atomic_t refcount;
+ wait_queue_head_t wait;
+};
+
+struct ib_mad_send_wr_private {
+ struct ib_mad_list_head mad_list;
+ struct list_head agent_list;
+ struct ib_mad_agent *agent;
+ struct ib_send_wr send_wr;
+ struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+ u64 wr_id; /* client WR ID */
+ u64 tid;
+ unsigned long timeout;
+ int retry;
+ int refcount;
+ enum ib_wc_status status;
+};
+
+struct ib_mad_local_private {
+ struct list_head completion_list;
+ struct ib_mad_private *mad_priv;
+ struct ib_mad_agent_private *recv_mad_agent;
+ struct ib_send_wr send_wr;
+ struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+ u64 wr_id; /* client WR ID */
+ u64 tid;
+};
+
+struct ib_mad_mgmt_method_table {
+ struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS];
+};
+
+struct ib_mad_mgmt_class_table {
+ struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
+};
+
+struct ib_mad_mgmt_vendor_class {
+ u8 oui[MAX_MGMT_OUI][3];
+ struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI];
+};
+
+struct ib_mad_mgmt_vendor_class_table {
+ struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2];
+};
+
+struct ib_mad_mgmt_version_table {
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+};
+
+struct ib_mad_queue {
+ spinlock_t lock;
+ struct list_head list;
+ int count;
+ int max_active;
+ struct ib_mad_qp_info *qp_info;
+};
+
+struct ib_mad_qp_info {
+ struct ib_mad_port_private *port_priv;
+ struct ib_qp *qp;
+ struct ib_mad_queue send_queue;
+ struct ib_mad_queue recv_queue;
+ struct list_head overflow_list;
+ spinlock_t snoop_lock;
+ struct ib_mad_snoop_private **snoop_table;
+ int snoop_table_size;
+ atomic_t snoop_count;
+};
+
+struct ib_mad_port_private {
+ struct list_head port_list;
+ struct ib_device *device;
+ int port_num;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+
+ spinlock_t reg_lock;
+ struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
+ struct list_head agent_list;
+ struct workqueue_struct *wq;
+ struct work_struct work;
+ struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+};
+
+extern kmem_cache_t *ib_mad_cache;
+
+#endif /* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
new file mode 100644
index 00000000000..5f15feffeae
--- /dev/null
+++ b/drivers/infiniband/core/packer.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: packer.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <ib_pack.h>
+
+static u64 value_read(int offset, int size, void *structure)
+{
+ switch (size) {
+ case 1: return *(u8 *) (structure + offset);
+ case 2: return be16_to_cpup((__be16 *) (structure + offset));
+ case 4: return be32_to_cpup((__be32 *) (structure + offset));
+ case 8: return be64_to_cpup((__be64 *) (structure + offset));
+ default:
+ printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+ return 0;
+ }
+}
+
+/**
+ * ib_pack - Pack a structure into a buffer
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @structure:Structure to pack from
+ * @buf:Buffer to pack into
+ *
+ * ib_pack() packs a list of structure fields into a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_pack(const struct ib_field *desc,
+ int desc_len,
+ void *structure,
+ void *buf)
+{
+ int i;
+
+ for (i = 0; i < desc_len; ++i) {
+ if (desc[i].size_bits <= 32) {
+ int shift;
+ u32 val;
+ __be32 mask;
+ __be32 *addr;
+
+ shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+ if (desc[i].struct_size_bytes)
+ val = value_read(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ structure) << shift;
+ else
+ val = 0;
+
+ mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift);
+ addr = (__be32 *) buf + desc[i].offset_words;
+ *addr = (*addr & ~mask) | (cpu_to_be32(val) & mask);
+ } else if (desc[i].size_bits <= 64) {
+ int shift;
+ u64 val;
+ __be64 mask;
+ __be64 *addr;
+
+ shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+ if (desc[i].struct_size_bytes)
+ val = value_read(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ structure) << shift;
+ else
+ val = 0;
+
+ mask = cpu_to_be64(((1ull << desc[i].size_bits) - 1) << shift);
+ addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words);
+ *addr = (*addr & ~mask) | (cpu_to_be64(val) & mask);
+ } else {
+ if (desc[i].offset_bits % 8 ||
+ desc[i].size_bits % 8) {
+ printk(KERN_WARNING "Structure field %s of size %d "
+ "bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
+ }
+
+ if (desc[i].struct_size_bytes)
+ memcpy(buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ structure + desc[i].struct_offset_bytes,
+ desc[i].size_bits / 8);
+ else
+ memset(buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ 0,
+ desc[i].size_bits / 8);
+ }
+ }
+}
+EXPORT_SYMBOL(ib_pack);
+
+static void value_write(int offset, int size, u64 val, void *structure)
+{
+ switch (size * 8) {
+ case 8: *( u8 *) (structure + offset) = val; break;
+ case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
+ case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
+ case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+ default:
+ printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+ }
+}
+
+/**
+ * ib_unpack - Unpack a buffer into a structure
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @buf:Buffer to unpack from
+ * @structure:Structure to unpack into
+ *
+ * ib_pack() unpacks a list of structure fields from a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_unpack(const struct ib_field *desc,
+ int desc_len,
+ void *buf,
+ void *structure)
+{
+ int i;
+
+ for (i = 0; i < desc_len; ++i) {
+ if (!desc[i].struct_size_bytes)
+ continue;
+
+ if (desc[i].size_bits <= 32) {
+ int shift;
+ u32 val;
+ u32 mask;
+ __be32 *addr;
+
+ shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+ mask = ((1ull << desc[i].size_bits) - 1) << shift;
+ addr = (__be32 *) buf + desc[i].offset_words;
+ val = (be32_to_cpup(addr) & mask) >> shift;
+ value_write(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ val,
+ structure);
+ } else if (desc[i].size_bits <= 64) {
+ int shift;
+ u64 val;
+ u64 mask;
+ __be64 *addr;
+
+ shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+ mask = ((1ull << desc[i].size_bits) - 1) << shift;
+ addr = (__be64 *) buf + desc[i].offset_words;
+ val = (be64_to_cpup(addr) & mask) >> shift;
+ value_write(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ val,
+ structure);
+ } else {
+ if (desc[i].offset_bits % 8 ||
+ desc[i].size_bits % 8) {
+ printk(KERN_WARNING "Structure field %s of size %d "
+ "bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
+ }
+
+ memcpy(structure + desc[i].struct_offset_bytes,
+ buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ desc[i].size_bits / 8);
+ }
+ }
+}
+EXPORT_SYMBOL(ib_unpack);
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
new file mode 100644
index 00000000000..d4233ee61c3
--- /dev/null
+++ b/drivers/infiniband/core/sa_query.c
@@ -0,0 +1,866 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: sa_query.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+
+#include <ib_pack.h>
+#include <ib_sa.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand subnet administration query support");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/*
+ * These two structures must be packed because they have 64-bit fields
+ * that are only 32-bit aligned. 64-bit architectures will lay them
+ * out wrong otherwise. (And unfortunately they are sent on the wire
+ * so we can't change the layout)
+ */
+struct ib_sa_hdr {
+ u64 sm_key;
+ u16 attr_offset;
+ u16 reserved;
+ ib_sa_comp_mask comp_mask;
+} __attribute__ ((packed));
+
+struct ib_sa_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ struct ib_sa_hdr sa_hdr;
+ u8 data[200];
+} __attribute__ ((packed));
+
+struct ib_sa_sm_ah {
+ struct ib_ah *ah;
+ struct kref ref;
+};
+
+struct ib_sa_port {
+ struct ib_mad_agent *agent;
+ struct ib_mr *mr;
+ struct ib_sa_sm_ah *sm_ah;
+ struct work_struct update_task;
+ spinlock_t ah_lock;
+ u8 port_num;
+};
+
+struct ib_sa_device {
+ int start_port, end_port;
+ struct ib_event_handler event_handler;
+ struct ib_sa_port port[0];
+};
+
+struct ib_sa_query {
+ void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+ void (*release)(struct ib_sa_query *);
+ struct ib_sa_port *port;
+ struct ib_sa_mad *mad;
+ struct ib_sa_sm_ah *sm_ah;
+ DECLARE_PCI_UNMAP_ADDR(mapping)
+ int id;
+};
+
+struct ib_sa_path_query {
+ void (*callback)(int, struct ib_sa_path_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+ void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+static void ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device);
+
+static struct ib_client sa_client = {
+ .name = "sa",
+ .add = ib_sa_add_one,
+ .remove = ib_sa_remove_one
+};
+
+static spinlock_t idr_lock;
+static DEFINE_IDR(query_idr);
+
+static spinlock_t tid_lock;
+static u32 tid;
+
+enum {
+ IB_SA_ATTR_CLASS_PORTINFO = 0x01,
+ IB_SA_ATTR_NOTICE = 0x02,
+ IB_SA_ATTR_INFORM_INFO = 0x03,
+ IB_SA_ATTR_NODE_REC = 0x11,
+ IB_SA_ATTR_PORT_INFO_REC = 0x12,
+ IB_SA_ATTR_SL2VL_REC = 0x13,
+ IB_SA_ATTR_SWITCH_REC = 0x14,
+ IB_SA_ATTR_LINEAR_FDB_REC = 0x15,
+ IB_SA_ATTR_RANDOM_FDB_REC = 0x16,
+ IB_SA_ATTR_MCAST_FDB_REC = 0x17,
+ IB_SA_ATTR_SM_INFO_REC = 0x18,
+ IB_SA_ATTR_LINK_REC = 0x20,
+ IB_SA_ATTR_GUID_INFO_REC = 0x30,
+ IB_SA_ATTR_SERVICE_REC = 0x31,
+ IB_SA_ATTR_PARTITION_REC = 0x33,
+ IB_SA_ATTR_RANGE_REC = 0x34,
+ IB_SA_ATTR_PATH_REC = 0x35,
+ IB_SA_ATTR_VL_ARB_REC = 0x36,
+ IB_SA_ATTR_MC_GROUP_REC = 0x37,
+ IB_SA_ATTR_MC_MEMBER_REC = 0x38,
+ IB_SA_ATTR_TRACE_REC = 0x39,
+ IB_SA_ATTR_MULTI_PATH_REC = 0x3a,
+ IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b
+};
+
+#define PATH_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_path_rec, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_path_rec *) 0)->field, \
+ .field_name = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+ { RESERVED,
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { PATH_REC_FIELD(dgid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { PATH_REC_FIELD(sgid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { PATH_REC_FIELD(dlid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(slid),
+ .offset_words = 10,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(raw_traffic),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 11,
+ .offset_bits = 1,
+ .size_bits = 3 },
+ { PATH_REC_FIELD(flow_label),
+ .offset_words = 11,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { PATH_REC_FIELD(hop_limit),
+ .offset_words = 11,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { PATH_REC_FIELD(traffic_class),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { PATH_REC_FIELD(reversible),
+ .offset_words = 12,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { PATH_REC_FIELD(numb_path),
+ .offset_words = 12,
+ .offset_bits = 9,
+ .size_bits = 7 },
+ { PATH_REC_FIELD(pkey),
+ .offset_words = 12,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 13,
+ .offset_bits = 0,
+ .size_bits = 12 },
+ { PATH_REC_FIELD(sl),
+ .offset_words = 13,
+ .offset_bits = 12,
+ .size_bits = 4 },
+ { PATH_REC_FIELD(mtu_selector),
+ .offset_words = 13,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(mtu),
+ .offset_words = 13,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(rate_selector),
+ .offset_words = 13,
+ .offset_bits = 24,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(rate),
+ .offset_words = 13,
+ .offset_bits = 26,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(packet_life_time_selector),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(packet_life_time),
+ .offset_words = 14,
+ .offset_bits = 2,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(preference),
+ .offset_words = 14,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { RESERVED,
+ .offset_words = 14,
+ .offset_bits = 16,
+ .size_bits = 48 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \
+ .field_name = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+ { MCMEMBER_REC_FIELD(mgid),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { MCMEMBER_REC_FIELD(port_gid),
+ .offset_words = 4,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { MCMEMBER_REC_FIELD(qkey),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { MCMEMBER_REC_FIELD(mlid),
+ .offset_words = 9,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { MCMEMBER_REC_FIELD(mtu_selector),
+ .offset_words = 9,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(mtu),
+ .offset_words = 9,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(traffic_class),
+ .offset_words = 9,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { MCMEMBER_REC_FIELD(pkey),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { MCMEMBER_REC_FIELD(rate_selector),
+ .offset_words = 10,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(rate),
+ .offset_words = 10,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(packet_life_time_selector),
+ .offset_words = 10,
+ .offset_bits = 24,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(packet_life_time),
+ .offset_words = 10,
+ .offset_bits = 26,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(sl),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(flow_label),
+ .offset_words = 11,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { MCMEMBER_REC_FIELD(hop_limit),
+ .offset_words = 11,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { MCMEMBER_REC_FIELD(scope),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(join_state),
+ .offset_words = 12,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(proxy_join),
+ .offset_words = 12,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 12,
+ .offset_bits = 9,
+ .size_bits = 23 },
+};
+
+static void free_sm_ah(struct kref *kref)
+{
+ struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+ ib_destroy_ah(sm_ah->ah);
+ kfree(sm_ah);
+}
+
+static void update_sm_ah(void *port_ptr)
+{
+ struct ib_sa_port *port = port_ptr;
+ struct ib_sa_sm_ah *new_ah, *old_ah;
+ struct ib_port_attr port_attr;
+ struct ib_ah_attr ah_attr;
+
+ if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+ printk(KERN_WARNING "Couldn't query port\n");
+ return;
+ }
+
+ new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL);
+ if (!new_ah) {
+ printk(KERN_WARNING "Couldn't allocate new SM AH\n");
+ return;
+ }
+
+ kref_init(&new_ah->ref);
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = port_attr.sm_lid;
+ ah_attr.sl = port_attr.sm_sl;
+ ah_attr.port_num = port->port_num;
+
+ new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+ if (IS_ERR(new_ah->ah)) {
+ printk(KERN_WARNING "Couldn't create new SM AH\n");
+ kfree(new_ah);
+ return;
+ }
+
+ spin_lock_irq(&port->ah_lock);
+ old_ah = port->sm_ah;
+ port->sm_ah = new_ah;
+ spin_unlock_irq(&port->ah_lock);
+
+ if (old_ah)
+ kref_put(&old_ah->ref, free_sm_ah);
+}
+
+static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event)
+{
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE) {
+ struct ib_sa_device *sa_dev =
+ ib_get_client_data(event->device, &sa_client);
+
+ schedule_work(&sa_dev->port[event->element.port_num -
+ sa_dev->start_port].update_task);
+ }
+}
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query. If the id and query don't match up or
+ * the query has already completed, nothing is done. Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+ unsigned long flags;
+ struct ib_mad_agent *agent;
+
+ spin_lock_irqsave(&idr_lock, flags);
+ if (idr_find(&query_idr, id) != query) {
+ spin_unlock_irqrestore(&idr_lock, flags);
+ return;
+ }
+ agent = query->port->agent;
+ spin_unlock_irqrestore(&idr_lock, flags);
+
+ ib_cancel_mad(agent, id);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
+{
+ unsigned long flags;
+
+ memset(mad, 0, sizeof *mad);
+
+ mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
+ mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+
+ spin_lock_irqsave(&tid_lock, flags);
+ mad->mad_hdr.tid =
+ cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+ spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, int timeout_ms)
+{
+ struct ib_sa_port *port = query->port;
+ unsigned long flags;
+ int ret;
+ struct ib_sge gather_list;
+ struct ib_send_wr *bad_wr, wr = {
+ .opcode = IB_WR_SEND,
+ .sg_list = &gather_list,
+ .num_sge = 1,
+ .send_flags = IB_SEND_SIGNALED,
+ .wr = {
+ .ud = {
+ .mad_hdr = &query->mad->mad_hdr,
+ .remote_qpn = 1,
+ .remote_qkey = IB_QP1_QKEY,
+ .timeout_ms = timeout_ms
+ }
+ }
+ };
+
+retry:
+ if (!idr_pre_get(&query_idr, GFP_ATOMIC))
+ return -ENOMEM;
+ spin_lock_irqsave(&idr_lock, flags);
+ ret = idr_get_new(&query_idr, query, &query->id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+ if (ret == -EAGAIN)
+ goto retry;
+ if (ret)
+ return ret;
+
+ wr.wr_id = query->id;
+
+ spin_lock_irqsave(&port->ah_lock, flags);
+ kref_get(&port->sm_ah->ref);
+ query->sm_ah = port->sm_ah;
+ wr.wr.ud.ah = port->sm_ah->ah;
+ spin_unlock_irqrestore(&port->ah_lock, flags);
+
+ gather_list.addr = dma_map_single(port->agent->device->dma_device,
+ query->mad,
+ sizeof (struct ib_sa_mad),
+ DMA_TO_DEVICE);
+ gather_list.length = sizeof (struct ib_sa_mad);
+ gather_list.lkey = port->mr->lkey;
+ pci_unmap_addr_set(query, mapping, gather_list.addr);
+
+ ret = ib_post_send_mad(port->agent, &wr, &bad_wr);
+ if (ret) {
+ dma_unmap_single(port->agent->device->dma_device,
+ pci_unmap_addr(query, mapping),
+ sizeof (struct ib_sa_mad),
+ DMA_TO_DEVICE);
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+ spin_lock_irqsave(&idr_lock, flags);
+ idr_remove(&query_idr, query->id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+ }
+
+ return ret;
+}
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_path_query *query =
+ container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_path_rec rec;
+
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(sa_query->mad);
+ kfree(container_of(sa_query, struct ib_sa_path_query, sa_query));
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path. The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query. The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code. Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, int gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_path_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_path_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port = &sa_dev->port[port_num - sa_dev->start_port];
+ struct ib_mad_agent *agent = port->agent;
+ int ret;
+
+ query = kmalloc(sizeof *query, gfp_mask);
+ if (!query)
+ return -ENOMEM;
+ query->sa_query.mad = kmalloc(sizeof *query->sa_query.mad, gfp_mask);
+ if (!query->sa_query.mad) {
+ kfree(query);
+ return -ENOMEM;
+ }
+
+ query->callback = callback;
+ query->context = context;
+
+ init_mad(query->sa_query.mad, agent);
+
+ query->sa_query.callback = ib_sa_path_rec_callback;
+ query->sa_query.release = ib_sa_path_rec_release;
+ query->sa_query.port = port;
+ query->sa_query.mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ query->sa_query.mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+ query->sa_query.mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ rec, query->sa_query.mad->data);
+
+ *sa_query = &query->sa_query;
+ ret = send_mad(&query->sa_query, timeout_ms);
+ if (ret) {
+ *sa_query = NULL;
+ kfree(query->sa_query.mad);
+ kfree(query);
+ }
+
+ return ret ? ret : query->sa_query.id;
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_mcmember_query *query =
+ container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_mcmember_rec rec;
+
+ ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(sa_query->mad);
+ kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_device *device, u8 port_num,
+ u8 method,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, int gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_mcmember_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port = &sa_dev->port[port_num - sa_dev->start_port];
+ struct ib_mad_agent *agent = port->agent;
+ int ret;
+
+ query = kmalloc(sizeof *query, gfp_mask);
+ if (!query)
+ return -ENOMEM;
+ query->sa_query.mad = kmalloc(sizeof *query->sa_query.mad, gfp_mask);
+ if (!query->sa_query.mad) {
+ kfree(query);
+ return -ENOMEM;
+ }
+
+ query->callback = callback;
+ query->context = context;
+
+ init_mad(query->sa_query.mad, agent);
+
+ query->sa_query.callback = ib_sa_mcmember_rec_callback;
+ query->sa_query.release = ib_sa_mcmember_rec_release;
+ query->sa_query.port = port;
+ query->sa_query.mad->mad_hdr.method = method;
+ query->sa_query.mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+ query->sa_query.mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+ rec, query->sa_query.mad->data);
+
+ *sa_query = &query->sa_query;
+ ret = send_mad(&query->sa_query, timeout_ms);
+ if (ret) {
+ *sa_query = NULL;
+ kfree(query->sa_query.mad);
+ kfree(query);
+ }
+
+ return ret ? ret : query->sa_query.id;
+}
+EXPORT_SYMBOL(ib_sa_mcmember_rec_query);
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_sa_query *query;
+ unsigned long flags;
+
+ spin_lock_irqsave(&idr_lock, flags);
+ query = idr_find(&query_idr, mad_send_wc->wr_id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+
+ if (!query)
+ return;
+
+ switch (mad_send_wc->status) {
+ case IB_WC_SUCCESS:
+ /* No callback -- already got recv */
+ break;
+ case IB_WC_RESP_TIMEOUT_ERR:
+ query->callback(query, -ETIMEDOUT, NULL);
+ break;
+ case IB_WC_WR_FLUSH_ERR:
+ query->callback(query, -EINTR, NULL);
+ break;
+ default:
+ query->callback(query, -EIO, NULL);
+ break;
+ }
+
+ dma_unmap_single(agent->device->dma_device,
+ pci_unmap_addr(query, mapping),
+ sizeof (struct ib_sa_mad),
+ DMA_TO_DEVICE);
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+
+ query->release(query);
+
+ spin_lock_irqsave(&idr_lock, flags);
+ idr_remove(&query_idr, mad_send_wc->wr_id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_sa_query *query;
+ unsigned long flags;
+
+ spin_lock_irqsave(&idr_lock, flags);
+ query = idr_find(&query_idr, mad_recv_wc->wc->wr_id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+
+ if (query) {
+ if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+ query->callback(query,
+ mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+ -EINVAL : 0,
+ (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+ else
+ query->callback(query, -EIO, NULL);
+ }
+
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static void ib_sa_add_one(struct ib_device *device)
+{
+ struct ib_sa_device *sa_dev;
+ int s, e, i;
+
+ if (device->node_type == IB_NODE_SWITCH)
+ s = e = 0;
+ else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ sa_dev = kmalloc(sizeof *sa_dev +
+ (e - s + 1) * sizeof (struct ib_sa_port),
+ GFP_KERNEL);
+ if (!sa_dev)
+ return;
+
+ sa_dev->start_port = s;
+ sa_dev->end_port = e;
+
+ for (i = 0; i <= e - s; ++i) {
+ sa_dev->port[i].mr = NULL;
+ sa_dev->port[i].sm_ah = NULL;
+ sa_dev->port[i].port_num = i + s;
+ spin_lock_init(&sa_dev->port[i].ah_lock);
+
+ sa_dev->port[i].agent =
+ ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+ NULL, 0, send_handler,
+ recv_handler, sa_dev);
+ if (IS_ERR(sa_dev->port[i].agent))
+ goto err;
+
+ sa_dev->port[i].mr = ib_get_dma_mr(sa_dev->port[i].agent->qp->pd,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(sa_dev->port[i].mr)) {
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+ goto err;
+ }
+
+ INIT_WORK(&sa_dev->port[i].update_task,
+ update_sm_ah, &sa_dev->port[i]);
+ }
+
+ ib_set_client_data(device, &sa_client, sa_dev);
+
+ /*
+ * We register our event handler after everything is set up,
+ * and then update our cached info after the event handler is
+ * registered to avoid any problems if a port changes state
+ * during our initialization.
+ */
+
+ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+ if (ib_register_event_handler(&sa_dev->event_handler))
+ goto err;
+
+ for (i = 0; i <= e - s; ++i)
+ update_sm_ah(&sa_dev->port[i]);
+
+ return;
+
+err:
+ while (--i >= 0) {
+ ib_dereg_mr(sa_dev->port[i].mr);
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+ }
+
+ kfree(sa_dev);
+
+ return;
+}
+
+static void ib_sa_remove_one(struct ib_device *device)
+{
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ int i;
+
+ if (!sa_dev)
+ return;
+
+ ib_unregister_event_handler(&sa_dev->event_handler);
+
+ for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+ kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+ }
+
+ kfree(sa_dev);
+}
+
+static int __init ib_sa_init(void)
+{
+ int ret;
+
+ spin_lock_init(&idr_lock);
+ spin_lock_init(&tid_lock);
+
+ get_random_bytes(&tid, sizeof tid);
+
+ ret = ib_register_client(&sa_client);
+ if (ret)
+ printk(KERN_ERR "Couldn't register ib_sa client\n");
+
+ return ret;
+}
+
+static void __exit ib_sa_cleanup(void)
+{
+ ib_unregister_client(&sa_client);
+}
+
+module_init(ib_sa_init);
+module_exit(ib_sa_cleanup);
diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c
new file mode 100644
index 00000000000..b4b284324a3
--- /dev/null
+++ b/drivers/infiniband/core/smi.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: smi.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <ib_smi.h>
+#include "smi.h"
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return 0 if the SMP should be discarded
+ */
+int smi_handle_dr_smp_send(struct ib_smp *smp,
+ u8 node_type,
+ int port_num)
+{
+ u8 hop_ptr, hop_cnt;
+
+ hop_ptr = smp->hop_ptr;
+ hop_cnt = smp->hop_cnt;
+
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ if (!ib_get_smp_direction(smp)) {
+ /* C14-9:1 */
+ if (hop_cnt && hop_ptr == 0) {
+ smp->hop_ptr++;
+ return (smp->initial_path[smp->hop_ptr] ==
+ port_num);
+ }
+
+ /* C14-9:2 */
+ if (hop_ptr && hop_ptr < hop_cnt) {
+ if (node_type != IB_NODE_SWITCH)
+ return 0;
+
+ /* smp->return_path set when received */
+ smp->hop_ptr++;
+ return (smp->initial_path[smp->hop_ptr] ==
+ port_num);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt) {
+ /* smp->return_path set when received */
+ smp->hop_ptr++;
+ return (node_type == IB_NODE_SWITCH ||
+ smp->dr_dlid == IB_LID_PERMISSIVE);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- Fail unreasonable hop pointer */
+ return (hop_ptr == hop_cnt + 1);
+
+ } else {
+ /* C14-13:1 */
+ if (hop_cnt && hop_ptr == hop_cnt + 1) {
+ smp->hop_ptr--;
+ return (smp->return_path[smp->hop_ptr] ==
+ port_num);
+ }
+
+ /* C14-13:2 */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+ if (node_type != IB_NODE_SWITCH)
+ return 0;
+
+ smp->hop_ptr--;
+ return (smp->return_path[smp->hop_ptr] ==
+ port_num);
+ }
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (hop_ptr == 1) {
+ smp->hop_ptr--;
+ /* C14-13:3 -- SMPs destined for SM shouldn't be here */
+ return (node_type == IB_NODE_SWITCH ||
+ smp->dr_slid == IB_LID_PERMISSIVE);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+ if (hop_ptr == 0)
+ return 1;
+
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return 0;
+ }
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return 0 if the SMP should be dropped
+ */
+int smi_handle_dr_smp_recv(struct ib_smp *smp,
+ u8 node_type,
+ int port_num,
+ int phys_port_cnt)
+{
+ u8 hop_ptr, hop_cnt;
+
+ hop_ptr = smp->hop_ptr;
+ hop_cnt = smp->hop_cnt;
+
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ if (!ib_get_smp_direction(smp)) {
+ /* C14-9:1 -- sender should have incremented hop_ptr */
+ if (hop_cnt && hop_ptr == 0)
+ return 0;
+
+ /* C14-9:2 -- intermediate hop */
+ if (hop_ptr && hop_ptr < hop_cnt) {
+ if (node_type != IB_NODE_SWITCH)
+ return 0;
+
+ smp->return_path[hop_ptr] = port_num;
+ /* smp->hop_ptr updated when sending */
+ return (smp->initial_path[hop_ptr+1] <= phys_port_cnt);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt) {
+ if (hop_cnt)
+ smp->return_path[hop_ptr] = port_num;
+ /* smp->hop_ptr updated when sending */
+
+ return (node_type == IB_NODE_SWITCH ||
+ smp->dr_dlid == IB_LID_PERMISSIVE);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- fail unreasonable hop pointer */
+ return (hop_ptr == hop_cnt + 1);
+
+ } else {
+
+ /* C14-13:1 */
+ if (hop_cnt && hop_ptr == hop_cnt + 1) {
+ smp->hop_ptr--;
+ return (smp->return_path[smp->hop_ptr] ==
+ port_num);
+ }
+
+ /* C14-13:2 */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+ if (node_type != IB_NODE_SWITCH)
+ return 0;
+
+ /* smp->hop_ptr updated when sending */
+ return (smp->return_path[hop_ptr-1] <= phys_port_cnt);
+ }
+
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ if (hop_ptr == 1) {
+ if (smp->dr_slid == IB_LID_PERMISSIVE) {
+ /* giving SMP to SM - update hop_ptr */
+ smp->hop_ptr--;
+ return 1;
+ }
+ /* smp->hop_ptr updated when sending */
+ return (node_type == IB_NODE_SWITCH);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> give to SM */
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return (hop_ptr == 0);
+ }
+}
+
+/*
+ * Return 1 if the received DR SMP should be forwarded to the send queue
+ * Return 0 if the SMP should be completed up the stack
+ */
+int smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+ u8 hop_ptr, hop_cnt;
+
+ hop_ptr = smp->hop_ptr;
+ hop_cnt = smp->hop_cnt;
+
+ if (!ib_get_smp_direction(smp)) {
+ /* C14-9:2 -- intermediate hop */
+ if (hop_ptr && hop_ptr < hop_cnt)
+ return 1;
+
+ /* C14-9:3 -- at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt)
+ return (smp->dr_dlid == IB_LID_PERMISSIVE);
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ if (hop_ptr == hop_cnt + 1)
+ return 1;
+ } else {
+ /* C14-13:2 */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+ return 1;
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (hop_ptr == 1)
+ return (smp->dr_slid != IB_LID_PERMISSIVE);
+ }
+ return 0;
+}
diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h
new file mode 100644
index 00000000000..db25503a073
--- /dev/null
+++ b/drivers/infiniband/core/smi.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: smi.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef __SMI_H_
+#define __SMI_H_
+
+int smi_handle_dr_smp_recv(struct ib_smp *smp,
+ u8 node_type,
+ int port_num,
+ int phys_port_cnt);
+extern int smi_check_forward_dr_smp(struct ib_smp *smp);
+extern int smi_handle_dr_smp_send(struct ib_smp *smp,
+ u8 node_type,
+ int port_num);
+extern int smi_check_local_dr_smp(struct ib_smp *smp,
+ struct ib_device *device,
+ int port_num);
+
+/*
+ * Return 1 if the SMP should be handled by the local SMA/SM via process_mad
+ */
+static inline int smi_check_local_smp(struct ib_mad_agent *mad_agent,
+ struct ib_smp *smp)
+{
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+ return ((mad_agent->device->process_mad &&
+ !ib_get_smp_direction(smp) &&
+ (smp->hop_ptr == smp->hop_cnt + 1)));
+}
+
+#endif /* __SMI_H_ */
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
new file mode 100644
index 00000000000..3a413f72ff6
--- /dev/null
+++ b/drivers/infiniband/core/sysfs.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: sysfs.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include "core_priv.h"
+
+#include <ib_mad.h>
+
+struct ib_port {
+ struct kobject kobj;
+ struct ib_device *ibdev;
+ struct attribute_group gid_group;
+ struct attribute **gid_attr;
+ struct attribute_group pkey_group;
+ struct attribute **pkey_attr;
+ u8 port_num;
+};
+
+struct port_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
+ ssize_t (*store)(struct ib_port *, struct port_attribute *,
+ const char *buf, size_t count);
+};
+
+#define PORT_ATTR(_name, _mode, _show, _store) \
+struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define PORT_ATTR_RO(_name) \
+struct port_attribute port_attr_##_name = __ATTR_RO(_name)
+
+struct port_table_attribute {
+ struct port_attribute attr;
+ int index;
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+ if (!port_attr->show)
+ return 0;
+
+ return port_attr->show(p, port_attr, buf);
+}
+
+static struct sysfs_ops port_sysfs_ops = {
+ .show = port_attr_show
+};
+
+static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ static const char *state_name[] = {
+ [IB_PORT_NOP] = "NOP",
+ [IB_PORT_DOWN] = "DOWN",
+ [IB_PORT_INIT] = "INIT",
+ [IB_PORT_ARMED] = "ARMED",
+ [IB_PORT_ACTIVE] = "ACTIVE",
+ [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER"
+ };
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d: %s\n", attr.state,
+ attr.state >= 0 && attr.state <= ARRAY_SIZE(state_name) ?
+ state_name[attr.state] : "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_port *p,
+ struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ char *speed = "";
+ int rate;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ switch (attr.active_speed) {
+ case 2: speed = " DDR"; break;
+ case 4: speed = " QDR"; break;
+ }
+
+ rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed;
+ if (rate < 0)
+ return -EINVAL;
+
+ return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+ rate / 10, rate % 10 ? ".5" : "",
+ ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ switch (attr.phys_state) {
+ case 1: return sprintf(buf, "1: Sleep\n");
+ case 2: return sprintf(buf, "2: Polling\n");
+ case 3: return sprintf(buf, "3: Disabled\n");
+ case 4: return sprintf(buf, "4: PortConfigurationTraining\n");
+ case 5: return sprintf(buf, "5: LinkUp\n");
+ case 6: return sprintf(buf, "6: LinkErrorRecovery\n");
+ case 7: return sprintf(buf, "7: Phy Test\n");
+ default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+ }
+}
+
+static PORT_ATTR_RO(state);
+static PORT_ATTR_RO(lid);
+static PORT_ATTR_RO(lid_mask_count);
+static PORT_ATTR_RO(sm_lid);
+static PORT_ATTR_RO(sm_sl);
+static PORT_ATTR_RO(cap_mask);
+static PORT_ATTR_RO(rate);
+static PORT_ATTR_RO(phys_state);
+
+static struct attribute *port_default_attrs[] = {
+ &port_attr_state.attr,
+ &port_attr_lid.attr,
+ &port_attr_lid_mask_count.attr,
+ &port_attr_sm_lid.attr,
+ &port_attr_sm_sl.attr,
+ &port_attr_cap_mask.attr,
+ &port_attr_rate.attr,
+ &port_attr_phys_state.attr,
+ NULL
+};
+
+static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ union ib_gid gid;
+ ssize_t ret;
+
+ ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((u16 *) gid.raw)[0]),
+ be16_to_cpu(((u16 *) gid.raw)[1]),
+ be16_to_cpu(((u16 *) gid.raw)[2]),
+ be16_to_cpu(((u16 *) gid.raw)[3]),
+ be16_to_cpu(((u16 *) gid.raw)[4]),
+ be16_to_cpu(((u16 *) gid.raw)[5]),
+ be16_to_cpu(((u16 *) gid.raw)[6]),
+ be16_to_cpu(((u16 *) gid.raw)[7]));
+}
+
+static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ u16 pkey;
+ ssize_t ret;
+
+ ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \
+struct port_table_attribute port_pma_attr_##_name = { \
+ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
+ .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ int offset = tab_attr->index & 0xffff;
+ int width = (tab_attr->index >> 16) & 0xff;
+ struct ib_mad *in_mad = NULL;
+ struct ib_mad *out_mad = NULL;
+ ssize_t ret;
+
+ if (!p->ibdev->process_mad)
+ return sprintf(buf, "N/A (no PMA)\n");
+
+ in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *in_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ memset(in_mad, 0, sizeof *in_mad);
+ in_mad->mad_hdr.base_version = 1;
+ in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT;
+ in_mad->mad_hdr.class_version = 1;
+ in_mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ in_mad->mad_hdr.attr_id = cpu_to_be16(0x12); /* PortCounters */
+
+ in_mad->data[41] = p->port_num; /* PortSelect field */
+
+ if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
+ p->port_num, NULL, NULL, in_mad, out_mad) &
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ switch (width) {
+ case 4:
+ ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
+ (offset % 4)) & 0xf);
+ break;
+ case 8:
+ ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
+ break;
+ case 16:
+ ret = sprintf(buf, "%u\n",
+ be16_to_cpup((u16 *)(out_mad->data + 40 + offset / 8)));
+ break;
+ case 32:
+ ret = sprintf(buf, "%u\n",
+ be32_to_cpup((u32 *)(out_mad->data + 40 + offset / 8)));
+ break;
+ default:
+ ret = 0;
+ }
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+
+ return ret;
+}
+
+static PORT_PMA_ATTR(symbol_error , 0, 16, 32);
+static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48);
+static PORT_PMA_ATTR(link_downed , 2, 8, 56);
+static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96);
+static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156);
+static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288);
+
+static struct attribute *pma_attrs[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_port_xmit_data.attr.attr,
+ &port_pma_attr_port_rcv_data.attr.attr,
+ &port_pma_attr_port_xmit_packets.attr.attr,
+ &port_pma_attr_port_rcv_packets.attr.attr,
+ NULL
+};
+
+static struct attribute_group pma_group = {
+ .name = "counters",
+ .attrs = pma_attrs
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+ struct attribute *a;
+ int i;
+
+ for (i = 0; (a = p->gid_attr[i]); ++i) {
+ kfree(a->name);
+ kfree(a);
+ }
+
+ for (i = 0; (a = p->pkey_attr[i]); ++i) {
+ kfree(a->name);
+ kfree(a);
+ }
+
+ kfree(p->gid_attr);
+ kfree(p);
+}
+
+static struct kobj_type port_type = {
+ .release = ib_port_release,
+ .sysfs_ops = &port_sysfs_ops,
+ .default_attrs = port_default_attrs
+};
+
+static void ib_device_release(struct class_device *cdev)
+{
+ struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
+
+ kfree(dev);
+}
+
+static int ib_device_hotplug(struct class_device *cdev, char **envp,
+ int num_envp, char *buf, int size)
+{
+ struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
+ int i = 0, len = 0;
+
+ if (add_hotplug_env_var(envp, num_envp, &i, buf, size, &len,
+ "NAME=%s", dev->name))
+ return -ENOMEM;
+
+ /*
+ * It might be nice to pass the node GUID to hotplug, but
+ * right now the only way to get it is to query the device
+ * provider, and this can crash during device removal because
+ * we are will be running after driver removal has started.
+ * We could add a node_guid field to struct ib_device, or we
+ * could just let the hotplug script read the node GUID from
+ * sysfs when devices are added.
+ */
+
+ envp[i] = NULL;
+ return 0;
+}
+
+static int alloc_group(struct attribute ***attr,
+ ssize_t (*show)(struct ib_port *,
+ struct port_attribute *, char *buf),
+ int len)
+{
+ struct port_table_attribute ***tab_attr =
+ (struct port_table_attribute ***) attr;
+ int i;
+ int ret;
+
+ *tab_attr = kmalloc((1 + len) * sizeof *tab_attr, GFP_KERNEL);
+ if (!*tab_attr)
+ return -ENOMEM;
+
+ memset(*tab_attr, 0, (1 + len) * sizeof *tab_attr);
+
+ for (i = 0; i < len; ++i) {
+ (*tab_attr)[i] = kmalloc(sizeof *(*tab_attr)[i], GFP_KERNEL);
+ if (!(*tab_attr)[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ memset((*tab_attr)[i], 0, sizeof *(*tab_attr)[i]);
+ (*tab_attr)[i]->attr.attr.name = kmalloc(8, GFP_KERNEL);
+ if (!(*tab_attr)[i]->attr.attr.name) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ if (snprintf((*tab_attr)[i]->attr.attr.name, 8, "%d", i) >= 8) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ (*tab_attr)[i]->attr.attr.mode = S_IRUGO;
+ (*tab_attr)[i]->attr.attr.owner = THIS_MODULE;
+ (*tab_attr)[i]->attr.show = show;
+ (*tab_attr)[i]->index = i;
+ }
+
+ return 0;
+
+err:
+ for (i = 0; i < len; ++i) {
+ if ((*tab_attr)[i])
+ kfree((*tab_attr)[i]->attr.attr.name);
+ kfree((*tab_attr)[i]);
+ }
+
+ kfree(*tab_attr);
+
+ return ret;
+}
+
+static int add_port(struct ib_device *device, int port_num)
+{
+ struct ib_port *p;
+ struct ib_port_attr attr;
+ int i;
+ int ret;
+
+ ret = ib_query_port(device, port_num, &attr);
+ if (ret)
+ return ret;
+
+ p = kmalloc(sizeof *p, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ memset(p, 0, sizeof *p);
+
+ p->ibdev = device;
+ p->port_num = port_num;
+ p->kobj.ktype = &port_type;
+
+ p->kobj.parent = kobject_get(&device->ports_parent);
+ if (!p->kobj.parent) {
+ ret = -EBUSY;
+ goto err;
+ }
+
+ ret = kobject_set_name(&p->kobj, "%d", port_num);
+ if (ret)
+ goto err_put;
+
+ ret = kobject_register(&p->kobj);
+ if (ret)
+ goto err_put;
+
+ ret = sysfs_create_group(&p->kobj, &pma_group);
+ if (ret)
+ goto err_put;
+
+ ret = alloc_group(&p->gid_attr, show_port_gid, attr.gid_tbl_len);
+ if (ret)
+ goto err_remove_pma;
+
+ p->gid_group.name = "gids";
+ p->gid_group.attrs = p->gid_attr;
+
+ ret = sysfs_create_group(&p->kobj, &p->gid_group);
+ if (ret)
+ goto err_free_gid;
+
+ ret = alloc_group(&p->pkey_attr, show_port_pkey, attr.pkey_tbl_len);
+ if (ret)
+ goto err_remove_gid;
+
+ p->pkey_group.name = "pkeys";
+ p->pkey_group.attrs = p->pkey_attr;
+
+ ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+ if (ret)
+ goto err_free_pkey;
+
+ list_add_tail(&p->kobj.entry, &device->port_list);
+
+ return 0;
+
+err_free_pkey:
+ for (i = 0; i < attr.pkey_tbl_len; ++i) {
+ kfree(p->pkey_attr[i]->name);
+ kfree(p->pkey_attr[i]);
+ }
+
+ kfree(p->pkey_attr);
+
+err_remove_gid:
+ sysfs_remove_group(&p->kobj, &p->gid_group);
+
+err_free_gid:
+ for (i = 0; i < attr.gid_tbl_len; ++i) {
+ kfree(p->gid_attr[i]->name);
+ kfree(p->gid_attr[i]);
+ }
+
+ kfree(p->gid_attr);
+
+err_remove_pma:
+ sysfs_remove_group(&p->kobj, &pma_group);
+
+err_put:
+ kobject_put(&device->ports_parent);
+
+err:
+ kfree(p);
+ return ret;
+}
+
+static ssize_t show_node_type(struct class_device *cdev, char *buf)
+{
+ struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
+
+ switch (dev->node_type) {
+ case IB_NODE_CA: return sprintf(buf, "%d: CA\n", dev->node_type);
+ case IB_NODE_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+ case IB_NODE_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+ default: return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+ }
+}
+
+static ssize_t show_sys_image_guid(struct class_device *cdev, char *buf)
+{
+ struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
+ struct ib_device_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_device(dev, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((u16 *) &attr.sys_image_guid)[0]),
+ be16_to_cpu(((u16 *) &attr.sys_image_guid)[1]),
+ be16_to_cpu(((u16 *) &attr.sys_image_guid)[2]),
+ be16_to_cpu(((u16 *) &attr.sys_image_guid)[3]));
+}
+
+static ssize_t show_node_guid(struct class_device *cdev, char *buf)
+{
+ struct ib_device *dev = container_of(cdev, struct ib_device, class_dev);
+ struct ib_device_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_device(dev, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((u16 *) &attr.node_guid)[0]),
+ be16_to_cpu(((u16 *) &attr.node_guid)[1]),
+ be16_to_cpu(((u16 *) &attr.node_guid)[2]),
+ be16_to_cpu(((u16 *) &attr.node_guid)[3]));
+}
+
+static CLASS_DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
+static CLASS_DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
+static CLASS_DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
+
+static struct class_device_attribute *ib_class_attributes[] = {
+ &class_device_attr_node_type,
+ &class_device_attr_sys_image_guid,
+ &class_device_attr_node_guid
+};
+
+static struct class ib_class = {
+ .name = "infiniband",
+ .release = ib_device_release,
+ .hotplug = ib_device_hotplug,
+};
+
+int ib_device_register_sysfs(struct ib_device *device)
+{
+ struct class_device *class_dev = &device->class_dev;
+ int ret;
+ int i;
+
+ class_dev->class = &ib_class;
+ class_dev->class_data = device;
+ strlcpy(class_dev->class_id, device->name, BUS_ID_SIZE);
+
+ INIT_LIST_HEAD(&device->port_list);
+
+ ret = class_device_register(class_dev);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+ ret = class_device_create_file(class_dev, ib_class_attributes[i]);
+ if (ret)
+ goto err_unregister;
+ }
+
+ device->ports_parent.parent = kobject_get(&class_dev->kobj);
+ if (!device->ports_parent.parent) {
+ ret = -EBUSY;
+ goto err_unregister;
+ }
+ ret = kobject_set_name(&device->ports_parent, "ports");
+ if (ret)
+ goto err_put;
+ ret = kobject_register(&device->ports_parent);
+ if (ret)
+ goto err_put;
+
+ if (device->node_type == IB_NODE_SWITCH) {
+ ret = add_port(device, 0);
+ if (ret)
+ goto err_put;
+ } else {
+ int i;
+
+ for (i = 1; i <= device->phys_port_cnt; ++i) {
+ ret = add_port(device, i);
+ if (ret)
+ goto err_put;
+ }
+ }
+
+ return 0;
+
+err_put:
+ {
+ struct kobject *p, *t;
+ struct ib_port *port;
+
+ list_for_each_entry_safe(p, t, &device->port_list, entry) {
+ list_del(&p->entry);
+ port = container_of(p, struct ib_port, kobj);
+ sysfs_remove_group(p, &pma_group);
+ sysfs_remove_group(p, &port->pkey_group);
+ sysfs_remove_group(p, &port->gid_group);
+ kobject_unregister(p);
+ }
+ }
+
+ kobject_put(&class_dev->kobj);
+
+err_unregister:
+ class_device_unregister(class_dev);
+
+err:
+ return ret;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+ struct kobject *p, *t;
+ struct ib_port *port;
+
+ list_for_each_entry_safe(p, t, &device->port_list, entry) {
+ list_del(&p->entry);
+ port = container_of(p, struct ib_port, kobj);
+ sysfs_remove_group(p, &pma_group);
+ sysfs_remove_group(p, &port->pkey_group);
+ sysfs_remove_group(p, &port->gid_group);
+ kobject_unregister(p);
+ }
+
+ kobject_unregister(&device->ports_parent);
+ class_device_unregister(&device->class_dev);
+}
+
+int ib_sysfs_setup(void)
+{
+ return class_register(&ib_class);
+}
+
+void ib_sysfs_cleanup(void)
+{
+ class_unregister(&ib_class);
+}
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
new file mode 100644
index 00000000000..dc4eb1db5e9
--- /dev/null
+++ b/drivers/infiniband/core/ud_header.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ud_header.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/errno.h>
+
+#include <ib_pack.h>
+
+#define STRUCT_FIELD(header, field) \
+ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \
+ .struct_size_bytes = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \
+ .field_name = #header ":" #field
+
+static const struct ib_field lrh_table[] = {
+ { STRUCT_FIELD(lrh, virtual_lane),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(lrh, link_version),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { STRUCT_FIELD(lrh, service_level),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 4 },
+ { RESERVED,
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 2 },
+ { STRUCT_FIELD(lrh, link_next_header),
+ .offset_words = 0,
+ .offset_bits = 14,
+ .size_bits = 2 },
+ { STRUCT_FIELD(lrh, destination_lid),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 5 },
+ { STRUCT_FIELD(lrh, packet_length),
+ .offset_words = 1,
+ .offset_bits = 5,
+ .size_bits = 11 },
+ { STRUCT_FIELD(lrh, source_lid),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
+static const struct ib_field grh_table[] = {
+ { STRUCT_FIELD(grh, ip_version),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(grh, traffic_class),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, flow_label),
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 20 },
+ { STRUCT_FIELD(grh, payload_length),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(grh, next_header),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, hop_limit),
+ .offset_words = 1,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, source_gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { STRUCT_FIELD(grh, destination_gid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 }
+};
+
+static const struct ib_field bth_table[] = {
+ { STRUCT_FIELD(bth, opcode),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(bth, solicited_event),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { STRUCT_FIELD(bth, mig_req),
+ .offset_words = 0,
+ .offset_bits = 9,
+ .size_bits = 1 },
+ { STRUCT_FIELD(bth, pad_count),
+ .offset_words = 0,
+ .offset_bits = 10,
+ .size_bits = 2 },
+ { STRUCT_FIELD(bth, transport_header_version),
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 4 },
+ { STRUCT_FIELD(bth, pkey),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(bth, destination_qpn),
+ .offset_words = 1,
+ .offset_bits = 8,
+ .size_bits = 24 },
+ { STRUCT_FIELD(bth, ack_req),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 2,
+ .offset_bits = 1,
+ .size_bits = 7 },
+ { STRUCT_FIELD(bth, psn),
+ .offset_words = 2,
+ .offset_bits = 8,
+ .size_bits = 24 }
+};
+
+static const struct ib_field deth_table[] = {
+ { STRUCT_FIELD(deth, qkey),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(deth, source_qpn),
+ .offset_words = 1,
+ .offset_bits = 8,
+ .size_bits = 24 }
+};
+
+/**
+ * ib_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @header:Structure to initialize
+ *
+ * ib_ud_header_init() initializes the lrh.link_version, lrh.link_next_header,
+ * lrh.packet_length, grh.ip_version, grh.payload_length,
+ * grh.next_header, bth.opcode, bth.pad_count and
+ * bth.transport_header_version fields of a &struct ib_ud_header given
+ * the payload length and whether a GRH will be included.
+ */
+void ib_ud_header_init(int payload_bytes,
+ int grh_present,
+ struct ib_ud_header *header)
+{
+ int header_len;
+
+ memset(header, 0, sizeof *header);
+
+ header_len =
+ IB_LRH_BYTES +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES;
+ if (grh_present) {
+ header_len += IB_GRH_BYTES;
+ }
+
+ header->lrh.link_version = 0;
+ header->lrh.link_next_header =
+ grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+ header->lrh.packet_length = (IB_LRH_BYTES +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4 + /* ICRC */
+ 3) / 4; /* round up */
+
+ header->grh_present = grh_present;
+ if (grh_present) {
+ header->lrh.packet_length += IB_GRH_BYTES / 4;
+
+ header->grh.ip_version = 6;
+ header->grh.payload_length =
+ cpu_to_be16((IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4 + /* ICRC */
+ 3) & ~3); /* round up */
+ header->grh.next_header = 0x1b;
+ }
+
+ cpu_to_be16s(&header->lrh.packet_length);
+
+ if (header->immediate_present)
+ header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ else
+ header->bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ header->bth.pad_count = (4 - payload_bytes) & 3;
+ header->bth.transport_header_version = 0;
+}
+EXPORT_SYMBOL(ib_ud_header_init);
+
+/**
+ * ib_ud_header_pack - Pack UD header struct into wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_pack(struct ib_ud_header *header,
+ void *buf)
+{
+ int len = 0;
+
+ ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
+ &header->lrh, buf);
+ len += IB_LRH_BYTES;
+
+ if (header->grh_present) {
+ ib_pack(grh_table, ARRAY_SIZE(grh_table),
+ &header->grh, buf + len);
+ len += IB_GRH_BYTES;
+ }
+
+ ib_pack(bth_table, ARRAY_SIZE(bth_table),
+ &header->bth, buf + len);
+ len += IB_BTH_BYTES;
+
+ ib_pack(deth_table, ARRAY_SIZE(deth_table),
+ &header->deth, buf + len);
+ len += IB_DETH_BYTES;
+
+ if (header->immediate_present) {
+ memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+ len += sizeof header->immediate_data;
+ }
+
+ return len;
+}
+EXPORT_SYMBOL(ib_ud_header_pack);
+
+/**
+ * ib_ud_header_unpack - Unpack UD header struct from wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() unpacks the UD header structure @header from wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_unpack(void *buf,
+ struct ib_ud_header *header)
+{
+ ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
+ buf, &header->lrh);
+ buf += IB_LRH_BYTES;
+
+ if (header->lrh.link_version != 0) {
+ printk(KERN_WARNING "Invalid LRH.link_version %d\n",
+ header->lrh.link_version);
+ return -EINVAL;
+ }
+
+ switch (header->lrh.link_next_header) {
+ case IB_LNH_IBA_LOCAL:
+ header->grh_present = 0;
+ break;
+
+ case IB_LNH_IBA_GLOBAL:
+ header->grh_present = 1;
+ ib_unpack(grh_table, ARRAY_SIZE(grh_table),
+ buf, &header->grh);
+ buf += IB_GRH_BYTES;
+
+ if (header->grh.ip_version != 6) {
+ printk(KERN_WARNING "Invalid GRH.ip_version %d\n",
+ header->grh.ip_version);
+ return -EINVAL;
+ }
+ if (header->grh.next_header != 0x1b) {
+ printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n",
+ header->grh.next_header);
+ return -EINVAL;
+ }
+ break;
+
+ default:
+ printk(KERN_WARNING "Invalid LRH.link_next_header %d\n",
+ header->lrh.link_next_header);
+ return -EINVAL;
+ }
+
+ ib_unpack(bth_table, ARRAY_SIZE(bth_table),
+ buf, &header->bth);
+ buf += IB_BTH_BYTES;
+
+ switch (header->bth.opcode) {
+ case IB_OPCODE_UD_SEND_ONLY:
+ header->immediate_present = 0;
+ break;
+ case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
+ header->immediate_present = 1;
+ break;
+ default:
+ printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n",
+ header->bth.opcode);
+ return -EINVAL;
+ }
+
+ if (header->bth.transport_header_version != 0) {
+ printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n",
+ header->bth.transport_header_version);
+ return -EINVAL;
+ }
+
+ ib_unpack(deth_table, ARRAY_SIZE(deth_table),
+ buf, &header->deth);
+ buf += IB_DETH_BYTES;
+
+ if (header->immediate_present)
+ memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
new file mode 100644
index 00000000000..54b4f33b0bf
--- /dev/null
+++ b/drivers/infiniband/core/user_mad.c
@@ -0,0 +1,840 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: user_mad.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/rwsem.h>
+#include <linux/kref.h>
+
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <ib_mad.h>
+#include <ib_user_mad.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+ IB_UMAD_MAX_PORTS = 64,
+ IB_UMAD_MAX_AGENTS = 32,
+
+ IB_UMAD_MAJOR = 231,
+ IB_UMAD_MINOR_BASE = 0
+};
+
+struct ib_umad_port {
+ int devnum;
+ struct cdev dev;
+ struct class_device class_dev;
+
+ int sm_devnum;
+ struct cdev sm_dev;
+ struct class_device sm_class_dev;
+ struct semaphore sm_sem;
+
+ struct ib_device *ib_dev;
+ struct ib_umad_device *umad_dev;
+ u8 port_num;
+};
+
+struct ib_umad_device {
+ int start_port, end_port;
+ struct kref ref;
+ struct ib_umad_port port[0];
+};
+
+struct ib_umad_file {
+ struct ib_umad_port *port;
+ spinlock_t recv_lock;
+ struct list_head recv_list;
+ wait_queue_head_t recv_wait;
+ struct rw_semaphore agent_mutex;
+ struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS];
+ struct ib_mr *mr[IB_UMAD_MAX_AGENTS];
+};
+
+struct ib_umad_packet {
+ struct ib_user_mad mad;
+ struct ib_ah *ah;
+ struct list_head list;
+ DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+static spinlock_t map_lock;
+static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS * 2);
+
+static void ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device);
+
+static int queue_packet(struct ib_umad_file *file,
+ struct ib_mad_agent *agent,
+ struct ib_umad_packet *packet)
+{
+ int ret = 1;
+
+ down_read(&file->agent_mutex);
+ for (packet->mad.id = 0;
+ packet->mad.id < IB_UMAD_MAX_AGENTS;
+ packet->mad.id++)
+ if (agent == file->agent[packet->mad.id]) {
+ spin_lock_irq(&file->recv_lock);
+ list_add_tail(&packet->list, &file->recv_list);
+ spin_unlock_irq(&file->recv_lock);
+ wake_up_interruptible(&file->recv_wait);
+ ret = 0;
+ break;
+ }
+
+ up_read(&file->agent_mutex);
+
+ return ret;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *send_wc)
+{
+ struct ib_umad_file *file = agent->context;
+ struct ib_umad_packet *packet =
+ (void *) (unsigned long) send_wc->wr_id;
+
+ dma_unmap_single(agent->device->dma_device,
+ pci_unmap_addr(packet, mapping),
+ sizeof packet->mad.data,
+ DMA_TO_DEVICE);
+ ib_destroy_ah(packet->ah);
+
+ if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+ packet->mad.status = ETIMEDOUT;
+
+ if (!queue_packet(file, agent, packet))
+ return;
+ }
+
+ kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_umad_file *file = agent->context;
+ struct ib_umad_packet *packet;
+
+ if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+ goto out;
+
+ packet = kmalloc(sizeof *packet, GFP_KERNEL);
+ if (!packet)
+ goto out;
+
+ memset(packet, 0, sizeof *packet);
+
+ memcpy(packet->mad.data, mad_recv_wc->recv_buf.mad, sizeof packet->mad.data);
+ packet->mad.status = 0;
+ packet->mad.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp);
+ packet->mad.lid = cpu_to_be16(mad_recv_wc->wc->slid);
+ packet->mad.sl = mad_recv_wc->wc->sl;
+ packet->mad.path_bits = mad_recv_wc->wc->dlid_path_bits;
+ packet->mad.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+ if (packet->mad.grh_present) {
+ /* XXX parse GRH */
+ packet->mad.gid_index = 0;
+ packet->mad.hop_limit = 0;
+ packet->mad.traffic_class = 0;
+ memset(packet->mad.gid, 0, 16);
+ packet->mad.flow_label = 0;
+ }
+
+ if (queue_packet(file, agent, packet))
+ kfree(packet);
+
+out:
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_packet *packet;
+ ssize_t ret;
+
+ if (count < sizeof (struct ib_user_mad))
+ return -EINVAL;
+
+ spin_lock_irq(&file->recv_lock);
+
+ while (list_empty(&file->recv_list)) {
+ spin_unlock_irq(&file->recv_lock);
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->recv_wait,
+ !list_empty(&file->recv_list)))
+ return -ERESTARTSYS;
+
+ spin_lock_irq(&file->recv_lock);
+ }
+
+ packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+ list_del(&packet->list);
+
+ spin_unlock_irq(&file->recv_lock);
+
+ if (copy_to_user(buf, &packet->mad, sizeof packet->mad))
+ ret = -EFAULT;
+ else
+ ret = sizeof packet->mad;
+
+ kfree(packet);
+ return ret;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_packet *packet;
+ struct ib_mad_agent *agent;
+ struct ib_ah_attr ah_attr;
+ struct ib_sge gather_list;
+ struct ib_send_wr *bad_wr, wr = {
+ .opcode = IB_WR_SEND,
+ .sg_list = &gather_list,
+ .num_sge = 1,
+ .send_flags = IB_SEND_SIGNALED,
+ };
+ u8 method;
+ u64 *tid;
+ int ret;
+
+ if (count < sizeof (struct ib_user_mad))
+ return -EINVAL;
+
+ packet = kmalloc(sizeof *packet, GFP_KERNEL);
+ if (!packet)
+ return -ENOMEM;
+
+ if (copy_from_user(&packet->mad, buf, sizeof packet->mad)) {
+ kfree(packet);
+ return -EFAULT;
+ }
+
+ if (packet->mad.id < 0 || packet->mad.id >= IB_UMAD_MAX_AGENTS) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ down_read(&file->agent_mutex);
+
+ agent = file->agent[packet->mad.id];
+ if (!agent) {
+ ret = -EINVAL;
+ goto err_up;
+ }
+
+ /*
+ * If userspace is generating a request that will generate a
+ * response, we need to make sure the high-order part of the
+ * transaction ID matches the agent being used to send the
+ * MAD.
+ */
+ method = ((struct ib_mad_hdr *) packet->mad.data)->method;
+
+ if (!(method & IB_MGMT_METHOD_RESP) &&
+ method != IB_MGMT_METHOD_TRAP_REPRESS &&
+ method != IB_MGMT_METHOD_SEND) {
+ tid = &((struct ib_mad_hdr *) packet->mad.data)->tid;
+ *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
+ (be64_to_cpup(tid) & 0xffffffff));
+ }
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = be16_to_cpu(packet->mad.lid);
+ ah_attr.sl = packet->mad.sl;
+ ah_attr.src_path_bits = packet->mad.path_bits;
+ ah_attr.port_num = file->port->port_num;
+ if (packet->mad.grh_present) {
+ ah_attr.ah_flags = IB_AH_GRH;
+ memcpy(ah_attr.grh.dgid.raw, packet->mad.gid, 16);
+ ah_attr.grh.flow_label = packet->mad.flow_label;
+ ah_attr.grh.hop_limit = packet->mad.hop_limit;
+ ah_attr.grh.traffic_class = packet->mad.traffic_class;
+ }
+
+ packet->ah = ib_create_ah(agent->qp->pd, &ah_attr);
+ if (IS_ERR(packet->ah)) {
+ ret = PTR_ERR(packet->ah);
+ goto err_up;
+ }
+
+ gather_list.addr = dma_map_single(agent->device->dma_device,
+ packet->mad.data,
+ sizeof packet->mad.data,
+ DMA_TO_DEVICE);
+ gather_list.length = sizeof packet->mad.data;
+ gather_list.lkey = file->mr[packet->mad.id]->lkey;
+ pci_unmap_addr_set(packet, mapping, gather_list.addr);
+
+ wr.wr.ud.mad_hdr = (struct ib_mad_hdr *) packet->mad.data;
+ wr.wr.ud.ah = packet->ah;
+ wr.wr.ud.remote_qpn = be32_to_cpu(packet->mad.qpn);
+ wr.wr.ud.remote_qkey = be32_to_cpu(packet->mad.qkey);
+ wr.wr.ud.timeout_ms = packet->mad.timeout_ms;
+
+ wr.wr_id = (unsigned long) packet;
+
+ ret = ib_post_send_mad(agent, &wr, &bad_wr);
+ if (ret) {
+ dma_unmap_single(agent->device->dma_device,
+ pci_unmap_addr(packet, mapping),
+ sizeof packet->mad.data,
+ DMA_TO_DEVICE);
+ goto err_up;
+ }
+
+ up_read(&file->agent_mutex);
+
+ return sizeof packet->mad;
+
+err_up:
+ up_read(&file->agent_mutex);
+
+err:
+ kfree(packet);
+ return ret;
+}
+
+static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct ib_umad_file *file = filp->private_data;
+
+ /* we will always be able to post a MAD send */
+ unsigned int mask = POLLOUT | POLLWRNORM;
+
+ poll_wait(filp, &file->recv_wait, wait);
+
+ if (!list_empty(&file->recv_list))
+ mask |= POLLIN | POLLRDNORM;
+
+ return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, unsigned long arg)
+{
+ struct ib_user_mad_reg_req ureq;
+ struct ib_mad_reg_req req;
+ struct ib_mad_agent *agent;
+ int agent_id;
+ int ret;
+
+ down_write(&file->agent_mutex);
+
+ if (copy_from_user(&ureq, (void __user *) arg, sizeof ureq)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ureq.qpn != 0 && ureq.qpn != 1) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+ if (!file->agent[agent_id])
+ goto found;
+
+ ret = -ENOMEM;
+ goto out;
+
+found:
+ req.mgmt_class = ureq.mgmt_class;
+ req.mgmt_class_version = ureq.mgmt_class_version;
+ memcpy(req.method_mask, ureq.method_mask, sizeof req.method_mask);
+ memcpy(req.oui, ureq.oui, sizeof req.oui);
+
+ agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+ ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+ &req, 0, send_handler, recv_handler,
+ file);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ goto out;
+ }
+
+ file->agent[agent_id] = agent;
+
+ file->mr[agent_id] = ib_get_dma_mr(agent->qp->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(file->mr[agent_id])) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ if (put_user(agent_id,
+ (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+ ret = -EFAULT;
+ goto err_mr;
+ }
+
+ ret = 0;
+ goto out;
+
+err_mr:
+ ib_dereg_mr(file->mr[agent_id]);
+
+err:
+ file->agent[agent_id] = NULL;
+ ib_unregister_mad_agent(agent);
+
+out:
+ up_write(&file->agent_mutex);
+ return ret;
+}
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, unsigned long arg)
+{
+ u32 id;
+ int ret = 0;
+
+ down_write(&file->agent_mutex);
+
+ if (get_user(id, (u32 __user *) arg)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !file->agent[id]) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ib_dereg_mr(file->mr[id]);
+ ib_unregister_mad_agent(file->agent[id]);
+ file->agent[id] = NULL;
+
+out:
+ up_write(&file->agent_mutex);
+ return ret;
+}
+
+static long ib_umad_ioctl(struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case IB_USER_MAD_REGISTER_AGENT:
+ return ib_umad_reg_agent(filp->private_data, arg);
+ case IB_USER_MAD_UNREGISTER_AGENT:
+ return ib_umad_unreg_agent(filp->private_data, arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port =
+ container_of(inode->i_cdev, struct ib_umad_port, dev);
+ struct ib_umad_file *file;
+
+ file = kmalloc(sizeof *file, GFP_KERNEL);
+ if (!file)
+ return -ENOMEM;
+
+ memset(file, 0, sizeof *file);
+
+ spin_lock_init(&file->recv_lock);
+ init_rwsem(&file->agent_mutex);
+ INIT_LIST_HEAD(&file->recv_list);
+ init_waitqueue_head(&file->recv_wait);
+
+ file->port = port;
+ filp->private_data = file;
+
+ return 0;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_file *file = filp->private_data;
+ int i;
+
+ for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+ if (file->agent[i]) {
+ ib_dereg_mr(file->mr[i]);
+ ib_unregister_mad_agent(file->agent[i]);
+ }
+
+ kfree(file);
+
+ return 0;
+}
+
+static struct file_operations umad_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_umad_read,
+ .write = ib_umad_write,
+ .poll = ib_umad_poll,
+ .unlocked_ioctl = ib_umad_ioctl,
+ .compat_ioctl = ib_umad_ioctl,
+ .open = ib_umad_open,
+ .release = ib_umad_close
+};
+
+static int ib_umad_sm_open(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port =
+ container_of(inode->i_cdev, struct ib_umad_port, sm_dev);
+ struct ib_port_modify props = {
+ .set_port_cap_mask = IB_PORT_SM
+ };
+ int ret;
+
+ if (filp->f_flags & O_NONBLOCK) {
+ if (down_trylock(&port->sm_sem))
+ return -EAGAIN;
+ } else {
+ if (down_interruptible(&port->sm_sem))
+ return -ERESTARTSYS;
+ }
+
+ ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+ if (ret) {
+ up(&port->sm_sem);
+ return ret;
+ }
+
+ filp->private_data = port;
+
+ return 0;
+}
+
+static int ib_umad_sm_close(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port = filp->private_data;
+ struct ib_port_modify props = {
+ .clr_port_cap_mask = IB_PORT_SM
+ };
+ int ret;
+
+ ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+ up(&port->sm_sem);
+
+ return ret;
+}
+
+static struct file_operations umad_sm_fops = {
+ .owner = THIS_MODULE,
+ .open = ib_umad_sm_open,
+ .release = ib_umad_sm_close
+};
+
+static struct ib_client umad_client = {
+ .name = "umad",
+ .add = ib_umad_add_one,
+ .remove = ib_umad_remove_one
+};
+
+static ssize_t show_dev(struct class_device *class_dev, char *buf)
+{
+ struct ib_umad_port *port = class_get_devdata(class_dev);
+
+ if (class_dev == &port->class_dev)
+ return print_dev_t(buf, port->dev.dev);
+ else
+ return print_dev_t(buf, port->sm_dev.dev);
+}
+static CLASS_DEVICE_ATTR(dev, S_IRUGO, show_dev, NULL);
+
+static ssize_t show_ibdev(struct class_device *class_dev, char *buf)
+{
+ struct ib_umad_port *port = class_get_devdata(class_dev);
+
+ return sprintf(buf, "%s\n", port->ib_dev->name);
+}
+static CLASS_DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct class_device *class_dev, char *buf)
+{
+ struct ib_umad_port *port = class_get_devdata(class_dev);
+
+ return sprintf(buf, "%d\n", port->port_num);
+}
+static CLASS_DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static void ib_umad_release_dev(struct kref *ref)
+{
+ struct ib_umad_device *dev =
+ container_of(ref, struct ib_umad_device, ref);
+
+ kfree(dev);
+}
+
+static void ib_umad_release_port(struct class_device *class_dev)
+{
+ struct ib_umad_port *port = class_get_devdata(class_dev);
+
+ if (class_dev == &port->class_dev) {
+ cdev_del(&port->dev);
+ clear_bit(port->devnum, dev_map);
+ } else {
+ cdev_del(&port->sm_dev);
+ clear_bit(port->sm_devnum, dev_map);
+ }
+
+ kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+}
+
+static struct class umad_class = {
+ .name = "infiniband_mad",
+ .release = ib_umad_release_port
+};
+
+static ssize_t show_abi_version(struct class *class, char *buf)
+{
+ return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int ib_umad_init_port(struct ib_device *device, int port_num,
+ struct ib_umad_port *port)
+{
+ spin_lock(&map_lock);
+ port->devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+ if (port->devnum >= IB_UMAD_MAX_PORTS) {
+ spin_unlock(&map_lock);
+ return -1;
+ }
+ port->sm_devnum = find_next_zero_bit(dev_map, IB_UMAD_MAX_PORTS * 2, IB_UMAD_MAX_PORTS);
+ if (port->sm_devnum >= IB_UMAD_MAX_PORTS * 2) {
+ spin_unlock(&map_lock);
+ return -1;
+ }
+ set_bit(port->devnum, dev_map);
+ set_bit(port->sm_devnum, dev_map);
+ spin_unlock(&map_lock);
+
+ port->ib_dev = device;
+ port->port_num = port_num;
+ init_MUTEX(&port->sm_sem);
+
+ cdev_init(&port->dev, &umad_fops);
+ port->dev.owner = THIS_MODULE;
+ kobject_set_name(&port->dev.kobj, "umad%d", port->devnum);
+ if (cdev_add(&port->dev, base_dev + port->devnum, 1))
+ return -1;
+
+ port->class_dev.class = &umad_class;
+ port->class_dev.dev = device->dma_device;
+
+ snprintf(port->class_dev.class_id, BUS_ID_SIZE, "umad%d", port->devnum);
+
+ if (class_device_register(&port->class_dev))
+ goto err_cdev;
+
+ class_set_devdata(&port->class_dev, port);
+ kref_get(&port->umad_dev->ref);
+
+ if (class_device_create_file(&port->class_dev, &class_device_attr_dev))
+ goto err_class;
+ if (class_device_create_file(&port->class_dev, &class_device_attr_ibdev))
+ goto err_class;
+ if (class_device_create_file(&port->class_dev, &class_device_attr_port))
+ goto err_class;
+
+ cdev_init(&port->sm_dev, &umad_sm_fops);
+ port->sm_dev.owner = THIS_MODULE;
+ kobject_set_name(&port->dev.kobj, "issm%d", port->sm_devnum - IB_UMAD_MAX_PORTS);
+ if (cdev_add(&port->sm_dev, base_dev + port->sm_devnum, 1))
+ return -1;
+
+ port->sm_class_dev.class = &umad_class;
+ port->sm_class_dev.dev = device->dma_device;
+
+ snprintf(port->sm_class_dev.class_id, BUS_ID_SIZE, "issm%d", port->sm_devnum - IB_UMAD_MAX_PORTS);
+
+ if (class_device_register(&port->sm_class_dev))
+ goto err_sm_cdev;
+
+ class_set_devdata(&port->sm_class_dev, port);
+ kref_get(&port->umad_dev->ref);
+
+ if (class_device_create_file(&port->sm_class_dev, &class_device_attr_dev))
+ goto err_sm_class;
+ if (class_device_create_file(&port->sm_class_dev, &class_device_attr_ibdev))
+ goto err_sm_class;
+ if (class_device_create_file(&port->sm_class_dev, &class_device_attr_port))
+ goto err_sm_class;
+
+ return 0;
+
+err_sm_class:
+ class_device_unregister(&port->sm_class_dev);
+
+err_sm_cdev:
+ cdev_del(&port->sm_dev);
+
+err_class:
+ class_device_unregister(&port->class_dev);
+
+err_cdev:
+ cdev_del(&port->dev);
+ clear_bit(port->devnum, dev_map);
+
+ return -1;
+}
+
+static void ib_umad_add_one(struct ib_device *device)
+{
+ struct ib_umad_device *umad_dev;
+ int s, e, i;
+
+ if (device->node_type == IB_NODE_SWITCH)
+ s = e = 0;
+ else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ umad_dev = kmalloc(sizeof *umad_dev +
+ (e - s + 1) * sizeof (struct ib_umad_port),
+ GFP_KERNEL);
+ if (!umad_dev)
+ return;
+
+ memset(umad_dev, 0, sizeof *umad_dev +
+ (e - s + 1) * sizeof (struct ib_umad_port));
+
+ kref_init(&umad_dev->ref);
+
+ umad_dev->start_port = s;
+ umad_dev->end_port = e;
+
+ for (i = s; i <= e; ++i) {
+ umad_dev->port[i - s].umad_dev = umad_dev;
+
+ if (ib_umad_init_port(device, i, &umad_dev->port[i - s]))
+ goto err;
+ }
+
+ ib_set_client_data(device, &umad_client, umad_dev);
+
+ return;
+
+err:
+ while (--i >= s) {
+ class_device_unregister(&umad_dev->port[i - s].class_dev);
+ class_device_unregister(&umad_dev->port[i - s].sm_class_dev);
+ }
+
+ kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static void ib_umad_remove_one(struct ib_device *device)
+{
+ struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+ int i;
+
+ if (!umad_dev)
+ return;
+
+ for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) {
+ class_device_unregister(&umad_dev->port[i].class_dev);
+ class_device_unregister(&umad_dev->port[i].sm_class_dev);
+ }
+
+ kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static int __init ib_umad_init(void)
+{
+ int ret;
+
+ spin_lock_init(&map_lock);
+
+ ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
+ "infiniband_mad");
+ if (ret) {
+ printk(KERN_ERR "user_mad: couldn't register device number\n");
+ goto out;
+ }
+
+ ret = class_register(&umad_class);
+ if (ret) {
+ printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n");
+ goto out_chrdev;
+ }
+
+ ret = class_create_file(&umad_class, &class_attr_abi_version);
+ if (ret) {
+ printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n");
+ goto out_class;
+ }
+
+ ret = ib_register_client(&umad_client);
+ if (ret) {
+ printk(KERN_ERR "user_mad: couldn't register ib_umad client\n");
+ goto out_class;
+ }
+
+ return 0;
+
+out_class:
+ class_unregister(&umad_class);
+
+out_chrdev:
+ unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+
+out:
+ return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+ ib_unregister_client(&umad_client);
+ class_unregister(&umad_class);
+ unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
new file mode 100644
index 00000000000..7c08ed0cd7d
--- /dev/null
+++ b/drivers/infiniband/core/verbs.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: verbs.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+
+#include <ib_verbs.h>
+
+/* Protection domains */
+
+struct ib_pd *ib_alloc_pd(struct ib_device *device)
+{
+ struct ib_pd *pd;
+
+ pd = device->alloc_pd(device);
+
+ if (!IS_ERR(pd)) {
+ pd->device = device;
+ atomic_set(&pd->usecnt, 0);
+ }
+
+ return pd;
+}
+EXPORT_SYMBOL(ib_alloc_pd);
+
+int ib_dealloc_pd(struct ib_pd *pd)
+{
+ if (atomic_read(&pd->usecnt))
+ return -EBUSY;
+
+ return pd->device->dealloc_pd(pd);
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+ struct ib_ah *ah;
+
+ ah = pd->device->create_ah(pd, ah_attr);
+
+ if (!IS_ERR(ah)) {
+ ah->device = pd->device;
+ ah->pd = pd;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return ah;
+}
+EXPORT_SYMBOL(ib_create_ah);
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ return ah->device->modify_ah ?
+ ah->device->modify_ah(ah, ah_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_ah);
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ return ah->device->query_ah ?
+ ah->device->query_ah(ah, ah_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_ah);
+
+int ib_destroy_ah(struct ib_ah *ah)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = ah->pd;
+ ret = ah->device->destroy_ah(ah);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_ah);
+
+/* Queue pairs */
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct ib_qp *qp;
+
+ qp = pd->device->create_qp(pd, qp_init_attr);
+
+ if (!IS_ERR(qp)) {
+ qp->device = pd->device;
+ qp->pd = pd;
+ qp->send_cq = qp_init_attr->send_cq;
+ qp->recv_cq = qp_init_attr->recv_cq;
+ qp->srq = qp_init_attr->srq;
+ qp->event_handler = qp_init_attr->event_handler;
+ qp->qp_context = qp_init_attr->qp_context;
+ qp->qp_type = qp_init_attr->qp_type;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&qp_init_attr->send_cq->usecnt);
+ atomic_inc(&qp_init_attr->recv_cq->usecnt);
+ if (qp_init_attr->srq)
+ atomic_inc(&qp_init_attr->srq->usecnt);
+ }
+
+ return qp;
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+int ib_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask)
+{
+ return qp->device->modify_qp(qp, qp_attr, qp_attr_mask);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ return qp->device->query_qp ?
+ qp->device->query_qp(qp, qp_attr, qp_attr_mask, qp_init_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+ struct ib_pd *pd;
+ struct ib_cq *scq, *rcq;
+ struct ib_srq *srq;
+ int ret;
+
+ pd = qp->pd;
+ scq = qp->send_cq;
+ rcq = qp->recv_cq;
+ srq = qp->srq;
+
+ ret = qp->device->destroy_qp(qp);
+ if (!ret) {
+ atomic_dec(&pd->usecnt);
+ atomic_dec(&scq->usecnt);
+ atomic_dec(&rcq->usecnt);
+ if (srq)
+ atomic_dec(&srq->usecnt);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context, int cqe)
+{
+ struct ib_cq *cq;
+
+ cq = device->create_cq(device, cqe);
+
+ if (!IS_ERR(cq)) {
+ cq->device = device;
+ cq->comp_handler = comp_handler;
+ cq->event_handler = event_handler;
+ cq->cq_context = cq_context;
+ atomic_set(&cq->usecnt, 0);
+ }
+
+ return cq;
+}
+EXPORT_SYMBOL(ib_create_cq);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+ if (atomic_read(&cq->usecnt))
+ return -EBUSY;
+
+ return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq,
+ int cqe)
+{
+ int ret;
+
+ if (!cq->device->resize_cq)
+ return -ENOSYS;
+
+ ret = cq->device->resize_cq(cq, &cqe);
+ if (!ret)
+ cq->cqe = cqe;
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+ struct ib_mr *mr;
+
+ mr = pd->device->get_dma_mr(pd, mr_access_flags);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_get_dma_mr);
+
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start)
+{
+ struct ib_mr *mr;
+
+ mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
+ mr_access_flags, iova_start);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_reg_phys_mr);
+
+int ib_rereg_phys_mr(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start)
+{
+ struct ib_pd *old_pd;
+ int ret;
+
+ if (!mr->device->rereg_phys_mr)
+ return -ENOSYS;
+
+ if (atomic_read(&mr->usecnt))
+ return -EBUSY;
+
+ old_pd = mr->pd;
+
+ ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
+ phys_buf_array, num_phys_buf,
+ mr_access_flags, iova_start);
+
+ if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
+ atomic_dec(&old_pd->usecnt);
+ atomic_inc(&pd->usecnt);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_rereg_phys_mr);
+
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+ return mr->device->query_mr ?
+ mr->device->query_mr(mr, mr_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_mr);
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ if (atomic_read(&mr->usecnt))
+ return -EBUSY;
+
+ pd = mr->pd;
+ ret = mr->device->dereg_mr(mr);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+/* Memory windows */
+
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
+{
+ struct ib_mw *mw;
+
+ if (!pd->device->alloc_mw)
+ return ERR_PTR(-ENOSYS);
+
+ mw = pd->device->alloc_mw(pd);
+ if (!IS_ERR(mw)) {
+ mw->device = pd->device;
+ mw->pd = pd;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return mw;
+}
+EXPORT_SYMBOL(ib_alloc_mw);
+
+int ib_dealloc_mw(struct ib_mw *mw)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = mw->pd;
+ ret = mw->device->dealloc_mw(mw);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_mw);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct ib_fmr *fmr;
+
+ if (!pd->device->alloc_fmr)
+ return ERR_PTR(-ENOSYS);
+
+ fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+ if (!IS_ERR(fmr)) {
+ fmr->device = pd->device;
+ fmr->pd = pd;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+ struct ib_fmr *fmr;
+
+ if (list_empty(fmr_list))
+ return 0;
+
+ fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+ return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = fmr->pd;
+ ret = fmr->device->dealloc_fmr(fmr);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ return qp->device->attach_mcast ?
+ qp->device->attach_mcast(qp, gid, lid) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ return qp->device->detach_mcast ?
+ qp->device->detach_mcast(qp, gid, lid) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_detach_mcast);
diff --git a/drivers/infiniband/hw/mthca/Kconfig b/drivers/infiniband/hw/mthca/Kconfig
new file mode 100644
index 00000000000..e88be85b3d5
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/Kconfig
@@ -0,0 +1,16 @@
+config INFINIBAND_MTHCA
+ tristate "Mellanox HCA support"
+ depends on PCI && INFINIBAND
+ ---help---
+ This is a low-level driver for Mellanox InfiniHost host
+ channel adapters (HCAs), including the MT23108 PCI-X HCA
+ ("Tavor") and the MT25208 PCI Express HCA ("Arbel").
+
+config INFINIBAND_MTHCA_DEBUG
+ bool "Verbose debugging output"
+ depends on INFINIBAND_MTHCA
+ default n
+ ---help---
+ This option causes the mthca driver produce a bunch of debug
+ messages. Select this is you are developing the driver or
+ trying to diagnose a problem.
diff --git a/drivers/infiniband/hw/mthca/Makefile b/drivers/infiniband/hw/mthca/Makefile
new file mode 100644
index 00000000000..5dcbd43073e
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/Makefile
@@ -0,0 +1,12 @@
+EXTRA_CFLAGS += -Idrivers/infiniband/include
+
+ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+EXTRA_CFLAGS += -DDEBUG
+endif
+
+obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o
+
+ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \
+ mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \
+ mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \
+ mthca_provider.o mthca_memfree.o mthca_uar.o
diff --git a/drivers/infiniband/hw/mthca/mthca_allocator.c b/drivers/infiniband/hw/mthca/mthca_allocator.c
new file mode 100644
index 00000000000..b1db48dd91d
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_allocator.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_allocator.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "mthca_dev.h"
+
+/* Trivial bitmap-based allocator */
+u32 mthca_alloc(struct mthca_alloc *alloc)
+{
+ u32 obj;
+
+ spin_lock(&alloc->lock);
+ obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+ if (obj >= alloc->max) {
+ alloc->top = (alloc->top + alloc->max) & alloc->mask;
+ obj = find_first_zero_bit(alloc->table, alloc->max);
+ }
+
+ if (obj < alloc->max) {
+ set_bit(obj, alloc->table);
+ obj |= alloc->top;
+ } else
+ obj = -1;
+
+ spin_unlock(&alloc->lock);
+
+ return obj;
+}
+
+void mthca_free(struct mthca_alloc *alloc, u32 obj)
+{
+ obj &= alloc->max - 1;
+ spin_lock(&alloc->lock);
+ clear_bit(obj, alloc->table);
+ alloc->last = min(alloc->last, obj);
+ alloc->top = (alloc->top + alloc->max) & alloc->mask;
+ spin_unlock(&alloc->lock);
+}
+
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+ u32 reserved)
+{
+ int i;
+
+ /* num must be a power of 2 */
+ if (num != 1 << (ffs(num) - 1))
+ return -EINVAL;
+
+ alloc->last = 0;
+ alloc->top = 0;
+ alloc->max = num;
+ alloc->mask = mask;
+ spin_lock_init(&alloc->lock);
+ alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof (long),
+ GFP_KERNEL);
+ if (!alloc->table)
+ return -ENOMEM;
+
+ bitmap_zero(alloc->table, num);
+ for (i = 0; i < reserved; ++i)
+ set_bit(i, alloc->table);
+
+ return 0;
+}
+
+void mthca_alloc_cleanup(struct mthca_alloc *alloc)
+{
+ kfree(alloc->table);
+}
+
+/*
+ * Array of pointers with lazy allocation of leaf pages. Callers of
+ * _get, _set and _clear methods must use a lock or otherwise
+ * serialize access to the array.
+ */
+
+void *mthca_array_get(struct mthca_array *array, int index)
+{
+ int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+ if (array->page_list[p].page) {
+ int i = index & (PAGE_SIZE / sizeof (void *) - 1);
+ return array->page_list[p].page[i];
+ } else
+ return NULL;
+}
+
+int mthca_array_set(struct mthca_array *array, int index, void *value)
+{
+ int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+ /* Allocate with GFP_ATOMIC because we'll be called with locks held. */
+ if (!array->page_list[p].page)
+ array->page_list[p].page = (void **) get_zeroed_page(GFP_ATOMIC);
+
+ if (!array->page_list[p].page)
+ return -ENOMEM;
+
+ array->page_list[p].page[index & (PAGE_SIZE / sizeof (void *) - 1)] =
+ value;
+ ++array->page_list[p].used;
+
+ return 0;
+}
+
+void mthca_array_clear(struct mthca_array *array, int index)
+{
+ int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+ if (--array->page_list[p].used == 0) {
+ free_page((unsigned long) array->page_list[p].page);
+ array->page_list[p].page = NULL;
+ }
+
+ if (array->page_list[p].used < 0)
+ pr_debug("Array %p index %d page %d with ref count %d < 0\n",
+ array, index, p, array->page_list[p].used);
+}
+
+int mthca_array_init(struct mthca_array *array, int nent)
+{
+ int npage = (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE;
+ int i;
+
+ array->page_list = kmalloc(npage * sizeof *array->page_list, GFP_KERNEL);
+ if (!array->page_list)
+ return -ENOMEM;
+
+ for (i = 0; i < npage; ++i) {
+ array->page_list[i].page = NULL;
+ array->page_list[i].used = 0;
+ }
+
+ return 0;
+}
+
+void mthca_array_cleanup(struct mthca_array *array, int nent)
+{
+ int i;
+
+ for (i = 0; i < (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+ free_page((unsigned long) array->page_list[i].page);
+
+ kfree(array->page_list);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
new file mode 100644
index 00000000000..426d32778e9
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_av.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/init.h>
+
+#include <ib_verbs.h>
+#include <ib_cache.h>
+
+#include "mthca_dev.h"
+
+struct mthca_av {
+ u32 port_pd;
+ u8 reserved1;
+ u8 g_slid;
+ u16 dlid;
+ u8 reserved2;
+ u8 gid_index;
+ u8 msg_sr;
+ u8 hop_limit;
+ u32 sl_tclass_flowlabel;
+ u32 dgid[4];
+};
+
+int mthca_create_ah(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct ib_ah_attr *ah_attr,
+ struct mthca_ah *ah)
+{
+ u32 index = -1;
+ struct mthca_av *av = NULL;
+
+ ah->type = MTHCA_AH_PCI_POOL;
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ ah->av = kmalloc(sizeof *ah->av, GFP_KERNEL);
+ if (!ah->av)
+ return -ENOMEM;
+
+ ah->type = MTHCA_AH_KMALLOC;
+ av = ah->av;
+ } else if (!atomic_read(&pd->sqp_count) &&
+ !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+ index = mthca_alloc(&dev->av_table.alloc);
+
+ /* fall back to allocate in host memory */
+ if (index == -1)
+ goto on_hca_fail;
+
+ av = kmalloc(sizeof *av, GFP_KERNEL);
+ if (!av)
+ goto on_hca_fail;
+
+ ah->type = MTHCA_AH_ON_HCA;
+ ah->avdma = dev->av_table.ddr_av_base +
+ index * MTHCA_AV_SIZE;
+ }
+
+on_hca_fail:
+ if (ah->type == MTHCA_AH_PCI_POOL) {
+ ah->av = pci_pool_alloc(dev->av_table.pool,
+ SLAB_KERNEL, &ah->avdma);
+ if (!ah->av)
+ return -ENOMEM;
+
+ av = ah->av;
+ }
+
+ ah->key = pd->ntmr.ibmr.lkey;
+
+ memset(av, 0, MTHCA_AV_SIZE);
+
+ av->port_pd = cpu_to_be32(pd->pd_num | (ah_attr->port_num << 24));
+ av->g_slid = ah_attr->src_path_bits;
+ av->dlid = cpu_to_be16(ah_attr->dlid);
+ av->msg_sr = (3 << 4) | /* 2K message */
+ ah_attr->static_rate;
+ av->sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ av->g_slid |= 0x80;
+ av->gid_index = (ah_attr->port_num - 1) * dev->limits.gid_table_len +
+ ah_attr->grh.sgid_index;
+ av->hop_limit = ah_attr->grh.hop_limit;
+ av->sl_tclass_flowlabel |=
+ cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+ ah_attr->grh.flow_label);
+ memcpy(av->dgid, ah_attr->grh.dgid.raw, 16);
+ } else {
+ /* Arbel workaround -- low byte of GID must be 2 */
+ av->dgid[3] = cpu_to_be32(2);
+ }
+
+ if (0) {
+ int j;
+
+ mthca_dbg(dev, "Created UDAV at %p/%08lx:\n",
+ av, (unsigned long) ah->avdma);
+ for (j = 0; j < 8; ++j)
+ printk(KERN_DEBUG " [%2x] %08x\n",
+ j * 4, be32_to_cpu(((u32 *) av)[j]));
+ }
+
+ if (ah->type == MTHCA_AH_ON_HCA) {
+ memcpy_toio(dev->av_table.av_map + index * MTHCA_AV_SIZE,
+ av, MTHCA_AV_SIZE);
+ kfree(av);
+ }
+
+ return 0;
+}
+
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah)
+{
+ switch (ah->type) {
+ case MTHCA_AH_ON_HCA:
+ mthca_free(&dev->av_table.alloc,
+ (ah->avdma - dev->av_table.ddr_av_base) /
+ MTHCA_AV_SIZE);
+ break;
+
+ case MTHCA_AH_PCI_POOL:
+ pci_pool_free(dev->av_table.pool, ah->av, ah->avdma);
+ break;
+
+ case MTHCA_AH_KMALLOC:
+ kfree(ah->av);
+ break;
+ }
+
+ return 0;
+}
+
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+ struct ib_ud_header *header)
+{
+ if (ah->type == MTHCA_AH_ON_HCA)
+ return -EINVAL;
+
+ header->lrh.service_level = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+ header->lrh.destination_lid = ah->av->dlid;
+ header->lrh.source_lid = ah->av->g_slid & 0x7f;
+ if (ah->av->g_slid & 0x80) {
+ header->grh_present = 1;
+ header->grh.traffic_class =
+ (be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
+ header->grh.flow_label =
+ ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+ ib_get_cached_gid(&dev->ib_dev,
+ be32_to_cpu(ah->av->port_pd) >> 24,
+ ah->av->gid_index,
+ &header->grh.source_gid);
+ memcpy(header->grh.destination_gid.raw,
+ ah->av->dgid, 16);
+ } else {
+ header->grh_present = 0;
+ }
+
+ return 0;
+}
+
+int __devinit mthca_init_av_table(struct mthca_dev *dev)
+{
+ int err;
+
+ if (dev->hca_type == ARBEL_NATIVE)
+ return 0;
+
+ err = mthca_alloc_init(&dev->av_table.alloc,
+ dev->av_table.num_ddr_avs,
+ dev->av_table.num_ddr_avs - 1,
+ 0);
+ if (err)
+ return err;
+
+ dev->av_table.pool = pci_pool_create("mthca_av", dev->pdev,
+ MTHCA_AV_SIZE,
+ MTHCA_AV_SIZE, 0);
+ if (!dev->av_table.pool)
+ goto out_free_alloc;
+
+ if (!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+ dev->av_table.av_map = ioremap(pci_resource_start(dev->pdev, 4) +
+ dev->av_table.ddr_av_base -
+ dev->ddr_start,
+ dev->av_table.num_ddr_avs *
+ MTHCA_AV_SIZE);
+ if (!dev->av_table.av_map)
+ goto out_free_pool;
+ } else
+ dev->av_table.av_map = NULL;
+
+ return 0;
+
+ out_free_pool:
+ pci_pool_destroy(dev->av_table.pool);
+
+ out_free_alloc:
+ mthca_alloc_cleanup(&dev->av_table.alloc);
+ return -ENOMEM;
+}
+
+void __devexit mthca_cleanup_av_table(struct mthca_dev *dev)
+{
+ if (dev->hca_type == ARBEL_NATIVE)
+ return;
+
+ if (dev->av_table.av_map)
+ iounmap(dev->av_table.av_map);
+ pci_pool_destroy(dev->av_table.pool);
+ mthca_alloc_cleanup(&dev->av_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
new file mode 100644
index 00000000000..9def0981f63
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -0,0 +1,1767 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_cmd.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <asm/io.h>
+#include <ib_mad.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+ HCR_IN_PARAM_OFFSET = 0x00,
+ HCR_IN_MODIFIER_OFFSET = 0x08,
+ HCR_OUT_PARAM_OFFSET = 0x0c,
+ HCR_TOKEN_OFFSET = 0x14,
+ HCR_STATUS_OFFSET = 0x18,
+
+ HCR_OPMOD_SHIFT = 12,
+ HCA_E_BIT = 22,
+ HCR_GO_BIT = 23
+};
+
+enum {
+ /* initialization and general commands */
+ CMD_SYS_EN = 0x1,
+ CMD_SYS_DIS = 0x2,
+ CMD_MAP_FA = 0xfff,
+ CMD_UNMAP_FA = 0xffe,
+ CMD_RUN_FW = 0xff6,
+ CMD_MOD_STAT_CFG = 0x34,
+ CMD_QUERY_DEV_LIM = 0x3,
+ CMD_QUERY_FW = 0x4,
+ CMD_ENABLE_LAM = 0xff8,
+ CMD_DISABLE_LAM = 0xff7,
+ CMD_QUERY_DDR = 0x5,
+ CMD_QUERY_ADAPTER = 0x6,
+ CMD_INIT_HCA = 0x7,
+ CMD_CLOSE_HCA = 0x8,
+ CMD_INIT_IB = 0x9,
+ CMD_CLOSE_IB = 0xa,
+ CMD_QUERY_HCA = 0xb,
+ CMD_SET_IB = 0xc,
+ CMD_ACCESS_DDR = 0x2e,
+ CMD_MAP_ICM = 0xffa,
+ CMD_UNMAP_ICM = 0xff9,
+ CMD_MAP_ICM_AUX = 0xffc,
+ CMD_UNMAP_ICM_AUX = 0xffb,
+ CMD_SET_ICM_SIZE = 0xffd,
+
+ /* TPT commands */
+ CMD_SW2HW_MPT = 0xd,
+ CMD_QUERY_MPT = 0xe,
+ CMD_HW2SW_MPT = 0xf,
+ CMD_READ_MTT = 0x10,
+ CMD_WRITE_MTT = 0x11,
+ CMD_SYNC_TPT = 0x2f,
+
+ /* EQ commands */
+ CMD_MAP_EQ = 0x12,
+ CMD_SW2HW_EQ = 0x13,
+ CMD_HW2SW_EQ = 0x14,
+ CMD_QUERY_EQ = 0x15,
+
+ /* CQ commands */
+ CMD_SW2HW_CQ = 0x16,
+ CMD_HW2SW_CQ = 0x17,
+ CMD_QUERY_CQ = 0x18,
+ CMD_RESIZE_CQ = 0x2c,
+
+ /* SRQ commands */
+ CMD_SW2HW_SRQ = 0x35,
+ CMD_HW2SW_SRQ = 0x36,
+ CMD_QUERY_SRQ = 0x37,
+
+ /* QP/EE commands */
+ CMD_RST2INIT_QPEE = 0x19,
+ CMD_INIT2RTR_QPEE = 0x1a,
+ CMD_RTR2RTS_QPEE = 0x1b,
+ CMD_RTS2RTS_QPEE = 0x1c,
+ CMD_SQERR2RTS_QPEE = 0x1d,
+ CMD_2ERR_QPEE = 0x1e,
+ CMD_RTS2SQD_QPEE = 0x1f,
+ CMD_SQD2SQD_QPEE = 0x38,
+ CMD_SQD2RTS_QPEE = 0x20,
+ CMD_ERR2RST_QPEE = 0x21,
+ CMD_QUERY_QPEE = 0x22,
+ CMD_INIT2INIT_QPEE = 0x2d,
+ CMD_SUSPEND_QPEE = 0x32,
+ CMD_UNSUSPEND_QPEE = 0x33,
+ /* special QPs and management commands */
+ CMD_CONF_SPECIAL_QP = 0x23,
+ CMD_MAD_IFC = 0x24,
+
+ /* multicast commands */
+ CMD_READ_MGM = 0x25,
+ CMD_WRITE_MGM = 0x26,
+ CMD_MGID_HASH = 0x27,
+
+ /* miscellaneous commands */
+ CMD_DIAG_RPRT = 0x30,
+ CMD_NOP = 0x31,
+
+ /* debug commands */
+ CMD_QUERY_DEBUG_MSG = 0x2a,
+ CMD_SET_DEBUG_MSG = 0x2b,
+};
+
+/*
+ * According to Mellanox code, FW may be starved and never complete
+ * commands. So we can't use strict timeouts described in PRM -- we
+ * just arbitrarily select 60 seconds for now.
+ */
+#if 0
+/*
+ * Round up and add 1 to make sure we get the full wait time (since we
+ * will be starting in the middle of a jiffy)
+ */
+enum {
+ CMD_TIME_CLASS_A = (HZ + 999) / 1000 + 1,
+ CMD_TIME_CLASS_B = (HZ + 99) / 100 + 1,
+ CMD_TIME_CLASS_C = (HZ + 9) / 10 + 1
+};
+#else
+enum {
+ CMD_TIME_CLASS_A = 60 * HZ,
+ CMD_TIME_CLASS_B = 60 * HZ,
+ CMD_TIME_CLASS_C = 60 * HZ
+};
+#endif
+
+enum {
+ GO_BIT_TIMEOUT = HZ * 10
+};
+
+struct mthca_cmd_context {
+ struct completion done;
+ struct timer_list timer;
+ int result;
+ int next;
+ u64 out_param;
+ u16 token;
+ u8 status;
+};
+
+static inline int go_bit(struct mthca_dev *dev)
+{
+ return readl(dev->hcr + HCR_STATUS_OFFSET) &
+ swab32(1 << HCR_GO_BIT);
+}
+
+static int mthca_cmd_post(struct mthca_dev *dev,
+ u64 in_param,
+ u64 out_param,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ u16 token,
+ int event)
+{
+ int err = 0;
+
+ if (down_interruptible(&dev->cmd.hcr_sem))
+ return -EINTR;
+
+ if (event) {
+ unsigned long end = jiffies + GO_BIT_TIMEOUT;
+
+ while (go_bit(dev) && time_before(jiffies, end)) {
+ set_current_state(TASK_RUNNING);
+ schedule();
+ }
+ }
+
+ if (go_bit(dev)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ /*
+ * We use writel (instead of something like memcpy_toio)
+ * because writes of less than 32 bits to the HCR don't work
+ * (and some architectures such as ia64 implement memcpy_toio
+ * in terms of writeb).
+ */
+ __raw_writel(cpu_to_be32(in_param >> 32), dev->hcr + 0 * 4);
+ __raw_writel(cpu_to_be32(in_param & 0xfffffffful), dev->hcr + 1 * 4);
+ __raw_writel(cpu_to_be32(in_modifier), dev->hcr + 2 * 4);
+ __raw_writel(cpu_to_be32(out_param >> 32), dev->hcr + 3 * 4);
+ __raw_writel(cpu_to_be32(out_param & 0xfffffffful), dev->hcr + 4 * 4);
+ __raw_writel(cpu_to_be32(token << 16), dev->hcr + 5 * 4);
+
+ /* __raw_writel may not order writes. */
+ wmb();
+
+ __raw_writel(cpu_to_be32((1 << HCR_GO_BIT) |
+ (event ? (1 << HCA_E_BIT) : 0) |
+ (op_modifier << HCR_OPMOD_SHIFT) |
+ op), dev->hcr + 6 * 4);
+
+out:
+ up(&dev->cmd.hcr_sem);
+ return err;
+}
+
+static int mthca_cmd_poll(struct mthca_dev *dev,
+ u64 in_param,
+ u64 *out_param,
+ int out_is_imm,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ unsigned long timeout,
+ u8 *status)
+{
+ int err = 0;
+ unsigned long end;
+
+ if (down_interruptible(&dev->cmd.poll_sem))
+ return -EINTR;
+
+ err = mthca_cmd_post(dev, in_param,
+ out_param ? *out_param : 0,
+ in_modifier, op_modifier,
+ op, CMD_POLL_TOKEN, 0);
+ if (err)
+ goto out;
+
+ end = timeout + jiffies;
+ while (go_bit(dev) && time_before(jiffies, end)) {
+ set_current_state(TASK_RUNNING);
+ schedule();
+ }
+
+ if (go_bit(dev)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (out_is_imm) {
+ memcpy_fromio(out_param, dev->hcr + HCR_OUT_PARAM_OFFSET, sizeof (u64));
+ be64_to_cpus(out_param);
+ }
+
+ *status = be32_to_cpu(__raw_readl(dev->hcr + HCR_STATUS_OFFSET)) >> 24;
+
+out:
+ up(&dev->cmd.poll_sem);
+ return err;
+}
+
+void mthca_cmd_event(struct mthca_dev *dev,
+ u16 token,
+ u8 status,
+ u64 out_param)
+{
+ struct mthca_cmd_context *context =
+ &dev->cmd.context[token & dev->cmd.token_mask];
+
+ /* previously timed out command completing at long last */
+ if (token != context->token)
+ return;
+
+ context->result = 0;
+ context->status = status;
+ context->out_param = out_param;
+
+ context->token += dev->cmd.token_mask + 1;
+
+ complete(&context->done);
+}
+
+static void event_timeout(unsigned long context_ptr)
+{
+ struct mthca_cmd_context *context =
+ (struct mthca_cmd_context *) context_ptr;
+
+ context->result = -EBUSY;
+ complete(&context->done);
+}
+
+static int mthca_cmd_wait(struct mthca_dev *dev,
+ u64 in_param,
+ u64 *out_param,
+ int out_is_imm,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ unsigned long timeout,
+ u8 *status)
+{
+ int err = 0;
+ struct mthca_cmd_context *context;
+
+ if (down_interruptible(&dev->cmd.event_sem))
+ return -EINTR;
+
+ spin_lock(&dev->cmd.context_lock);
+ BUG_ON(dev->cmd.free_head < 0);
+ context = &dev->cmd.context[dev->cmd.free_head];
+ dev->cmd.free_head = context->next;
+ spin_unlock(&dev->cmd.context_lock);
+
+ init_completion(&context->done);
+
+ err = mthca_cmd_post(dev, in_param,
+ out_param ? *out_param : 0,
+ in_modifier, op_modifier,
+ op, context->token, 1);
+ if (err)
+ goto out;
+
+ context->timer.expires = jiffies + timeout;
+ add_timer(&context->timer);
+
+ wait_for_completion(&context->done);
+ del_timer_sync(&context->timer);
+
+ err = context->result;
+ if (err)
+ goto out;
+
+ *status = context->status;
+ if (*status)
+ mthca_dbg(dev, "Command %02x completed with status %02x\n",
+ op, *status);
+
+ if (out_is_imm)
+ *out_param = context->out_param;
+
+out:
+ spin_lock(&dev->cmd.context_lock);
+ context->next = dev->cmd.free_head;
+ dev->cmd.free_head = context - dev->cmd.context;
+ spin_unlock(&dev->cmd.context_lock);
+
+ up(&dev->cmd.event_sem);
+ return err;
+}
+
+/* Invoke a command with an output mailbox */
+static int mthca_cmd_box(struct mthca_dev *dev,
+ u64 in_param,
+ u64 out_param,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ unsigned long timeout,
+ u8 *status)
+{
+ if (dev->cmd.use_events)
+ return mthca_cmd_wait(dev, in_param, &out_param, 0,
+ in_modifier, op_modifier, op,
+ timeout, status);
+ else
+ return mthca_cmd_poll(dev, in_param, &out_param, 0,
+ in_modifier, op_modifier, op,
+ timeout, status);
+}
+
+/* Invoke a command with no output parameter */
+static int mthca_cmd(struct mthca_dev *dev,
+ u64 in_param,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ unsigned long timeout,
+ u8 *status)
+{
+ return mthca_cmd_box(dev, in_param, 0, in_modifier,
+ op_modifier, op, timeout, status);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static int mthca_cmd_imm(struct mthca_dev *dev,
+ u64 in_param,
+ u64 *out_param,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ unsigned long timeout,
+ u8 *status)
+{
+ if (dev->cmd.use_events)
+ return mthca_cmd_wait(dev, in_param, out_param, 1,
+ in_modifier, op_modifier, op,
+ timeout, status);
+ else
+ return mthca_cmd_poll(dev, in_param, out_param, 1,
+ in_modifier, op_modifier, op,
+ timeout, status);
+}
+
+/*
+ * Switch to using events to issue FW commands (should be called after
+ * event queue to command events has been initialized).
+ */
+int mthca_cmd_use_events(struct mthca_dev *dev)
+{
+ int i;
+
+ dev->cmd.context = kmalloc(dev->cmd.max_cmds *
+ sizeof (struct mthca_cmd_context),
+ GFP_KERNEL);
+ if (!dev->cmd.context)
+ return -ENOMEM;
+
+ for (i = 0; i < dev->cmd.max_cmds; ++i) {
+ dev->cmd.context[i].token = i;
+ dev->cmd.context[i].next = i + 1;
+ init_timer(&dev->cmd.context[i].timer);
+ dev->cmd.context[i].timer.data =
+ (unsigned long) &dev->cmd.context[i];
+ dev->cmd.context[i].timer.function = event_timeout;
+ }
+
+ dev->cmd.context[dev->cmd.max_cmds - 1].next = -1;
+ dev->cmd.free_head = 0;
+
+ sema_init(&dev->cmd.event_sem, dev->cmd.max_cmds);
+ spin_lock_init(&dev->cmd.context_lock);
+
+ for (dev->cmd.token_mask = 1;
+ dev->cmd.token_mask < dev->cmd.max_cmds;
+ dev->cmd.token_mask <<= 1)
+ ; /* nothing */
+ --dev->cmd.token_mask;
+
+ dev->cmd.use_events = 1;
+ down(&dev->cmd.poll_sem);
+
+ return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mthca_cmd_use_polling(struct mthca_dev *dev)
+{
+ int i;
+
+ dev->cmd.use_events = 0;
+
+ for (i = 0; i < dev->cmd.max_cmds; ++i)
+ down(&dev->cmd.event_sem);
+
+ kfree(dev->cmd.context);
+
+ up(&dev->cmd.poll_sem);
+}
+
+int mthca_SYS_EN(struct mthca_dev *dev, u8 *status)
+{
+ u64 out;
+ int ret;
+
+ ret = mthca_cmd_imm(dev, 0, &out, 0, 0, CMD_SYS_EN, HZ, status);
+
+ if (*status == MTHCA_CMD_STAT_DDR_MEM_ERR)
+ mthca_warn(dev, "SYS_EN DDR error: syn=%x, sock=%d, "
+ "sladdr=%d, SPD source=%s\n",
+ (int) (out >> 6) & 0xf, (int) (out >> 4) & 3,
+ (int) (out >> 1) & 7, (int) out & 1 ? "NVMEM" : "DIMM");
+
+ return ret;
+}
+
+int mthca_SYS_DIS(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, HZ, status);
+}
+
+static int mthca_map_cmd(struct mthca_dev *dev, u16 op, struct mthca_icm *icm,
+ u64 virt, u8 *status)
+{
+ u32 *inbox;
+ dma_addr_t indma;
+ struct mthca_icm_iter iter;
+ int lg;
+ int nent = 0;
+ int i;
+ int err = 0;
+ int ts = 0, tc = 0;
+
+ inbox = pci_alloc_consistent(dev->pdev, PAGE_SIZE, &indma);
+ if (!inbox)
+ return -ENOMEM;
+
+ memset(inbox, 0, PAGE_SIZE);
+
+ for (mthca_icm_first(icm, &iter);
+ !mthca_icm_last(&iter);
+ mthca_icm_next(&iter)) {
+ /*
+ * We have to pass pages that are aligned to their
+ * size, so find the least significant 1 in the
+ * address or size and use that as our log2 size.
+ */
+ lg = ffs(mthca_icm_addr(&iter) | mthca_icm_size(&iter)) - 1;
+ if (lg < 12) {
+ mthca_warn(dev, "Got FW area not aligned to 4K (%llx/%lx).\n",
+ (unsigned long long) mthca_icm_addr(&iter),
+ mthca_icm_size(&iter));
+ err = -EINVAL;
+ goto out;
+ }
+ for (i = 0; i < mthca_icm_size(&iter) / (1 << lg); ++i, ++nent) {
+ if (virt != -1) {
+ *((__be64 *) (inbox + nent * 4)) =
+ cpu_to_be64(virt);
+ virt += 1 << lg;
+ }
+
+ *((__be64 *) (inbox + nent * 4 + 2)) =
+ cpu_to_be64((mthca_icm_addr(&iter) +
+ (i << lg)) | (lg - 12));
+ ts += 1 << (lg - 10);
+ ++tc;
+
+ if (nent == PAGE_SIZE / 16) {
+ err = mthca_cmd(dev, indma, nent, 0, op,
+ CMD_TIME_CLASS_B, status);
+ if (err || *status)
+ goto out;
+ nent = 0;
+ }
+ }
+ }
+
+ if (nent)
+ err = mthca_cmd(dev, indma, nent, 0, op,
+ CMD_TIME_CLASS_B, status);
+
+ switch (op) {
+ case CMD_MAP_FA:
+ mthca_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+ break;
+ case CMD_MAP_ICM_AUX:
+ mthca_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+ break;
+ case CMD_MAP_ICM:
+ mthca_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
+ tc, ts, (unsigned long long) virt - (ts << 10));
+ break;
+ }
+
+out:
+ pci_free_consistent(dev->pdev, PAGE_SIZE, inbox, indma);
+ return err;
+}
+
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status)
+{
+ return mthca_map_cmd(dev, CMD_MAP_FA, icm, -1, status);
+}
+
+int mthca_UNMAP_FA(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_FA, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_RUN_FW(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, 0, CMD_RUN_FW, CMD_TIME_CLASS_A, status);
+}
+
+int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status)
+{
+ u32 *outbox;
+ dma_addr_t outdma;
+ int err = 0;
+ u8 lg;
+
+#define QUERY_FW_OUT_SIZE 0x100
+#define QUERY_FW_VER_OFFSET 0x00
+#define QUERY_FW_MAX_CMD_OFFSET 0x0f
+#define QUERY_FW_ERR_START_OFFSET 0x30
+#define QUERY_FW_ERR_SIZE_OFFSET 0x38
+
+#define QUERY_FW_START_OFFSET 0x20
+#define QUERY_FW_END_OFFSET 0x28
+
+#define QUERY_FW_SIZE_OFFSET 0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET 0x20
+#define QUERY_FW_EQ_ARM_BASE_OFFSET 0x40
+#define QUERY_FW_EQ_SET_CI_BASE_OFFSET 0x48
+
+ outbox = pci_alloc_consistent(dev->pdev, QUERY_FW_OUT_SIZE, &outdma);
+ if (!outbox) {
+ return -ENOMEM;
+ }
+
+ err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_FW,
+ CMD_TIME_CLASS_A, status);
+
+ if (err)
+ goto out;
+
+ MTHCA_GET(dev->fw_ver, outbox, QUERY_FW_VER_OFFSET);
+ /*
+ * FW subminor version is at more signifant bits than minor
+ * version, so swap here.
+ */
+ dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) |
+ ((dev->fw_ver & 0xffff0000ull) >> 16) |
+ ((dev->fw_ver & 0x0000ffffull) << 16);
+
+ MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+ dev->cmd.max_cmds = 1 << lg;
+
+ mthca_dbg(dev, "FW version %012llx, max commands %d\n",
+ (unsigned long long) dev->fw_ver, dev->cmd.max_cmds);
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ MTHCA_GET(dev->fw.arbel.fw_pages, outbox, QUERY_FW_SIZE_OFFSET);
+ MTHCA_GET(dev->fw.arbel.clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+ MTHCA_GET(dev->fw.arbel.eq_arm_base, outbox, QUERY_FW_EQ_ARM_BASE_OFFSET);
+ MTHCA_GET(dev->fw.arbel.eq_set_ci_base, outbox, QUERY_FW_EQ_SET_CI_BASE_OFFSET);
+ mthca_dbg(dev, "FW size %d KB\n", dev->fw.arbel.fw_pages << 2);
+
+ /*
+ * Arbel page size is always 4 KB; round up number of
+ * system pages needed.
+ */
+ dev->fw.arbel.fw_pages =
+ (dev->fw.arbel.fw_pages + (1 << (PAGE_SHIFT - 12)) - 1) >>
+ (PAGE_SHIFT - 12);
+
+ mthca_dbg(dev, "Clear int @ %llx, EQ arm @ %llx, EQ set CI @ %llx\n",
+ (unsigned long long) dev->fw.arbel.clr_int_base,
+ (unsigned long long) dev->fw.arbel.eq_arm_base,
+ (unsigned long long) dev->fw.arbel.eq_set_ci_base);
+ } else {
+ MTHCA_GET(dev->fw.tavor.fw_start, outbox, QUERY_FW_START_OFFSET);
+ MTHCA_GET(dev->fw.tavor.fw_end, outbox, QUERY_FW_END_OFFSET);
+
+ mthca_dbg(dev, "FW size %d KB (start %llx, end %llx)\n",
+ (int) ((dev->fw.tavor.fw_end - dev->fw.tavor.fw_start) >> 10),
+ (unsigned long long) dev->fw.tavor.fw_start,
+ (unsigned long long) dev->fw.tavor.fw_end);
+ }
+
+out:
+ pci_free_consistent(dev->pdev, QUERY_FW_OUT_SIZE, outbox, outdma);
+ return err;
+}
+
+int mthca_ENABLE_LAM(struct mthca_dev *dev, u8 *status)
+{
+ u8 info;
+ u32 *outbox;
+ dma_addr_t outdma;
+ int err = 0;
+
+#define ENABLE_LAM_OUT_SIZE 0x100
+#define ENABLE_LAM_START_OFFSET 0x00
+#define ENABLE_LAM_END_OFFSET 0x08
+#define ENABLE_LAM_INFO_OFFSET 0x13
+
+#define ENABLE_LAM_INFO_HIDDEN_FLAG (1 << 4)
+#define ENABLE_LAM_INFO_ECC_MASK 0x3
+
+ outbox = pci_alloc_consistent(dev->pdev, ENABLE_LAM_OUT_SIZE, &outdma);
+ if (!outbox)
+ return -ENOMEM;
+
+ err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_ENABLE_LAM,
+ CMD_TIME_CLASS_C, status);
+
+ if (err)
+ goto out;
+
+ if (*status == MTHCA_CMD_STAT_LAM_NOT_PRE)
+ goto out;
+
+ MTHCA_GET(dev->ddr_start, outbox, ENABLE_LAM_START_OFFSET);
+ MTHCA_GET(dev->ddr_end, outbox, ENABLE_LAM_END_OFFSET);
+ MTHCA_GET(info, outbox, ENABLE_LAM_INFO_OFFSET);
+
+ if (!!(info & ENABLE_LAM_INFO_HIDDEN_FLAG) !=
+ !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+ mthca_info(dev, "FW reports that HCA-attached memory "
+ "is %s hidden; does not match PCI config\n",
+ (info & ENABLE_LAM_INFO_HIDDEN_FLAG) ?
+ "" : "not");
+ }
+ if (info & ENABLE_LAM_INFO_HIDDEN_FLAG)
+ mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+ mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+ (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+ (unsigned long long) dev->ddr_start,
+ (unsigned long long) dev->ddr_end);
+
+out:
+ pci_free_consistent(dev->pdev, ENABLE_LAM_OUT_SIZE, outbox, outdma);
+ return err;
+}
+
+int mthca_DISABLE_LAM(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C, status);
+}
+
+int mthca_QUERY_DDR(struct mthca_dev *dev, u8 *status)
+{
+ u8 info;
+ u32 *outbox;
+ dma_addr_t outdma;
+ int err = 0;
+
+#define QUERY_DDR_OUT_SIZE 0x100
+#define QUERY_DDR_START_OFFSET 0x00
+#define QUERY_DDR_END_OFFSET 0x08
+#define QUERY_DDR_INFO_OFFSET 0x13
+
+#define QUERY_DDR_INFO_HIDDEN_FLAG (1 << 4)
+#define QUERY_DDR_INFO_ECC_MASK 0x3
+
+ outbox = pci_alloc_consistent(dev->pdev, QUERY_DDR_OUT_SIZE, &outdma);
+ if (!outbox)
+ return -ENOMEM;
+
+ err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_DDR,
+ CMD_TIME_CLASS_A, status);
+
+ if (err)
+ goto out;
+
+ MTHCA_GET(dev->ddr_start, outbox, QUERY_DDR_START_OFFSET);
+ MTHCA_GET(dev->ddr_end, outbox, QUERY_DDR_END_OFFSET);
+ MTHCA_GET(info, outbox, QUERY_DDR_INFO_OFFSET);
+
+ if (!!(info & QUERY_DDR_INFO_HIDDEN_FLAG) !=
+ !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+ mthca_info(dev, "FW reports that HCA-attached memory "
+ "is %s hidden; does not match PCI config\n",
+ (info & QUERY_DDR_INFO_HIDDEN_FLAG) ?
+ "" : "not");
+ }
+ if (info & QUERY_DDR_INFO_HIDDEN_FLAG)
+ mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+ mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+ (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+ (unsigned long long) dev->ddr_start,
+ (unsigned long long) dev->ddr_end);
+
+out:
+ pci_free_consistent(dev->pdev, QUERY_DDR_OUT_SIZE, outbox, outdma);
+ return err;
+}
+
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+ struct mthca_dev_lim *dev_lim, u8 *status)
+{
+ u32 *outbox;
+ dma_addr_t outdma;
+ u8 field;
+ u16 size;
+ int err;
+
+#define QUERY_DEV_LIM_OUT_SIZE 0x100
+#define QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET 0x10
+#define QUERY_DEV_LIM_MAX_QP_SZ_OFFSET 0x11
+#define QUERY_DEV_LIM_RSVD_QP_OFFSET 0x12
+#define QUERY_DEV_LIM_MAX_QP_OFFSET 0x13
+#define QUERY_DEV_LIM_RSVD_SRQ_OFFSET 0x14
+#define QUERY_DEV_LIM_MAX_SRQ_OFFSET 0x15
+#define QUERY_DEV_LIM_RSVD_EEC_OFFSET 0x16
+#define QUERY_DEV_LIM_MAX_EEC_OFFSET 0x17
+#define QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET 0x19
+#define QUERY_DEV_LIM_RSVD_CQ_OFFSET 0x1a
+#define QUERY_DEV_LIM_MAX_CQ_OFFSET 0x1b
+#define QUERY_DEV_LIM_MAX_MPT_OFFSET 0x1d
+#define QUERY_DEV_LIM_RSVD_EQ_OFFSET 0x1e
+#define QUERY_DEV_LIM_MAX_EQ_OFFSET 0x1f
+#define QUERY_DEV_LIM_RSVD_MTT_OFFSET 0x20
+#define QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET 0x21
+#define QUERY_DEV_LIM_RSVD_MRW_OFFSET 0x22
+#define QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET 0x23
+#define QUERY_DEV_LIM_MAX_AV_OFFSET 0x27
+#define QUERY_DEV_LIM_MAX_REQ_QP_OFFSET 0x29
+#define QUERY_DEV_LIM_MAX_RES_QP_OFFSET 0x2b
+#define QUERY_DEV_LIM_MAX_RDMA_OFFSET 0x2f
+#define QUERY_DEV_LIM_RSZ_SRQ_OFFSET 0x33
+#define QUERY_DEV_LIM_ACK_DELAY_OFFSET 0x35
+#define QUERY_DEV_LIM_MTU_WIDTH_OFFSET 0x36
+#define QUERY_DEV_LIM_VL_PORT_OFFSET 0x37
+#define QUERY_DEV_LIM_MAX_GID_OFFSET 0x3b
+#define QUERY_DEV_LIM_MAX_PKEY_OFFSET 0x3f
+#define QUERY_DEV_LIM_FLAGS_OFFSET 0x44
+#define QUERY_DEV_LIM_RSVD_UAR_OFFSET 0x48
+#define QUERY_DEV_LIM_UAR_SZ_OFFSET 0x49
+#define QUERY_DEV_LIM_PAGE_SZ_OFFSET 0x4b
+#define QUERY_DEV_LIM_MAX_SG_OFFSET 0x51
+#define QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET 0x52
+#define QUERY_DEV_LIM_MAX_SG_RQ_OFFSET 0x55
+#define QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET 0x56
+#define QUERY_DEV_LIM_MAX_QP_MCG_OFFSET 0x61
+#define QUERY_DEV_LIM_RSVD_MCG_OFFSET 0x62
+#define QUERY_DEV_LIM_MAX_MCG_OFFSET 0x63
+#define QUERY_DEV_LIM_RSVD_PD_OFFSET 0x64
+#define QUERY_DEV_LIM_MAX_PD_OFFSET 0x65
+#define QUERY_DEV_LIM_RSVD_RDD_OFFSET 0x66
+#define QUERY_DEV_LIM_MAX_RDD_OFFSET 0x67
+#define QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET 0x80
+#define QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET 0x82
+#define QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET 0x84
+#define QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET 0x86
+#define QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET 0x88
+#define QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET 0x8a
+#define QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET 0x8c
+#define QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET 0x8e
+#define QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET 0x90
+#define QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET 0x92
+#define QUERY_DEV_LIM_PBL_SZ_OFFSET 0x96
+#define QUERY_DEV_LIM_BMME_FLAGS_OFFSET 0x97
+#define QUERY_DEV_LIM_RSVD_LKEY_OFFSET 0x98
+#define QUERY_DEV_LIM_LAMR_OFFSET 0x9f
+#define QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET 0xa0
+
+ outbox = pci_alloc_consistent(dev->pdev, QUERY_DEV_LIM_OUT_SIZE, &outdma);
+ if (!outbox)
+ return -ENOMEM;
+
+ err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_DEV_LIM,
+ CMD_TIME_CLASS_A, status);
+
+ if (err)
+ goto out;
+
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+ dev_lim->max_srq_sz = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+ dev_lim->max_qp_sz = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET);
+ dev_lim->reserved_qps = 1 << (field & 0xf);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET);
+ dev_lim->max_qps = 1 << (field & 0x1f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_SRQ_OFFSET);
+ dev_lim->reserved_srqs = 1 << (field >> 4);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_OFFSET);
+ dev_lim->max_srqs = 1 << (field & 0x1f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EEC_OFFSET);
+ dev_lim->reserved_eecs = 1 << (field & 0xf);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EEC_OFFSET);
+ dev_lim->max_eecs = 1 << (field & 0x1f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET);
+ dev_lim->max_cq_sz = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_CQ_OFFSET);
+ dev_lim->reserved_cqs = 1 << (field & 0xf);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_OFFSET);
+ dev_lim->max_cqs = 1 << (field & 0x1f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MPT_OFFSET);
+ dev_lim->max_mpts = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EQ_OFFSET);
+ dev_lim->reserved_eqs = 1 << (field & 0xf);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET);
+ dev_lim->max_eqs = 1 << (field & 0x7);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET);
+ dev_lim->reserved_mtts = 1 << (field >> 4);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET);
+ dev_lim->max_mrw_sz = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET);
+ dev_lim->reserved_mrws = 1 << (field & 0xf);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET);
+ dev_lim->max_mtt_seg = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_REQ_QP_OFFSET);
+ dev_lim->max_requester_per_qp = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RES_QP_OFFSET);
+ dev_lim->max_responder_per_qp = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDMA_OFFSET);
+ dev_lim->max_rdma_global = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_ACK_DELAY_OFFSET);
+ dev_lim->local_ca_ack_delay = field & 0x1f;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MTU_WIDTH_OFFSET);
+ dev_lim->max_mtu = field >> 4;
+ dev_lim->max_port_width = field & 0xf;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_VL_PORT_OFFSET);
+ dev_lim->max_vl = field >> 4;
+ dev_lim->num_ports = field & 0xf;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET);
+ dev_lim->max_gids = 1 << (field & 0xf);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET);
+ dev_lim->max_pkeys = 1 << (field & 0xf);
+ MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_UAR_OFFSET);
+ dev_lim->reserved_uars = field >> 4;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_UAR_SZ_OFFSET);
+ dev_lim->uar_size = 1 << ((field & 0x3f) + 20);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_PAGE_SZ_OFFSET);
+ dev_lim->min_page_sz = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_OFFSET);
+ dev_lim->max_sg = field;
+
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET);
+ dev_lim->max_desc_sz = size;
+
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_MCG_OFFSET);
+ dev_lim->max_qp_per_mcg = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MCG_OFFSET);
+ dev_lim->reserved_mgms = field & 0xf;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MCG_OFFSET);
+ dev_lim->max_mcgs = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_PD_OFFSET);
+ dev_lim->reserved_pds = field >> 4;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PD_OFFSET);
+ dev_lim->max_pds = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_RDD_OFFSET);
+ dev_lim->reserved_rdds = field >> 4;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDD_OFFSET);
+ dev_lim->max_rdds = 1 << (field & 0x3f);
+
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET);
+ dev_lim->eec_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET);
+ dev_lim->qpc_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET);
+ dev_lim->eeec_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET);
+ dev_lim->eqpc_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET);
+ dev_lim->eqc_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET);
+ dev_lim->cqc_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET);
+ dev_lim->srq_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET);
+ dev_lim->uar_scratch_entry_sz = size;
+
+ mthca_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+ dev_lim->max_qps, dev_lim->reserved_qps, dev_lim->qpc_entry_sz);
+ mthca_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+ dev_lim->max_cqs, dev_lim->reserved_cqs, dev_lim->cqc_entry_sz);
+ mthca_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+ dev_lim->max_eqs, dev_lim->reserved_eqs, dev_lim->eqc_entry_sz);
+ mthca_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+ dev_lim->reserved_mrws, dev_lim->reserved_mtts);
+ mthca_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+ dev_lim->max_pds, dev_lim->reserved_pds, dev_lim->reserved_uars);
+ mthca_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+ dev_lim->max_pds, dev_lim->reserved_mgms);
+
+ mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags);
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET);
+ dev_lim->hca.arbel.resize_srq = field & 1;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET);
+ dev_lim->mtt_seg_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET);
+ dev_lim->mpt_entry_sz = size;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET);
+ dev_lim->hca.arbel.max_pbl_sz = 1 << (field & 0x3f);
+ MTHCA_GET(dev_lim->hca.arbel.bmme_flags, outbox,
+ QUERY_DEV_LIM_BMME_FLAGS_OFFSET);
+ MTHCA_GET(dev_lim->hca.arbel.reserved_lkey, outbox,
+ QUERY_DEV_LIM_RSVD_LKEY_OFFSET);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_LAMR_OFFSET);
+ dev_lim->hca.arbel.lam_required = field & 1;
+ MTHCA_GET(dev_lim->hca.arbel.max_icm_sz, outbox,
+ QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET);
+
+ if (dev_lim->hca.arbel.bmme_flags & 1)
+ mthca_dbg(dev, "Base MM extensions: yes "
+ "(flags %d, max PBL %d, rsvd L_Key %08x)\n",
+ dev_lim->hca.arbel.bmme_flags,
+ dev_lim->hca.arbel.max_pbl_sz,
+ dev_lim->hca.arbel.reserved_lkey);
+ else
+ mthca_dbg(dev, "Base MM extensions: no\n");
+
+ mthca_dbg(dev, "Max ICM size %lld MB\n",
+ (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20);
+ } else {
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET);
+ dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f);
+ dev_lim->mtt_seg_sz = MTHCA_MTT_SEG_SIZE;
+ dev_lim->mpt_entry_sz = MTHCA_MPT_ENTRY_SIZE;
+ }
+
+out:
+ pci_free_consistent(dev->pdev, QUERY_DEV_LIM_OUT_SIZE, outbox, outdma);
+ return err;
+}
+
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+ struct mthca_adapter *adapter, u8 *status)
+{
+ u32 *outbox;
+ dma_addr_t outdma;
+ int err;
+
+#define QUERY_ADAPTER_OUT_SIZE 0x100
+#define QUERY_ADAPTER_VENDOR_ID_OFFSET 0x00
+#define QUERY_ADAPTER_DEVICE_ID_OFFSET 0x04
+#define QUERY_ADAPTER_REVISION_ID_OFFSET 0x08
+#define QUERY_ADAPTER_INTA_PIN_OFFSET 0x10
+
+ outbox = pci_alloc_consistent(dev->pdev, QUERY_ADAPTER_OUT_SIZE, &outdma);
+ if (!outbox)
+ return -ENOMEM;
+
+ err = mthca_cmd_box(dev, 0, outdma, 0, 0, CMD_QUERY_ADAPTER,
+ CMD_TIME_CLASS_A, status);
+
+ if (err)
+ goto out;
+
+ MTHCA_GET(adapter->vendor_id, outbox, QUERY_ADAPTER_VENDOR_ID_OFFSET);
+ MTHCA_GET(adapter->device_id, outbox, QUERY_ADAPTER_DEVICE_ID_OFFSET);
+ MTHCA_GET(adapter->revision_id, outbox, QUERY_ADAPTER_REVISION_ID_OFFSET);
+ MTHCA_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+out:
+ pci_free_consistent(dev->pdev, QUERY_DEV_LIM_OUT_SIZE, outbox, outdma);
+ return err;
+}
+
+int mthca_INIT_HCA(struct mthca_dev *dev,
+ struct mthca_init_hca_param *param,
+ u8 *status)
+{
+ u32 *inbox;
+ dma_addr_t indma;
+ int err;
+
+#define INIT_HCA_IN_SIZE 0x200
+#define INIT_HCA_FLAGS_OFFSET 0x014
+#define INIT_HCA_QPC_OFFSET 0x020
+#define INIT_HCA_QPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x10)
+#define INIT_HCA_LOG_QP_OFFSET (INIT_HCA_QPC_OFFSET + 0x17)
+#define INIT_HCA_EEC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x20)
+#define INIT_HCA_LOG_EEC_OFFSET (INIT_HCA_QPC_OFFSET + 0x27)
+#define INIT_HCA_SRQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x28)
+#define INIT_HCA_LOG_SRQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x2f)
+#define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30)
+#define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37)
+#define INIT_HCA_EQPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40)
+#define INIT_HCA_EEEC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50)
+#define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60)
+#define INIT_HCA_LOG_EQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x67)
+#define INIT_HCA_RDB_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x70)
+#define INIT_HCA_UDAV_OFFSET 0x0b0
+#define INIT_HCA_UDAV_LKEY_OFFSET (INIT_HCA_UDAV_OFFSET + 0x0)
+#define INIT_HCA_UDAV_PD_OFFSET (INIT_HCA_UDAV_OFFSET + 0x4)
+#define INIT_HCA_MCAST_OFFSET 0x0c0
+#define INIT_HCA_MC_BASE_OFFSET (INIT_HCA_MCAST_OFFSET + 0x00)
+#define INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define INIT_HCA_MC_HASH_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x16)
+#define INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET 0x0f0
+#define INIT_HCA_MPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x00)
+#define INIT_HCA_MTT_SEG_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x09)
+#define INIT_HCA_LOG_MPT_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x0b)
+#define INIT_HCA_MTT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x10)
+#define INIT_HCA_UAR_OFFSET 0x120
+#define INIT_HCA_UAR_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x00)
+#define INIT_HCA_UARC_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x09)
+#define INIT_HCA_LOG_UAR_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0a)
+#define INIT_HCA_UAR_PAGE_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0b)
+#define INIT_HCA_UAR_SCATCH_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x10)
+#define INIT_HCA_UAR_CTX_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x18)
+
+ inbox = pci_alloc_consistent(dev->pdev, INIT_HCA_IN_SIZE, &indma);
+ if (!inbox)
+ return -ENOMEM;
+
+ memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+#if defined(__LITTLE_ENDIAN)
+ *(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+ *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+ /* Check port for UD address vector: */
+ *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1);
+
+ /* We leave wqe_quota, responder_exu, etc as 0 (default) */
+
+ /* QPC/EEC/CQC/EQC/RDB attributes */
+
+ MTHCA_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_num_qps, INIT_HCA_LOG_QP_OFFSET);
+ MTHCA_PUT(inbox, param->eec_base, INIT_HCA_EEC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_num_eecs, INIT_HCA_LOG_EEC_OFFSET);
+ MTHCA_PUT(inbox, param->srqc_base, INIT_HCA_SRQC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET);
+ MTHCA_PUT(inbox, param->cqc_base, INIT_HCA_CQC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_num_cqs, INIT_HCA_LOG_CQ_OFFSET);
+ MTHCA_PUT(inbox, param->eqpc_base, INIT_HCA_EQPC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->eeec_base, INIT_HCA_EEEC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->eqc_base, INIT_HCA_EQC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_num_eqs, INIT_HCA_LOG_EQ_OFFSET);
+ MTHCA_PUT(inbox, param->rdb_base, INIT_HCA_RDB_BASE_OFFSET);
+
+ /* UD AV attributes */
+
+ /* multicast attributes */
+
+ MTHCA_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->mc_hash_sz, INIT_HCA_MC_HASH_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+ /* TPT attributes */
+
+ MTHCA_PUT(inbox, param->mpt_base, INIT_HCA_MPT_BASE_OFFSET);
+ if (dev->hca_type != ARBEL_NATIVE)
+ MTHCA_PUT(inbox, param->mtt_seg_sz, INIT_HCA_MTT_SEG_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->mtt_base, INIT_HCA_MTT_BASE_OFFSET);
+
+ /* UAR attributes */
+ {
+ u8 uar_page_sz = PAGE_SHIFT - 12;
+ MTHCA_PUT(inbox, uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+ }
+
+ MTHCA_PUT(inbox, param->uar_scratch_base, INIT_HCA_UAR_SCATCH_BASE_OFFSET);
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ MTHCA_PUT(inbox, param->log_uarc_sz, INIT_HCA_UARC_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->log_uar_sz, INIT_HCA_LOG_UAR_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->uarc_base, INIT_HCA_UAR_CTX_BASE_OFFSET);
+ }
+
+ err = mthca_cmd(dev, indma, 0, 0, CMD_INIT_HCA,
+ HZ, status);
+
+ pci_free_consistent(dev->pdev, INIT_HCA_IN_SIZE, inbox, indma);
+ return err;
+}
+
+int mthca_INIT_IB(struct mthca_dev *dev,
+ struct mthca_init_ib_param *param,
+ int port, u8 *status)
+{
+ u32 *inbox;
+ dma_addr_t indma;
+ int err;
+ u32 flags;
+
+#define INIT_IB_IN_SIZE 56
+#define INIT_IB_FLAGS_OFFSET 0x00
+#define INIT_IB_FLAG_SIG (1 << 18)
+#define INIT_IB_FLAG_NG (1 << 17)
+#define INIT_IB_FLAG_G0 (1 << 16)
+#define INIT_IB_FLAG_1X (1 << 8)
+#define INIT_IB_FLAG_4X (1 << 9)
+#define INIT_IB_FLAG_12X (1 << 11)
+#define INIT_IB_VL_SHIFT 4
+#define INIT_IB_MTU_SHIFT 12
+#define INIT_IB_MAX_GID_OFFSET 0x06
+#define INIT_IB_MAX_PKEY_OFFSET 0x0a
+#define INIT_IB_GUID0_OFFSET 0x10
+#define INIT_IB_NODE_GUID_OFFSET 0x18
+#define INIT_IB_SI_GUID_OFFSET 0x20
+
+ inbox = pci_alloc_consistent(dev->pdev, INIT_IB_IN_SIZE, &indma);
+ if (!inbox)
+ return -ENOMEM;
+
+ memset(inbox, 0, INIT_IB_IN_SIZE);
+
+ flags = 0;
+ flags |= param->enable_1x ? INIT_IB_FLAG_1X : 0;
+ flags |= param->enable_4x ? INIT_IB_FLAG_4X : 0;
+ flags |= param->set_guid0 ? INIT_IB_FLAG_G0 : 0;
+ flags |= param->set_node_guid ? INIT_IB_FLAG_NG : 0;
+ flags |= param->set_si_guid ? INIT_IB_FLAG_SIG : 0;
+ flags |= param->vl_cap << INIT_IB_VL_SHIFT;
+ flags |= param->mtu_cap << INIT_IB_MTU_SHIFT;
+ MTHCA_PUT(inbox, flags, INIT_IB_FLAGS_OFFSET);
+
+ MTHCA_PUT(inbox, param->gid_cap, INIT_IB_MAX_GID_OFFSET);
+ MTHCA_PUT(inbox, param->pkey_cap, INIT_IB_MAX_PKEY_OFFSET);
+ MTHCA_PUT(inbox, param->guid0, INIT_IB_GUID0_OFFSET);
+ MTHCA_PUT(inbox, param->node_guid, INIT_IB_NODE_GUID_OFFSET);
+ MTHCA_PUT(inbox, param->si_guid, INIT_IB_SI_GUID_OFFSET);
+
+ err = mthca_cmd(dev, indma, port, 0, CMD_INIT_IB,
+ CMD_TIME_CLASS_A, status);
+
+ pci_free_consistent(dev->pdev, INIT_HCA_IN_SIZE, inbox, indma);
+ return err;
+}
+
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port, u8 *status)
+{
+ return mthca_cmd(dev, 0, port, 0, CMD_CLOSE_IB, HZ, status);
+}
+
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, panic, CMD_CLOSE_HCA, HZ, status);
+}
+
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+ int port, u8 *status)
+{
+ u32 *inbox;
+ dma_addr_t indma;
+ int err;
+ u32 flags = 0;
+
+#define SET_IB_IN_SIZE 0x40
+#define SET_IB_FLAGS_OFFSET 0x00
+#define SET_IB_FLAG_SIG (1 << 18)
+#define SET_IB_FLAG_RQK (1 << 0)
+#define SET_IB_CAP_MASK_OFFSET 0x04
+#define SET_IB_SI_GUID_OFFSET 0x08
+
+ inbox = pci_alloc_consistent(dev->pdev, SET_IB_IN_SIZE, &indma);
+ if (!inbox)
+ return -ENOMEM;
+
+ memset(inbox, 0, SET_IB_IN_SIZE);
+
+ flags |= param->set_si_guid ? SET_IB_FLAG_SIG : 0;
+ flags |= param->reset_qkey_viol ? SET_IB_FLAG_RQK : 0;
+ MTHCA_PUT(inbox, flags, SET_IB_FLAGS_OFFSET);
+
+ MTHCA_PUT(inbox, param->cap_mask, SET_IB_CAP_MASK_OFFSET);
+ MTHCA_PUT(inbox, param->si_guid, SET_IB_SI_GUID_OFFSET);
+
+ err = mthca_cmd(dev, indma, port, 0, CMD_SET_IB,
+ CMD_TIME_CLASS_B, status);
+
+ pci_free_consistent(dev->pdev, INIT_HCA_IN_SIZE, inbox, indma);
+ return err;
+}
+
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt, u8 *status)
+{
+ return mthca_map_cmd(dev, CMD_MAP_ICM, icm, virt, status);
+}
+
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt, u8 *status)
+{
+ u64 *inbox;
+ dma_addr_t indma;
+ int err;
+
+ inbox = pci_alloc_consistent(dev->pdev, 16, &indma);
+ if (!inbox)
+ return -ENOMEM;
+
+ inbox[0] = cpu_to_be64(virt);
+ inbox[1] = cpu_to_be64(dma_addr);
+
+ err = mthca_cmd(dev, indma, 1, 0, CMD_MAP_ICM, CMD_TIME_CLASS_B, status);
+
+ pci_free_consistent(dev->pdev, 16, inbox, indma);
+
+ if (!err)
+ mthca_dbg(dev, "Mapped page at %llx for ICM.\n",
+ (unsigned long long) virt);
+
+ return err;
+}
+
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count, u8 *status)
+{
+ mthca_dbg(dev, "Unmapping %d pages at %llx from ICM.\n",
+ page_count, (unsigned long long) virt);
+
+ return mthca_cmd(dev, virt, page_count, 0, CMD_UNMAP_ICM, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status)
+{
+ return mthca_map_cmd(dev, CMD_MAP_ICM_AUX, icm, -1, status);
+}
+
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_ICM_AUX, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages,
+ u8 *status)
+{
+ int ret = mthca_cmd_imm(dev, icm_size, aux_pages, 0, 0, CMD_SET_ICM_SIZE,
+ CMD_TIME_CLASS_A, status);
+
+ if (ret || status)
+ return ret;
+
+ /*
+ * Arbel page size is always 4 KB; round up number of system
+ * pages needed.
+ */
+ *aux_pages = (*aux_pages + (1 << (PAGE_SHIFT - 12)) - 1) >> (PAGE_SHIFT - 12);
+
+ return 0;
+}
+
+int mthca_SW2HW_MPT(struct mthca_dev *dev, void *mpt_entry,
+ int mpt_index, u8 *status)
+{
+ dma_addr_t indma;
+ int err;
+
+ indma = pci_map_single(dev->pdev, mpt_entry,
+ MTHCA_MPT_ENTRY_SIZE,
+ PCI_DMA_TODEVICE);
+ if (pci_dma_mapping_error(indma))
+ return -ENOMEM;
+
+ err = mthca_cmd(dev, indma, mpt_index, 0, CMD_SW2HW_MPT,
+ CMD_TIME_CLASS_B, status);
+
+ pci_unmap_single(dev->pdev, indma,
+ MTHCA_MPT_ENTRY_SIZE, PCI_DMA_TODEVICE);
+ return err;
+}
+
+int mthca_HW2SW_MPT(struct mthca_dev *dev, void *mpt_entry,
+ int mpt_index, u8 *status)
+{
+ dma_addr_t outdma = 0;
+ int err;
+
+ if (mpt_entry) {
+ outdma = pci_map_single(dev->pdev, mpt_entry,
+ MTHCA_MPT_ENTRY_SIZE,
+ PCI_DMA_FROMDEVICE);
+ if (pci_dma_mapping_error(outdma))
+ return -ENOMEM;
+ }
+
+ err = mthca_cmd_box(dev, 0, outdma, mpt_index, !mpt_entry,
+ CMD_HW2SW_MPT,
+ CMD_TIME_CLASS_B, status);
+
+ if (mpt_entry)
+ pci_unmap_single(dev->pdev, outdma,
+ MTHCA_MPT_ENTRY_SIZE,
+ PCI_DMA_FROMDEVICE);
+ return err;
+}
+
+int mthca_WRITE_MTT(struct mthca_dev *dev, u64 *mtt_entry,
+ int num_mtt, u8 *status)
+{
+ dma_addr_t indma;
+ int err;
+
+ indma = pci_map_single(dev->pdev, mtt_entry,
+ (num_mtt + 2) * 8,
+ PCI_DMA_TODEVICE);
+ if (pci_dma_mapping_error(indma))
+ return -ENOMEM;
+
+ err = mthca_cmd(dev, indma, num_mtt, 0, CMD_WRITE_MTT,
+ CMD_TIME_CLASS_B, status);
+
+ pci_unmap_single(dev->pdev, indma,
+ (num_mtt + 2) * 8, PCI_DMA_TODEVICE);
+ return err;
+}
+
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+ int eq_num, u8 *status)
+{
+ mthca_dbg(dev, "%s mask %016llx for eqn %d\n",
+ unmap ? "Clearing" : "Setting",
+ (unsigned long long) event_mask, eq_num);
+ return mthca_cmd(dev, event_mask, (unmap << 31) | eq_num,
+ 0, CMD_MAP_EQ, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_SW2HW_EQ(struct mthca_dev *dev, void *eq_context,
+ int eq_num, u8 *status)
+{
+ dma_addr_t indma;
+ int err;
+
+ indma = pci_map_single(dev->pdev, eq_context,
+ MTHCA_EQ_CONTEXT_SIZE,
+ PCI_DMA_TODEVICE);
+ if (pci_dma_mapping_error(indma))
+ return -ENOMEM;
+
+ err = mthca_cmd(dev, indma, eq_num, 0, CMD_SW2HW_EQ,
+ CMD_TIME_CLASS_A, status);
+
+ pci_unmap_single(dev->pdev, indma,
+ MTHCA_EQ_CONTEXT_SIZE, PCI_DMA_TODEVICE);
+ return err;
+}
+
+int mthca_HW2SW_EQ(struct mthca_dev *dev, void *eq_context,
+ int eq_num, u8 *status)
+{
+ dma_addr_t outdma = 0;
+ int err;
+
+ outdma = pci_map_single(dev->pdev, eq_context,
+ MTHCA_EQ_CONTEXT_SIZE,
+ PCI_DMA_FROMDEVICE);
+ if (pci_dma_mapping_error(outdma))
+ return -ENOMEM;
+
+ err = mthca_cmd_box(dev, 0, outdma, eq_num, 0,
+ CMD_HW2SW_EQ,
+ CMD_TIME_CLASS_A, status);
+
+ pci_unmap_single(dev->pdev, outdma,
+ MTHCA_EQ_CONTEXT_SIZE,
+ PCI_DMA_FROMDEVICE);
+ return err;
+}
+
+int mthca_SW2HW_CQ(struct mthca_dev *dev, void *cq_context,
+ int cq_num, u8 *status)
+{
+ dma_addr_t indma;
+ int err;
+
+ indma = pci_map_single(dev->pdev, cq_context,
+ MTHCA_CQ_CONTEXT_SIZE,
+ PCI_DMA_TODEVICE);
+ if (pci_dma_mapping_error(indma))
+ return -ENOMEM;
+
+ err = mthca_cmd(dev, indma, cq_num, 0, CMD_SW2HW_CQ,
+ CMD_TIME_CLASS_A, status);
+
+ pci_unmap_single(dev->pdev, indma,
+ MTHCA_CQ_CONTEXT_SIZE, PCI_DMA_TODEVICE);
+ return err;
+}
+
+int mthca_HW2SW_CQ(struct mthca_dev *dev, void *cq_context,
+ int cq_num, u8 *status)
+{
+ dma_addr_t outdma = 0;
+ int err;
+
+ outdma = pci_map_single(dev->pdev, cq_context,
+ MTHCA_CQ_CONTEXT_SIZE,
+ PCI_DMA_FROMDEVICE);
+ if (pci_dma_mapping_error(outdma))
+ return -ENOMEM;
+
+ err = mthca_cmd_box(dev, 0, outdma, cq_num, 0,
+ CMD_HW2SW_CQ,
+ CMD_TIME_CLASS_A, status);
+
+ pci_unmap_single(dev->pdev, outdma,
+ MTHCA_CQ_CONTEXT_SIZE,
+ PCI_DMA_FROMDEVICE);
+ return err;
+}
+
+int mthca_MODIFY_QP(struct mthca_dev *dev, int trans, u32 num,
+ int is_ee, void *qp_context, u32 optmask,
+ u8 *status)
+{
+ static const u16 op[] = {
+ [MTHCA_TRANS_RST2INIT] = CMD_RST2INIT_QPEE,
+ [MTHCA_TRANS_INIT2INIT] = CMD_INIT2INIT_QPEE,
+ [MTHCA_TRANS_INIT2RTR] = CMD_INIT2RTR_QPEE,
+ [MTHCA_TRANS_RTR2RTS] = CMD_RTR2RTS_QPEE,
+ [MTHCA_TRANS_RTS2RTS] = CMD_RTS2RTS_QPEE,
+ [MTHCA_TRANS_SQERR2RTS] = CMD_SQERR2RTS_QPEE,
+ [MTHCA_TRANS_ANY2ERR] = CMD_2ERR_QPEE,
+ [MTHCA_TRANS_RTS2SQD] = CMD_RTS2SQD_QPEE,
+ [MTHCA_TRANS_SQD2SQD] = CMD_SQD2SQD_QPEE,
+ [MTHCA_TRANS_SQD2RTS] = CMD_SQD2RTS_QPEE,
+ [MTHCA_TRANS_ANY2RST] = CMD_ERR2RST_QPEE
+ };
+ u8 op_mod = 0;
+
+ dma_addr_t indma;
+ int err;
+
+ if (trans < 0 || trans >= ARRAY_SIZE(op))
+ return -EINVAL;
+
+ if (trans == MTHCA_TRANS_ANY2RST) {
+ indma = 0;
+ op_mod = 3; /* don't write outbox, any->reset */
+
+ /* For debugging */
+ qp_context = pci_alloc_consistent(dev->pdev, MTHCA_QP_CONTEXT_SIZE,
+ &indma);
+ op_mod = 2; /* write outbox, any->reset */
+ } else {
+ indma = pci_map_single(dev->pdev, qp_context,
+ MTHCA_QP_CONTEXT_SIZE,
+ PCI_DMA_TODEVICE);
+ if (pci_dma_mapping_error(indma))
+ return -ENOMEM;
+
+ if (0) {
+ int i;
+ mthca_dbg(dev, "Dumping QP context:\n");
+ printk(" opt param mask: %08x\n", be32_to_cpup(qp_context));
+ for (i = 0; i < 0x100 / 4; ++i) {
+ if (i % 8 == 0)
+ printk(" [%02x] ", i * 4);
+ printk(" %08x", be32_to_cpu(((u32 *) qp_context)[i + 2]));
+ if ((i + 1) % 8 == 0)
+ printk("\n");
+ }
+ }
+ }
+
+ if (trans == MTHCA_TRANS_ANY2RST) {
+ err = mthca_cmd_box(dev, 0, indma, (!!is_ee << 24) | num,
+ op_mod, op[trans], CMD_TIME_CLASS_C, status);
+
+ if (0) {
+ int i;
+ mthca_dbg(dev, "Dumping QP context:\n");
+ printk(" %08x\n", be32_to_cpup(qp_context));
+ for (i = 0; i < 0x100 / 4; ++i) {
+ if (i % 8 == 0)
+ printk("[%02x] ", i * 4);
+ printk(" %08x", be32_to_cpu(((u32 *) qp_context)[i + 2]));
+ if ((i + 1) % 8 == 0)
+ printk("\n");
+ }
+ }
+
+ } else
+ err = mthca_cmd(dev, indma, (!!is_ee << 24) | num,
+ op_mod, op[trans], CMD_TIME_CLASS_C, status);
+
+ if (trans != MTHCA_TRANS_ANY2RST)
+ pci_unmap_single(dev->pdev, indma,
+ MTHCA_QP_CONTEXT_SIZE, PCI_DMA_TODEVICE);
+ else
+ pci_free_consistent(dev->pdev, MTHCA_QP_CONTEXT_SIZE,
+ qp_context, indma);
+ return err;
+}
+
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+ void *qp_context, u8 *status)
+{
+ dma_addr_t outdma = 0;
+ int err;
+
+ outdma = pci_map_single(dev->pdev, qp_context,
+ MTHCA_QP_CONTEXT_SIZE,
+ PCI_DMA_FROMDEVICE);
+ if (pci_dma_mapping_error(outdma))
+ return -ENOMEM;
+
+ err = mthca_cmd_box(dev, 0, outdma, (!!is_ee << 24) | num, 0,
+ CMD_QUERY_QPEE,
+ CMD_TIME_CLASS_A, status);
+
+ pci_unmap_single(dev->pdev, outdma,
+ MTHCA_QP_CONTEXT_SIZE,
+ PCI_DMA_FROMDEVICE);
+ return err;
+}
+
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn,
+ u8 *status)
+{
+ u8 op_mod;
+
+ switch (type) {
+ case IB_QPT_SMI:
+ op_mod = 0;
+ break;
+ case IB_QPT_GSI:
+ op_mod = 1;
+ break;
+ case IB_QPT_RAW_IPV6:
+ op_mod = 2;
+ break;
+ case IB_QPT_RAW_ETY:
+ op_mod = 3;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return mthca_cmd(dev, 0, qpn, op_mod, CMD_CONF_SPECIAL_QP,
+ CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+ int port, struct ib_wc* in_wc, struct ib_grh* in_grh,
+ void *in_mad, void *response_mad, u8 *status)
+{
+ void *box;
+ dma_addr_t dma;
+ int err;
+ u32 in_modifier = port;
+ u8 op_modifier = 0;
+
+#define MAD_IFC_BOX_SIZE 0x400
+#define MAD_IFC_MY_QPN_OFFSET 0x100
+#define MAD_IFC_RQPN_OFFSET 0x104
+#define MAD_IFC_SL_OFFSET 0x108
+#define MAD_IFC_G_PATH_OFFSET 0x109
+#define MAD_IFC_RLID_OFFSET 0x10a
+#define MAD_IFC_PKEY_OFFSET 0x10e
+#define MAD_IFC_GRH_OFFSET 0x140
+
+ box = pci_alloc_consistent(dev->pdev, MAD_IFC_BOX_SIZE, &dma);
+ if (!box)
+ return -ENOMEM;
+
+ memcpy(box, in_mad, 256);
+
+ /*
+ * Key check traps can't be generated unless we have in_wc to
+ * tell us where to send the trap.
+ */
+ if (ignore_mkey || !in_wc)
+ op_modifier |= 0x1;
+ if (ignore_bkey || !in_wc)
+ op_modifier |= 0x2;
+
+ if (in_wc) {
+ u8 val;
+
+ memset(box + 256, 0, 256);
+
+ MTHCA_PUT(box, in_wc->qp_num, MAD_IFC_MY_QPN_OFFSET);
+ MTHCA_PUT(box, in_wc->src_qp, MAD_IFC_RQPN_OFFSET);
+
+ val = in_wc->sl << 4;
+ MTHCA_PUT(box, val, MAD_IFC_SL_OFFSET);
+
+ val = in_wc->dlid_path_bits |
+ (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+ MTHCA_PUT(box, val, MAD_IFC_GRH_OFFSET);
+
+ MTHCA_PUT(box, in_wc->slid, MAD_IFC_RLID_OFFSET);
+ MTHCA_PUT(box, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET);
+
+ if (in_grh)
+ memcpy((u8 *) box + MAD_IFC_GRH_OFFSET, in_grh, 40);
+
+ op_modifier |= 0x10;
+
+ in_modifier |= in_wc->slid << 16;
+ }
+
+ err = mthca_cmd_box(dev, dma, dma + 512, in_modifier, op_modifier,
+ CMD_MAD_IFC, CMD_TIME_CLASS_C, status);
+
+ if (!err && !*status)
+ memcpy(response_mad, box + 512, 256);
+
+ pci_free_consistent(dev->pdev, MAD_IFC_BOX_SIZE, box, dma);
+ return err;
+}
+
+int mthca_READ_MGM(struct mthca_dev *dev, int index, void *mgm,
+ u8 *status)
+{
+ dma_addr_t outdma = 0;
+ int err;
+
+ outdma = pci_map_single(dev->pdev, mgm,
+ MTHCA_MGM_ENTRY_SIZE,
+ PCI_DMA_FROMDEVICE);
+ if (pci_dma_mapping_error(outdma))
+ return -ENOMEM;
+
+ err = mthca_cmd_box(dev, 0, outdma, index, 0,
+ CMD_READ_MGM,
+ CMD_TIME_CLASS_A, status);
+
+ pci_unmap_single(dev->pdev, outdma,
+ MTHCA_MGM_ENTRY_SIZE,
+ PCI_DMA_FROMDEVICE);
+ return err;
+}
+
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index, void *mgm,
+ u8 *status)
+{
+ dma_addr_t indma;
+ int err;
+
+ indma = pci_map_single(dev->pdev, mgm,
+ MTHCA_MGM_ENTRY_SIZE,
+ PCI_DMA_TODEVICE);
+ if (pci_dma_mapping_error(indma))
+ return -ENOMEM;
+
+ err = mthca_cmd(dev, indma, index, 0, CMD_WRITE_MGM,
+ CMD_TIME_CLASS_A, status);
+
+ pci_unmap_single(dev->pdev, indma,
+ MTHCA_MGM_ENTRY_SIZE, PCI_DMA_TODEVICE);
+ return err;
+}
+
+int mthca_MGID_HASH(struct mthca_dev *dev, void *gid, u16 *hash,
+ u8 *status)
+{
+ dma_addr_t indma;
+ u64 imm;
+ int err;
+
+ indma = pci_map_single(dev->pdev, gid, 16, PCI_DMA_TODEVICE);
+ if (pci_dma_mapping_error(indma))
+ return -ENOMEM;
+
+ err = mthca_cmd_imm(dev, indma, &imm, 0, 0, CMD_MGID_HASH,
+ CMD_TIME_CLASS_A, status);
+ *hash = imm;
+
+ pci_unmap_single(dev->pdev, indma, 16, PCI_DMA_TODEVICE);
+ return err;
+}
+
+int mthca_NOP(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0x1f, 0, CMD_NOP, msecs_to_jiffies(100), status);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h
new file mode 100644
index 00000000000..a8bc6aa36ff
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_cmd.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef MTHCA_CMD_H
+#define MTHCA_CMD_H
+
+#include <ib_verbs.h>
+
+#define MTHCA_CMD_MAILBOX_ALIGN 16UL
+#define MTHCA_CMD_MAILBOX_EXTRA (MTHCA_CMD_MAILBOX_ALIGN - 1)
+
+enum {
+ /* command completed successfully: */
+ MTHCA_CMD_STAT_OK = 0x00,
+ /* Internal error (such as a bus error) occurred while processing command: */
+ MTHCA_CMD_STAT_INTERNAL_ERR = 0x01,
+ /* Operation/command not supported or opcode modifier not supported: */
+ MTHCA_CMD_STAT_BAD_OP = 0x02,
+ /* Parameter not supported or parameter out of range: */
+ MTHCA_CMD_STAT_BAD_PARAM = 0x03,
+ /* System not enabled or bad system state: */
+ MTHCA_CMD_STAT_BAD_SYS_STATE = 0x04,
+ /* Attempt to access reserved or unallocaterd resource: */
+ MTHCA_CMD_STAT_BAD_RESOURCE = 0x05,
+ /* Requested resource is currently executing a command, or is otherwise busy: */
+ MTHCA_CMD_STAT_RESOURCE_BUSY = 0x06,
+ /* memory error: */
+ MTHCA_CMD_STAT_DDR_MEM_ERR = 0x07,
+ /* Required capability exceeds device limits: */
+ MTHCA_CMD_STAT_EXCEED_LIM = 0x08,
+ /* Resource is not in the appropriate state or ownership: */
+ MTHCA_CMD_STAT_BAD_RES_STATE = 0x09,
+ /* Index out of range: */
+ MTHCA_CMD_STAT_BAD_INDEX = 0x0a,
+ /* FW image corrupted: */
+ MTHCA_CMD_STAT_BAD_NVMEM = 0x0b,
+ /* Attempt to modify a QP/EE which is not in the presumed state: */
+ MTHCA_CMD_STAT_BAD_QPEE_STATE = 0x10,
+ /* Bad segment parameters (Address/Size): */
+ MTHCA_CMD_STAT_BAD_SEG_PARAM = 0x20,
+ /* Memory Region has Memory Windows bound to: */
+ MTHCA_CMD_STAT_REG_BOUND = 0x21,
+ /* HCA local attached memory not present: */
+ MTHCA_CMD_STAT_LAM_NOT_PRE = 0x22,
+ /* Bad management packet (silently discarded): */
+ MTHCA_CMD_STAT_BAD_PKT = 0x30,
+ /* More outstanding CQEs in CQ than new CQ size: */
+ MTHCA_CMD_STAT_BAD_SIZE = 0x40
+};
+
+enum {
+ MTHCA_TRANS_INVALID = 0,
+ MTHCA_TRANS_RST2INIT,
+ MTHCA_TRANS_INIT2INIT,
+ MTHCA_TRANS_INIT2RTR,
+ MTHCA_TRANS_RTR2RTS,
+ MTHCA_TRANS_RTS2RTS,
+ MTHCA_TRANS_SQERR2RTS,
+ MTHCA_TRANS_ANY2ERR,
+ MTHCA_TRANS_RTS2SQD,
+ MTHCA_TRANS_SQD2SQD,
+ MTHCA_TRANS_SQD2RTS,
+ MTHCA_TRANS_ANY2RST,
+};
+
+enum {
+ DEV_LIM_FLAG_RC = 1 << 0,
+ DEV_LIM_FLAG_UC = 1 << 1,
+ DEV_LIM_FLAG_UD = 1 << 2,
+ DEV_LIM_FLAG_RD = 1 << 3,
+ DEV_LIM_FLAG_RAW_IPV6 = 1 << 4,
+ DEV_LIM_FLAG_RAW_ETHER = 1 << 5,
+ DEV_LIM_FLAG_SRQ = 1 << 6,
+ DEV_LIM_FLAG_BAD_PKEY_CNTR = 1 << 8,
+ DEV_LIM_FLAG_BAD_QKEY_CNTR = 1 << 9,
+ DEV_LIM_FLAG_MW = 1 << 16,
+ DEV_LIM_FLAG_AUTO_PATH_MIG = 1 << 17,
+ DEV_LIM_FLAG_ATOMIC = 1 << 18,
+ DEV_LIM_FLAG_RAW_MULTI = 1 << 19,
+ DEV_LIM_FLAG_UD_AV_PORT_ENFORCE = 1 << 20,
+ DEV_LIM_FLAG_UD_MULTI = 1 << 21,
+};
+
+struct mthca_dev_lim {
+ int max_srq_sz;
+ int max_qp_sz;
+ int reserved_qps;
+ int max_qps;
+ int reserved_srqs;
+ int max_srqs;
+ int reserved_eecs;
+ int max_eecs;
+ int max_cq_sz;
+ int reserved_cqs;
+ int max_cqs;
+ int max_mpts;
+ int reserved_eqs;
+ int max_eqs;
+ int reserved_mtts;
+ int max_mrw_sz;
+ int reserved_mrws;
+ int max_mtt_seg;
+ int max_requester_per_qp;
+ int max_responder_per_qp;
+ int max_rdma_global;
+ int local_ca_ack_delay;
+ int max_mtu;
+ int max_port_width;
+ int max_vl;
+ int num_ports;
+ int max_gids;
+ int max_pkeys;
+ u32 flags;
+ int reserved_uars;
+ int uar_size;
+ int min_page_sz;
+ int max_sg;
+ int max_desc_sz;
+ int max_qp_per_mcg;
+ int reserved_mgms;
+ int max_mcgs;
+ int reserved_pds;
+ int max_pds;
+ int reserved_rdds;
+ int max_rdds;
+ int eec_entry_sz;
+ int qpc_entry_sz;
+ int eeec_entry_sz;
+ int eqpc_entry_sz;
+ int eqc_entry_sz;
+ int cqc_entry_sz;
+ int srq_entry_sz;
+ int uar_scratch_entry_sz;
+ int mtt_seg_sz;
+ int mpt_entry_sz;
+ union {
+ struct {
+ int max_avs;
+ } tavor;
+ struct {
+ int resize_srq;
+ int max_pbl_sz;
+ u8 bmme_flags;
+ u32 reserved_lkey;
+ int lam_required;
+ u64 max_icm_sz;
+ } arbel;
+ } hca;
+};
+
+struct mthca_adapter {
+ u32 vendor_id;
+ u32 device_id;
+ u32 revision_id;
+ u8 inta_pin;
+};
+
+struct mthca_init_hca_param {
+ u64 qpc_base;
+ u64 eec_base;
+ u64 srqc_base;
+ u64 cqc_base;
+ u64 eqpc_base;
+ u64 eeec_base;
+ u64 eqc_base;
+ u64 rdb_base;
+ u64 mc_base;
+ u64 mpt_base;
+ u64 mtt_base;
+ u64 uar_scratch_base;
+ u64 uarc_base;
+ u16 log_mc_entry_sz;
+ u16 mc_hash_sz;
+ u8 log_num_qps;
+ u8 log_num_eecs;
+ u8 log_num_srqs;
+ u8 log_num_cqs;
+ u8 log_num_eqs;
+ u8 log_mc_table_sz;
+ u8 mtt_seg_sz;
+ u8 log_mpt_sz;
+ u8 log_uar_sz;
+ u8 log_uarc_sz;
+};
+
+struct mthca_init_ib_param {
+ int enable_1x;
+ int enable_4x;
+ int vl_cap;
+ int mtu_cap;
+ u16 gid_cap;
+ u16 pkey_cap;
+ int set_guid0;
+ u64 guid0;
+ int set_node_guid;
+ u64 node_guid;
+ int set_si_guid;
+ u64 si_guid;
+};
+
+struct mthca_set_ib_param {
+ int set_si_guid;
+ int reset_qkey_viol;
+ u64 si_guid;
+ u32 cap_mask;
+};
+
+int mthca_cmd_use_events(struct mthca_dev *dev);
+void mthca_cmd_use_polling(struct mthca_dev *dev);
+void mthca_cmd_event(struct mthca_dev *dev, u16 token,
+ u8 status, u64 out_param);
+
+int mthca_SYS_EN(struct mthca_dev *dev, u8 *status);
+int mthca_SYS_DIS(struct mthca_dev *dev, u8 *status);
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status);
+int mthca_UNMAP_FA(struct mthca_dev *dev, u8 *status);
+int mthca_RUN_FW(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status);
+int mthca_ENABLE_LAM(struct mthca_dev *dev, u8 *status);
+int mthca_DISABLE_LAM(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_DDR(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+ struct mthca_dev_lim *dev_lim, u8 *status);
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+ struct mthca_adapter *adapter, u8 *status);
+int mthca_INIT_HCA(struct mthca_dev *dev,
+ struct mthca_init_hca_param *param,
+ u8 *status);
+int mthca_INIT_IB(struct mthca_dev *dev,
+ struct mthca_init_ib_param *param,
+ int port, u8 *status);
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port, u8 *status);
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic, u8 *status);
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+ int port, u8 *status);
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt, u8 *status);
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt, u8 *status);
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count, u8 *status);
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status);
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev, u8 *status);
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages,
+ u8 *status);
+int mthca_SW2HW_MPT(struct mthca_dev *dev, void *mpt_entry,
+ int mpt_index, u8 *status);
+int mthca_HW2SW_MPT(struct mthca_dev *dev, void *mpt_entry,
+ int mpt_index, u8 *status);
+int mthca_WRITE_MTT(struct mthca_dev *dev, u64 *mtt_entry,
+ int num_mtt, u8 *status);
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+ int eq_num, u8 *status);
+int mthca_SW2HW_EQ(struct mthca_dev *dev, void *eq_context,
+ int eq_num, u8 *status);
+int mthca_HW2SW_EQ(struct mthca_dev *dev, void *eq_context,
+ int eq_num, u8 *status);
+int mthca_SW2HW_CQ(struct mthca_dev *dev, void *cq_context,
+ int cq_num, u8 *status);
+int mthca_HW2SW_CQ(struct mthca_dev *dev, void *cq_context,
+ int cq_num, u8 *status);
+int mthca_MODIFY_QP(struct mthca_dev *dev, int trans, u32 num,
+ int is_ee, void *qp_context, u32 optmask,
+ u8 *status);
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+ void *qp_context, u8 *status);
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn,
+ u8 *status);
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+ int port, struct ib_wc* in_wc, struct ib_grh* in_grh,
+ void *in_mad, void *response_mad, u8 *status);
+int mthca_READ_MGM(struct mthca_dev *dev, int index, void *mgm,
+ u8 *status);
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index, void *mgm,
+ u8 *status);
+int mthca_MGID_HASH(struct mthca_dev *dev, void *gid, u16 *hash,
+ u8 *status);
+int mthca_NOP(struct mthca_dev *dev, u8 *status);
+
+#define MAILBOX_ALIGN(x) ((void *) ALIGN((unsigned long) (x), MTHCA_CMD_MAILBOX_ALIGN))
+
+#endif /* MTHCA_CMD_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_config_reg.h b/drivers/infiniband/hw/mthca/mthca_config_reg.h
new file mode 100644
index 00000000000..b4bfbbfe2c3
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_config_reg.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_config_reg.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef MTHCA_CONFIG_REG_H
+#define MTHCA_CONFIG_REG_H
+
+#include <asm/page.h>
+
+#define MTHCA_HCR_BASE 0x80680
+#define MTHCA_HCR_SIZE 0x0001c
+#define MTHCA_ECR_BASE 0x80700
+#define MTHCA_ECR_SIZE 0x00008
+#define MTHCA_ECR_CLR_BASE 0x80708
+#define MTHCA_ECR_CLR_SIZE 0x00008
+#define MTHCA_MAP_ECR_SIZE (MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE)
+#define MTHCA_CLR_INT_BASE 0xf00d8
+#define MTHCA_CLR_INT_SIZE 0x00008
+#define MTHCA_EQ_SET_CI_SIZE (8 * 32)
+
+#endif /* MTHCA_CONFIG_REG_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
new file mode 100644
index 00000000000..5dead2df7eb
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -0,0 +1,918 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_cq.c 1369 2004-12-20 16:17:07Z roland $
+ */
+
+#include <linux/init.h>
+#include <linux/hardirq.h>
+
+#include <ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+enum {
+ MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE
+};
+
+enum {
+ MTHCA_CQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_cq_context {
+ u32 flags;
+ u64 start;
+ u32 logsize_usrpage;
+ u32 error_eqn; /* Tavor only */
+ u32 comp_eqn;
+ u32 pd;
+ u32 lkey;
+ u32 last_notified_index;
+ u32 solicit_producer_index;
+ u32 consumer_index;
+ u32 producer_index;
+ u32 cqn;
+ u32 ci_db; /* Arbel only */
+ u32 state_db; /* Arbel only */
+ u32 reserved;
+} __attribute__((packed));
+
+#define MTHCA_CQ_STATUS_OK ( 0 << 28)
+#define MTHCA_CQ_STATUS_OVERFLOW ( 9 << 28)
+#define MTHCA_CQ_STATUS_WRITE_FAIL (10 << 28)
+#define MTHCA_CQ_FLAG_TR ( 1 << 18)
+#define MTHCA_CQ_FLAG_OI ( 1 << 17)
+#define MTHCA_CQ_STATE_DISARMED ( 0 << 8)
+#define MTHCA_CQ_STATE_ARMED ( 1 << 8)
+#define MTHCA_CQ_STATE_ARMED_SOL ( 4 << 8)
+#define MTHCA_EQ_STATE_FIRED (10 << 8)
+
+enum {
+ MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe
+};
+
+enum {
+ SYNDROME_LOCAL_LENGTH_ERR = 0x01,
+ SYNDROME_LOCAL_QP_OP_ERR = 0x02,
+ SYNDROME_LOCAL_EEC_OP_ERR = 0x03,
+ SYNDROME_LOCAL_PROT_ERR = 0x04,
+ SYNDROME_WR_FLUSH_ERR = 0x05,
+ SYNDROME_MW_BIND_ERR = 0x06,
+ SYNDROME_BAD_RESP_ERR = 0x10,
+ SYNDROME_LOCAL_ACCESS_ERR = 0x11,
+ SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12,
+ SYNDROME_REMOTE_ACCESS_ERR = 0x13,
+ SYNDROME_REMOTE_OP_ERR = 0x14,
+ SYNDROME_RETRY_EXC_ERR = 0x15,
+ SYNDROME_RNR_RETRY_EXC_ERR = 0x16,
+ SYNDROME_LOCAL_RDD_VIOL_ERR = 0x20,
+ SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,
+ SYNDROME_REMOTE_ABORTED_ERR = 0x22,
+ SYNDROME_INVAL_EECN_ERR = 0x23,
+ SYNDROME_INVAL_EEC_STATE_ERR = 0x24
+};
+
+struct mthca_cqe {
+ u32 my_qpn;
+ u32 my_ee;
+ u32 rqpn;
+ u16 sl_g_mlpath;
+ u16 rlid;
+ u32 imm_etype_pkey_eec;
+ u32 byte_cnt;
+ u32 wqe;
+ u8 opcode;
+ u8 is_send;
+ u8 reserved;
+ u8 owner;
+};
+
+struct mthca_err_cqe {
+ u32 my_qpn;
+ u32 reserved1[3];
+ u8 syndrome;
+ u8 reserved2;
+ u16 db_cnt;
+ u32 reserved3;
+ u32 wqe;
+ u8 opcode;
+ u8 reserved4[2];
+ u8 owner;
+};
+
+#define MTHCA_CQ_ENTRY_OWNER_SW (0 << 7)
+#define MTHCA_CQ_ENTRY_OWNER_HW (1 << 7)
+
+#define MTHCA_TAVOR_CQ_DB_INC_CI (1 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT (2 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL (3 << 24)
+#define MTHCA_TAVOR_CQ_DB_SET_CI (4 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24)
+
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL (1 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT (2 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24)
+
+static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
+{
+ if (cq->is_direct)
+ return cq->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE);
+ else
+ return cq->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf
+ + (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE;
+}
+
+static inline struct mthca_cqe *cqe_sw(struct mthca_cq *cq, int i)
+{
+ struct mthca_cqe *cqe = get_cqe(cq, i);
+ return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe;
+}
+
+static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
+{
+ return cqe_sw(cq, cq->cons_index & cq->ibcq.cqe);
+}
+
+static inline void set_cqe_hw(struct mthca_cqe *cqe)
+{
+ cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;
+}
+
+/*
+ * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index
+ * should be correct before calling update_cons_index().
+ */
+static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
+ int incr)
+{
+ u32 doorbell[2];
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ *cq->set_ci_db = cpu_to_be32(cq->cons_index);
+ wmb();
+ } else {
+ doorbell[0] = cpu_to_be32(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn);
+ doorbell[1] = cpu_to_be32(incr - 1);
+
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_CQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
+}
+
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn)
+{
+ struct mthca_cq *cq;
+
+ cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+
+ if (!cq) {
+ mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
+ return;
+ }
+
+ ++cq->arm_sn;
+
+ cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn)
+{
+ struct mthca_cq *cq;
+ struct mthca_cqe *cqe;
+ int prod_index;
+ int nfreed = 0;
+
+ spin_lock_irq(&dev->cq_table.lock);
+ cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+ if (cq)
+ atomic_inc(&cq->refcount);
+ spin_unlock_irq(&dev->cq_table.lock);
+
+ if (!cq)
+ return;
+
+ spin_lock_irq(&cq->lock);
+
+ /*
+ * First we need to find the current producer index, so we
+ * know where to start cleaning from. It doesn't matter if HW
+ * adds new entries after this loop -- the QP we're worried
+ * about is already in RESET, so the new entries won't come
+ * from our QP and therefore don't need to be checked.
+ */
+ for (prod_index = cq->cons_index;
+ cqe_sw(cq, prod_index & cq->ibcq.cqe);
+ ++prod_index)
+ if (prod_index == cq->cons_index + cq->ibcq.cqe)
+ break;
+
+ if (0)
+ mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n",
+ qpn, cqn, cq->cons_index, prod_index);
+
+ /*
+ * Now sweep backwards through the CQ, removing CQ entries
+ * that match our QP by copying older entries on top of them.
+ */
+ while (prod_index > cq->cons_index) {
+ cqe = get_cqe(cq, (prod_index - 1) & cq->ibcq.cqe);
+ if (cqe->my_qpn == cpu_to_be32(qpn))
+ ++nfreed;
+ else if (nfreed)
+ memcpy(get_cqe(cq, (prod_index - 1 + nfreed) &
+ cq->ibcq.cqe),
+ cqe,
+ MTHCA_CQ_ENTRY_SIZE);
+ --prod_index;
+ }
+
+ if (nfreed) {
+ wmb();
+ cq->cons_index += nfreed;
+ update_cons_index(dev, cq, nfreed);
+ }
+
+ spin_unlock_irq(&cq->lock);
+ if (atomic_dec_and_test(&cq->refcount))
+ wake_up(&cq->wait);
+}
+
+static int handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
+ struct mthca_qp *qp, int wqe_index, int is_send,
+ struct mthca_err_cqe *cqe,
+ struct ib_wc *entry, int *free_cqe)
+{
+ int err;
+ int dbd;
+ u32 new_wqe;
+
+ if (1 && cqe->syndrome != SYNDROME_WR_FLUSH_ERR) {
+ int j;
+
+ mthca_dbg(dev, "%x/%d: error CQE -> QPN %06x, WQE @ %08x\n",
+ cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
+ be32_to_cpu(cqe->wqe));
+
+ for (j = 0; j < 8; ++j)
+ printk(KERN_DEBUG " [%2x] %08x\n",
+ j * 4, be32_to_cpu(((u32 *) cqe)[j]));
+ }
+
+ /*
+ * For completions in error, only work request ID, status (and
+ * freed resource count for RD) have to be set.
+ */
+ switch (cqe->syndrome) {
+ case SYNDROME_LOCAL_LENGTH_ERR:
+ entry->status = IB_WC_LOC_LEN_ERR;
+ break;
+ case SYNDROME_LOCAL_QP_OP_ERR:
+ entry->status = IB_WC_LOC_QP_OP_ERR;
+ break;
+ case SYNDROME_LOCAL_EEC_OP_ERR:
+ entry->status = IB_WC_LOC_EEC_OP_ERR;
+ break;
+ case SYNDROME_LOCAL_PROT_ERR:
+ entry->status = IB_WC_LOC_PROT_ERR;
+ break;
+ case SYNDROME_WR_FLUSH_ERR:
+ entry->status = IB_WC_WR_FLUSH_ERR;
+ break;
+ case SYNDROME_MW_BIND_ERR:
+ entry->status = IB_WC_MW_BIND_ERR;
+ break;
+ case SYNDROME_BAD_RESP_ERR:
+ entry->status = IB_WC_BAD_RESP_ERR;
+ break;
+ case SYNDROME_LOCAL_ACCESS_ERR:
+ entry->status = IB_WC_LOC_ACCESS_ERR;
+ break;
+ case SYNDROME_REMOTE_INVAL_REQ_ERR:
+ entry->status = IB_WC_REM_INV_REQ_ERR;
+ break;
+ case SYNDROME_REMOTE_ACCESS_ERR:
+ entry->status = IB_WC_REM_ACCESS_ERR;
+ break;
+ case SYNDROME_REMOTE_OP_ERR:
+ entry->status = IB_WC_REM_OP_ERR;
+ break;
+ case SYNDROME_RETRY_EXC_ERR:
+ entry->status = IB_WC_RETRY_EXC_ERR;
+ break;
+ case SYNDROME_RNR_RETRY_EXC_ERR:
+ entry->status = IB_WC_RNR_RETRY_EXC_ERR;
+ break;
+ case SYNDROME_LOCAL_RDD_VIOL_ERR:
+ entry->status = IB_WC_LOC_RDD_VIOL_ERR;
+ break;
+ case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:
+ entry->status = IB_WC_REM_INV_RD_REQ_ERR;
+ break;
+ case SYNDROME_REMOTE_ABORTED_ERR:
+ entry->status = IB_WC_REM_ABORT_ERR;
+ break;
+ case SYNDROME_INVAL_EECN_ERR:
+ entry->status = IB_WC_INV_EECN_ERR;
+ break;
+ case SYNDROME_INVAL_EEC_STATE_ERR:
+ entry->status = IB_WC_INV_EEC_STATE_ERR;
+ break;
+ default:
+ entry->status = IB_WC_GENERAL_ERR;
+ break;
+ }
+
+ err = mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe);
+ if (err)
+ return err;
+
+ /*
+ * If we're at the end of the WQE chain, or we've used up our
+ * doorbell count, free the CQE. Otherwise just update it for
+ * the next poll operation.
+ */
+ if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd))
+ return 0;
+
+ cqe->db_cnt = cpu_to_be16(be16_to_cpu(cqe->db_cnt) - dbd);
+ cqe->wqe = new_wqe;
+ cqe->syndrome = SYNDROME_WR_FLUSH_ERR;
+
+ *free_cqe = 0;
+
+ return 0;
+}
+
+static void dump_cqe(struct mthca_cqe *cqe)
+{
+ int j;
+
+ for (j = 0; j < 8; ++j)
+ printk(KERN_DEBUG " [%2x] %08x\n",
+ j * 4, be32_to_cpu(((u32 *) cqe)[j]));
+}
+
+static inline int mthca_poll_one(struct mthca_dev *dev,
+ struct mthca_cq *cq,
+ struct mthca_qp **cur_qp,
+ int *freed,
+ struct ib_wc *entry)
+{
+ struct mthca_wq *wq;
+ struct mthca_cqe *cqe;
+ int wqe_index;
+ int is_error;
+ int is_send;
+ int free_cqe = 1;
+ int err = 0;
+
+ cqe = next_cqe_sw(cq);
+ if (!cqe)
+ return -EAGAIN;
+
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rmb();
+
+ if (0) {
+ mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
+ cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
+ be32_to_cpu(cqe->wqe));
+
+ dump_cqe(cqe);
+ }
+
+ is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+ MTHCA_ERROR_CQE_OPCODE_MASK;
+ is_send = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;
+
+ if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
+ /*
+ * We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
+ *cur_qp = mthca_array_get(&dev->qp_table.qp,
+ be32_to_cpu(cqe->my_qpn) &
+ (dev->limits.num_qps - 1));
+ if (!*cur_qp) {
+ mthca_warn(dev, "CQ entry for unknown QP %06x\n",
+ be32_to_cpu(cqe->my_qpn) & 0xffffff);
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ entry->qp_num = (*cur_qp)->qpn;
+
+ if (is_send) {
+ wq = &(*cur_qp)->sq;
+ wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset)
+ >> wq->wqe_shift);
+ entry->wr_id = (*cur_qp)->wrid[wqe_index +
+ (*cur_qp)->rq.max];
+ } else {
+ wq = &(*cur_qp)->rq;
+ wqe_index = be32_to_cpu(cqe->wqe) >> wq->wqe_shift;
+ entry->wr_id = (*cur_qp)->wrid[wqe_index];
+ }
+
+ if (wq->last_comp < wqe_index)
+ wq->tail += wqe_index - wq->last_comp;
+ else
+ wq->tail += wqe_index + wq->max - wq->last_comp;
+
+ wq->last_comp = wqe_index;
+
+ if (0)
+ mthca_dbg(dev, "%s completion for QP %06x, index %d (nr %d)\n",
+ is_send ? "Send" : "Receive",
+ (*cur_qp)->qpn, wqe_index, wq->max);
+
+ if (is_error) {
+ err = handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send,
+ (struct mthca_err_cqe *) cqe,
+ entry, &free_cqe);
+ goto out;
+ }
+
+ if (is_send) {
+ entry->opcode = IB_WC_SEND; /* XXX */
+ } else {
+ entry->byte_len = be32_to_cpu(cqe->byte_cnt);
+ switch (cqe->opcode & 0x1f) {
+ case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
+ case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
+ entry->wc_flags = IB_WC_WITH_IMM;
+ entry->imm_data = cqe->imm_etype_pkey_eec;
+ entry->opcode = IB_WC_RECV;
+ break;
+ case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
+ entry->wc_flags = IB_WC_WITH_IMM;
+ entry->imm_data = cqe->imm_etype_pkey_eec;
+ entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ break;
+ default:
+ entry->wc_flags = 0;
+ entry->opcode = IB_WC_RECV;
+ break;
+ }
+ entry->slid = be16_to_cpu(cqe->rlid);
+ entry->sl = be16_to_cpu(cqe->sl_g_mlpath) >> 12;
+ entry->src_qp = be32_to_cpu(cqe->rqpn) & 0xffffff;
+ entry->dlid_path_bits = be16_to_cpu(cqe->sl_g_mlpath) & 0x7f;
+ entry->pkey_index = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16;
+ entry->wc_flags |= be16_to_cpu(cqe->sl_g_mlpath) & 0x80 ?
+ IB_WC_GRH : 0;
+ }
+
+ entry->status = IB_WC_SUCCESS;
+
+ out:
+ if (likely(free_cqe)) {
+ set_cqe_hw(cqe);
+ ++(*freed);
+ ++cq->cons_index;
+ }
+
+ return err;
+}
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+ struct ib_wc *entry)
+{
+ struct mthca_dev *dev = to_mdev(ibcq->device);
+ struct mthca_cq *cq = to_mcq(ibcq);
+ struct mthca_qp *qp = NULL;
+ unsigned long flags;
+ int err = 0;
+ int freed = 0;
+ int npolled;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ for (npolled = 0; npolled < num_entries; ++npolled) {
+ err = mthca_poll_one(dev, cq, &qp,
+ &freed, entry + npolled);
+ if (err)
+ break;
+ }
+
+ if (freed) {
+ wmb();
+ update_cons_index(dev, cq, freed);
+ }
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return err == 0 || err == -EAGAIN ? npolled : err;
+}
+
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify)
+{
+ u32 doorbell[2];
+
+ doorbell[0] = cpu_to_be32((notify == IB_CQ_SOLICITED ?
+ MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
+ MTHCA_TAVOR_CQ_DB_REQ_NOT) |
+ to_mcq(cq)->cqn);
+ doorbell[1] = 0xffffffff;
+
+ mthca_write64(doorbell,
+ to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock));
+
+ return 0;
+}
+
+int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
+{
+ struct mthca_cq *cq = to_mcq(ibcq);
+ u32 doorbell[2];
+ u32 sn;
+ u32 ci;
+
+ sn = cq->arm_sn & 3;
+ ci = cpu_to_be32(cq->cons_index);
+
+ doorbell[0] = ci;
+ doorbell[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
+ (notify == IB_CQ_SOLICITED ? 1 : 2));
+
+ mthca_write_db_rec(doorbell, cq->arm_db);
+
+ /*
+ * Make sure that the doorbell record in host memory is
+ * written before ringing the doorbell via PCI MMIO.
+ */
+ wmb();
+
+ doorbell[0] = cpu_to_be32((sn << 28) |
+ (notify == IB_CQ_SOLICITED ?
+ MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
+ MTHCA_ARBEL_CQ_DB_REQ_NOT) |
+ cq->cqn);
+ doorbell[1] = ci;
+
+ mthca_write64(doorbell,
+ to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock));
+
+ return 0;
+}
+
+static void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq *cq)
+{
+ int i;
+ int size;
+
+ if (cq->is_direct)
+ pci_free_consistent(dev->pdev,
+ (cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE,
+ cq->queue.direct.buf,
+ pci_unmap_addr(&cq->queue.direct,
+ mapping));
+ else {
+ size = (cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE;
+ for (i = 0; i < (size + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+ if (cq->queue.page_list[i].buf)
+ pci_free_consistent(dev->pdev, PAGE_SIZE,
+ cq->queue.page_list[i].buf,
+ pci_unmap_addr(&cq->queue.page_list[i],
+ mapping));
+
+ kfree(cq->queue.page_list);
+ }
+}
+
+static int mthca_alloc_cq_buf(struct mthca_dev *dev, int size,
+ struct mthca_cq *cq)
+{
+ int err = -ENOMEM;
+ int npages, shift;
+ u64 *dma_list = NULL;
+ dma_addr_t t;
+ int i;
+
+ if (size <= MTHCA_MAX_DIRECT_CQ_SIZE) {
+ cq->is_direct = 1;
+ npages = 1;
+ shift = get_order(size) + PAGE_SHIFT;
+
+ cq->queue.direct.buf = pci_alloc_consistent(dev->pdev,
+ size, &t);
+ if (!cq->queue.direct.buf)
+ return -ENOMEM;
+
+ pci_unmap_addr_set(&cq->queue.direct, mapping, t);
+
+ memset(cq->queue.direct.buf, 0, size);
+
+ while (t & ((1 << shift) - 1)) {
+ --shift;
+ npages *= 2;
+ }
+
+ dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+ if (!dma_list)
+ goto err_free;
+
+ for (i = 0; i < npages; ++i)
+ dma_list[i] = t + i * (1 << shift);
+ } else {
+ cq->is_direct = 0;
+ npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ shift = PAGE_SHIFT;
+
+ dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+ if (!dma_list)
+ return -ENOMEM;
+
+ cq->queue.page_list = kmalloc(npages * sizeof *cq->queue.page_list,
+ GFP_KERNEL);
+ if (!cq->queue.page_list)
+ goto err_out;
+
+ for (i = 0; i < npages; ++i)
+ cq->queue.page_list[i].buf = NULL;
+
+ for (i = 0; i < npages; ++i) {
+ cq->queue.page_list[i].buf =
+ pci_alloc_consistent(dev->pdev, PAGE_SIZE, &t);
+ if (!cq->queue.page_list[i].buf)
+ goto err_free;
+
+ dma_list[i] = t;
+ pci_unmap_addr_set(&cq->queue.page_list[i], mapping, t);
+
+ memset(cq->queue.page_list[i].buf, 0, PAGE_SIZE);
+ }
+ }
+
+ err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
+ dma_list, shift, npages,
+ 0, size,
+ MTHCA_MPT_FLAG_LOCAL_WRITE |
+ MTHCA_MPT_FLAG_LOCAL_READ,
+ &cq->mr);
+ if (err)
+ goto err_free;
+
+ kfree(dma_list);
+
+ return 0;
+
+err_free:
+ mthca_free_cq_buf(dev, cq);
+
+err_out:
+ kfree(dma_list);
+
+ return err;
+}
+
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+ struct mthca_cq *cq)
+{
+ int size = nent * MTHCA_CQ_ENTRY_SIZE;
+ void *mailbox = NULL;
+ struct mthca_cq_context *cq_context;
+ int err = -ENOMEM;
+ u8 status;
+ int i;
+
+ might_sleep();
+
+ cq->ibcq.cqe = nent - 1;
+
+ cq->cqn = mthca_alloc(&dev->cq_table.alloc);
+ if (cq->cqn == -1)
+ return -ENOMEM;
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ cq->arm_sn = 1;
+
+ err = mthca_table_get(dev, dev->cq_table.table, cq->cqn);
+ if (err)
+ goto err_out;
+
+ err = -ENOMEM;
+
+ cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI,
+ cq->cqn, &cq->set_ci_db);
+ if (cq->set_ci_db_index < 0)
+ goto err_out_icm;
+
+ cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM,
+ cq->cqn, &cq->arm_db);
+ if (cq->arm_db_index < 0)
+ goto err_out_ci;
+ }
+
+ mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
+ GFP_KERNEL);
+ if (!mailbox)
+ goto err_out_mailbox;
+
+ cq_context = MAILBOX_ALIGN(mailbox);
+
+ err = mthca_alloc_cq_buf(dev, size, cq);
+ if (err)
+ goto err_out_mailbox;
+
+ for (i = 0; i < nent; ++i)
+ set_cqe_hw(get_cqe(cq, i));
+
+ spin_lock_init(&cq->lock);
+ atomic_set(&cq->refcount, 1);
+ init_waitqueue_head(&cq->wait);
+
+ memset(cq_context, 0, sizeof *cq_context);
+ cq_context->flags = cpu_to_be32(MTHCA_CQ_STATUS_OK |
+ MTHCA_CQ_STATE_DISARMED |
+ MTHCA_CQ_FLAG_TR);
+ cq_context->start = cpu_to_be64(0);
+ cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24 |
+ dev->driver_uar.index);
+ cq_context->error_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+ cq_context->comp_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
+ cq_context->pd = cpu_to_be32(dev->driver_pd.pd_num);
+ cq_context->lkey = cpu_to_be32(cq->mr.ibmr.lkey);
+ cq_context->cqn = cpu_to_be32(cq->cqn);
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ cq_context->ci_db = cpu_to_be32(cq->set_ci_db_index);
+ cq_context->state_db = cpu_to_be32(cq->arm_db_index);
+ }
+
+ err = mthca_SW2HW_CQ(dev, cq_context, cq->cqn, &status);
+ if (err) {
+ mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err);
+ goto err_out_free_mr;
+ }
+
+ if (status) {
+ mthca_warn(dev, "SW2HW_CQ returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ goto err_out_free_mr;
+ }
+
+ spin_lock_irq(&dev->cq_table.lock);
+ if (mthca_array_set(&dev->cq_table.cq,
+ cq->cqn & (dev->limits.num_cqs - 1),
+ cq)) {
+ spin_unlock_irq(&dev->cq_table.lock);
+ goto err_out_free_mr;
+ }
+ spin_unlock_irq(&dev->cq_table.lock);
+
+ cq->cons_index = 0;
+
+ kfree(mailbox);
+
+ return 0;
+
+err_out_free_mr:
+ mthca_free_mr(dev, &cq->mr);
+ mthca_free_cq_buf(dev, cq);
+
+err_out_mailbox:
+ kfree(mailbox);
+
+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
+
+err_out_ci:
+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+
+err_out_icm:
+ mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+
+err_out:
+ mthca_free(&dev->cq_table.alloc, cq->cqn);
+
+ return err;
+}
+
+void mthca_free_cq(struct mthca_dev *dev,
+ struct mthca_cq *cq)
+{
+ void *mailbox;
+ int err;
+ u8 status;
+
+ might_sleep();
+
+ mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
+ GFP_KERNEL);
+ if (!mailbox) {
+ mthca_warn(dev, "No memory for mailbox to free CQ.\n");
+ return;
+ }
+
+ err = mthca_HW2SW_CQ(dev, MAILBOX_ALIGN(mailbox), cq->cqn, &status);
+ if (err)
+ mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err);
+ else if (status)
+ mthca_warn(dev, "HW2SW_CQ returned status 0x%02x\n",
+ status);
+
+ if (0) {
+ u32 *ctx = MAILBOX_ALIGN(mailbox);
+ int j;
+
+ printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
+ cq->cqn, cq->cons_index, !!next_cqe_sw(cq));
+ for (j = 0; j < 16; ++j)
+ printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
+ }
+
+ spin_lock_irq(&dev->cq_table.lock);
+ mthca_array_clear(&dev->cq_table.cq,
+ cq->cqn & (dev->limits.num_cqs - 1));
+ spin_unlock_irq(&dev->cq_table.lock);
+
+ if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
+ synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
+ else
+ synchronize_irq(dev->pdev->irq);
+
+ atomic_dec(&cq->refcount);
+ wait_event(cq->wait, !atomic_read(&cq->refcount));
+
+ mthca_free_mr(dev, &cq->mr);
+ mthca_free_cq_buf(dev, cq);
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+ mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+ }
+
+ mthca_free(&dev->cq_table.alloc, cq->cqn);
+ kfree(mailbox);
+}
+
+int __devinit mthca_init_cq_table(struct mthca_dev *dev)
+{
+ int err;
+
+ spin_lock_init(&dev->cq_table.lock);
+
+ err = mthca_alloc_init(&dev->cq_table.alloc,
+ dev->limits.num_cqs,
+ (1 << 24) - 1,
+ dev->limits.reserved_cqs);
+ if (err)
+ return err;
+
+ err = mthca_array_init(&dev->cq_table.cq,
+ dev->limits.num_cqs);
+ if (err)
+ mthca_alloc_cleanup(&dev->cq_table.alloc);
+
+ return err;
+}
+
+void __devexit mthca_cleanup_cq_table(struct mthca_dev *dev)
+{
+ mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs);
+ mthca_alloc_cleanup(&dev->cq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
new file mode 100644
index 00000000000..56b2bfb5adb
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_dev.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef MTHCA_DEV_H
+#define MTHCA_DEV_H
+
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <asm/semaphore.h>
+
+#include "mthca_provider.h"
+#include "mthca_doorbell.h"
+
+#define DRV_NAME "ib_mthca"
+#define PFX DRV_NAME ": "
+#define DRV_VERSION "0.06-pre"
+#define DRV_RELDATE "November 8, 2004"
+
+/* Types of supported HCA */
+enum {
+ TAVOR, /* MT23108 */
+ ARBEL_COMPAT, /* MT25208 in Tavor compat mode */
+ ARBEL_NATIVE /* MT25208 with extended features */
+};
+
+enum {
+ MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
+ MTHCA_FLAG_SRQ = 1 << 2,
+ MTHCA_FLAG_MSI = 1 << 3,
+ MTHCA_FLAG_MSI_X = 1 << 4,
+ MTHCA_FLAG_NO_LAM = 1 << 5
+};
+
+enum {
+ MTHCA_MAX_PORTS = 2
+};
+
+enum {
+ MTHCA_EQ_CONTEXT_SIZE = 0x40,
+ MTHCA_CQ_CONTEXT_SIZE = 0x40,
+ MTHCA_QP_CONTEXT_SIZE = 0x200,
+ MTHCA_RDB_ENTRY_SIZE = 0x20,
+ MTHCA_AV_SIZE = 0x20,
+ MTHCA_MGM_ENTRY_SIZE = 0x40,
+
+ /* Arbel FW gives us these, but we need them for Tavor */
+ MTHCA_MPT_ENTRY_SIZE = 0x40,
+ MTHCA_MTT_SEG_SIZE = 0x40,
+};
+
+enum {
+ MTHCA_EQ_CMD,
+ MTHCA_EQ_ASYNC,
+ MTHCA_EQ_COMP,
+ MTHCA_NUM_EQ
+};
+
+struct mthca_cmd {
+ int use_events;
+ struct semaphore hcr_sem;
+ struct semaphore poll_sem;
+ struct semaphore event_sem;
+ int max_cmds;
+ spinlock_t context_lock;
+ int free_head;
+ struct mthca_cmd_context *context;
+ u16 token_mask;
+};
+
+struct mthca_limits {
+ int num_ports;
+ int vl_cap;
+ int mtu_cap;
+ int gid_table_len;
+ int pkey_table_len;
+ int local_ca_ack_delay;
+ int num_uars;
+ int max_sg;
+ int num_qps;
+ int reserved_qps;
+ int num_srqs;
+ int reserved_srqs;
+ int num_eecs;
+ int reserved_eecs;
+ int num_cqs;
+ int reserved_cqs;
+ int num_eqs;
+ int reserved_eqs;
+ int num_mpts;
+ int num_mtt_segs;
+ int mtt_seg_size;
+ int reserved_mtts;
+ int reserved_mrws;
+ int reserved_uars;
+ int num_mgms;
+ int num_amgms;
+ int reserved_mcgs;
+ int num_pds;
+ int reserved_pds;
+};
+
+struct mthca_alloc {
+ u32 last;
+ u32 top;
+ u32 max;
+ u32 mask;
+ spinlock_t lock;
+ unsigned long *table;
+};
+
+struct mthca_array {
+ struct {
+ void **page;
+ int used;
+ } *page_list;
+};
+
+struct mthca_uar_table {
+ struct mthca_alloc alloc;
+ u64 uarc_base;
+ int uarc_size;
+};
+
+struct mthca_pd_table {
+ struct mthca_alloc alloc;
+};
+
+struct mthca_mr_table {
+ struct mthca_alloc mpt_alloc;
+ int max_mtt_order;
+ unsigned long **mtt_buddy;
+ u64 mtt_base;
+ struct mthca_icm_table *mtt_table;
+ struct mthca_icm_table *mpt_table;
+};
+
+struct mthca_eq_table {
+ struct mthca_alloc alloc;
+ void __iomem *clr_int;
+ u32 clr_mask;
+ u32 arm_mask;
+ struct mthca_eq eq[MTHCA_NUM_EQ];
+ u64 icm_virt;
+ struct page *icm_page;
+ dma_addr_t icm_dma;
+ int have_irq;
+ u8 inta_pin;
+};
+
+struct mthca_cq_table {
+ struct mthca_alloc alloc;
+ spinlock_t lock;
+ struct mthca_array cq;
+ struct mthca_icm_table *table;
+};
+
+struct mthca_qp_table {
+ struct mthca_alloc alloc;
+ u32 rdb_base;
+ int rdb_shift;
+ int sqp_start;
+ spinlock_t lock;
+ struct mthca_array qp;
+ struct mthca_icm_table *qp_table;
+ struct mthca_icm_table *eqp_table;
+};
+
+struct mthca_av_table {
+ struct pci_pool *pool;
+ int num_ddr_avs;
+ u64 ddr_av_base;
+ void __iomem *av_map;
+ struct mthca_alloc alloc;
+};
+
+struct mthca_mcg_table {
+ struct semaphore sem;
+ struct mthca_alloc alloc;
+ struct mthca_icm_table *table;
+};
+
+struct mthca_dev {
+ struct ib_device ib_dev;
+ struct pci_dev *pdev;
+
+ int hca_type;
+ unsigned long mthca_flags;
+ unsigned long device_cap_flags;
+
+ u32 rev_id;
+
+ /* firmware info */
+ u64 fw_ver;
+ union {
+ struct {
+ u64 fw_start;
+ u64 fw_end;
+ } tavor;
+ struct {
+ u64 clr_int_base;
+ u64 eq_arm_base;
+ u64 eq_set_ci_base;
+ struct mthca_icm *fw_icm;
+ struct mthca_icm *aux_icm;
+ u16 fw_pages;
+ } arbel;
+ } fw;
+
+ u64 ddr_start;
+ u64 ddr_end;
+
+ MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock)
+ struct semaphore cap_mask_mutex;
+
+ void __iomem *hcr;
+ void __iomem *kar;
+ void __iomem *clr_base;
+ union {
+ struct {
+ void __iomem *ecr_base;
+ } tavor;
+ struct {
+ void __iomem *eq_arm;
+ void __iomem *eq_set_ci_base;
+ } arbel;
+ } eq_regs;
+
+ struct mthca_cmd cmd;
+ struct mthca_limits limits;
+
+ struct mthca_uar_table uar_table;
+ struct mthca_pd_table pd_table;
+ struct mthca_mr_table mr_table;
+ struct mthca_eq_table eq_table;
+ struct mthca_cq_table cq_table;
+ struct mthca_qp_table qp_table;
+ struct mthca_av_table av_table;
+ struct mthca_mcg_table mcg_table;
+
+ struct mthca_uar driver_uar;
+ struct mthca_db_table *db_tab;
+ struct mthca_pd driver_pd;
+ struct mthca_mr driver_mr;
+
+ struct ib_mad_agent *send_agent[MTHCA_MAX_PORTS][2];
+ struct ib_ah *sm_ah[MTHCA_MAX_PORTS];
+ spinlock_t sm_lock;
+};
+
+#define mthca_dbg(mdev, format, arg...) \
+ dev_dbg(&mdev->pdev->dev, format, ## arg)
+#define mthca_err(mdev, format, arg...) \
+ dev_err(&mdev->pdev->dev, format, ## arg)
+#define mthca_info(mdev, format, arg...) \
+ dev_info(&mdev->pdev->dev, format, ## arg)
+#define mthca_warn(mdev, format, arg...) \
+ dev_warn(&mdev->pdev->dev, format, ## arg)
+
+extern void __buggy_use_of_MTHCA_GET(void);
+extern void __buggy_use_of_MTHCA_PUT(void);
+
+#define MTHCA_GET(dest, source, offset) \
+ do { \
+ void *__p = (char *) (source) + (offset); \
+ switch (sizeof (dest)) { \
+ case 1: (dest) = *(u8 *) __p; break; \
+ case 2: (dest) = be16_to_cpup(__p); break; \
+ case 4: (dest) = be32_to_cpup(__p); break; \
+ case 8: (dest) = be64_to_cpup(__p); break; \
+ default: __buggy_use_of_MTHCA_GET(); \
+ } \
+ } while (0)
+
+#define MTHCA_PUT(dest, source, offset) \
+ do { \
+ __typeof__(source) *__p = \
+ (__typeof__(source) *) ((char *) (dest) + (offset)); \
+ switch (sizeof(source)) { \
+ case 1: *__p = (source); break; \
+ case 2: *__p = cpu_to_be16(source); break; \
+ case 4: *__p = cpu_to_be32(source); break; \
+ case 8: *__p = cpu_to_be64(source); break; \
+ default: __buggy_use_of_MTHCA_PUT(); \
+ } \
+ } while (0)
+
+int mthca_reset(struct mthca_dev *mdev);
+
+u32 mthca_alloc(struct mthca_alloc *alloc);
+void mthca_free(struct mthca_alloc *alloc, u32 obj);
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+ u32 reserved);
+void mthca_alloc_cleanup(struct mthca_alloc *alloc);
+void *mthca_array_get(struct mthca_array *array, int index);
+int mthca_array_set(struct mthca_array *array, int index, void *value);
+void mthca_array_clear(struct mthca_array *array, int index);
+int mthca_array_init(struct mthca_array *array, int nent);
+void mthca_array_cleanup(struct mthca_array *array, int nent);
+
+int mthca_init_uar_table(struct mthca_dev *dev);
+int mthca_init_pd_table(struct mthca_dev *dev);
+int mthca_init_mr_table(struct mthca_dev *dev);
+int mthca_init_eq_table(struct mthca_dev *dev);
+int mthca_init_cq_table(struct mthca_dev *dev);
+int mthca_init_qp_table(struct mthca_dev *dev);
+int mthca_init_av_table(struct mthca_dev *dev);
+int mthca_init_mcg_table(struct mthca_dev *dev);
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev);
+void mthca_cleanup_pd_table(struct mthca_dev *dev);
+void mthca_cleanup_mr_table(struct mthca_dev *dev);
+void mthca_cleanup_eq_table(struct mthca_dev *dev);
+void mthca_cleanup_cq_table(struct mthca_dev *dev);
+void mthca_cleanup_qp_table(struct mthca_dev *dev);
+void mthca_cleanup_av_table(struct mthca_dev *dev);
+void mthca_cleanup_mcg_table(struct mthca_dev *dev);
+
+int mthca_register_device(struct mthca_dev *dev);
+void mthca_unregister_device(struct mthca_dev *dev);
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
+
+int mthca_pd_alloc(struct mthca_dev *dev, struct mthca_pd *pd);
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd);
+
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+ u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+ u64 *buffer_list, int buffer_size_shift,
+ int list_len, u64 iova, u64 total_size,
+ u32 access, struct mthca_mr *mr);
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr);
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
+void mthca_unmap_eq_icm(struct mthca_dev *dev);
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+ struct ib_wc *entry);
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify);
+int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify);
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+ struct mthca_cq *cq);
+void mthca_free_cq(struct mthca_dev *dev,
+ struct mthca_cq *cq);
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn);
+void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn);
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+ enum ib_event_type event_type);
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask);
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr);
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr);
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+int mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+ int index, int *dbd, u32 *new_wqe);
+int mthca_alloc_qp(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_cq *send_cq,
+ struct mthca_cq *recv_cq,
+ enum ib_qp_type type,
+ enum ib_sig_type send_policy,
+ struct mthca_qp *qp);
+int mthca_alloc_sqp(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_cq *send_cq,
+ struct mthca_cq *recv_cq,
+ enum ib_sig_type send_policy,
+ int qpn,
+ int port,
+ struct mthca_sqp *sqp);
+void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
+int mthca_create_ah(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct ib_ah_attr *ah_attr,
+ struct mthca_ah *ah);
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah);
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+ struct ib_ud_header *header);
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int mthca_process_mad(struct ib_device *ibdev,
+ int mad_flags,
+ u8 port_num,
+ struct ib_wc *in_wc,
+ struct ib_grh *in_grh,
+ struct ib_mad *in_mad,
+ struct ib_mad *out_mad);
+int mthca_create_agents(struct mthca_dev *dev);
+void mthca_free_agents(struct mthca_dev *dev);
+
+static inline struct mthca_dev *to_mdev(struct ib_device *ibdev)
+{
+ return container_of(ibdev, struct mthca_dev, ib_dev);
+}
+
+#endif /* MTHCA_DEV_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_doorbell.h b/drivers/infiniband/hw/mthca/mthca_doorbell.h
new file mode 100644
index 00000000000..78b183cab54
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_doorbell.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_doorbell.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/types.h>
+
+#define MTHCA_RD_DOORBELL 0x00
+#define MTHCA_SEND_DOORBELL 0x10
+#define MTHCA_RECEIVE_DOORBELL 0x18
+#define MTHCA_CQ_DOORBELL 0x20
+#define MTHCA_EQ_DOORBELL 0x28
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically. s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name)
+#define MTHCA_INIT_DOORBELL_LOCK(ptr) do { } while (0)
+#define MTHCA_GET_DOORBELL_LOCK(ptr) (NULL)
+
+static inline void mthca_write64(u32 val[2], void __iomem *dest,
+ spinlock_t *doorbell_lock)
+{
+ __raw_writeq(*(u64 *) val, dest);
+}
+
+static inline void mthca_write_db_rec(u32 val[2], u32 *db)
+{
+ *(u64 *) db = *(u64 *) val;
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MTHCA_INIT_DOORBELL_LOCK(ptr) spin_lock_init(ptr)
+#define MTHCA_GET_DOORBELL_LOCK(ptr) (ptr)
+
+static inline void mthca_write64(u32 val[2], void __iomem *dest,
+ spinlock_t *doorbell_lock)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(doorbell_lock, flags);
+ __raw_writel(val[0], dest);
+ __raw_writel(val[1], dest + 4);
+ spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+static inline void mthca_write_db_rec(u32 val[2], u32 *db)
+{
+ db[0] = val[0];
+ wmb();
+ db[1] = val[1];
+}
+
+#endif
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
new file mode 100644
index 00000000000..623daab5c92
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -0,0 +1,964 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_eq.c 1382 2004-12-24 02:21:02Z roland $
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_config_reg.h"
+
+enum {
+ MTHCA_NUM_ASYNC_EQE = 0x80,
+ MTHCA_NUM_CMD_EQE = 0x80,
+ MTHCA_EQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_eq_context {
+ u32 flags;
+ u64 start;
+ u32 logsize_usrpage;
+ u32 tavor_pd; /* reserved for Arbel */
+ u8 reserved1[3];
+ u8 intr;
+ u32 arbel_pd; /* lost_count for Tavor */
+ u32 lkey;
+ u32 reserved2[2];
+ u32 consumer_index;
+ u32 producer_index;
+ u32 reserved3[4];
+} __attribute__((packed));
+
+#define MTHCA_EQ_STATUS_OK ( 0 << 28)
+#define MTHCA_EQ_STATUS_OVERFLOW ( 9 << 28)
+#define MTHCA_EQ_STATUS_WRITE_FAIL (10 << 28)
+#define MTHCA_EQ_OWNER_SW ( 0 << 24)
+#define MTHCA_EQ_OWNER_HW ( 1 << 24)
+#define MTHCA_EQ_FLAG_TR ( 1 << 18)
+#define MTHCA_EQ_FLAG_OI ( 1 << 17)
+#define MTHCA_EQ_STATE_ARMED ( 1 << 8)
+#define MTHCA_EQ_STATE_FIRED ( 2 << 8)
+#define MTHCA_EQ_STATE_ALWAYS_ARMED ( 3 << 8)
+#define MTHCA_EQ_STATE_ARBEL ( 8 << 8)
+
+enum {
+ MTHCA_EVENT_TYPE_COMP = 0x00,
+ MTHCA_EVENT_TYPE_PATH_MIG = 0x01,
+ MTHCA_EVENT_TYPE_COMM_EST = 0x02,
+ MTHCA_EVENT_TYPE_SQ_DRAINED = 0x03,
+ MTHCA_EVENT_TYPE_SRQ_LAST_WQE = 0x13,
+ MTHCA_EVENT_TYPE_CQ_ERROR = 0x04,
+ MTHCA_EVENT_TYPE_WQ_CATAS_ERROR = 0x05,
+ MTHCA_EVENT_TYPE_EEC_CATAS_ERROR = 0x06,
+ MTHCA_EVENT_TYPE_PATH_MIG_FAILED = 0x07,
+ MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+ MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR = 0x11,
+ MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR = 0x12,
+ MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR = 0x08,
+ MTHCA_EVENT_TYPE_PORT_CHANGE = 0x09,
+ MTHCA_EVENT_TYPE_EQ_OVERFLOW = 0x0f,
+ MTHCA_EVENT_TYPE_ECC_DETECT = 0x0e,
+ MTHCA_EVENT_TYPE_CMD = 0x0a
+};
+
+#define MTHCA_ASYNC_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_PATH_MIG) | \
+ (1ULL << MTHCA_EVENT_TYPE_COMM_EST) | \
+ (1ULL << MTHCA_EVENT_TYPE_SQ_DRAINED) | \
+ (1ULL << MTHCA_EVENT_TYPE_CQ_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_WQ_CATAS_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_EEC_CATAS_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_PATH_MIG_FAILED) | \
+ (1ULL << MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_PORT_CHANGE) | \
+ (1ULL << MTHCA_EVENT_TYPE_ECC_DETECT))
+#define MTHCA_SRQ_EVENT_MASK (1ULL << MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_SRQ_LAST_WQE)
+#define MTHCA_CMD_EVENT_MASK (1ULL << MTHCA_EVENT_TYPE_CMD)
+
+#define MTHCA_EQ_DB_INC_CI (1 << 24)
+#define MTHCA_EQ_DB_REQ_NOT (2 << 24)
+#define MTHCA_EQ_DB_DISARM_CQ (3 << 24)
+#define MTHCA_EQ_DB_SET_CI (4 << 24)
+#define MTHCA_EQ_DB_ALWAYS_ARM (5 << 24)
+
+struct mthca_eqe {
+ u8 reserved1;
+ u8 type;
+ u8 reserved2;
+ u8 subtype;
+ union {
+ u32 raw[6];
+ struct {
+ u32 cqn;
+ } __attribute__((packed)) comp;
+ struct {
+ u16 reserved1;
+ u16 token;
+ u32 reserved2;
+ u8 reserved3[3];
+ u8 status;
+ u64 out_param;
+ } __attribute__((packed)) cmd;
+ struct {
+ u32 qpn;
+ } __attribute__((packed)) qp;
+ struct {
+ u32 cqn;
+ u32 reserved1;
+ u8 reserved2[3];
+ u8 syndrome;
+ } __attribute__((packed)) cq_err;
+ struct {
+ u32 reserved1[2];
+ u32 port;
+ } __attribute__((packed)) port_change;
+ } event;
+ u8 reserved3[3];
+ u8 owner;
+} __attribute__((packed));
+
+#define MTHCA_EQ_ENTRY_OWNER_SW (0 << 7)
+#define MTHCA_EQ_ENTRY_OWNER_HW (1 << 7)
+
+static inline u64 async_mask(struct mthca_dev *dev)
+{
+ return dev->mthca_flags & MTHCA_FLAG_SRQ ?
+ MTHCA_ASYNC_EVENT_MASK | MTHCA_SRQ_EVENT_MASK :
+ MTHCA_ASYNC_EVENT_MASK;
+}
+
+static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+ u32 doorbell[2];
+
+ doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_SET_CI | eq->eqn);
+ doorbell[1] = cpu_to_be32(ci & (eq->nent - 1));
+
+ /*
+ * This barrier makes sure that all updates to ownership bits
+ * done by set_eqe_hw() hit memory before the consumer index
+ * is updated. set_eq_ci() allows the HCA to possibly write
+ * more EQ entries, and we want to avoid the exceedingly
+ * unlikely possibility of the HCA writing an entry and then
+ * having set_eqe_hw() overwrite the owner field.
+ */
+ wmb();
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_EQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+ /* See comment in tavor_set_eq_ci() above. */
+ wmb();
+ __raw_writel(cpu_to_be32(ci), dev->eq_regs.arbel.eq_set_ci_base +
+ eq->eqn * 8);
+ /* We still want ordering, just not swabbing, so add a barrier */
+ mb();
+}
+
+static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+ if (dev->hca_type == ARBEL_NATIVE)
+ arbel_set_eq_ci(dev, eq, ci);
+ else
+ tavor_set_eq_ci(dev, eq, ci);
+}
+
+static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn)
+{
+ u32 doorbell[2];
+
+ doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_REQ_NOT | eqn);
+ doorbell[1] = 0;
+
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_EQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask)
+{
+ writel(eqn_mask, dev->eq_regs.arbel.eq_arm);
+}
+
+static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn)
+{
+ if (dev->hca_type != ARBEL_NATIVE) {
+ u32 doorbell[2];
+
+ doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_DISARM_CQ | eqn);
+ doorbell[1] = cpu_to_be32(cqn);
+
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_EQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
+}
+
+static inline struct mthca_eqe *get_eqe(struct mthca_eq *eq, u32 entry)
+{
+ unsigned long off = (entry & (eq->nent - 1)) * MTHCA_EQ_ENTRY_SIZE;
+ return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+}
+
+static inline struct mthca_eqe* next_eqe_sw(struct mthca_eq *eq)
+{
+ struct mthca_eqe* eqe;
+ eqe = get_eqe(eq, eq->cons_index);
+ return (MTHCA_EQ_ENTRY_OWNER_HW & eqe->owner) ? NULL : eqe;
+}
+
+static inline void set_eqe_hw(struct mthca_eqe *eqe)
+{
+ eqe->owner = MTHCA_EQ_ENTRY_OWNER_HW;
+}
+
+static void port_change(struct mthca_dev *dev, int port, int active)
+{
+ struct ib_event record;
+
+ mthca_dbg(dev, "Port change to %s for port %d\n",
+ active ? "active" : "down", port);
+
+ record.device = &dev->ib_dev;
+ record.event = active ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+ record.element.port_num = port;
+
+ ib_dispatch_event(&record);
+}
+
+static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq)
+{
+ struct mthca_eqe *eqe;
+ int disarm_cqn;
+ int eqes_found = 0;
+
+ while ((eqe = next_eqe_sw(eq))) {
+ int set_ci = 0;
+
+ /*
+ * Make sure we read EQ entry contents after we've
+ * checked the ownership bit.
+ */
+ rmb();
+
+ switch (eqe->type) {
+ case MTHCA_EVENT_TYPE_COMP:
+ disarm_cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+ disarm_cq(dev, eq->eqn, disarm_cqn);
+ mthca_cq_event(dev, disarm_cqn);
+ break;
+
+ case MTHCA_EVENT_TYPE_PATH_MIG:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_PATH_MIG);
+ break;
+
+ case MTHCA_EVENT_TYPE_COMM_EST:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_COMM_EST);
+ break;
+
+ case MTHCA_EVENT_TYPE_SQ_DRAINED:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_SQ_DRAINED);
+ break;
+
+ case MTHCA_EVENT_TYPE_WQ_CATAS_ERROR:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_QP_FATAL);
+ break;
+
+ case MTHCA_EVENT_TYPE_PATH_MIG_FAILED:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_PATH_MIG_ERR);
+ break;
+
+ case MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_QP_REQ_ERR);
+ break;
+
+ case MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_QP_ACCESS_ERR);
+ break;
+
+ case MTHCA_EVENT_TYPE_CMD:
+ mthca_cmd_event(dev,
+ be16_to_cpu(eqe->event.cmd.token),
+ eqe->event.cmd.status,
+ be64_to_cpu(eqe->event.cmd.out_param));
+ /*
+ * cmd_event() may add more commands.
+ * The card will think the queue has overflowed if
+ * we don't tell it we've been processing events.
+ */
+ set_ci = 1;
+ break;
+
+ case MTHCA_EVENT_TYPE_PORT_CHANGE:
+ port_change(dev,
+ (be32_to_cpu(eqe->event.port_change.port) >> 28) & 3,
+ eqe->subtype == 0x4);
+ break;
+
+ case MTHCA_EVENT_TYPE_CQ_ERROR:
+ mthca_warn(dev, "CQ %s on CQN %08x\n",
+ eqe->event.cq_err.syndrome == 1 ?
+ "overrun" : "access violation",
+ be32_to_cpu(eqe->event.cq_err.cqn));
+ break;
+
+ case MTHCA_EVENT_TYPE_EQ_OVERFLOW:
+ mthca_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+ break;
+
+ case MTHCA_EVENT_TYPE_EEC_CATAS_ERROR:
+ case MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR:
+ case MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR:
+ case MTHCA_EVENT_TYPE_ECC_DETECT:
+ default:
+ mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n",
+ eqe->type, eqe->subtype, eq->eqn);
+ break;
+ };
+
+ set_eqe_hw(eqe);
+ ++eq->cons_index;
+ eqes_found = 1;
+
+ if (unlikely(set_ci)) {
+ /*
+ * Conditional on hca_type is OK here because
+ * this is a rare case, not the fast path.
+ */
+ set_eq_ci(dev, eq, eq->cons_index);
+ set_ci = 0;
+ }
+ }
+
+ /*
+ * Rely on caller to set consumer index so that we don't have
+ * to test hca_type in our interrupt handling fast path.
+ */
+ return eqes_found;
+}
+
+static irqreturn_t mthca_tavor_interrupt(int irq, void *dev_ptr, struct pt_regs *regs)
+{
+ struct mthca_dev *dev = dev_ptr;
+ u32 ecr;
+ int i;
+
+ if (dev->eq_table.clr_mask)
+ writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+ ecr = readl(dev->eq_regs.tavor.ecr_base + 4);
+ if (ecr) {
+ writel(ecr, dev->eq_regs.tavor.ecr_base +
+ MTHCA_ECR_CLR_BASE - MTHCA_ECR_BASE + 4);
+
+ for (i = 0; i < MTHCA_NUM_EQ; ++i)
+ if (ecr & dev->eq_table.eq[i].eqn_mask &&
+ mthca_eq_int(dev, &dev->eq_table.eq[i])) {
+ tavor_set_eq_ci(dev, &dev->eq_table.eq[i],
+ dev->eq_table.eq[i].cons_index);
+ tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+ }
+ }
+
+ return IRQ_RETVAL(ecr);
+}
+
+static irqreturn_t mthca_tavor_msi_x_interrupt(int irq, void *eq_ptr,
+ struct pt_regs *regs)
+{
+ struct mthca_eq *eq = eq_ptr;
+ struct mthca_dev *dev = eq->dev;
+
+ mthca_eq_int(dev, eq);
+ tavor_set_eq_ci(dev, eq, eq->cons_index);
+ tavor_eq_req_not(dev, eq->eqn);
+
+ /* MSI-X vectors always belong to us */
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_arbel_interrupt(int irq, void *dev_ptr, struct pt_regs *regs)
+{
+ struct mthca_dev *dev = dev_ptr;
+ int work = 0;
+ int i;
+
+ if (dev->eq_table.clr_mask)
+ writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+ for (i = 0; i < MTHCA_NUM_EQ; ++i)
+ if (mthca_eq_int(dev, &dev->eq_table.eq[i])) {
+ work = 1;
+ arbel_set_eq_ci(dev, &dev->eq_table.eq[i],
+ dev->eq_table.eq[i].cons_index);
+ }
+
+ arbel_eq_req_not(dev, dev->eq_table.arm_mask);
+
+ return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mthca_arbel_msi_x_interrupt(int irq, void *eq_ptr,
+ struct pt_regs *regs)
+{
+ struct mthca_eq *eq = eq_ptr;
+ struct mthca_dev *dev = eq->dev;
+
+ mthca_eq_int(dev, eq);
+ arbel_set_eq_ci(dev, eq, eq->cons_index);
+ arbel_eq_req_not(dev, eq->eqn_mask);
+
+ /* MSI-X vectors always belong to us */
+ return IRQ_HANDLED;
+}
+
+static int __devinit mthca_create_eq(struct mthca_dev *dev,
+ int nent,
+ u8 intr,
+ struct mthca_eq *eq)
+{
+ int npages = (nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+ PAGE_SIZE;
+ u64 *dma_list = NULL;
+ dma_addr_t t;
+ void *mailbox = NULL;
+ struct mthca_eq_context *eq_context;
+ int err = -ENOMEM;
+ int i;
+ u8 status;
+
+ /* Make sure EQ size is aligned to a power of 2 size. */
+ for (i = 1; i < nent; i <<= 1)
+ ; /* nothing */
+ nent = i;
+
+ eq->dev = dev;
+
+ eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+ GFP_KERNEL);
+ if (!eq->page_list)
+ goto err_out;
+
+ for (i = 0; i < npages; ++i)
+ eq->page_list[i].buf = NULL;
+
+ dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+ if (!dma_list)
+ goto err_out_free;
+
+ mailbox = kmalloc(sizeof *eq_context + MTHCA_CMD_MAILBOX_EXTRA,
+ GFP_KERNEL);
+ if (!mailbox)
+ goto err_out_free;
+ eq_context = MAILBOX_ALIGN(mailbox);
+
+ for (i = 0; i < npages; ++i) {
+ eq->page_list[i].buf = pci_alloc_consistent(dev->pdev,
+ PAGE_SIZE, &t);
+ if (!eq->page_list[i].buf)
+ goto err_out_free;
+
+ dma_list[i] = t;
+ pci_unmap_addr_set(&eq->page_list[i], mapping, t);
+
+ memset(eq->page_list[i].buf, 0, PAGE_SIZE);
+ }
+
+ for (i = 0; i < nent; ++i)
+ set_eqe_hw(get_eqe(eq, i));
+
+ eq->eqn = mthca_alloc(&dev->eq_table.alloc);
+ if (eq->eqn == -1)
+ goto err_out_free;
+
+ err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
+ dma_list, PAGE_SHIFT, npages,
+ 0, npages * PAGE_SIZE,
+ MTHCA_MPT_FLAG_LOCAL_WRITE |
+ MTHCA_MPT_FLAG_LOCAL_READ,
+ &eq->mr);
+ if (err)
+ goto err_out_free_eq;
+
+ eq->nent = nent;
+
+ memset(eq_context, 0, sizeof *eq_context);
+ eq_context->flags = cpu_to_be32(MTHCA_EQ_STATUS_OK |
+ MTHCA_EQ_OWNER_HW |
+ MTHCA_EQ_STATE_ARMED |
+ MTHCA_EQ_FLAG_TR);
+ if (dev->hca_type == ARBEL_NATIVE)
+ eq_context->flags |= cpu_to_be32(MTHCA_EQ_STATE_ARBEL);
+
+ eq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24);
+ if (dev->hca_type == ARBEL_NATIVE) {
+ eq_context->arbel_pd = cpu_to_be32(dev->driver_pd.pd_num);
+ } else {
+ eq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+ eq_context->tavor_pd = cpu_to_be32(dev->driver_pd.pd_num);
+ }
+ eq_context->intr = intr;
+ eq_context->lkey = cpu_to_be32(eq->mr.ibmr.lkey);
+
+ err = mthca_SW2HW_EQ(dev, eq_context, eq->eqn, &status);
+ if (err) {
+ mthca_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+ goto err_out_free_mr;
+ }
+ if (status) {
+ mthca_warn(dev, "SW2HW_EQ returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ goto err_out_free_mr;
+ }
+
+ kfree(dma_list);
+ kfree(mailbox);
+
+ eq->eqn_mask = swab32(1 << eq->eqn);
+ eq->cons_index = 0;
+
+ dev->eq_table.arm_mask |= eq->eqn_mask;
+
+ mthca_dbg(dev, "Allocated EQ %d with %d entries\n",
+ eq->eqn, nent);
+
+ return err;
+
+ err_out_free_mr:
+ mthca_free_mr(dev, &eq->mr);
+
+ err_out_free_eq:
+ mthca_free(&dev->eq_table.alloc, eq->eqn);
+
+ err_out_free:
+ for (i = 0; i < npages; ++i)
+ if (eq->page_list[i].buf)
+ pci_free_consistent(dev->pdev, PAGE_SIZE,
+ eq->page_list[i].buf,
+ pci_unmap_addr(&eq->page_list[i],
+ mapping));
+
+ kfree(eq->page_list);
+ kfree(dma_list);
+ kfree(mailbox);
+
+ err_out:
+ return err;
+}
+
+static void mthca_free_eq(struct mthca_dev *dev,
+ struct mthca_eq *eq)
+{
+ void *mailbox = NULL;
+ int err;
+ u8 status;
+ int npages = (eq->nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+ PAGE_SIZE;
+ int i;
+
+ mailbox = kmalloc(sizeof (struct mthca_eq_context) + MTHCA_CMD_MAILBOX_EXTRA,
+ GFP_KERNEL);
+ if (!mailbox)
+ return;
+
+ err = mthca_HW2SW_EQ(dev, MAILBOX_ALIGN(mailbox),
+ eq->eqn, &status);
+ if (err)
+ mthca_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+ if (status)
+ mthca_warn(dev, "HW2SW_EQ returned status 0x%02x\n",
+ status);
+
+ dev->eq_table.arm_mask &= ~eq->eqn_mask;
+
+ if (0) {
+ mthca_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+ for (i = 0; i < sizeof (struct mthca_eq_context) / 4; ++i) {
+ if (i % 4 == 0)
+ printk("[%02x] ", i * 4);
+ printk(" %08x", be32_to_cpup(MAILBOX_ALIGN(mailbox) + i * 4));
+ if ((i + 1) % 4 == 0)
+ printk("\n");
+ }
+ }
+
+ mthca_free_mr(dev, &eq->mr);
+ for (i = 0; i < npages; ++i)
+ pci_free_consistent(dev->pdev, PAGE_SIZE,
+ eq->page_list[i].buf,
+ pci_unmap_addr(&eq->page_list[i], mapping));
+
+ kfree(eq->page_list);
+ kfree(mailbox);
+}
+
+static void mthca_free_irqs(struct mthca_dev *dev)
+{
+ int i;
+
+ if (dev->eq_table.have_irq)
+ free_irq(dev->pdev->irq, dev);
+ for (i = 0; i < MTHCA_NUM_EQ; ++i)
+ if (dev->eq_table.eq[i].have_irq)
+ free_irq(dev->eq_table.eq[i].msi_x_vector,
+ dev->eq_table.eq + i);
+}
+
+static int __devinit mthca_map_reg(struct mthca_dev *dev,
+ unsigned long offset, unsigned long size,
+ void __iomem **map)
+{
+ unsigned long base = pci_resource_start(dev->pdev, 0);
+
+ if (!request_mem_region(base + offset, size, DRV_NAME))
+ return -EBUSY;
+
+ *map = ioremap(base + offset, size);
+ if (!*map) {
+ release_mem_region(base + offset, size);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void mthca_unmap_reg(struct mthca_dev *dev, unsigned long offset,
+ unsigned long size, void __iomem *map)
+{
+ unsigned long base = pci_resource_start(dev->pdev, 0);
+
+ release_mem_region(base + offset, size);
+ iounmap(map);
+}
+
+static int __devinit mthca_map_eq_regs(struct mthca_dev *dev)
+{
+ unsigned long mthca_base;
+
+ mthca_base = pci_resource_start(dev->pdev, 0);
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ /*
+ * We assume that the EQ arm and EQ set CI registers
+ * fall within the first BAR. We can't trust the
+ * values firmware gives us, since those addresses are
+ * valid on the HCA's side of the PCI bus but not
+ * necessarily the host side.
+ */
+ if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+ &dev->clr_base)) {
+ mthca_err(dev, "Couldn't map interrupt clear register, "
+ "aborting.\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Add 4 because we limit ourselves to EQs 0 ... 31,
+ * so we only need the low word of the register.
+ */
+ if (mthca_map_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_arm_base) + 4, 4,
+ &dev->eq_regs.arbel.eq_arm)) {
+ mthca_err(dev, "Couldn't map interrupt clear register, "
+ "aborting.\n");
+ mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+ dev->clr_base);
+ return -ENOMEM;
+ }
+
+ if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_set_ci_base,
+ MTHCA_EQ_SET_CI_SIZE,
+ &dev->eq_regs.arbel.eq_set_ci_base)) {
+ mthca_err(dev, "Couldn't map interrupt clear register, "
+ "aborting.\n");
+ mthca_unmap_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_arm_base) + 4, 4,
+ dev->eq_regs.arbel.eq_arm);
+ mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+ dev->clr_base);
+ return -ENOMEM;
+ }
+ } else {
+ if (mthca_map_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+ &dev->clr_base)) {
+ mthca_err(dev, "Couldn't map interrupt clear register, "
+ "aborting.\n");
+ return -ENOMEM;
+ }
+
+ if (mthca_map_reg(dev, MTHCA_ECR_BASE,
+ MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE,
+ &dev->eq_regs.tavor.ecr_base)) {
+ mthca_err(dev, "Couldn't map ecr register, "
+ "aborting.\n");
+ mthca_unmap_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+ dev->clr_base);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+
+}
+
+static void __devexit mthca_unmap_eq_regs(struct mthca_dev *dev)
+{
+ if (dev->hca_type == ARBEL_NATIVE) {
+ mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_set_ci_base,
+ MTHCA_EQ_SET_CI_SIZE,
+ dev->eq_regs.arbel.eq_set_ci_base);
+ mthca_unmap_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_arm_base) + 4, 4,
+ dev->eq_regs.arbel.eq_arm);
+ mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+ dev->clr_base);
+ } else {
+ mthca_unmap_reg(dev, MTHCA_ECR_BASE,
+ MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE,
+ dev->eq_regs.tavor.ecr_base);
+ mthca_unmap_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+ dev->clr_base);
+ }
+}
+
+int __devinit mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt)
+{
+ int ret;
+ u8 status;
+
+ /*
+ * We assume that mapping one page is enough for the whole EQ
+ * context table. This is fine with all current HCAs, because
+ * we only use 32 EQs and each EQ uses 32 bytes of context
+ * memory, or 1 KB total.
+ */
+ dev->eq_table.icm_virt = icm_virt;
+ dev->eq_table.icm_page = alloc_page(GFP_HIGHUSER);
+ if (!dev->eq_table.icm_page)
+ return -ENOMEM;
+ dev->eq_table.icm_dma = pci_map_page(dev->pdev, dev->eq_table.icm_page, 0,
+ PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+ if (pci_dma_mapping_error(dev->eq_table.icm_dma)) {
+ __free_page(dev->eq_table.icm_page);
+ return -ENOMEM;
+ }
+
+ ret = mthca_MAP_ICM_page(dev, dev->eq_table.icm_dma, icm_virt, &status);
+ if (!ret && status)
+ ret = -EINVAL;
+ if (ret) {
+ pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+ PCI_DMA_BIDIRECTIONAL);
+ __free_page(dev->eq_table.icm_page);
+ }
+
+ return ret;
+}
+
+void __devexit mthca_unmap_eq_icm(struct mthca_dev *dev)
+{
+ u8 status;
+
+ mthca_UNMAP_ICM(dev, dev->eq_table.icm_virt, PAGE_SIZE / 4096, &status);
+ pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+ PCI_DMA_BIDIRECTIONAL);
+ __free_page(dev->eq_table.icm_page);
+}
+
+int __devinit mthca_init_eq_table(struct mthca_dev *dev)
+{
+ int err;
+ u8 status;
+ u8 intr;
+ int i;
+
+ err = mthca_alloc_init(&dev->eq_table.alloc,
+ dev->limits.num_eqs,
+ dev->limits.num_eqs - 1,
+ dev->limits.reserved_eqs);
+ if (err)
+ return err;
+
+ err = mthca_map_eq_regs(dev);
+ if (err)
+ goto err_out_free;
+
+ if (dev->mthca_flags & MTHCA_FLAG_MSI ||
+ dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+ dev->eq_table.clr_mask = 0;
+ } else {
+ dev->eq_table.clr_mask =
+ swab32(1 << (dev->eq_table.inta_pin & 31));
+ dev->eq_table.clr_int = dev->clr_base +
+ (dev->eq_table.inta_pin < 31 ? 4 : 0);
+ }
+
+ dev->eq_table.arm_mask = 0;
+
+ intr = (dev->mthca_flags & MTHCA_FLAG_MSI) ?
+ 128 : dev->eq_table.inta_pin;
+
+ err = mthca_create_eq(dev, dev->limits.num_cqs,
+ (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr,
+ &dev->eq_table.eq[MTHCA_EQ_COMP]);
+ if (err)
+ goto err_out_unmap;
+
+ err = mthca_create_eq(dev, MTHCA_NUM_ASYNC_EQE,
+ (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 129 : intr,
+ &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+ if (err)
+ goto err_out_comp;
+
+ err = mthca_create_eq(dev, MTHCA_NUM_CMD_EQE,
+ (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 130 : intr,
+ &dev->eq_table.eq[MTHCA_EQ_CMD]);
+ if (err)
+ goto err_out_async;
+
+ if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+ static const char *eq_name[] = {
+ [MTHCA_EQ_COMP] = DRV_NAME " (comp)",
+ [MTHCA_EQ_ASYNC] = DRV_NAME " (async)",
+ [MTHCA_EQ_CMD] = DRV_NAME " (cmd)"
+ };
+
+ for (i = 0; i < MTHCA_NUM_EQ; ++i) {
+ err = request_irq(dev->eq_table.eq[i].msi_x_vector,
+ dev->hca_type == ARBEL_NATIVE ?
+ mthca_arbel_msi_x_interrupt :
+ mthca_tavor_msi_x_interrupt,
+ 0, eq_name[i], dev->eq_table.eq + i);
+ if (err)
+ goto err_out_cmd;
+ dev->eq_table.eq[i].have_irq = 1;
+ }
+ } else {
+ err = request_irq(dev->pdev->irq,
+ dev->hca_type == ARBEL_NATIVE ?
+ mthca_arbel_interrupt :
+ mthca_tavor_interrupt,
+ SA_SHIRQ, DRV_NAME, dev);
+ if (err)
+ goto err_out_cmd;
+ dev->eq_table.have_irq = 1;
+ }
+
+ err = mthca_MAP_EQ(dev, async_mask(dev),
+ 0, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, &status);
+ if (err)
+ mthca_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+ dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, err);
+ if (status)
+ mthca_warn(dev, "MAP_EQ for async EQ %d returned status 0x%02x\n",
+ dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, status);
+
+ err = mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+ 0, dev->eq_table.eq[MTHCA_EQ_CMD].eqn, &status);
+ if (err)
+ mthca_warn(dev, "MAP_EQ for cmd EQ %d failed (%d)\n",
+ dev->eq_table.eq[MTHCA_EQ_CMD].eqn, err);
+ if (status)
+ mthca_warn(dev, "MAP_EQ for cmd EQ %d returned status 0x%02x\n",
+ dev->eq_table.eq[MTHCA_EQ_CMD].eqn, status);
+
+ for (i = 0; i < MTHCA_EQ_CMD; ++i)
+ if (dev->hca_type == ARBEL_NATIVE)
+ arbel_eq_req_not(dev, dev->eq_table.eq[i].eqn_mask);
+ else
+ tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+
+ return 0;
+
+err_out_cmd:
+ mthca_free_irqs(dev);
+ mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_CMD]);
+
+err_out_async:
+ mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+
+err_out_comp:
+ mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_COMP]);
+
+err_out_unmap:
+ mthca_unmap_eq_regs(dev);
+
+err_out_free:
+ mthca_alloc_cleanup(&dev->eq_table.alloc);
+ return err;
+}
+
+void __devexit mthca_cleanup_eq_table(struct mthca_dev *dev)
+{
+ u8 status;
+ int i;
+
+ mthca_free_irqs(dev);
+
+ mthca_MAP_EQ(dev, async_mask(dev),
+ 1, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, &status);
+ mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+ 1, dev->eq_table.eq[MTHCA_EQ_CMD].eqn, &status);
+
+ for (i = 0; i < MTHCA_NUM_EQ; ++i)
+ mthca_free_eq(dev, &dev->eq_table.eq[i]);
+
+ mthca_unmap_eq_regs(dev);
+
+ mthca_alloc_cleanup(&dev->eq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
new file mode 100644
index 00000000000..7df22364201
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_mad.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <ib_verbs.h>
+#include <ib_mad.h>
+#include <ib_smi.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+ MTHCA_VENDOR_CLASS1 = 0x9,
+ MTHCA_VENDOR_CLASS2 = 0xa
+};
+
+struct mthca_trap_mad {
+ struct ib_mad *mad;
+ DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+static void update_sm_ah(struct mthca_dev *dev,
+ u8 port_num, u16 lid, u8 sl)
+{
+ struct ib_ah *new_ah;
+ struct ib_ah_attr ah_attr;
+ unsigned long flags;
+
+ if (!dev->send_agent[port_num - 1][0])
+ return;
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = lid;
+ ah_attr.sl = sl;
+ ah_attr.port_num = port_num;
+
+ new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+ &ah_attr);
+ if (IS_ERR(new_ah))
+ return;
+
+ spin_lock_irqsave(&dev->sm_lock, flags);
+ if (dev->sm_ah[port_num - 1])
+ ib_destroy_ah(dev->sm_ah[port_num - 1]);
+ dev->sm_ah[port_num - 1] = new_ah;
+ spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev,
+ u8 port_num,
+ struct ib_mad *mad)
+{
+ struct ib_event event;
+
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ mad->mad_hdr.method == IB_MGMT_METHOD_SET) {
+ if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+ update_sm_ah(to_mdev(ibdev), port_num,
+ be16_to_cpup((__be16 *) (mad->data + 58)),
+ (*(u8 *) (mad->data + 76)) & 0xf);
+
+ event.device = ibdev;
+ event.event = IB_EVENT_LID_CHANGE;
+ event.element.port_num = port_num;
+ ib_dispatch_event(&event);
+ }
+
+ if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+ event.device = ibdev;
+ event.event = IB_EVENT_PKEY_CHANGE;
+ event.element.port_num = port_num;
+ ib_dispatch_event(&event);
+ }
+ }
+}
+
+static void forward_trap(struct mthca_dev *dev,
+ u8 port_num,
+ struct ib_mad *mad)
+{
+ int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ struct mthca_trap_mad *tmad;
+ struct ib_sge gather_list;
+ struct ib_send_wr *bad_wr, wr = {
+ .opcode = IB_WR_SEND,
+ .sg_list = &gather_list,
+ .num_sge = 1,
+ .send_flags = IB_SEND_SIGNALED,
+ .wr = {
+ .ud = {
+ .remote_qpn = qpn,
+ .remote_qkey = qpn ? IB_QP1_QKEY : 0,
+ .timeout_ms = 0
+ }
+ }
+ };
+ struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+ int ret;
+ unsigned long flags;
+
+ if (agent) {
+ tmad = kmalloc(sizeof *tmad, GFP_KERNEL);
+ if (!tmad)
+ return;
+
+ tmad->mad = kmalloc(sizeof *tmad->mad, GFP_KERNEL);
+ if (!tmad->mad) {
+ kfree(tmad);
+ return;
+ }
+
+ memcpy(tmad->mad, mad, sizeof *mad);
+
+ wr.wr.ud.mad_hdr = &tmad->mad->mad_hdr;
+ wr.wr_id = (unsigned long) tmad;
+
+ gather_list.addr = dma_map_single(agent->device->dma_device,
+ tmad->mad,
+ sizeof *tmad->mad,
+ DMA_TO_DEVICE);
+ gather_list.length = sizeof *tmad->mad;
+ gather_list.lkey = to_mpd(agent->qp->pd)->ntmr.ibmr.lkey;
+ pci_unmap_addr_set(tmad, mapping, gather_list.addr);
+
+ /*
+ * We rely here on the fact that MLX QPs don't use the
+ * address handle after the send is posted (this is
+ * wrong following the IB spec strictly, but we know
+ * it's OK for our devices).
+ */
+ spin_lock_irqsave(&dev->sm_lock, flags);
+ wr.wr.ud.ah = dev->sm_ah[port_num - 1];
+ if (wr.wr.ud.ah)
+ ret = ib_post_send_mad(agent, &wr, &bad_wr);
+ else
+ ret = -EINVAL;
+ spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+ if (ret) {
+ dma_unmap_single(agent->device->dma_device,
+ pci_unmap_addr(tmad, mapping),
+ sizeof *tmad->mad,
+ DMA_TO_DEVICE);
+ kfree(tmad->mad);
+ kfree(tmad);
+ }
+ }
+}
+
+int mthca_process_mad(struct ib_device *ibdev,
+ int mad_flags,
+ u8 port_num,
+ struct ib_wc *in_wc,
+ struct ib_grh *in_grh,
+ struct ib_mad *in_mad,
+ struct ib_mad *out_mad)
+{
+ int err;
+ u8 status;
+ u16 slid = in_wc ? in_wc->slid : IB_LID_PERMISSIVE;
+
+ /* Forward locally generated traps to the SM */
+ if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP &&
+ slid == 0) {
+ forward_trap(to_mdev(ibdev), port_num, in_mad);
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ }
+
+ /*
+ * Only handle SM gets, sets and trap represses for SM class
+ *
+ * Only handle PMA and Mellanox vendor-specific class gets and
+ * sets for other classes.
+ */
+ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_SET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS)
+ return IB_MAD_RESULT_SUCCESS;
+
+ /*
+ * Don't process SMInfo queries or vendor-specific
+ * MADs -- the SMA can't handle them.
+ */
+ if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+ ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
+ IB_SMP_ATTR_VENDOR_MASK))
+ return IB_MAD_RESULT_SUCCESS;
+ } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+ in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1 ||
+ in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) {
+ if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_SET)
+ return IB_MAD_RESULT_SUCCESS;
+ } else
+ return IB_MAD_RESULT_SUCCESS;
+
+ err = mthca_MAD_IFC(to_mdev(ibdev),
+ mad_flags & IB_MAD_IGNORE_MKEY,
+ mad_flags & IB_MAD_IGNORE_BKEY,
+ port_num, in_wc, in_grh, in_mad, out_mad,
+ &status);
+ if (err) {
+ mthca_err(to_mdev(ibdev), "MAD_IFC failed\n");
+ return IB_MAD_RESULT_FAILURE;
+ }
+ if (status == MTHCA_CMD_STAT_BAD_PKT)
+ return IB_MAD_RESULT_SUCCESS;
+ if (status) {
+ mthca_err(to_mdev(ibdev), "MAD_IFC returned status %02x\n",
+ status);
+ return IB_MAD_RESULT_FAILURE;
+ }
+
+ if (!out_mad->mad_hdr.status)
+ smp_snoop(ibdev, port_num, in_mad);
+
+ /* set return bit in status of directed route responses */
+ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+ if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+ /* no response for trap repress */
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct mthca_trap_mad *tmad =
+ (void *) (unsigned long) mad_send_wc->wr_id;
+
+ dma_unmap_single(agent->device->dma_device,
+ pci_unmap_addr(tmad, mapping),
+ sizeof *tmad->mad,
+ DMA_TO_DEVICE);
+ kfree(tmad->mad);
+ kfree(tmad);
+}
+
+int mthca_create_agents(struct mthca_dev *dev)
+{
+ struct ib_mad_agent *agent;
+ int p, q;
+
+ spin_lock_init(&dev->sm_lock);
+
+ for (p = 0; p < dev->limits.num_ports; ++p)
+ for (q = 0; q <= 1; ++q) {
+ agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+ q ? IB_QPT_GSI : IB_QPT_SMI,
+ NULL, 0, send_handler,
+ NULL, NULL);
+ if (IS_ERR(agent))
+ goto err;
+ dev->send_agent[p][q] = agent;
+ }
+
+ return 0;
+
+err:
+ for (p = 0; p < dev->limits.num_ports; ++p)
+ for (q = 0; q <= 1; ++q)
+ if (dev->send_agent[p][q])
+ ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+ return PTR_ERR(agent);
+}
+
+void mthca_free_agents(struct mthca_dev *dev)
+{
+ struct ib_mad_agent *agent;
+ int p, q;
+
+ for (p = 0; p < dev->limits.num_ports; ++p) {
+ for (q = 0; q <= 1; ++q) {
+ agent = dev->send_agent[p][q];
+ dev->send_agent[p][q] = NULL;
+ ib_unregister_mad_agent(agent);
+ }
+
+ if (dev->sm_ah[p])
+ ib_destroy_ah(dev->sm_ah[p]);
+ }
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
new file mode 100644
index 00000000000..9e782bc1c38
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -0,0 +1,1123 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_main.c 1396 2004-12-28 04:10:27Z roland $
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_profile.h"
+#include "mthca_memfree.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 0;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+static int msi = 0;
+module_param(msi, int, 0444);
+MODULE_PARM_DESC(msi, "attempt to use MSI if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+#define msi (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static const char mthca_version[] __devinitdata =
+ "ib_mthca: Mellanox InfiniBand HCA driver v"
+ DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static struct mthca_profile default_profile = {
+ .num_qp = 1 << 16,
+ .rdb_per_qp = 4,
+ .num_cq = 1 << 16,
+ .num_mcg = 1 << 13,
+ .num_mpt = 1 << 17,
+ .num_mtt = 1 << 20,
+ .num_udav = 1 << 15, /* Tavor only */
+ .uarc_size = 1 << 18, /* Arbel only */
+};
+
+static int __devinit mthca_tune_pci(struct mthca_dev *mdev)
+{
+ int cap;
+ u16 val;
+
+ /* First try to max out Read Byte Count */
+ cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX);
+ if (cap) {
+ if (pci_read_config_word(mdev->pdev, cap + PCI_X_CMD, &val)) {
+ mthca_err(mdev, "Couldn't read PCI-X command register, "
+ "aborting.\n");
+ return -ENODEV;
+ }
+ val = (val & ~PCI_X_CMD_MAX_READ) | (3 << 2);
+ if (pci_write_config_word(mdev->pdev, cap + PCI_X_CMD, val)) {
+ mthca_err(mdev, "Couldn't write PCI-X command register, "
+ "aborting.\n");
+ return -ENODEV;
+ }
+ } else if (mdev->hca_type == TAVOR)
+ mthca_info(mdev, "No PCI-X capability, not setting RBC.\n");
+
+ cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP);
+ if (cap) {
+ if (pci_read_config_word(mdev->pdev, cap + PCI_EXP_DEVCTL, &val)) {
+ mthca_err(mdev, "Couldn't read PCI Express device control "
+ "register, aborting.\n");
+ return -ENODEV;
+ }
+ val = (val & ~PCI_EXP_DEVCTL_READRQ) | (5 << 12);
+ if (pci_write_config_word(mdev->pdev, cap + PCI_EXP_DEVCTL, val)) {
+ mthca_err(mdev, "Couldn't write PCI Express device control "
+ "register, aborting.\n");
+ return -ENODEV;
+ }
+ } else if (mdev->hca_type == ARBEL_NATIVE ||
+ mdev->hca_type == ARBEL_COMPAT)
+ mthca_info(mdev, "No PCI Express capability, "
+ "not setting Max Read Request Size.\n");
+
+ return 0;
+}
+
+static int __devinit mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim)
+{
+ int err;
+ u8 status;
+
+ err = mthca_QUERY_DEV_LIM(mdev, dev_lim, &status);
+ if (err) {
+ mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+ return err;
+ }
+ if (status) {
+ mthca_err(mdev, "QUERY_DEV_LIM returned status 0x%02x, "
+ "aborting.\n", status);
+ return -EINVAL;
+ }
+ if (dev_lim->min_page_sz > PAGE_SIZE) {
+ mthca_err(mdev, "HCA minimum page size of %d bigger than "
+ "kernel PAGE_SIZE of %ld, aborting.\n",
+ dev_lim->min_page_sz, PAGE_SIZE);
+ return -ENODEV;
+ }
+ if (dev_lim->num_ports > MTHCA_MAX_PORTS) {
+ mthca_err(mdev, "HCA has %d ports, but we only support %d, "
+ "aborting.\n",
+ dev_lim->num_ports, MTHCA_MAX_PORTS);
+ return -ENODEV;
+ }
+
+ mdev->limits.num_ports = dev_lim->num_ports;
+ mdev->limits.vl_cap = dev_lim->max_vl;
+ mdev->limits.mtu_cap = dev_lim->max_mtu;
+ mdev->limits.gid_table_len = dev_lim->max_gids;
+ mdev->limits.pkey_table_len = dev_lim->max_pkeys;
+ mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay;
+ mdev->limits.max_sg = dev_lim->max_sg;
+ mdev->limits.reserved_qps = dev_lim->reserved_qps;
+ mdev->limits.reserved_srqs = dev_lim->reserved_srqs;
+ mdev->limits.reserved_eecs = dev_lim->reserved_eecs;
+ mdev->limits.reserved_cqs = dev_lim->reserved_cqs;
+ mdev->limits.reserved_eqs = dev_lim->reserved_eqs;
+ mdev->limits.reserved_mtts = dev_lim->reserved_mtts;
+ mdev->limits.reserved_mrws = dev_lim->reserved_mrws;
+ mdev->limits.reserved_uars = dev_lim->reserved_uars;
+ mdev->limits.reserved_pds = dev_lim->reserved_pds;
+
+ /* IB_DEVICE_RESIZE_MAX_WR not supported by driver.
+ May be doable since hardware supports it for SRQ.
+
+ IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver.
+
+ IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not
+ supported by driver. */
+ mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+ IB_DEVICE_PORT_ACTIVE_EVENT |
+ IB_DEVICE_SYS_IMAGE_GUID |
+ IB_DEVICE_RC_RNR_NAK_GEN;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR)
+ mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR)
+ mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI)
+ mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG)
+ mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE)
+ mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_SRQ)
+ mdev->mthca_flags |= MTHCA_FLAG_SRQ;
+
+ return 0;
+}
+
+static int __devinit mthca_init_tavor(struct mthca_dev *mdev)
+{
+ u8 status;
+ int err;
+ struct mthca_dev_lim dev_lim;
+ struct mthca_profile profile;
+ struct mthca_init_hca_param init_hca;
+ struct mthca_adapter adapter;
+
+ err = mthca_SYS_EN(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "SYS_EN command failed, aborting.\n");
+ return err;
+ }
+ if (status) {
+ mthca_err(mdev, "SYS_EN returned status 0x%02x, "
+ "aborting.\n", status);
+ return -EINVAL;
+ }
+
+ err = mthca_QUERY_FW(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "QUERY_FW command failed, aborting.\n");
+ goto err_disable;
+ }
+ if (status) {
+ mthca_err(mdev, "QUERY_FW returned status 0x%02x, "
+ "aborting.\n", status);
+ err = -EINVAL;
+ goto err_disable;
+ }
+ err = mthca_QUERY_DDR(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "QUERY_DDR command failed, aborting.\n");
+ goto err_disable;
+ }
+ if (status) {
+ mthca_err(mdev, "QUERY_DDR returned status 0x%02x, "
+ "aborting.\n", status);
+ err = -EINVAL;
+ goto err_disable;
+ }
+
+ err = mthca_dev_lim(mdev, &dev_lim);
+
+ profile = default_profile;
+ profile.num_uar = dev_lim.uar_size / PAGE_SIZE;
+ profile.uarc_size = 0;
+
+ err = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+ if (err < 0)
+ goto err_disable;
+
+ err = mthca_INIT_HCA(mdev, &init_hca, &status);
+ if (err) {
+ mthca_err(mdev, "INIT_HCA command failed, aborting.\n");
+ goto err_disable;
+ }
+ if (status) {
+ mthca_err(mdev, "INIT_HCA returned status 0x%02x, "
+ "aborting.\n", status);
+ err = -EINVAL;
+ goto err_disable;
+ }
+
+ err = mthca_QUERY_ADAPTER(mdev, &adapter, &status);
+ if (err) {
+ mthca_err(mdev, "QUERY_ADAPTER command failed, aborting.\n");
+ goto err_close;
+ }
+ if (status) {
+ mthca_err(mdev, "QUERY_ADAPTER returned status 0x%02x, "
+ "aborting.\n", status);
+ err = -EINVAL;
+ goto err_close;
+ }
+
+ mdev->eq_table.inta_pin = adapter.inta_pin;
+ mdev->rev_id = adapter.revision_id;
+
+ return 0;
+
+err_close:
+ mthca_CLOSE_HCA(mdev, 0, &status);
+
+err_disable:
+ mthca_SYS_DIS(mdev, &status);
+
+ return err;
+}
+
+static int __devinit mthca_load_fw(struct mthca_dev *mdev)
+{
+ u8 status;
+ int err;
+
+ /* FIXME: use HCA-attached memory for FW if present */
+
+ mdev->fw.arbel.fw_icm =
+ mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages,
+ GFP_HIGHUSER | __GFP_NOWARN);
+ if (!mdev->fw.arbel.fw_icm) {
+ mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
+ return -ENOMEM;
+ }
+
+ err = mthca_MAP_FA(mdev, mdev->fw.arbel.fw_icm, &status);
+ if (err) {
+ mthca_err(mdev, "MAP_FA command failed, aborting.\n");
+ goto err_free;
+ }
+ if (status) {
+ mthca_err(mdev, "MAP_FA returned status 0x%02x, aborting.\n", status);
+ err = -EINVAL;
+ goto err_free;
+ }
+ err = mthca_RUN_FW(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "RUN_FW command failed, aborting.\n");
+ goto err_unmap_fa;
+ }
+ if (status) {
+ mthca_err(mdev, "RUN_FW returned status 0x%02x, aborting.\n", status);
+ err = -EINVAL;
+ goto err_unmap_fa;
+ }
+
+ return 0;
+
+err_unmap_fa:
+ mthca_UNMAP_FA(mdev, &status);
+
+err_free:
+ mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+ return err;
+}
+
+static int __devinit mthca_init_icm(struct mthca_dev *mdev,
+ struct mthca_dev_lim *dev_lim,
+ struct mthca_init_hca_param *init_hca,
+ u64 icm_size)
+{
+ u64 aux_pages;
+ u8 status;
+ int err;
+
+ err = mthca_SET_ICM_SIZE(mdev, icm_size, &aux_pages, &status);
+ if (err) {
+ mthca_err(mdev, "SET_ICM_SIZE command failed, aborting.\n");
+ return err;
+ }
+ if (status) {
+ mthca_err(mdev, "SET_ICM_SIZE returned status 0x%02x, "
+ "aborting.\n", status);
+ return -EINVAL;
+ }
+
+ mthca_dbg(mdev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+ (unsigned long long) icm_size >> 10,
+ (unsigned long long) aux_pages << 2);
+
+ mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages,
+ GFP_HIGHUSER | __GFP_NOWARN);
+ if (!mdev->fw.arbel.aux_icm) {
+ mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n");
+ return -ENOMEM;
+ }
+
+ err = mthca_MAP_ICM_AUX(mdev, mdev->fw.arbel.aux_icm, &status);
+ if (err) {
+ mthca_err(mdev, "MAP_ICM_AUX command failed, aborting.\n");
+ goto err_free_aux;
+ }
+ if (status) {
+ mthca_err(mdev, "MAP_ICM_AUX returned status 0x%02x, aborting.\n", status);
+ err = -EINVAL;
+ goto err_free_aux;
+ }
+
+ err = mthca_map_eq_icm(mdev, init_hca->eqc_base);
+ if (err) {
+ mthca_err(mdev, "Failed to map EQ context memory, aborting.\n");
+ goto err_unmap_aux;
+ }
+
+ mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base,
+ init_hca->mtt_seg_sz,
+ mdev->limits.num_mtt_segs,
+ mdev->limits.reserved_mtts, 1);
+ if (!mdev->mr_table.mtt_table) {
+ mthca_err(mdev, "Failed to map MTT context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_eq;
+ }
+
+ mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base,
+ dev_lim->mpt_entry_sz,
+ mdev->limits.num_mpts,
+ mdev->limits.reserved_mrws, 1);
+ if (!mdev->mr_table.mpt_table) {
+ mthca_err(mdev, "Failed to map MPT context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_mtt;
+ }
+
+ mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base,
+ dev_lim->qpc_entry_sz,
+ mdev->limits.num_qps,
+ mdev->limits.reserved_qps, 0);
+ if (!mdev->qp_table.qp_table) {
+ mthca_err(mdev, "Failed to map QP context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_mpt;
+ }
+
+ mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base,
+ dev_lim->eqpc_entry_sz,
+ mdev->limits.num_qps,
+ mdev->limits.reserved_qps, 0);
+ if (!mdev->qp_table.eqp_table) {
+ mthca_err(mdev, "Failed to map EQP context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_qp;
+ }
+
+ mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base,
+ dev_lim->cqc_entry_sz,
+ mdev->limits.num_cqs,
+ mdev->limits.reserved_cqs, 0);
+ if (!mdev->cq_table.table) {
+ mthca_err(mdev, "Failed to map CQ context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_eqp;
+ }
+
+ /*
+ * It's not strictly required, but for simplicity just map the
+ * whole multicast group table now. The table isn't very big
+ * and it's a lot easier than trying to track ref counts.
+ */
+ mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base,
+ MTHCA_MGM_ENTRY_SIZE,
+ mdev->limits.num_mgms +
+ mdev->limits.num_amgms,
+ mdev->limits.num_mgms +
+ mdev->limits.num_amgms,
+ 0);
+ if (!mdev->mcg_table.table) {
+ mthca_err(mdev, "Failed to map MCG context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_cq;
+ }
+
+ return 0;
+
+err_unmap_cq:
+ mthca_free_icm_table(mdev, mdev->cq_table.table);
+
+err_unmap_eqp:
+ mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+
+err_unmap_qp:
+ mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+
+err_unmap_mpt:
+ mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+
+err_unmap_mtt:
+ mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+
+err_unmap_eq:
+ mthca_unmap_eq_icm(mdev);
+
+err_unmap_aux:
+ mthca_UNMAP_ICM_AUX(mdev, &status);
+
+err_free_aux:
+ mthca_free_icm(mdev, mdev->fw.arbel.aux_icm);
+
+ return err;
+}
+
+static int __devinit mthca_init_arbel(struct mthca_dev *mdev)
+{
+ struct mthca_dev_lim dev_lim;
+ struct mthca_profile profile;
+ struct mthca_init_hca_param init_hca;
+ struct mthca_adapter adapter;
+ u64 icm_size;
+ u8 status;
+ int err;
+
+ err = mthca_QUERY_FW(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "QUERY_FW command failed, aborting.\n");
+ return err;
+ }
+ if (status) {
+ mthca_err(mdev, "QUERY_FW returned status 0x%02x, "
+ "aborting.\n", status);
+ return -EINVAL;
+ }
+
+ err = mthca_ENABLE_LAM(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "ENABLE_LAM command failed, aborting.\n");
+ return err;
+ }
+ if (status == MTHCA_CMD_STAT_LAM_NOT_PRE) {
+ mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n");
+ mdev->mthca_flags |= MTHCA_FLAG_NO_LAM;
+ } else if (status) {
+ mthca_err(mdev, "ENABLE_LAM returned status 0x%02x, "
+ "aborting.\n", status);
+ return -EINVAL;
+ }
+
+ err = mthca_load_fw(mdev);
+ if (err) {
+ mthca_err(mdev, "Failed to start FW, aborting.\n");
+ goto err_disable;
+ }
+
+ err = mthca_dev_lim(mdev, &dev_lim);
+ if (err) {
+ mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+ goto err_stop_fw;
+ }
+
+ profile = default_profile;
+ profile.num_uar = dev_lim.uar_size / PAGE_SIZE;
+ profile.num_udav = 0;
+
+ icm_size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+ if ((int) icm_size < 0) {
+ err = icm_size;
+ goto err_stop_fw;
+ }
+
+ err = mthca_init_icm(mdev, &dev_lim, &init_hca, icm_size);
+ if (err)
+ goto err_stop_fw;
+
+ err = mthca_INIT_HCA(mdev, &init_hca, &status);
+ if (err) {
+ mthca_err(mdev, "INIT_HCA command failed, aborting.\n");
+ goto err_free_icm;
+ }
+ if (status) {
+ mthca_err(mdev, "INIT_HCA returned status 0x%02x, "
+ "aborting.\n", status);
+ err = -EINVAL;
+ goto err_free_icm;
+ }
+
+ err = mthca_QUERY_ADAPTER(mdev, &adapter, &status);
+ if (err) {
+ mthca_err(mdev, "QUERY_ADAPTER command failed, aborting.\n");
+ goto err_free_icm;
+ }
+ if (status) {
+ mthca_err(mdev, "QUERY_ADAPTER returned status 0x%02x, "
+ "aborting.\n", status);
+ err = -EINVAL;
+ goto err_free_icm;
+ }
+
+ mdev->eq_table.inta_pin = adapter.inta_pin;
+ mdev->rev_id = adapter.revision_id;
+
+ return 0;
+
+err_free_icm:
+ mthca_free_icm_table(mdev, mdev->cq_table.table);
+ mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+ mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+ mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+ mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+ mthca_unmap_eq_icm(mdev);
+
+ mthca_UNMAP_ICM_AUX(mdev, &status);
+ mthca_free_icm(mdev, mdev->fw.arbel.aux_icm);
+
+err_stop_fw:
+ mthca_UNMAP_FA(mdev, &status);
+ mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+
+err_disable:
+ if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+ mthca_DISABLE_LAM(mdev, &status);
+
+ return err;
+}
+
+static int __devinit mthca_init_hca(struct mthca_dev *mdev)
+{
+ if (mdev->hca_type == ARBEL_NATIVE)
+ return mthca_init_arbel(mdev);
+ else
+ return mthca_init_tavor(mdev);
+}
+
+static int __devinit mthca_setup_hca(struct mthca_dev *dev)
+{
+ int err;
+ u8 status;
+
+ MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock);
+
+ err = mthca_init_uar_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "user access region table, aborting.\n");
+ return err;
+ }
+
+ err = mthca_uar_alloc(dev, &dev->driver_uar);
+ if (err) {
+ mthca_err(dev, "Failed to allocate driver access region, "
+ "aborting.\n");
+ goto err_uar_table_free;
+ }
+
+ dev->kar = ioremap(dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!dev->kar) {
+ mthca_err(dev, "Couldn't map kernel access region, "
+ "aborting.\n");
+ err = -ENOMEM;
+ goto err_uar_free;
+ }
+
+ err = mthca_init_pd_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "protection domain table, aborting.\n");
+ goto err_kar_unmap;
+ }
+
+ err = mthca_init_mr_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "memory region table, aborting.\n");
+ goto err_pd_table_free;
+ }
+
+ err = mthca_pd_alloc(dev, &dev->driver_pd);
+ if (err) {
+ mthca_err(dev, "Failed to create driver PD, "
+ "aborting.\n");
+ goto err_mr_table_free;
+ }
+
+ err = mthca_init_eq_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "event queue table, aborting.\n");
+ goto err_pd_free;
+ }
+
+ err = mthca_cmd_use_events(dev);
+ if (err) {
+ mthca_err(dev, "Failed to switch to event-driven "
+ "firmware commands, aborting.\n");
+ goto err_eq_table_free;
+ }
+
+ err = mthca_NOP(dev, &status);
+ if (err || status) {
+ mthca_err(dev, "NOP command failed to generate interrupt, aborting.\n");
+ if (dev->mthca_flags & (MTHCA_FLAG_MSI | MTHCA_FLAG_MSI_X))
+ mthca_err(dev, "Try again with MSI/MSI-X disabled.\n");
+ else
+ mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+
+ goto err_cmd_poll;
+ }
+
+ mthca_dbg(dev, "NOP command IRQ test passed\n");
+
+ err = mthca_init_cq_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "completion queue table, aborting.\n");
+ goto err_cmd_poll;
+ }
+
+ err = mthca_init_qp_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "queue pair table, aborting.\n");
+ goto err_cq_table_free;
+ }
+
+ err = mthca_init_av_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "address vector table, aborting.\n");
+ goto err_qp_table_free;
+ }
+
+ err = mthca_init_mcg_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "multicast group table, aborting.\n");
+ goto err_av_table_free;
+ }
+
+ return 0;
+
+err_av_table_free:
+ mthca_cleanup_av_table(dev);
+
+err_qp_table_free:
+ mthca_cleanup_qp_table(dev);
+
+err_cq_table_free:
+ mthca_cleanup_cq_table(dev);
+
+err_cmd_poll:
+ mthca_cmd_use_polling(dev);
+
+err_eq_table_free:
+ mthca_cleanup_eq_table(dev);
+
+err_pd_free:
+ mthca_pd_free(dev, &dev->driver_pd);
+
+err_mr_table_free:
+ mthca_cleanup_mr_table(dev);
+
+err_pd_table_free:
+ mthca_cleanup_pd_table(dev);
+
+err_kar_unmap:
+ iounmap(dev->kar);
+
+err_uar_free:
+ mthca_uar_free(dev, &dev->driver_uar);
+
+err_uar_table_free:
+ mthca_cleanup_uar_table(dev);
+ return err;
+}
+
+static int __devinit mthca_request_regions(struct pci_dev *pdev,
+ int ddr_hidden)
+{
+ int err;
+
+ /*
+ * We can't just use pci_request_regions() because the MSI-X
+ * table is right in the middle of the first BAR. If we did
+ * pci_request_region and grab all of the first BAR, then
+ * setting up MSI-X would fail, since the PCI core wants to do
+ * request_mem_region on the MSI-X vector table.
+ *
+ * So just request what we need right now, and request any
+ * other regions we need when setting up EQs.
+ */
+ if (!request_mem_region(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE,
+ MTHCA_HCR_SIZE, DRV_NAME))
+ return -EBUSY;
+
+ err = pci_request_region(pdev, 2, DRV_NAME);
+ if (err)
+ goto err_bar2_failed;
+
+ if (!ddr_hidden) {
+ err = pci_request_region(pdev, 4, DRV_NAME);
+ if (err)
+ goto err_bar4_failed;
+ }
+
+ return 0;
+
+err_bar4_failed:
+ pci_release_region(pdev, 2);
+
+err_bar2_failed:
+ release_mem_region(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE,
+ MTHCA_HCR_SIZE);
+
+ return err;
+}
+
+static void mthca_release_regions(struct pci_dev *pdev,
+ int ddr_hidden)
+{
+ if (!ddr_hidden)
+ pci_release_region(pdev, 4);
+
+ pci_release_region(pdev, 2);
+
+ release_mem_region(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE,
+ MTHCA_HCR_SIZE);
+}
+
+static int __devinit mthca_enable_msi_x(struct mthca_dev *mdev)
+{
+ struct msix_entry entries[3];
+ int err;
+
+ entries[0].entry = 0;
+ entries[1].entry = 1;
+ entries[2].entry = 2;
+
+ err = pci_enable_msix(mdev->pdev, entries, ARRAY_SIZE(entries));
+ if (err) {
+ if (err > 0)
+ mthca_info(mdev, "Only %d MSI-X vectors available, "
+ "not using MSI-X\n", err);
+ return err;
+ }
+
+ mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector;
+ mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector;
+ mdev->eq_table.eq[MTHCA_EQ_CMD ].msi_x_vector = entries[2].vector;
+
+ return 0;
+}
+
+static void mthca_close_hca(struct mthca_dev *mdev)
+{
+ u8 status;
+
+ mthca_CLOSE_HCA(mdev, 0, &status);
+
+ if (mdev->hca_type == ARBEL_NATIVE) {
+ mthca_free_icm_table(mdev, mdev->cq_table.table);
+ mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+ mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+ mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+ mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+ mthca_unmap_eq_icm(mdev);
+
+ mthca_UNMAP_ICM_AUX(mdev, &status);
+ mthca_free_icm(mdev, mdev->fw.arbel.aux_icm);
+
+ mthca_UNMAP_FA(mdev, &status);
+ mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+
+ if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+ mthca_DISABLE_LAM(mdev, &status);
+ } else
+ mthca_SYS_DIS(mdev, &status);
+}
+
+static int __devinit mthca_init_one(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ static int mthca_version_printed = 0;
+ static int mthca_memfree_warned = 0;
+ int ddr_hidden = 0;
+ int err;
+ struct mthca_dev *mdev;
+
+ if (!mthca_version_printed) {
+ printk(KERN_INFO "%s", mthca_version);
+ ++mthca_version_printed;
+ }
+
+ printk(KERN_INFO PFX "Initializing %s (%s)\n",
+ pci_pretty_name(pdev), pci_name(pdev));
+
+ err = pci_enable_device(pdev);
+ if (err) {
+ dev_err(&pdev->dev, "Cannot enable PCI device, "
+ "aborting.\n");
+ return err;
+ }
+
+ /*
+ * Check for BARs. We expect 0: 1MB, 2: 8MB, 4: DDR (may not
+ * be present)
+ */
+ if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+ pci_resource_len(pdev, 0) != 1 << 20) {
+ dev_err(&pdev->dev, "Missing DCS, aborting.");
+ err = -ENODEV;
+ goto err_disable_pdev;
+ }
+ if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM) ||
+ pci_resource_len(pdev, 2) != 1 << 23) {
+ dev_err(&pdev->dev, "Missing UAR, aborting.");
+ err = -ENODEV;
+ goto err_disable_pdev;
+ }
+ if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM))
+ ddr_hidden = 1;
+
+ err = mthca_request_regions(pdev, ddr_hidden);
+ if (err) {
+ dev_err(&pdev->dev, "Cannot obtain PCI resources, "
+ "aborting.\n");
+ goto err_disable_pdev;
+ }
+
+ pci_set_master(pdev);
+
+ err = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
+ if (err) {
+ dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+ err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+ if (err) {
+ dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+ goto err_free_res;
+ }
+ }
+ err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
+ if (err) {
+ dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
+ "consistent PCI DMA mask.\n");
+ err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+ if (err) {
+ dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
+ "aborting.\n");
+ goto err_free_res;
+ }
+ }
+
+ mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev);
+ if (!mdev) {
+ dev_err(&pdev->dev, "Device struct alloc failed, "
+ "aborting.\n");
+ err = -ENOMEM;
+ goto err_free_res;
+ }
+
+ mdev->pdev = pdev;
+ mdev->hca_type = id->driver_data;
+
+ if (mdev->hca_type == ARBEL_NATIVE && !mthca_memfree_warned++)
+ mthca_warn(mdev, "Warning: native MT25208 mode support is incomplete. "
+ "Your HCA may not work properly.\n");
+
+ if (ddr_hidden)
+ mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
+
+ /*
+ * Now reset the HCA before we touch the PCI capabilities or
+ * attempt a firmware command, since a boot ROM may have left
+ * the HCA in an undefined state.
+ */
+ err = mthca_reset(mdev);
+ if (err) {
+ mthca_err(mdev, "Failed to reset HCA, aborting.\n");
+ goto err_free_dev;
+ }
+
+ if (msi_x && !mthca_enable_msi_x(mdev))
+ mdev->mthca_flags |= MTHCA_FLAG_MSI_X;
+ if (msi && !(mdev->mthca_flags & MTHCA_FLAG_MSI_X) &&
+ !pci_enable_msi(pdev))
+ mdev->mthca_flags |= MTHCA_FLAG_MSI;
+
+ sema_init(&mdev->cmd.hcr_sem, 1);
+ sema_init(&mdev->cmd.poll_sem, 1);
+ mdev->cmd.use_events = 0;
+
+ mdev->hcr = ioremap(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE, MTHCA_HCR_SIZE);
+ if (!mdev->hcr) {
+ mthca_err(mdev, "Couldn't map command register, "
+ "aborting.\n");
+ err = -ENOMEM;
+ goto err_free_dev;
+ }
+
+ err = mthca_tune_pci(mdev);
+ if (err)
+ goto err_iounmap;
+
+ err = mthca_init_hca(mdev);
+ if (err)
+ goto err_iounmap;
+
+ err = mthca_setup_hca(mdev);
+ if (err)
+ goto err_close;
+
+ err = mthca_register_device(mdev);
+ if (err)
+ goto err_cleanup;
+
+ err = mthca_create_agents(mdev);
+ if (err)
+ goto err_unregister;
+
+ pci_set_drvdata(pdev, mdev);
+
+ return 0;
+
+err_unregister:
+ mthca_unregister_device(mdev);
+
+err_cleanup:
+ mthca_cleanup_mcg_table(mdev);
+ mthca_cleanup_av_table(mdev);
+ mthca_cleanup_qp_table(mdev);
+ mthca_cleanup_cq_table(mdev);
+ mthca_cmd_use_polling(mdev);
+ mthca_cleanup_eq_table(mdev);
+
+ mthca_pd_free(mdev, &mdev->driver_pd);
+
+ mthca_cleanup_mr_table(mdev);
+ mthca_cleanup_pd_table(mdev);
+ mthca_cleanup_uar_table(mdev);
+
+err_close:
+ mthca_close_hca(mdev);
+
+err_iounmap:
+ iounmap(mdev->hcr);
+
+err_free_dev:
+ if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+ pci_disable_msix(pdev);
+ if (mdev->mthca_flags & MTHCA_FLAG_MSI)
+ pci_disable_msi(pdev);
+
+ ib_dealloc_device(&mdev->ib_dev);
+
+err_free_res:
+ mthca_release_regions(pdev, ddr_hidden);
+
+err_disable_pdev:
+ pci_disable_device(pdev);
+ pci_set_drvdata(pdev, NULL);
+ return err;
+}
+
+static void __devexit mthca_remove_one(struct pci_dev *pdev)
+{
+ struct mthca_dev *mdev = pci_get_drvdata(pdev);
+ u8 status;
+ int p;
+
+ if (mdev) {
+ mthca_free_agents(mdev);
+ mthca_unregister_device(mdev);
+
+ for (p = 1; p <= mdev->limits.num_ports; ++p)
+ mthca_CLOSE_IB(mdev, p, &status);
+
+ mthca_cleanup_mcg_table(mdev);
+ mthca_cleanup_av_table(mdev);
+ mthca_cleanup_qp_table(mdev);
+ mthca_cleanup_cq_table(mdev);
+ mthca_cmd_use_polling(mdev);
+ mthca_cleanup_eq_table(mdev);
+
+ mthca_pd_free(mdev, &mdev->driver_pd);
+
+ mthca_cleanup_mr_table(mdev);
+ mthca_cleanup_pd_table(mdev);
+
+ iounmap(mdev->kar);
+ mthca_uar_free(mdev, &mdev->driver_uar);
+ mthca_cleanup_uar_table(mdev);
+
+ mthca_close_hca(mdev);
+
+ iounmap(mdev->hcr);
+
+ if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+ pci_disable_msix(pdev);
+ if (mdev->mthca_flags & MTHCA_FLAG_MSI)
+ pci_disable_msi(pdev);
+
+ ib_dealloc_device(&mdev->ib_dev);
+ mthca_release_regions(pdev, mdev->mthca_flags &
+ MTHCA_FLAG_DDR_HIDDEN);
+ pci_disable_device(pdev);
+ pci_set_drvdata(pdev, NULL);
+ }
+}
+
+static struct pci_device_id mthca_pci_table[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
+ .driver_data = TAVOR },
+ { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR),
+ .driver_data = TAVOR },
+ { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+ .driver_data = ARBEL_COMPAT },
+ { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+ .driver_data = ARBEL_COMPAT },
+ { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL),
+ .driver_data = ARBEL_NATIVE },
+ { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL),
+ .driver_data = ARBEL_NATIVE },
+ { 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mthca_pci_table);
+
+static struct pci_driver mthca_driver = {
+ .name = "ib_mthca",
+ .id_table = mthca_pci_table,
+ .probe = mthca_init_one,
+ .remove = __devexit_p(mthca_remove_one)
+};
+
+static int __init mthca_init(void)
+{
+ int ret;
+
+ ret = pci_register_driver(&mthca_driver);
+ return ret < 0 ? ret : 0;
+}
+
+static void __exit mthca_cleanup(void)
+{
+ pci_unregister_driver(&mthca_driver);
+}
+
+module_init(mthca_init);
+module_exit(mthca_cleanup);
diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c
new file mode 100644
index 00000000000..70a6553a588
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_mcg.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/init.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+ MTHCA_QP_PER_MGM = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2)
+};
+
+struct mthca_mgm {
+ u32 next_gid_index;
+ u32 reserved[3];
+ u8 gid[16];
+ u32 qp[MTHCA_QP_PER_MGM];
+};
+
+static const u8 zero_gid[16]; /* automatically initialized to 0 */
+
+/*
+ * Caller must hold MCG table semaphore. gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ * Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_mgm(struct mthca_dev *dev,
+ u8 *gid, struct mthca_mgm *mgm,
+ u16 *hash, int *prev, int *index)
+{
+ void *mailbox;
+ u8 *mgid;
+ int err;
+ u8 status;
+
+ mailbox = kmalloc(16 + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+ if (!mailbox)
+ return -ENOMEM;
+ mgid = MAILBOX_ALIGN(mailbox);
+
+ memcpy(mgid, gid, 16);
+
+ err = mthca_MGID_HASH(dev, mgid, hash, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "MGID_HASH returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (0)
+ mthca_dbg(dev, "Hash for %04x:%04x:%04x:%04x:"
+ "%04x:%04x:%04x:%04x is %04x\n",
+ be16_to_cpu(((u16 *) gid)[0]), be16_to_cpu(((u16 *) gid)[1]),
+ be16_to_cpu(((u16 *) gid)[2]), be16_to_cpu(((u16 *) gid)[3]),
+ be16_to_cpu(((u16 *) gid)[4]), be16_to_cpu(((u16 *) gid)[5]),
+ be16_to_cpu(((u16 *) gid)[6]), be16_to_cpu(((u16 *) gid)[7]),
+ *hash);
+
+ *index = *hash;
+ *prev = -1;
+
+ do {
+ err = mthca_READ_MGM(dev, *index, mgm, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "READ_MGM returned status %02x\n", status);
+ return -EINVAL;
+ }
+
+ if (!memcmp(mgm->gid, zero_gid, 16)) {
+ if (*index != *hash) {
+ mthca_err(dev, "Found zero MGID in AMGM.\n");
+ err = -EINVAL;
+ }
+ goto out;
+ }
+
+ if (!memcmp(mgm->gid, gid, 16))
+ goto out;
+
+ *prev = *index;
+ *index = be32_to_cpu(mgm->next_gid_index) >> 5;
+ } while (*index);
+
+ *index = -1;
+
+ out:
+ kfree(mailbox);
+ return err;
+}
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ void *mailbox;
+ struct mthca_mgm *mgm;
+ u16 hash;
+ int index, prev;
+ int link = 0;
+ int i;
+ int err;
+ u8 status;
+
+ mailbox = kmalloc(sizeof *mgm + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+ if (!mailbox)
+ return -ENOMEM;
+ mgm = MAILBOX_ALIGN(mailbox);
+
+ if (down_interruptible(&dev->mcg_table.sem))
+ return -EINTR;
+
+ err = find_mgm(dev, gid->raw, mgm, &hash, &prev, &index);
+ if (err)
+ goto out;
+
+ if (index != -1) {
+ if (!memcmp(mgm->gid, zero_gid, 16))
+ memcpy(mgm->gid, gid->raw, 16);
+ } else {
+ link = 1;
+
+ index = mthca_alloc(&dev->mcg_table.alloc);
+ if (index == -1) {
+ mthca_err(dev, "No AMGM entries left\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = mthca_READ_MGM(dev, index, mgm, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "READ_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ memcpy(mgm->gid, gid->raw, 16);
+ mgm->next_gid_index = 0;
+ }
+
+ for (i = 0; i < MTHCA_QP_PER_MGM; ++i)
+ if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) {
+ mgm->qp[i] = cpu_to_be32(ibqp->qp_num | (1 << 31));
+ break;
+ }
+
+ if (i == MTHCA_QP_PER_MGM) {
+ mthca_err(dev, "MGM at index %x is full.\n", index);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = mthca_WRITE_MGM(dev, index, mgm, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ }
+
+ if (!link)
+ goto out;
+
+ err = mthca_READ_MGM(dev, prev, mgm, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "READ_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mgm->next_gid_index = cpu_to_be32(index << 5);
+
+ err = mthca_WRITE_MGM(dev, prev, mgm, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ }
+
+ out:
+ up(&dev->mcg_table.sem);
+ kfree(mailbox);
+ return err;
+}
+
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ void *mailbox;
+ struct mthca_mgm *mgm;
+ u16 hash;
+ int prev, index;
+ int i, loc;
+ int err;
+ u8 status;
+
+ mailbox = kmalloc(sizeof *mgm + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+ if (!mailbox)
+ return -ENOMEM;
+ mgm = MAILBOX_ALIGN(mailbox);
+
+ if (down_interruptible(&dev->mcg_table.sem))
+ return -EINTR;
+
+ err = find_mgm(dev, gid->raw, mgm, &hash, &prev, &index);
+ if (err)
+ goto out;
+
+ if (index == -1) {
+ mthca_err(dev, "MGID %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x "
+ "not found\n",
+ be16_to_cpu(((u16 *) gid->raw)[0]),
+ be16_to_cpu(((u16 *) gid->raw)[1]),
+ be16_to_cpu(((u16 *) gid->raw)[2]),
+ be16_to_cpu(((u16 *) gid->raw)[3]),
+ be16_to_cpu(((u16 *) gid->raw)[4]),
+ be16_to_cpu(((u16 *) gid->raw)[5]),
+ be16_to_cpu(((u16 *) gid->raw)[6]),
+ be16_to_cpu(((u16 *) gid->raw)[7]));
+ err = -EINVAL;
+ goto out;
+ }
+
+ for (loc = -1, i = 0; i < MTHCA_QP_PER_MGM; ++i) {
+ if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31)))
+ loc = i;
+ if (!(mgm->qp[i] & cpu_to_be32(1 << 31)))
+ break;
+ }
+
+ if (loc == -1) {
+ mthca_err(dev, "QP %06x not found in MGM\n", ibqp->qp_num);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mgm->qp[loc] = mgm->qp[i - 1];
+ mgm->qp[i - 1] = 0;
+
+ err = mthca_WRITE_MGM(dev, index, mgm, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (i != 1)
+ goto out;
+
+ goto out;
+
+ if (prev == -1) {
+ /* Remove entry from MGM */
+ if (be32_to_cpu(mgm->next_gid_index) >> 5) {
+ err = mthca_READ_MGM(dev,
+ be32_to_cpu(mgm->next_gid_index) >> 5,
+ mgm, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "READ_MGM returned status %02x\n",
+ status);
+ err = -EINVAL;
+ goto out;
+ }
+ } else
+ memset(mgm->gid, 0, 16);
+
+ err = mthca_WRITE_MGM(dev, index, mgm, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+ } else {
+ /* Remove entry from AMGM */
+ index = be32_to_cpu(mgm->next_gid_index) >> 5;
+ err = mthca_READ_MGM(dev, prev, mgm, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "READ_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mgm->next_gid_index = cpu_to_be32(index << 5);
+
+ err = mthca_WRITE_MGM(dev, prev, mgm, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ out:
+ up(&dev->mcg_table.sem);
+ kfree(mailbox);
+ return err;
+}
+
+int __devinit mthca_init_mcg_table(struct mthca_dev *dev)
+{
+ int err;
+
+ err = mthca_alloc_init(&dev->mcg_table.alloc,
+ dev->limits.num_amgms,
+ dev->limits.num_amgms - 1,
+ 0);
+ if (err)
+ return err;
+
+ init_MUTEX(&dev->mcg_table.sem);
+
+ return 0;
+}
+
+void __devexit mthca_cleanup_mcg_table(struct mthca_dev *dev)
+{
+ mthca_alloc_cleanup(&dev->mcg_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
new file mode 100644
index 00000000000..7730b596061
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#include "mthca_memfree.h"
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+ MTHCA_ICM_ALLOC_SIZE = 1 << 18,
+ MTHCA_TABLE_CHUNK_SIZE = 1 << 18
+};
+
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm)
+{
+ struct mthca_icm_chunk *chunk, *tmp;
+ int i;
+
+ if (!icm)
+ return;
+
+ list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+ if (chunk->nsg > 0)
+ pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+ PCI_DMA_BIDIRECTIONAL);
+
+ for (i = 0; i < chunk->npages; ++i)
+ __free_pages(chunk->mem[i].page,
+ get_order(chunk->mem[i].length));
+
+ kfree(chunk);
+ }
+
+ kfree(icm);
+}
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+ unsigned int gfp_mask)
+{
+ struct mthca_icm *icm;
+ struct mthca_icm_chunk *chunk = NULL;
+ int cur_order;
+
+ icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+ if (!icm)
+ return icm;
+
+ icm->refcount = 0;
+ INIT_LIST_HEAD(&icm->chunk_list);
+
+ cur_order = get_order(MTHCA_ICM_ALLOC_SIZE);
+
+ while (npages > 0) {
+ if (!chunk) {
+ chunk = kmalloc(sizeof *chunk,
+ gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+ if (!chunk)
+ goto fail;
+
+ chunk->npages = 0;
+ chunk->nsg = 0;
+ list_add_tail(&chunk->list, &icm->chunk_list);
+ }
+
+ while (1 << cur_order > npages)
+ --cur_order;
+
+ chunk->mem[chunk->npages].page = alloc_pages(gfp_mask, cur_order);
+ if (chunk->mem[chunk->npages].page) {
+ chunk->mem[chunk->npages].length = PAGE_SIZE << cur_order;
+ chunk->mem[chunk->npages].offset = 0;
+
+ if (++chunk->npages == MTHCA_ICM_CHUNK_LEN) {
+ chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+ chunk->npages,
+ PCI_DMA_BIDIRECTIONAL);
+
+ if (chunk->nsg <= 0)
+ goto fail;
+
+ chunk = NULL;
+ }
+
+ npages -= 1 << cur_order;
+ } else {
+ --cur_order;
+ if (cur_order < 0)
+ goto fail;
+ }
+ }
+
+ if (chunk) {
+ chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+ chunk->npages,
+ PCI_DMA_BIDIRECTIONAL);
+
+ if (chunk->nsg <= 0)
+ goto fail;
+ }
+
+ return icm;
+
+fail:
+ mthca_free_icm(dev, icm);
+ return NULL;
+}
+
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+ int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+ int ret = 0;
+ u8 status;
+
+ down(&table->mutex);
+
+ if (table->icm[i]) {
+ ++table->icm[i]->refcount;
+ goto out;
+ }
+
+ table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+ (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+ __GFP_NOWARN);
+ if (!table->icm[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (mthca_MAP_ICM(dev, table->icm[i], table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ &status) || status) {
+ mthca_free_icm(dev, table->icm[i]);
+ table->icm[i] = NULL;
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ++table->icm[i]->refcount;
+
+out:
+ up(&table->mutex);
+ return ret;
+}
+
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+ int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+ u8 status;
+
+ down(&table->mutex);
+
+ if (--table->icm[i]->refcount == 0) {
+ mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ MTHCA_TABLE_CHUNK_SIZE >> 12, &status);
+ mthca_free_icm(dev, table->icm[i]);
+ table->icm[i] = NULL;
+ }
+
+ up(&table->mutex);
+}
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+ u64 virt, int obj_size,
+ int nobj, int reserved,
+ int use_lowmem)
+{
+ struct mthca_icm_table *table;
+ int num_icm;
+ int i;
+ u8 status;
+
+ num_icm = obj_size * nobj / MTHCA_TABLE_CHUNK_SIZE;
+
+ table = kmalloc(sizeof *table + num_icm * sizeof *table->icm, GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ table->virt = virt;
+ table->num_icm = num_icm;
+ table->num_obj = nobj;
+ table->obj_size = obj_size;
+ table->lowmem = use_lowmem;
+ init_MUTEX(&table->mutex);
+
+ for (i = 0; i < num_icm; ++i)
+ table->icm[i] = NULL;
+
+ for (i = 0; i * MTHCA_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+ table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+ (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+ __GFP_NOWARN);
+ if (!table->icm[i])
+ goto err;
+ if (mthca_MAP_ICM(dev, table->icm[i], virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ &status) || status) {
+ mthca_free_icm(dev, table->icm[i]);
+ table->icm[i] = NULL;
+ goto err;
+ }
+
+ /*
+ * Add a reference to this ICM chunk so that it never
+ * gets freed (since it contains reserved firmware objects).
+ */
+ ++table->icm[i]->refcount;
+ }
+
+ return table;
+
+err:
+ for (i = 0; i < num_icm; ++i)
+ if (table->icm[i]) {
+ mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ MTHCA_TABLE_CHUNK_SIZE >> 12, &status);
+ mthca_free_icm(dev, table->icm[i]);
+ }
+
+ kfree(table);
+
+ return NULL;
+}
+
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table)
+{
+ int i;
+ u8 status;
+
+ for (i = 0; i < table->num_icm; ++i)
+ if (table->icm[i]) {
+ mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ MTHCA_TABLE_CHUNK_SIZE >> 12, &status);
+ mthca_free_icm(dev, table->icm[i]);
+ }
+
+ kfree(table);
+}
+
+static u64 mthca_uarc_virt(struct mthca_dev *dev, int page)
+{
+ return dev->uar_table.uarc_base +
+ dev->driver_uar.index * dev->uar_table.uarc_size +
+ page * 4096;
+}
+
+int mthca_alloc_db(struct mthca_dev *dev, int type, u32 qn, u32 **db)
+{
+ int group;
+ int start, end, dir;
+ int i, j;
+ struct mthca_db_page *page;
+ int ret = 0;
+ u8 status;
+
+ down(&dev->db_tab->mutex);
+
+ switch (type) {
+ case MTHCA_DB_TYPE_CQ_ARM:
+ case MTHCA_DB_TYPE_SQ:
+ group = 0;
+ start = 0;
+ end = dev->db_tab->max_group1;
+ dir = 1;
+ break;
+
+ case MTHCA_DB_TYPE_CQ_SET_CI:
+ case MTHCA_DB_TYPE_RQ:
+ case MTHCA_DB_TYPE_SRQ:
+ group = 1;
+ start = dev->db_tab->npages - 1;
+ end = dev->db_tab->min_group2;
+ dir = -1;
+ break;
+
+ default:
+ return -1;
+ }
+
+ for (i = start; i != end; i += dir)
+ if (dev->db_tab->page[i].db_rec &&
+ !bitmap_full(dev->db_tab->page[i].used,
+ MTHCA_DB_REC_PER_PAGE)) {
+ page = dev->db_tab->page + i;
+ goto found;
+ }
+
+ if (dev->db_tab->max_group1 >= dev->db_tab->min_group2 - 1) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ page = dev->db_tab->page + end;
+ page->db_rec = dma_alloc_coherent(&dev->pdev->dev, 4096,
+ &page->mapping, GFP_KERNEL);
+ if (!page->db_rec) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memset(page->db_rec, 0, 4096);
+
+ ret = mthca_MAP_ICM_page(dev, page->mapping, mthca_uarc_virt(dev, i), &status);
+ if (!ret && status)
+ ret = -EINVAL;
+ if (ret) {
+ dma_free_coherent(&dev->pdev->dev, 4096,
+ page->db_rec, page->mapping);
+ goto out;
+ }
+
+ bitmap_zero(page->used, MTHCA_DB_REC_PER_PAGE);
+ if (group == 0)
+ ++dev->db_tab->max_group1;
+ else
+ --dev->db_tab->min_group2;
+
+found:
+ j = find_first_zero_bit(page->used, MTHCA_DB_REC_PER_PAGE);
+ set_bit(j, page->used);
+
+ if (group == 1)
+ j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+
+ ret = i * MTHCA_DB_REC_PER_PAGE + j;
+
+ page->db_rec[j] = cpu_to_be64((qn << 8) | (type << 5));
+
+ *db = (u32 *) &page->db_rec[j];
+
+out:
+ up(&dev->db_tab->mutex);
+
+ return ret;
+}
+
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index)
+{
+ int i, j;
+ struct mthca_db_page *page;
+ u8 status;
+
+ i = db_index / MTHCA_DB_REC_PER_PAGE;
+ j = db_index % MTHCA_DB_REC_PER_PAGE;
+
+ page = dev->db_tab->page + i;
+
+ down(&dev->db_tab->mutex);
+
+ page->db_rec[j] = 0;
+ if (i >= dev->db_tab->min_group2)
+ j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+ clear_bit(j, page->used);
+
+ if (bitmap_empty(page->used, MTHCA_DB_REC_PER_PAGE) &&
+ i >= dev->db_tab->max_group1 - 1) {
+ mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, i), 1, &status);
+
+ dma_free_coherent(&dev->pdev->dev, 4096,
+ page->db_rec, page->mapping);
+ page->db_rec = NULL;
+
+ if (i == dev->db_tab->max_group1) {
+ --dev->db_tab->max_group1;
+ /* XXX may be able to unmap more pages now */
+ }
+ if (i == dev->db_tab->min_group2)
+ ++dev->db_tab->min_group2;
+ }
+
+ up(&dev->db_tab->mutex);
+}
+
+int mthca_init_db_tab(struct mthca_dev *dev)
+{
+ int i;
+
+ if (dev->hca_type != ARBEL_NATIVE)
+ return 0;
+
+ dev->db_tab = kmalloc(sizeof *dev->db_tab, GFP_KERNEL);
+ if (!dev->db_tab)
+ return -ENOMEM;
+
+ init_MUTEX(&dev->db_tab->mutex);
+
+ dev->db_tab->npages = dev->uar_table.uarc_size / PAGE_SIZE;
+ dev->db_tab->max_group1 = 0;
+ dev->db_tab->min_group2 = dev->db_tab->npages - 1;
+
+ dev->db_tab->page = kmalloc(dev->db_tab->npages *
+ sizeof *dev->db_tab->page,
+ GFP_KERNEL);
+ if (!dev->db_tab->page) {
+ kfree(dev->db_tab);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < dev->db_tab->npages; ++i)
+ dev->db_tab->page[i].db_rec = NULL;
+
+ return 0;
+}
+
+void mthca_cleanup_db_tab(struct mthca_dev *dev)
+{
+ int i;
+ u8 status;
+
+ if (dev->hca_type != ARBEL_NATIVE)
+ return;
+
+ /*
+ * Because we don't always free our UARC pages when they
+ * become empty to make mthca_free_db() simpler we need to
+ * make a sweep through the doorbell pages and free any
+ * leftover pages now.
+ */
+ for (i = 0; i < dev->db_tab->npages; ++i) {
+ if (!dev->db_tab->page[i].db_rec)
+ continue;
+
+ if (!bitmap_empty(dev->db_tab->page[i].used, MTHCA_DB_REC_PER_PAGE))
+ mthca_warn(dev, "Kernel UARC page %d not empty\n", i);
+
+ mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, i), 1, &status);
+
+ dma_free_coherent(&dev->pdev->dev, 4096,
+ dev->db_tab->page[i].db_rec,
+ dev->db_tab->page[i].mapping);
+ }
+
+ kfree(dev->db_tab->page);
+ kfree(dev->db_tab);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h
new file mode 100644
index 00000000000..a8fa97e140f
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#ifndef MTHCA_MEMFREE_H
+#define MTHCA_MEMFREE_H
+
+#include <linux/list.h>
+#include <linux/pci.h>
+
+#include <asm/semaphore.h>
+
+#define MTHCA_ICM_CHUNK_LEN \
+ ((256 - sizeof (struct list_head) - 2 * sizeof (int)) / \
+ (sizeof (struct scatterlist)))
+
+struct mthca_icm_chunk {
+ struct list_head list;
+ int npages;
+ int nsg;
+ struct scatterlist mem[MTHCA_ICM_CHUNK_LEN];
+};
+
+struct mthca_icm {
+ struct list_head chunk_list;
+ int refcount;
+};
+
+struct mthca_icm_table {
+ u64 virt;
+ int num_icm;
+ int num_obj;
+ int obj_size;
+ int lowmem;
+ struct semaphore mutex;
+ struct mthca_icm *icm[0];
+};
+
+struct mthca_icm_iter {
+ struct mthca_icm *icm;
+ struct mthca_icm_chunk *chunk;
+ int page_idx;
+};
+
+struct mthca_dev;
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+ unsigned int gfp_mask);
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm);
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+ u64 virt, int obj_size,
+ int nobj, int reserved,
+ int use_lowmem);
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table);
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+
+static inline void mthca_icm_first(struct mthca_icm *icm,
+ struct mthca_icm_iter *iter)
+{
+ iter->icm = icm;
+ iter->chunk = list_empty(&icm->chunk_list) ?
+ NULL : list_entry(icm->chunk_list.next,
+ struct mthca_icm_chunk, list);
+ iter->page_idx = 0;
+}
+
+static inline int mthca_icm_last(struct mthca_icm_iter *iter)
+{
+ return !iter->chunk;
+}
+
+static inline void mthca_icm_next(struct mthca_icm_iter *iter)
+{
+ if (++iter->page_idx >= iter->chunk->nsg) {
+ if (iter->chunk->list.next == &iter->icm->chunk_list) {
+ iter->chunk = NULL;
+ return;
+ }
+
+ iter->chunk = list_entry(iter->chunk->list.next,
+ struct mthca_icm_chunk, list);
+ iter->page_idx = 0;
+ }
+}
+
+static inline dma_addr_t mthca_icm_addr(struct mthca_icm_iter *iter)
+{
+ return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mthca_icm_size(struct mthca_icm_iter *iter)
+{
+ return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+enum {
+ MTHCA_DB_REC_PER_PAGE = 4096 / 8
+};
+
+struct mthca_db_page {
+ DECLARE_BITMAP(used, MTHCA_DB_REC_PER_PAGE);
+ u64 *db_rec;
+ dma_addr_t mapping;
+};
+
+struct mthca_db_table {
+ int npages;
+ int max_group1;
+ int min_group2;
+ struct mthca_db_page *page;
+ struct semaphore mutex;
+};
+
+enum {
+ MTHCA_DB_TYPE_INVALID = 0x0,
+ MTHCA_DB_TYPE_CQ_SET_CI = 0x1,
+ MTHCA_DB_TYPE_CQ_ARM = 0x2,
+ MTHCA_DB_TYPE_SQ = 0x3,
+ MTHCA_DB_TYPE_RQ = 0x4,
+ MTHCA_DB_TYPE_SRQ = 0x5,
+ MTHCA_DB_TYPE_GROUP_SEP = 0x7
+};
+
+int mthca_init_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_db_tab(struct mthca_dev *dev);
+int mthca_alloc_db(struct mthca_dev *dev, int type, u32 qn, u32 **db);
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index);
+
+#endif /* MTHCA_MEMFREE_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
new file mode 100644
index 00000000000..80a0cd97881
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_mr.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_mpt_entry {
+ u32 flags;
+ u32 page_size;
+ u32 key;
+ u32 pd;
+ u64 start;
+ u64 length;
+ u32 lkey;
+ u32 window_count;
+ u32 window_count_limit;
+ u64 mtt_seg;
+ u32 mtt_sz; /* Arbel only */
+ u32 reserved[2];
+} __attribute__((packed));
+
+#define MTHCA_MPT_FLAG_SW_OWNS (0xfUL << 28)
+#define MTHCA_MPT_FLAG_MIO (1 << 17)
+#define MTHCA_MPT_FLAG_BIND_ENABLE (1 << 15)
+#define MTHCA_MPT_FLAG_PHYSICAL (1 << 9)
+#define MTHCA_MPT_FLAG_REGION (1 << 8)
+
+#define MTHCA_MTT_FLAG_PRESENT 1
+
+/*
+ * Buddy allocator for MTT segments (currently not very efficient
+ * since it doesn't keep a free list and just searches linearly
+ * through the bitmaps)
+ */
+
+static u32 mthca_alloc_mtt(struct mthca_dev *dev, int order)
+{
+ int o;
+ int m;
+ u32 seg;
+
+ spin_lock(&dev->mr_table.mpt_alloc.lock);
+
+ for (o = order; o <= dev->mr_table.max_mtt_order; ++o) {
+ m = 1 << (dev->mr_table.max_mtt_order - o);
+ seg = find_first_bit(dev->mr_table.mtt_buddy[o], m);
+ if (seg < m)
+ goto found;
+ }
+
+ spin_unlock(&dev->mr_table.mpt_alloc.lock);
+ return -1;
+
+ found:
+ clear_bit(seg, dev->mr_table.mtt_buddy[o]);
+
+ while (o > order) {
+ --o;
+ seg <<= 1;
+ set_bit(seg ^ 1, dev->mr_table.mtt_buddy[o]);
+ }
+
+ spin_unlock(&dev->mr_table.mpt_alloc.lock);
+
+ seg <<= order;
+
+ return seg;
+}
+
+static void mthca_free_mtt(struct mthca_dev *dev, u32 seg, int order)
+{
+ seg >>= order;
+
+ spin_lock(&dev->mr_table.mpt_alloc.lock);
+
+ while (test_bit(seg ^ 1, dev->mr_table.mtt_buddy[order])) {
+ clear_bit(seg ^ 1, dev->mr_table.mtt_buddy[order]);
+ seg >>= 1;
+ ++order;
+ }
+
+ set_bit(seg, dev->mr_table.mtt_buddy[order]);
+
+ spin_unlock(&dev->mr_table.mpt_alloc.lock);
+}
+
+static inline u32 hw_index_to_key(struct mthca_dev *dev, u32 ind)
+{
+ if (dev->hca_type == ARBEL_NATIVE)
+ return (ind >> 24) | (ind << 8);
+ else
+ return ind;
+}
+
+static inline u32 key_to_hw_index(struct mthca_dev *dev, u32 key)
+{
+ if (dev->hca_type == ARBEL_NATIVE)
+ return (key << 24) | (key >> 8);
+ else
+ return key;
+}
+
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+ u32 access, struct mthca_mr *mr)
+{
+ void *mailbox;
+ struct mthca_mpt_entry *mpt_entry;
+ u32 key;
+ int err;
+ u8 status;
+
+ might_sleep();
+
+ mr->order = -1;
+ key = mthca_alloc(&dev->mr_table.mpt_alloc);
+ if (key == -1)
+ return -ENOMEM;
+ mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+ mailbox = kmalloc(sizeof *mpt_entry + MTHCA_CMD_MAILBOX_EXTRA,
+ GFP_KERNEL);
+ if (!mailbox) {
+ mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);
+ return -ENOMEM;
+ }
+ mpt_entry = MAILBOX_ALIGN(mailbox);
+
+ mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS |
+ MTHCA_MPT_FLAG_MIO |
+ MTHCA_MPT_FLAG_PHYSICAL |
+ MTHCA_MPT_FLAG_REGION |
+ access);
+ mpt_entry->page_size = 0;
+ mpt_entry->key = cpu_to_be32(key);
+ mpt_entry->pd = cpu_to_be32(pd);
+ mpt_entry->start = 0;
+ mpt_entry->length = ~0ULL;
+
+ memset(&mpt_entry->lkey, 0,
+ sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));
+
+ err = mthca_SW2HW_MPT(dev, mpt_entry,
+ key & (dev->limits.num_mpts - 1),
+ &status);
+ if (err)
+ mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+ else if (status) {
+ mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ }
+
+ kfree(mailbox);
+ return err;
+}
+
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+ u64 *buffer_list, int buffer_size_shift,
+ int list_len, u64 iova, u64 total_size,
+ u32 access, struct mthca_mr *mr)
+{
+ void *mailbox;
+ u64 *mtt_entry;
+ struct mthca_mpt_entry *mpt_entry;
+ u32 key;
+ int err = -ENOMEM;
+ u8 status;
+ int i;
+
+ might_sleep();
+ WARN_ON(buffer_size_shift >= 32);
+
+ key = mthca_alloc(&dev->mr_table.mpt_alloc);
+ if (key == -1)
+ return -ENOMEM;
+ mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+ for (i = dev->limits.mtt_seg_size / 8, mr->order = 0;
+ i < list_len;
+ i <<= 1, ++mr->order)
+ ; /* nothing */
+
+ mr->first_seg = mthca_alloc_mtt(dev, mr->order);
+ if (mr->first_seg == -1)
+ goto err_out_mpt_free;
+
+ /*
+ * If list_len is odd, we add one more dummy entry for
+ * firmware efficiency.
+ */
+ mailbox = kmalloc(max(sizeof *mpt_entry,
+ (size_t) 8 * (list_len + (list_len & 1) + 2)) +
+ MTHCA_CMD_MAILBOX_EXTRA,
+ GFP_KERNEL);
+ if (!mailbox)
+ goto err_out_free_mtt;
+
+ mtt_entry = MAILBOX_ALIGN(mailbox);
+
+ mtt_entry[0] = cpu_to_be64(dev->mr_table.mtt_base +
+ mr->first_seg * dev->limits.mtt_seg_size);
+ mtt_entry[1] = 0;
+ for (i = 0; i < list_len; ++i)
+ mtt_entry[i + 2] = cpu_to_be64(buffer_list[i] |
+ MTHCA_MTT_FLAG_PRESENT);
+ if (list_len & 1) {
+ mtt_entry[i + 2] = 0;
+ ++list_len;
+ }
+
+ if (0) {
+ mthca_dbg(dev, "Dumping MPT entry\n");
+ for (i = 0; i < list_len + 2; ++i)
+ printk(KERN_ERR "[%2d] %016llx\n",
+ i, (unsigned long long) be64_to_cpu(mtt_entry[i]));
+ }
+
+ err = mthca_WRITE_MTT(dev, mtt_entry, list_len, &status);
+ if (err) {
+ mthca_warn(dev, "WRITE_MTT failed (%d)\n", err);
+ goto err_out_mailbox_free;
+ }
+ if (status) {
+ mthca_warn(dev, "WRITE_MTT returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ goto err_out_mailbox_free;
+ }
+
+ mpt_entry = MAILBOX_ALIGN(mailbox);
+
+ mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS |
+ MTHCA_MPT_FLAG_MIO |
+ MTHCA_MPT_FLAG_REGION |
+ access);
+
+ mpt_entry->page_size = cpu_to_be32(buffer_size_shift - 12);
+ mpt_entry->key = cpu_to_be32(key);
+ mpt_entry->pd = cpu_to_be32(pd);
+ mpt_entry->start = cpu_to_be64(iova);
+ mpt_entry->length = cpu_to_be64(total_size);
+ memset(&mpt_entry->lkey, 0,
+ sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));
+ mpt_entry->mtt_seg = cpu_to_be64(dev->mr_table.mtt_base +
+ mr->first_seg * dev->limits.mtt_seg_size);
+
+ if (0) {
+ mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+ for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+ if (i % 4 == 0)
+ printk("[%02x] ", i * 4);
+ printk(" %08x", be32_to_cpu(((u32 *) mpt_entry)[i]));
+ if ((i + 1) % 4 == 0)
+ printk("\n");
+ }
+ }
+
+ err = mthca_SW2HW_MPT(dev, mpt_entry,
+ key & (dev->limits.num_mpts - 1),
+ &status);
+ if (err)
+ mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+ else if (status) {
+ mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ }
+
+ kfree(mailbox);
+ return err;
+
+ err_out_mailbox_free:
+ kfree(mailbox);
+
+ err_out_free_mtt:
+ mthca_free_mtt(dev, mr->first_seg, mr->order);
+
+ err_out_mpt_free:
+ mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);
+ return err;
+}
+
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
+{
+ int err;
+ u8 status;
+
+ might_sleep();
+
+ err = mthca_HW2SW_MPT(dev, NULL,
+ key_to_hw_index(dev, mr->ibmr.lkey) &
+ (dev->limits.num_mpts - 1),
+ &status);
+ if (err)
+ mthca_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+ else if (status)
+ mthca_warn(dev, "HW2SW_MPT returned status 0x%02x\n",
+ status);
+
+ if (mr->order >= 0)
+ mthca_free_mtt(dev, mr->first_seg, mr->order);
+
+ mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, mr->ibmr.lkey));
+}
+
+int __devinit mthca_init_mr_table(struct mthca_dev *dev)
+{
+ int err;
+ int i, s;
+
+ err = mthca_alloc_init(&dev->mr_table.mpt_alloc,
+ dev->limits.num_mpts,
+ ~0, dev->limits.reserved_mrws);
+ if (err)
+ return err;
+
+ err = -ENOMEM;
+
+ for (i = 1, dev->mr_table.max_mtt_order = 0;
+ i < dev->limits.num_mtt_segs;
+ i <<= 1, ++dev->mr_table.max_mtt_order)
+ ; /* nothing */
+
+ dev->mr_table.mtt_buddy = kmalloc((dev->mr_table.max_mtt_order + 1) *
+ sizeof (long *),
+ GFP_KERNEL);
+ if (!dev->mr_table.mtt_buddy)
+ goto err_out;
+
+ for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
+ dev->mr_table.mtt_buddy[i] = NULL;
+
+ for (i = 0; i <= dev->mr_table.max_mtt_order; ++i) {
+ s = BITS_TO_LONGS(1 << (dev->mr_table.max_mtt_order - i));
+ dev->mr_table.mtt_buddy[i] = kmalloc(s * sizeof (long),
+ GFP_KERNEL);
+ if (!dev->mr_table.mtt_buddy[i])
+ goto err_out_free;
+ bitmap_zero(dev->mr_table.mtt_buddy[i],
+ 1 << (dev->mr_table.max_mtt_order - i));
+ }
+
+ set_bit(0, dev->mr_table.mtt_buddy[dev->mr_table.max_mtt_order]);
+
+ for (i = 0; i < dev->mr_table.max_mtt_order; ++i)
+ if (1 << i >= dev->limits.reserved_mtts)
+ break;
+
+ if (i == dev->mr_table.max_mtt_order) {
+ mthca_err(dev, "MTT table of order %d is "
+ "too small.\n", i);
+ goto err_out_free;
+ }
+
+ (void) mthca_alloc_mtt(dev, i);
+
+ return 0;
+
+ err_out_free:
+ for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
+ kfree(dev->mr_table.mtt_buddy[i]);
+
+ err_out:
+ mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+
+ return err;
+}
+
+void __devexit mthca_cleanup_mr_table(struct mthca_dev *dev)
+{
+ int i;
+
+ /* XXX check if any MRs are still allocated? */
+ for (i = 0; i <= dev->mr_table.max_mtt_order; ++i)
+ kfree(dev->mr_table.mtt_buddy[i]);
+ kfree(dev->mr_table.mtt_buddy);
+ mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_pd.c b/drivers/infiniband/hw/mthca/mthca_pd.c
new file mode 100644
index 00000000000..ea66847e4ea
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_pd.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_pd.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+
+int mthca_pd_alloc(struct mthca_dev *dev, struct mthca_pd *pd)
+{
+ int err;
+
+ might_sleep();
+
+ atomic_set(&pd->sqp_count, 0);
+ pd->pd_num = mthca_alloc(&dev->pd_table.alloc);
+ if (pd->pd_num == -1)
+ return -ENOMEM;
+
+ err = mthca_mr_alloc_notrans(dev, pd->pd_num,
+ MTHCA_MPT_FLAG_LOCAL_READ |
+ MTHCA_MPT_FLAG_LOCAL_WRITE,
+ &pd->ntmr);
+ if (err)
+ mthca_free(&dev->pd_table.alloc, pd->pd_num);
+
+ return err;
+}
+
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd)
+{
+ might_sleep();
+ mthca_free_mr(dev, &pd->ntmr);
+ mthca_free(&dev->pd_table.alloc, pd->pd_num);
+}
+
+int __devinit mthca_init_pd_table(struct mthca_dev *dev)
+{
+ return mthca_alloc_init(&dev->pd_table.alloc,
+ dev->limits.num_pds,
+ (1 << 24) - 1,
+ dev->limits.reserved_pds);
+}
+
+void __devexit mthca_cleanup_pd_table(struct mthca_dev *dev)
+{
+ /* XXX check if any PDs are still allocated? */
+ mthca_alloc_cleanup(&dev->pd_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c
new file mode 100644
index 00000000000..7881a8a919c
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_profile.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include "mthca_profile.h"
+
+enum {
+ MTHCA_RES_QP,
+ MTHCA_RES_EEC,
+ MTHCA_RES_SRQ,
+ MTHCA_RES_CQ,
+ MTHCA_RES_EQP,
+ MTHCA_RES_EEEC,
+ MTHCA_RES_EQ,
+ MTHCA_RES_RDB,
+ MTHCA_RES_MCG,
+ MTHCA_RES_MPT,
+ MTHCA_RES_MTT,
+ MTHCA_RES_UAR,
+ MTHCA_RES_UDAV,
+ MTHCA_RES_UARC,
+ MTHCA_RES_NUM
+};
+
+enum {
+ MTHCA_NUM_EQS = 32,
+ MTHCA_NUM_PDS = 1 << 15
+};
+
+u64 mthca_make_profile(struct mthca_dev *dev,
+ struct mthca_profile *request,
+ struct mthca_dev_lim *dev_lim,
+ struct mthca_init_hca_param *init_hca)
+{
+ struct mthca_resource {
+ u64 size;
+ u64 start;
+ int type;
+ int num;
+ int log_num;
+ };
+
+ u64 mem_base, mem_avail;
+ u64 total_size = 0;
+ struct mthca_resource *profile;
+ struct mthca_resource tmp;
+ int i, j;
+
+ profile = kmalloc(MTHCA_RES_NUM * sizeof *profile, GFP_KERNEL);
+ if (!profile)
+ return -ENOMEM;
+
+ memset(profile, 0, MTHCA_RES_NUM * sizeof *profile);
+
+ profile[MTHCA_RES_QP].size = dev_lim->qpc_entry_sz;
+ profile[MTHCA_RES_EEC].size = dev_lim->eec_entry_sz;
+ profile[MTHCA_RES_SRQ].size = dev_lim->srq_entry_sz;
+ profile[MTHCA_RES_CQ].size = dev_lim->cqc_entry_sz;
+ profile[MTHCA_RES_EQP].size = dev_lim->eqpc_entry_sz;
+ profile[MTHCA_RES_EEEC].size = dev_lim->eeec_entry_sz;
+ profile[MTHCA_RES_EQ].size = dev_lim->eqc_entry_sz;
+ profile[MTHCA_RES_RDB].size = MTHCA_RDB_ENTRY_SIZE;
+ profile[MTHCA_RES_MCG].size = MTHCA_MGM_ENTRY_SIZE;
+ profile[MTHCA_RES_MPT].size = dev_lim->mpt_entry_sz;
+ profile[MTHCA_RES_MTT].size = dev_lim->mtt_seg_sz;
+ profile[MTHCA_RES_UAR].size = dev_lim->uar_scratch_entry_sz;
+ profile[MTHCA_RES_UDAV].size = MTHCA_AV_SIZE;
+ profile[MTHCA_RES_UARC].size = request->uarc_size;
+
+ profile[MTHCA_RES_QP].num = request->num_qp;
+ profile[MTHCA_RES_EQP].num = request->num_qp;
+ profile[MTHCA_RES_RDB].num = request->num_qp * request->rdb_per_qp;
+ profile[MTHCA_RES_CQ].num = request->num_cq;
+ profile[MTHCA_RES_EQ].num = MTHCA_NUM_EQS;
+ profile[MTHCA_RES_MCG].num = request->num_mcg;
+ profile[MTHCA_RES_MPT].num = request->num_mpt;
+ profile[MTHCA_RES_MTT].num = request->num_mtt;
+ profile[MTHCA_RES_UAR].num = request->num_uar;
+ profile[MTHCA_RES_UARC].num = request->num_uar;
+ profile[MTHCA_RES_UDAV].num = request->num_udav;
+
+ for (i = 0; i < MTHCA_RES_NUM; ++i) {
+ profile[i].type = i;
+ profile[i].log_num = max(ffs(profile[i].num) - 1, 0);
+ profile[i].size *= profile[i].num;
+ if (dev->hca_type == ARBEL_NATIVE)
+ profile[i].size = max(profile[i].size, (u64) PAGE_SIZE);
+ }
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ mem_base = 0;
+ mem_avail = dev_lim->hca.arbel.max_icm_sz;
+ } else {
+ mem_base = dev->ddr_start;
+ mem_avail = dev->fw.tavor.fw_start - dev->ddr_start;
+ }
+
+ /*
+ * Sort the resources in decreasing order of size. Since they
+ * all have sizes that are powers of 2, we'll be able to keep
+ * resources aligned to their size and pack them without gaps
+ * using the sorted order.
+ */
+ for (i = MTHCA_RES_NUM; i > 0; --i)
+ for (j = 1; j < i; ++j) {
+ if (profile[j].size > profile[j - 1].size) {
+ tmp = profile[j];
+ profile[j] = profile[j - 1];
+ profile[j - 1] = tmp;
+ }
+ }
+
+ for (i = 0; i < MTHCA_RES_NUM; ++i) {
+ if (profile[i].size) {
+ profile[i].start = mem_base + total_size;
+ total_size += profile[i].size;
+ }
+ if (total_size > mem_avail) {
+ mthca_err(dev, "Profile requires 0x%llx bytes; "
+ "won't in 0x%llx bytes of context memory.\n",
+ (unsigned long long) total_size,
+ (unsigned long long) mem_avail);
+ kfree(profile);
+ return -ENOMEM;
+ }
+
+ if (profile[i].size)
+ mthca_dbg(dev, "profile[%2d]--%2d/%2d @ 0x%16llx "
+ "(size 0x%8llx)\n",
+ i, profile[i].type, profile[i].log_num,
+ (unsigned long long) profile[i].start,
+ (unsigned long long) profile[i].size);
+ }
+
+ if (dev->hca_type == ARBEL_NATIVE)
+ mthca_dbg(dev, "HCA context memory: reserving %d KB\n",
+ (int) (total_size >> 10));
+ else
+ mthca_dbg(dev, "HCA memory: allocated %d KB/%d KB (%d KB free)\n",
+ (int) (total_size >> 10), (int) (mem_avail >> 10),
+ (int) ((mem_avail - total_size) >> 10));
+
+ for (i = 0; i < MTHCA_RES_NUM; ++i) {
+ switch (profile[i].type) {
+ case MTHCA_RES_QP:
+ dev->limits.num_qps = profile[i].num;
+ init_hca->qpc_base = profile[i].start;
+ init_hca->log_num_qps = profile[i].log_num;
+ break;
+ case MTHCA_RES_EEC:
+ dev->limits.num_eecs = profile[i].num;
+ init_hca->eec_base = profile[i].start;
+ init_hca->log_num_eecs = profile[i].log_num;
+ break;
+ case MTHCA_RES_SRQ:
+ dev->limits.num_srqs = profile[i].num;
+ init_hca->srqc_base = profile[i].start;
+ init_hca->log_num_srqs = profile[i].log_num;
+ break;
+ case MTHCA_RES_CQ:
+ dev->limits.num_cqs = profile[i].num;
+ init_hca->cqc_base = profile[i].start;
+ init_hca->log_num_cqs = profile[i].log_num;
+ break;
+ case MTHCA_RES_EQP:
+ init_hca->eqpc_base = profile[i].start;
+ break;
+ case MTHCA_RES_EEEC:
+ init_hca->eeec_base = profile[i].start;
+ break;
+ case MTHCA_RES_EQ:
+ dev->limits.num_eqs = profile[i].num;
+ init_hca->eqc_base = profile[i].start;
+ init_hca->log_num_eqs = profile[i].log_num;
+ break;
+ case MTHCA_RES_RDB:
+ for (dev->qp_table.rdb_shift = 0;
+ profile[MTHCA_RES_QP].num << dev->qp_table.rdb_shift <
+ profile[i].num;
+ ++dev->qp_table.rdb_shift)
+ ; /* nothing */
+ dev->qp_table.rdb_base = (u32) profile[i].start;
+ init_hca->rdb_base = profile[i].start;
+ break;
+ case MTHCA_RES_MCG:
+ dev->limits.num_mgms = profile[i].num >> 1;
+ dev->limits.num_amgms = profile[i].num >> 1;
+ init_hca->mc_base = profile[i].start;
+ init_hca->log_mc_entry_sz = ffs(MTHCA_MGM_ENTRY_SIZE) - 1;
+ init_hca->log_mc_table_sz = profile[i].log_num;
+ init_hca->mc_hash_sz = 1 << (profile[i].log_num - 1);
+ break;
+ case MTHCA_RES_MPT:
+ dev->limits.num_mpts = profile[i].num;
+ init_hca->mpt_base = profile[i].start;
+ init_hca->log_mpt_sz = profile[i].log_num;
+ break;
+ case MTHCA_RES_MTT:
+ dev->limits.num_mtt_segs = profile[i].num;
+ dev->limits.mtt_seg_size = dev_lim->mtt_seg_sz;
+ dev->mr_table.mtt_base = profile[i].start;
+ init_hca->mtt_base = profile[i].start;
+ init_hca->mtt_seg_sz = ffs(dev_lim->mtt_seg_sz) - 7;
+ break;
+ case MTHCA_RES_UAR:
+ dev->limits.num_uars = profile[i].num;
+ init_hca->uar_scratch_base = profile[i].start;
+ break;
+ case MTHCA_RES_UDAV:
+ dev->av_table.ddr_av_base = profile[i].start;
+ dev->av_table.num_ddr_avs = profile[i].num;
+ break;
+ case MTHCA_RES_UARC:
+ dev->uar_table.uarc_size = request->uarc_size;
+ dev->uar_table.uarc_base = profile[i].start;
+ init_hca->uarc_base = profile[i].start;
+ init_hca->log_uarc_sz = ffs(request->uarc_size) - 13;
+ init_hca->log_uar_sz = ffs(request->num_uar) - 1;
+ break;
+ default:
+ break;
+ }
+ }
+
+ /*
+ * PDs don't take any HCA memory, but we assign them as part
+ * of the HCA profile anyway.
+ */
+ dev->limits.num_pds = MTHCA_NUM_PDS;
+
+ kfree(profile);
+ return total_size;
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.h b/drivers/infiniband/hw/mthca/mthca_profile.h
new file mode 100644
index 00000000000..daaf7999486
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_profile.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_profile.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef MTHCA_PROFILE_H
+#define MTHCA_PROFILE_H
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_profile {
+ int num_qp;
+ int rdb_per_qp;
+ int num_cq;
+ int num_mcg;
+ int num_mpt;
+ int num_mtt;
+ int num_udav;
+ int num_uar;
+ int uarc_size;
+};
+
+u64 mthca_make_profile(struct mthca_dev *mdev,
+ struct mthca_profile *request,
+ struct mthca_dev_lim *dev_lim,
+ struct mthca_init_hca_param *init_hca);
+
+#endif /* MTHCA_PROFILE_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
new file mode 100644
index 00000000000..bbf74cf4334
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_provider.c 1397 2004-12-28 05:09:00Z roland $
+ */
+
+#include <ib_smi.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+static int mthca_query_device(struct ib_device *ibdev,
+ struct ib_device_attr *props)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+ struct mthca_dev* mdev = to_mdev(ibdev);
+
+ u8 status;
+
+ in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ props->fw_ver = mdev->fw_ver;
+
+ memset(in_mad, 0, sizeof *in_mad);
+ in_mad->base_version = 1;
+ in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ in_mad->class_version = 1;
+ in_mad->method = IB_MGMT_METHOD_GET;
+ in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+ err = mthca_MAD_IFC(mdev, 1, 1,
+ 1, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ props->device_cap_flags = mdev->device_cap_flags;
+ props->vendor_id = be32_to_cpup((u32 *) (out_mad->data + 36)) &
+ 0xffffff;
+ props->vendor_part_id = be16_to_cpup((u16 *) (out_mad->data + 30));
+ props->hw_ver = be16_to_cpup((u16 *) (out_mad->data + 32));
+ memcpy(&props->sys_image_guid, out_mad->data + 4, 8);
+ memcpy(&props->node_guid, out_mad->data + 12, 8);
+
+ err = 0;
+ out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int mthca_query_port(struct ib_device *ibdev,
+ u8 port, struct ib_port_attr *props)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+ u8 status;
+
+ in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ memset(in_mad, 0, sizeof *in_mad);
+ in_mad->base_version = 1;
+ in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ in_mad->class_version = 1;
+ in_mad->method = IB_MGMT_METHOD_GET;
+ in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
+ in_mad->attr_mod = cpu_to_be32(port);
+
+ err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+ port, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ props->lid = be16_to_cpup((u16 *) (out_mad->data + 16));
+ props->lmc = out_mad->data[34] & 0x7;
+ props->sm_lid = be16_to_cpup((u16 *) (out_mad->data + 18));
+ props->sm_sl = out_mad->data[36] & 0xf;
+ props->state = out_mad->data[32] & 0xf;
+ props->phys_state = out_mad->data[33] >> 4;
+ props->port_cap_flags = be32_to_cpup((u32 *) (out_mad->data + 20));
+ props->gid_tbl_len = to_mdev(ibdev)->limits.gid_table_len;
+ props->pkey_tbl_len = to_mdev(ibdev)->limits.pkey_table_len;
+ props->qkey_viol_cntr = be16_to_cpup((u16 *) (out_mad->data + 48));
+ props->active_width = out_mad->data[31] & 0xf;
+ props->active_speed = out_mad->data[35] >> 4;
+
+ out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int mthca_modify_port(struct ib_device *ibdev,
+ u8 port, int port_modify_mask,
+ struct ib_port_modify *props)
+{
+ struct mthca_set_ib_param set_ib;
+ struct ib_port_attr attr;
+ int err;
+ u8 status;
+
+ if (down_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+ return -ERESTARTSYS;
+
+ err = mthca_query_port(ibdev, port, &attr);
+ if (err)
+ goto out;
+
+ set_ib.set_si_guid = 0;
+ set_ib.reset_qkey_viol = !!(port_modify_mask & IB_PORT_RESET_QKEY_CNTR);
+
+ set_ib.cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+ ~props->clr_port_cap_mask;
+
+ err = mthca_SET_IB(to_mdev(ibdev), &set_ib, port, &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+out:
+ up(&to_mdev(ibdev)->cap_mask_mutex);
+ return err;
+}
+
+static int mthca_query_pkey(struct ib_device *ibdev,
+ u8 port, u16 index, u16 *pkey)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+ u8 status;
+
+ in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ memset(in_mad, 0, sizeof *in_mad);
+ in_mad->base_version = 1;
+ in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ in_mad->class_version = 1;
+ in_mad->method = IB_MGMT_METHOD_GET;
+ in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE;
+ in_mad->attr_mod = cpu_to_be32(index / 32);
+
+ err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+ port, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ *pkey = be16_to_cpu(((u16 *) out_mad->data)[index % 32]);
+
+ out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int mthca_query_gid(struct ib_device *ibdev, u8 port,
+ int index, union ib_gid *gid)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+ u8 status;
+
+ in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ memset(in_mad, 0, sizeof *in_mad);
+ in_mad->base_version = 1;
+ in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ in_mad->class_version = 1;
+ in_mad->method = IB_MGMT_METHOD_GET;
+ in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
+ in_mad->attr_mod = cpu_to_be32(port);
+
+ err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+ port, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ memcpy(gid->raw, out_mad->data + 8, 8);
+
+ memset(in_mad, 0, sizeof *in_mad);
+ in_mad->base_version = 1;
+ in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ in_mad->class_version = 1;
+ in_mad->method = IB_MGMT_METHOD_GET;
+ in_mad->attr_id = IB_SMP_ATTR_GUID_INFO;
+ in_mad->attr_mod = cpu_to_be32(index / 8);
+
+ err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+ port, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ memcpy(gid->raw + 8, out_mad->data + (index % 8) * 16, 8);
+
+ out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev)
+{
+ struct mthca_pd *pd;
+ int err;
+
+ pd = kmalloc(sizeof *pd, GFP_KERNEL);
+ if (!pd)
+ return ERR_PTR(-ENOMEM);
+
+ err = mthca_pd_alloc(to_mdev(ibdev), pd);
+ if (err) {
+ kfree(pd);
+ return ERR_PTR(err);
+ }
+
+ return &pd->ibpd;
+}
+
+static int mthca_dealloc_pd(struct ib_pd *pd)
+{
+ mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
+ kfree(pd);
+
+ return 0;
+}
+
+static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
+ struct ib_ah_attr *ah_attr)
+{
+ int err;
+ struct mthca_ah *ah;
+
+ ah = kmalloc(sizeof *ah, GFP_KERNEL);
+ if (!ah)
+ return ERR_PTR(-ENOMEM);
+
+ err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah);
+ if (err) {
+ kfree(ah);
+ return ERR_PTR(err);
+ }
+
+ return &ah->ibah;
+}
+
+static int mthca_ah_destroy(struct ib_ah *ah)
+{
+ mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
+ kfree(ah);
+
+ return 0;
+}
+
+static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr)
+{
+ struct mthca_qp *qp;
+ int err;
+
+ switch (init_attr->qp_type) {
+ case IB_QPT_RC:
+ case IB_QPT_UC:
+ case IB_QPT_UD:
+ {
+ qp = kmalloc(sizeof *qp, GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ qp->sq.max = init_attr->cap.max_send_wr;
+ qp->rq.max = init_attr->cap.max_recv_wr;
+ qp->sq.max_gs = init_attr->cap.max_send_sge;
+ qp->rq.max_gs = init_attr->cap.max_recv_sge;
+
+ err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd),
+ to_mcq(init_attr->send_cq),
+ to_mcq(init_attr->recv_cq),
+ init_attr->qp_type, init_attr->sq_sig_type,
+ qp);
+ qp->ibqp.qp_num = qp->qpn;
+ break;
+ }
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ {
+ qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ qp->sq.max = init_attr->cap.max_send_wr;
+ qp->rq.max = init_attr->cap.max_recv_wr;
+ qp->sq.max_gs = init_attr->cap.max_send_sge;
+ qp->rq.max_gs = init_attr->cap.max_recv_sge;
+
+ qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+ err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd),
+ to_mcq(init_attr->send_cq),
+ to_mcq(init_attr->recv_cq),
+ init_attr->sq_sig_type,
+ qp->ibqp.qp_num, init_attr->port_num,
+ to_msqp(qp));
+ break;
+ }
+ default:
+ /* Don't support raw QPs */
+ return ERR_PTR(-ENOSYS);
+ }
+
+ if (err) {
+ kfree(qp);
+ return ERR_PTR(err);
+ }
+
+ init_attr->cap.max_inline_data = 0;
+
+ return &qp->ibqp;
+}
+
+static int mthca_destroy_qp(struct ib_qp *qp)
+{
+ mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
+ kfree(qp);
+ return 0;
+}
+
+static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries)
+{
+ struct mthca_cq *cq;
+ int nent;
+ int err;
+
+ cq = kmalloc(sizeof *cq, GFP_KERNEL);
+ if (!cq)
+ return ERR_PTR(-ENOMEM);
+
+ for (nent = 1; nent <= entries; nent <<= 1)
+ ; /* nothing */
+
+ err = mthca_init_cq(to_mdev(ibdev), nent, cq);
+ if (err) {
+ kfree(cq);
+ cq = ERR_PTR(err);
+ }
+
+ return &cq->ibcq;
+}
+
+static int mthca_destroy_cq(struct ib_cq *cq)
+{
+ mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
+ kfree(cq);
+
+ return 0;
+}
+
+static inline u32 convert_access(int acc)
+{
+ return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC : 0) |
+ (acc & IB_ACCESS_REMOTE_WRITE ? MTHCA_MPT_FLAG_REMOTE_WRITE : 0) |
+ (acc & IB_ACCESS_REMOTE_READ ? MTHCA_MPT_FLAG_REMOTE_READ : 0) |
+ (acc & IB_ACCESS_LOCAL_WRITE ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0) |
+ MTHCA_MPT_FLAG_LOCAL_READ;
+}
+
+static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
+{
+ struct mthca_mr *mr;
+ int err;
+
+ mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ err = mthca_mr_alloc_notrans(to_mdev(pd->device),
+ to_mpd(pd)->pd_num,
+ convert_access(acc), mr);
+
+ if (err) {
+ kfree(mr);
+ return ERR_PTR(err);
+ }
+
+ return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *buffer_list,
+ int num_phys_buf,
+ int acc,
+ u64 *iova_start)
+{
+ struct mthca_mr *mr;
+ u64 *page_list;
+ u64 total_size;
+ u64 mask;
+ int shift;
+ int npages;
+ int err;
+ int i, j, n;
+
+ /* First check that we have enough alignment */
+ if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK))
+ return ERR_PTR(-EINVAL);
+
+ if (num_phys_buf > 1 &&
+ ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK))
+ return ERR_PTR(-EINVAL);
+
+ mask = 0;
+ total_size = 0;
+ for (i = 0; i < num_phys_buf; ++i) {
+ if (buffer_list[i].addr & ~PAGE_MASK)
+ return ERR_PTR(-EINVAL);
+ if (i != 0 && i != num_phys_buf - 1 &&
+ (buffer_list[i].size & ~PAGE_MASK))
+ return ERR_PTR(-EINVAL);
+
+ total_size += buffer_list[i].size;
+ if (i > 0)
+ mask |= buffer_list[i].addr;
+ }
+
+ /* Find largest page shift we can use to cover buffers */
+ for (shift = PAGE_SHIFT; shift < 31; ++shift)
+ if (num_phys_buf > 1) {
+ if ((1ULL << shift) & mask)
+ break;
+ } else {
+ if (1ULL << shift >=
+ buffer_list[0].size +
+ (buffer_list[0].addr & ((1ULL << shift) - 1)))
+ break;
+ }
+
+ buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
+ buffer_list[0].addr &= ~0ull << shift;
+
+ mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ npages = 0;
+ for (i = 0; i < num_phys_buf; ++i)
+ npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+
+ if (!npages)
+ return &mr->ibmr;
+
+ page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
+ if (!page_list) {
+ kfree(mr);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ n = 0;
+ for (i = 0; i < num_phys_buf; ++i)
+ for (j = 0;
+ j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+ ++j)
+ page_list[n++] = buffer_list[i].addr + ((u64) j << shift);
+
+ mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) "
+ "in PD %x; shift %d, npages %d.\n",
+ (unsigned long long) buffer_list[0].addr,
+ (unsigned long long) *iova_start,
+ to_mpd(pd)->pd_num,
+ shift, npages);
+
+ err = mthca_mr_alloc_phys(to_mdev(pd->device),
+ to_mpd(pd)->pd_num,
+ page_list, shift, npages,
+ *iova_start, total_size,
+ convert_access(acc), mr);
+
+ if (err) {
+ kfree(mr);
+ return ERR_PTR(err);
+ }
+
+ kfree(page_list);
+ return &mr->ibmr;
+}
+
+static int mthca_dereg_mr(struct ib_mr *mr)
+{
+ mthca_free_mr(to_mdev(mr->device), to_mmr(mr));
+ kfree(mr);
+ return 0;
+}
+
+static ssize_t show_rev(struct class_device *cdev, char *buf)
+{
+ struct mthca_dev *dev = container_of(cdev, struct mthca_dev, ib_dev.class_dev);
+ return sprintf(buf, "%x\n", dev->rev_id);
+}
+
+static ssize_t show_fw_ver(struct class_device *cdev, char *buf)
+{
+ struct mthca_dev *dev = container_of(cdev, struct mthca_dev, ib_dev.class_dev);
+ return sprintf(buf, "%x.%x.%x\n", (int) (dev->fw_ver >> 32),
+ (int) (dev->fw_ver >> 16) & 0xffff,
+ (int) dev->fw_ver & 0xffff);
+}
+
+static ssize_t show_hca(struct class_device *cdev, char *buf)
+{
+ struct mthca_dev *dev = container_of(cdev, struct mthca_dev, ib_dev.class_dev);
+ switch (dev->hca_type) {
+ case TAVOR: return sprintf(buf, "MT23108\n");
+ case ARBEL_COMPAT: return sprintf(buf, "MT25208 (MT23108 compat mode)\n");
+ case ARBEL_NATIVE: return sprintf(buf, "MT25208\n");
+ default: return sprintf(buf, "unknown\n");
+ }
+}
+
+static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static CLASS_DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+
+static struct class_device_attribute *mthca_class_attributes[] = {
+ &class_device_attr_hw_rev,
+ &class_device_attr_fw_ver,
+ &class_device_attr_hca_type
+};
+
+int mthca_register_device(struct mthca_dev *dev)
+{
+ int ret;
+ int i;
+
+ strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX);
+ dev->ib_dev.node_type = IB_NODE_CA;
+ dev->ib_dev.phys_port_cnt = dev->limits.num_ports;
+ dev->ib_dev.dma_device = &dev->pdev->dev;
+ dev->ib_dev.class_dev.dev = &dev->pdev->dev;
+ dev->ib_dev.query_device = mthca_query_device;
+ dev->ib_dev.query_port = mthca_query_port;
+ dev->ib_dev.modify_port = mthca_modify_port;
+ dev->ib_dev.query_pkey = mthca_query_pkey;
+ dev->ib_dev.query_gid = mthca_query_gid;
+ dev->ib_dev.alloc_pd = mthca_alloc_pd;
+ dev->ib_dev.dealloc_pd = mthca_dealloc_pd;
+ dev->ib_dev.create_ah = mthca_ah_create;
+ dev->ib_dev.destroy_ah = mthca_ah_destroy;
+ dev->ib_dev.create_qp = mthca_create_qp;
+ dev->ib_dev.modify_qp = mthca_modify_qp;
+ dev->ib_dev.destroy_qp = mthca_destroy_qp;
+ dev->ib_dev.create_cq = mthca_create_cq;
+ dev->ib_dev.destroy_cq = mthca_destroy_cq;
+ dev->ib_dev.poll_cq = mthca_poll_cq;
+ dev->ib_dev.get_dma_mr = mthca_get_dma_mr;
+ dev->ib_dev.reg_phys_mr = mthca_reg_phys_mr;
+ dev->ib_dev.dereg_mr = mthca_dereg_mr;
+ dev->ib_dev.attach_mcast = mthca_multicast_attach;
+ dev->ib_dev.detach_mcast = mthca_multicast_detach;
+ dev->ib_dev.process_mad = mthca_process_mad;
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
+ dev->ib_dev.post_send = mthca_arbel_post_send;
+ dev->ib_dev.post_recv = mthca_arbel_post_receive;
+ } else {
+ dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
+ dev->ib_dev.post_send = mthca_tavor_post_send;
+ dev->ib_dev.post_recv = mthca_tavor_post_receive;
+ }
+
+ init_MUTEX(&dev->cap_mask_mutex);
+
+ ret = ib_register_device(&dev->ib_dev);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < ARRAY_SIZE(mthca_class_attributes); ++i) {
+ ret = class_device_create_file(&dev->ib_dev.class_dev,
+ mthca_class_attributes[i]);
+ if (ret) {
+ ib_unregister_device(&dev->ib_dev);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+void mthca_unregister_device(struct mthca_dev *dev)
+{
+ ib_unregister_device(&dev->ib_dev);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h
new file mode 100644
index 00000000000..0598f3905d9
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_provider.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef MTHCA_PROVIDER_H
+#define MTHCA_PROVIDER_H
+
+#include <ib_verbs.h>
+#include <ib_pack.h>
+
+#define MTHCA_MPT_FLAG_ATOMIC (1 << 14)
+#define MTHCA_MPT_FLAG_REMOTE_WRITE (1 << 13)
+#define MTHCA_MPT_FLAG_REMOTE_READ (1 << 12)
+#define MTHCA_MPT_FLAG_LOCAL_WRITE (1 << 11)
+#define MTHCA_MPT_FLAG_LOCAL_READ (1 << 10)
+
+struct mthca_buf_list {
+ void *buf;
+ DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+struct mthca_uar {
+ unsigned long pfn;
+ int index;
+};
+
+struct mthca_mr {
+ struct ib_mr ibmr;
+ int order;
+ u32 first_seg;
+};
+
+struct mthca_pd {
+ struct ib_pd ibpd;
+ u32 pd_num;
+ atomic_t sqp_count;
+ struct mthca_mr ntmr;
+};
+
+struct mthca_eq {
+ struct mthca_dev *dev;
+ int eqn;
+ u32 eqn_mask;
+ u32 cons_index;
+ u16 msi_x_vector;
+ u16 msi_x_entry;
+ int have_irq;
+ int nent;
+ struct mthca_buf_list *page_list;
+ struct mthca_mr mr;
+};
+
+struct mthca_av;
+
+enum mthca_ah_type {
+ MTHCA_AH_ON_HCA,
+ MTHCA_AH_PCI_POOL,
+ MTHCA_AH_KMALLOC
+};
+
+struct mthca_ah {
+ struct ib_ah ibah;
+ enum mthca_ah_type type;
+ u32 key;
+ struct mthca_av *av;
+ dma_addr_t avdma;
+};
+
+/*
+ * Quick description of our CQ/QP locking scheme:
+ *
+ * We have one global lock that protects dev->cq/qp_table. Each
+ * struct mthca_cq/qp also has its own lock. An individual qp lock
+ * may be taken inside of an individual cq lock. Both cqs attached to
+ * a qp may be locked, with the send cq locked first. No other
+ * nesting should be done.
+ *
+ * Each struct mthca_cq/qp also has an atomic_t ref count. The
+ * pointer from the cq/qp_table to the struct counts as one reference.
+ * This reference also is good for access through the consumer API, so
+ * modifying the CQ/QP etc doesn't need to take another reference.
+ * Access because of a completion being polled does need a reference.
+ *
+ * Finally, each struct mthca_cq/qp has a wait_queue_head_t for the
+ * destroy function to sleep on.
+ *
+ * This means that access from the consumer API requires nothing but
+ * taking the struct's lock.
+ *
+ * Access because of a completion event should go as follows:
+ * - lock cq/qp_table and look up struct
+ * - increment ref count in struct
+ * - drop cq/qp_table lock
+ * - lock struct, do your thing, and unlock struct
+ * - decrement ref count; if zero, wake up waiters
+ *
+ * To destroy a CQ/QP, we can do the following:
+ * - lock cq/qp_table, remove pointer, unlock cq/qp_table lock
+ * - decrement ref count
+ * - wait_event until ref count is zero
+ *
+ * It is the consumer's responsibilty to make sure that no QP
+ * operations (WQE posting or state modification) are pending when the
+ * QP is destroyed. Also, the consumer must make sure that calls to
+ * qp_modify are serialized.
+ *
+ * Possible optimizations (wait for profile data to see if/where we
+ * have locks bouncing between CPUs):
+ * - split cq/qp table lock into n separate (cache-aligned) locks,
+ * indexed (say) by the page in the table
+ * - split QP struct lock into three (one for common info, one for the
+ * send queue and one for the receive queue)
+ */
+
+struct mthca_cq {
+ struct ib_cq ibcq;
+ spinlock_t lock;
+ atomic_t refcount;
+ int cqn;
+ u32 cons_index;
+ int is_direct;
+
+ /* Next fields are Arbel only */
+ int set_ci_db_index;
+ u32 *set_ci_db;
+ int arm_db_index;
+ u32 *arm_db;
+ int arm_sn;
+
+ union {
+ struct mthca_buf_list direct;
+ struct mthca_buf_list *page_list;
+ } queue;
+ struct mthca_mr mr;
+ wait_queue_head_t wait;
+};
+
+struct mthca_wq {
+ spinlock_t lock;
+ int max;
+ unsigned next_ind;
+ unsigned last_comp;
+ unsigned head;
+ unsigned tail;
+ void *last;
+ int max_gs;
+ int wqe_shift;
+
+ int db_index; /* Arbel only */
+ u32 *db;
+};
+
+struct mthca_qp {
+ struct ib_qp ibqp;
+ atomic_t refcount;
+ u32 qpn;
+ int is_direct;
+ u8 transport;
+ u8 state;
+ u8 atomic_rd_en;
+ u8 resp_depth;
+
+ struct mthca_mr mr;
+
+ struct mthca_wq rq;
+ struct mthca_wq sq;
+ enum ib_sig_type sq_policy;
+ int send_wqe_offset;
+
+ u64 *wrid;
+ union {
+ struct mthca_buf_list direct;
+ struct mthca_buf_list *page_list;
+ } queue;
+
+ wait_queue_head_t wait;
+};
+
+struct mthca_sqp {
+ struct mthca_qp qp;
+ int port;
+ int pkey_index;
+ u32 qkey;
+ u32 send_psn;
+ struct ib_ud_header ud_header;
+ int header_buf_size;
+ void *header_buf;
+ dma_addr_t header_dma;
+};
+
+static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct mthca_mr, ibmr);
+}
+
+static inline struct mthca_pd *to_mpd(struct ib_pd *ibpd)
+{
+ return container_of(ibpd, struct mthca_pd, ibpd);
+}
+
+static inline struct mthca_ah *to_mah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct mthca_ah, ibah);
+}
+
+static inline struct mthca_cq *to_mcq(struct ib_cq *ibcq)
+{
+ return container_of(ibcq, struct mthca_cq, ibcq);
+}
+
+static inline struct mthca_qp *to_mqp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct mthca_qp, ibqp);
+}
+
+static inline struct mthca_sqp *to_msqp(struct mthca_qp *qp)
+{
+ return container_of(qp, struct mthca_sqp, qp);
+}
+
+#endif /* MTHCA_PROVIDER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
new file mode 100644
index 00000000000..7e4bbbd31f0
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -0,0 +1,2056 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_qp.c 1355 2004-12-17 15:23:43Z roland $
+ */
+
+#include <linux/init.h>
+
+#include <ib_verbs.h>
+#include <ib_cache.h>
+#include <ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+enum {
+ MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE,
+ MTHCA_ACK_REQ_FREQ = 10,
+ MTHCA_FLIGHT_LIMIT = 9,
+ MTHCA_UD_HEADER_SIZE = 72 /* largest UD header possible */
+};
+
+enum {
+ MTHCA_QP_STATE_RST = 0,
+ MTHCA_QP_STATE_INIT = 1,
+ MTHCA_QP_STATE_RTR = 2,
+ MTHCA_QP_STATE_RTS = 3,
+ MTHCA_QP_STATE_SQE = 4,
+ MTHCA_QP_STATE_SQD = 5,
+ MTHCA_QP_STATE_ERR = 6,
+ MTHCA_QP_STATE_DRAINING = 7
+};
+
+enum {
+ MTHCA_QP_ST_RC = 0x0,
+ MTHCA_QP_ST_UC = 0x1,
+ MTHCA_QP_ST_RD = 0x2,
+ MTHCA_QP_ST_UD = 0x3,
+ MTHCA_QP_ST_MLX = 0x7
+};
+
+enum {
+ MTHCA_QP_PM_MIGRATED = 0x3,
+ MTHCA_QP_PM_ARMED = 0x0,
+ MTHCA_QP_PM_REARM = 0x1
+};
+
+enum {
+ /* qp_context flags */
+ MTHCA_QP_BIT_DE = 1 << 8,
+ /* params1 */
+ MTHCA_QP_BIT_SRE = 1 << 15,
+ MTHCA_QP_BIT_SWE = 1 << 14,
+ MTHCA_QP_BIT_SAE = 1 << 13,
+ MTHCA_QP_BIT_SIC = 1 << 4,
+ MTHCA_QP_BIT_SSC = 1 << 3,
+ /* params2 */
+ MTHCA_QP_BIT_RRE = 1 << 15,
+ MTHCA_QP_BIT_RWE = 1 << 14,
+ MTHCA_QP_BIT_RAE = 1 << 13,
+ MTHCA_QP_BIT_RIC = 1 << 4,
+ MTHCA_QP_BIT_RSC = 1 << 3
+};
+
+struct mthca_qp_path {
+ u32 port_pkey;
+ u8 rnr_retry;
+ u8 g_mylmc;
+ u16 rlid;
+ u8 ackto;
+ u8 mgid_index;
+ u8 static_rate;
+ u8 hop_limit;
+ u32 sl_tclass_flowlabel;
+ u8 rgid[16];
+} __attribute__((packed));
+
+struct mthca_qp_context {
+ u32 flags;
+ u32 tavor_sched_queue; /* Reserved on Arbel */
+ u8 mtu_msgmax;
+ u8 rq_size_stride; /* Reserved on Tavor */
+ u8 sq_size_stride; /* Reserved on Tavor */
+ u8 rlkey_arbel_sched_queue; /* Reserved on Tavor */
+ u32 usr_page;
+ u32 local_qpn;
+ u32 remote_qpn;
+ u32 reserved1[2];
+ struct mthca_qp_path pri_path;
+ struct mthca_qp_path alt_path;
+ u32 rdd;
+ u32 pd;
+ u32 wqe_base;
+ u32 wqe_lkey;
+ u32 params1;
+ u32 reserved2;
+ u32 next_send_psn;
+ u32 cqn_snd;
+ u32 snd_wqe_base_l; /* Next send WQE on Tavor */
+ u32 snd_db_index; /* (debugging only entries) */
+ u32 last_acked_psn;
+ u32 ssn;
+ u32 params2;
+ u32 rnr_nextrecvpsn;
+ u32 ra_buff_indx;
+ u32 cqn_rcv;
+ u32 rcv_wqe_base_l; /* Next recv WQE on Tavor */
+ u32 rcv_db_index; /* (debugging only entries) */
+ u32 qkey;
+ u32 srqn;
+ u32 rmsn;
+ u16 rq_wqe_counter; /* reserved on Tavor */
+ u16 sq_wqe_counter; /* reserved on Tavor */
+ u32 reserved3[18];
+} __attribute__((packed));
+
+struct mthca_qp_param {
+ u32 opt_param_mask;
+ u32 reserved1;
+ struct mthca_qp_context context;
+ u32 reserved2[62];
+} __attribute__((packed));
+
+enum {
+ MTHCA_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0,
+ MTHCA_QP_OPTPAR_RRE = 1 << 1,
+ MTHCA_QP_OPTPAR_RAE = 1 << 2,
+ MTHCA_QP_OPTPAR_RWE = 1 << 3,
+ MTHCA_QP_OPTPAR_PKEY_INDEX = 1 << 4,
+ MTHCA_QP_OPTPAR_Q_KEY = 1 << 5,
+ MTHCA_QP_OPTPAR_RNR_TIMEOUT = 1 << 6,
+ MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7,
+ MTHCA_QP_OPTPAR_SRA_MAX = 1 << 8,
+ MTHCA_QP_OPTPAR_RRA_MAX = 1 << 9,
+ MTHCA_QP_OPTPAR_PM_STATE = 1 << 10,
+ MTHCA_QP_OPTPAR_PORT_NUM = 1 << 11,
+ MTHCA_QP_OPTPAR_RETRY_COUNT = 1 << 12,
+ MTHCA_QP_OPTPAR_ALT_RNR_RETRY = 1 << 13,
+ MTHCA_QP_OPTPAR_ACK_TIMEOUT = 1 << 14,
+ MTHCA_QP_OPTPAR_RNR_RETRY = 1 << 15,
+ MTHCA_QP_OPTPAR_SCHED_QUEUE = 1 << 16
+};
+
+enum {
+ MTHCA_OPCODE_NOP = 0x00,
+ MTHCA_OPCODE_RDMA_WRITE = 0x08,
+ MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09,
+ MTHCA_OPCODE_SEND = 0x0a,
+ MTHCA_OPCODE_SEND_IMM = 0x0b,
+ MTHCA_OPCODE_RDMA_READ = 0x10,
+ MTHCA_OPCODE_ATOMIC_CS = 0x11,
+ MTHCA_OPCODE_ATOMIC_FA = 0x12,
+ MTHCA_OPCODE_BIND_MW = 0x18,
+ MTHCA_OPCODE_INVALID = 0xff
+};
+
+enum {
+ MTHCA_NEXT_DBD = 1 << 7,
+ MTHCA_NEXT_FENCE = 1 << 6,
+ MTHCA_NEXT_CQ_UPDATE = 1 << 3,
+ MTHCA_NEXT_EVENT_GEN = 1 << 2,
+ MTHCA_NEXT_SOLICIT = 1 << 1,
+
+ MTHCA_MLX_VL15 = 1 << 17,
+ MTHCA_MLX_SLR = 1 << 16
+};
+
+struct mthca_next_seg {
+ u32 nda_op; /* [31:6] next WQE [4:0] next opcode */
+ u32 ee_nds; /* [31:8] next EE [7] DBD [6] F [5:0] next WQE size */
+ u32 flags; /* [3] CQ [2] Event [1] Solicit */
+ u32 imm; /* immediate data */
+};
+
+struct mthca_tavor_ud_seg {
+ u32 reserved1;
+ u32 lkey;
+ u64 av_addr;
+ u32 reserved2[4];
+ u32 dqpn;
+ u32 qkey;
+ u32 reserved3[2];
+};
+
+struct mthca_arbel_ud_seg {
+ u32 av[8];
+ u32 dqpn;
+ u32 qkey;
+ u32 reserved[2];
+};
+
+struct mthca_bind_seg {
+ u32 flags; /* [31] Atomic [30] rem write [29] rem read */
+ u32 reserved;
+ u32 new_rkey;
+ u32 lkey;
+ u64 addr;
+ u64 length;
+};
+
+struct mthca_raddr_seg {
+ u64 raddr;
+ u32 rkey;
+ u32 reserved;
+};
+
+struct mthca_atomic_seg {
+ u64 swap_add;
+ u64 compare;
+};
+
+struct mthca_data_seg {
+ u32 byte_count;
+ u32 lkey;
+ u64 addr;
+};
+
+struct mthca_mlx_seg {
+ u32 nda_op;
+ u32 nds;
+ u32 flags; /* [17] VL15 [16] SLR [14:12] static rate
+ [11:8] SL [3] C [2] E */
+ u16 rlid;
+ u16 vcrc;
+};
+
+static const u8 mthca_opcode[] = {
+ [IB_WR_SEND] = MTHCA_OPCODE_SEND,
+ [IB_WR_SEND_WITH_IMM] = MTHCA_OPCODE_SEND_IMM,
+ [IB_WR_RDMA_WRITE] = MTHCA_OPCODE_RDMA_WRITE,
+ [IB_WR_RDMA_WRITE_WITH_IMM] = MTHCA_OPCODE_RDMA_WRITE_IMM,
+ [IB_WR_RDMA_READ] = MTHCA_OPCODE_RDMA_READ,
+ [IB_WR_ATOMIC_CMP_AND_SWP] = MTHCA_OPCODE_ATOMIC_CS,
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
+};
+
+static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+ return qp->qpn >= dev->qp_table.sqp_start &&
+ qp->qpn <= dev->qp_table.sqp_start + 3;
+}
+
+static int is_qp0(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+ return qp->qpn >= dev->qp_table.sqp_start &&
+ qp->qpn <= dev->qp_table.sqp_start + 1;
+}
+
+static void *get_recv_wqe(struct mthca_qp *qp, int n)
+{
+ if (qp->is_direct)
+ return qp->queue.direct.buf + (n << qp->rq.wqe_shift);
+ else
+ return qp->queue.page_list[(n << qp->rq.wqe_shift) >> PAGE_SHIFT].buf +
+ ((n << qp->rq.wqe_shift) & (PAGE_SIZE - 1));
+}
+
+static void *get_send_wqe(struct mthca_qp *qp, int n)
+{
+ if (qp->is_direct)
+ return qp->queue.direct.buf + qp->send_wqe_offset +
+ (n << qp->sq.wqe_shift);
+ else
+ return qp->queue.page_list[(qp->send_wqe_offset +
+ (n << qp->sq.wqe_shift)) >>
+ PAGE_SHIFT].buf +
+ ((qp->send_wqe_offset + (n << qp->sq.wqe_shift)) &
+ (PAGE_SIZE - 1));
+}
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+ enum ib_event_type event_type)
+{
+ struct mthca_qp *qp;
+ struct ib_event event;
+
+ spin_lock(&dev->qp_table.lock);
+ qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1));
+ if (qp)
+ atomic_inc(&qp->refcount);
+ spin_unlock(&dev->qp_table.lock);
+
+ if (!qp) {
+ mthca_warn(dev, "Async event for bogus QP %08x\n", qpn);
+ return;
+ }
+
+ event.device = &dev->ib_dev;
+ event.event = event_type;
+ event.element.qp = &qp->ibqp;
+ if (qp->ibqp.event_handler)
+ qp->ibqp.event_handler(&event, qp->ibqp.qp_context);
+
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+}
+
+static int to_mthca_state(enum ib_qp_state ib_state)
+{
+ switch (ib_state) {
+ case IB_QPS_RESET: return MTHCA_QP_STATE_RST;
+ case IB_QPS_INIT: return MTHCA_QP_STATE_INIT;
+ case IB_QPS_RTR: return MTHCA_QP_STATE_RTR;
+ case IB_QPS_RTS: return MTHCA_QP_STATE_RTS;
+ case IB_QPS_SQD: return MTHCA_QP_STATE_SQD;
+ case IB_QPS_SQE: return MTHCA_QP_STATE_SQE;
+ case IB_QPS_ERR: return MTHCA_QP_STATE_ERR;
+ default: return -1;
+ }
+}
+
+enum { RC, UC, UD, RD, RDEE, MLX, NUM_TRANS };
+
+static int to_mthca_st(int transport)
+{
+ switch (transport) {
+ case RC: return MTHCA_QP_ST_RC;
+ case UC: return MTHCA_QP_ST_UC;
+ case UD: return MTHCA_QP_ST_UD;
+ case RD: return MTHCA_QP_ST_RD;
+ case MLX: return MTHCA_QP_ST_MLX;
+ default: return -1;
+ }
+}
+
+static const struct {
+ int trans;
+ u32 req_param[NUM_TRANS];
+ u32 opt_param[NUM_TRANS];
+} state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = {
+ [IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+ [IB_QPS_INIT] = {
+ .trans = MTHCA_TRANS_RST2INIT,
+ .req_param = {
+ [UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [MLX] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ },
+ /* bug-for-bug compatibility with VAPI: */
+ .opt_param = {
+ [MLX] = IB_QP_PORT
+ }
+ },
+ },
+ [IB_QPS_INIT] = {
+ [IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+ [IB_QPS_INIT] = {
+ .trans = MTHCA_TRANS_INIT2INIT,
+ .opt_param = {
+ [UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [MLX] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_RTR] = {
+ .trans = MTHCA_TRANS_INIT2RTR,
+ .req_param = {
+ [RC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ },
+ .opt_param = {
+ [UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [RC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [MLX] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_RTR] = {
+ [IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+ [IB_QPS_RTS] = {
+ .trans = MTHCA_TRANS_RTR2RTS,
+ .req_param = {
+ [UD] = IB_QP_SQ_PSN,
+ [RC] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [MLX] = IB_QP_SQ_PSN,
+ },
+ .opt_param = {
+ [UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [MLX] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_RTS] = {
+ [IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+ [IB_QPS_RTS] = {
+ .trans = MTHCA_TRANS_RTS2RTS,
+ .opt_param = {
+ [UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [RC] = (IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [MLX] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .trans = MTHCA_TRANS_RTS2SQD,
+ },
+ },
+ [IB_QPS_SQD] = {
+ [IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+ [IB_QPS_RTS] = {
+ .trans = MTHCA_TRANS_SQD2RTS,
+ .opt_param = {
+ [UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [MLX] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .trans = MTHCA_TRANS_SQD2SQD,
+ .opt_param = {
+ [UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [RC] = (IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [MLX] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_SQE] = {
+ [IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR },
+ [IB_QPS_RTS] = {
+ .trans = MTHCA_TRANS_SQERR2RTS,
+ .opt_param = {
+ [UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [RC] = (IB_QP_CUR_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [MLX] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_ERR] = {
+ [IB_QPS_RESET] = { .trans = MTHCA_TRANS_ANY2RST },
+ [IB_QPS_ERR] = { .trans = MTHCA_TRANS_ANY2ERR }
+ }
+};
+
+static void store_attrs(struct mthca_sqp *sqp, struct ib_qp_attr *attr,
+ int attr_mask)
+{
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ sqp->pkey_index = attr->pkey_index;
+ if (attr_mask & IB_QP_QKEY)
+ sqp->qkey = attr->qkey;
+ if (attr_mask & IB_QP_SQ_PSN)
+ sqp->send_psn = attr->sq_psn;
+}
+
+static void init_port(struct mthca_dev *dev, int port)
+{
+ int err;
+ u8 status;
+ struct mthca_init_ib_param param;
+
+ memset(&param, 0, sizeof param);
+
+ param.enable_1x = 1;
+ param.enable_4x = 1;
+ param.vl_cap = dev->limits.vl_cap;
+ param.mtu_cap = dev->limits.mtu_cap;
+ param.gid_cap = dev->limits.gid_table_len;
+ param.pkey_cap = dev->limits.pkey_table_len;
+
+ err = mthca_INIT_IB(dev, &param, port, &status);
+ if (err)
+ mthca_warn(dev, "INIT_IB failed, return code %d.\n", err);
+ if (status)
+ mthca_warn(dev, "INIT_IB returned status %02x.\n", status);
+}
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ void *mailbox = NULL;
+ struct mthca_qp_param *qp_param;
+ struct mthca_qp_context *qp_context;
+ u32 req_param, opt_param;
+ u8 status;
+ int err;
+
+ if (attr_mask & IB_QP_CUR_STATE) {
+ if (attr->cur_qp_state != IB_QPS_RTR &&
+ attr->cur_qp_state != IB_QPS_RTS &&
+ attr->cur_qp_state != IB_QPS_SQD &&
+ attr->cur_qp_state != IB_QPS_SQE)
+ return -EINVAL;
+ else
+ cur_state = attr->cur_qp_state;
+ } else {
+ spin_lock_irq(&qp->sq.lock);
+ spin_lock(&qp->rq.lock);
+ cur_state = qp->state;
+ spin_unlock(&qp->rq.lock);
+ spin_unlock_irq(&qp->sq.lock);
+ }
+
+ if (attr_mask & IB_QP_STATE) {
+ if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR)
+ return -EINVAL;
+ new_state = attr->qp_state;
+ } else
+ new_state = cur_state;
+
+ if (state_table[cur_state][new_state].trans == MTHCA_TRANS_INVALID) {
+ mthca_dbg(dev, "Illegal QP transition "
+ "%d->%d\n", cur_state, new_state);
+ return -EINVAL;
+ }
+
+ req_param = state_table[cur_state][new_state].req_param[qp->transport];
+ opt_param = state_table[cur_state][new_state].opt_param[qp->transport];
+
+ if ((req_param & attr_mask) != req_param) {
+ mthca_dbg(dev, "QP transition "
+ "%d->%d missing req attr 0x%08x\n",
+ cur_state, new_state,
+ req_param & ~attr_mask);
+ return -EINVAL;
+ }
+
+ if (attr_mask & ~(req_param | opt_param | IB_QP_STATE)) {
+ mthca_dbg(dev, "QP transition (transport %d) "
+ "%d->%d has extra attr 0x%08x\n",
+ qp->transport,
+ cur_state, new_state,
+ attr_mask & ~(req_param | opt_param |
+ IB_QP_STATE));
+ return -EINVAL;
+ }
+
+ mailbox = kmalloc(sizeof (*qp_param) + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL);
+ if (!mailbox)
+ return -ENOMEM;
+ qp_param = MAILBOX_ALIGN(mailbox);
+ qp_context = &qp_param->context;
+ memset(qp_param, 0, sizeof *qp_param);
+
+ qp_context->flags = cpu_to_be32((to_mthca_state(new_state) << 28) |
+ (to_mthca_st(qp->transport) << 16));
+ qp_context->flags |= cpu_to_be32(MTHCA_QP_BIT_DE);
+ if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+ qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+ else {
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE);
+ switch (attr->path_mig_state) {
+ case IB_MIG_MIGRATED:
+ qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+ break;
+ case IB_MIG_REARM:
+ qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11);
+ break;
+ case IB_MIG_ARMED:
+ qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11);
+ break;
+ }
+ }
+
+ /* leave tavor_sched_queue as 0 */
+
+ if (qp->transport == MLX || qp->transport == UD)
+ qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11;
+ else if (attr_mask & IB_QP_PATH_MTU)
+ qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ qp_context->rq_size_stride =
+ ((ffs(qp->rq.max) - 1) << 3) | (qp->rq.wqe_shift - 4);
+ qp_context->sq_size_stride =
+ ((ffs(qp->sq.max) - 1) << 3) | (qp->sq.wqe_shift - 4);
+ }
+
+ /* leave arbel_sched_queue as 0 */
+
+ qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
+ qp_context->local_qpn = cpu_to_be32(qp->qpn);
+ if (attr_mask & IB_QP_DEST_QPN) {
+ qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+ }
+
+ if (qp->transport == MLX)
+ qp_context->pri_path.port_pkey |=
+ cpu_to_be32(to_msqp(qp)->port << 24);
+ else {
+ if (attr_mask & IB_QP_PORT) {
+ qp_context->pri_path.port_pkey |=
+ cpu_to_be32(attr->port_num << 24);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM);
+ }
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX) {
+ qp_context->pri_path.port_pkey |=
+ cpu_to_be32(attr->pkey_index);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX);
+ }
+
+ if (attr_mask & IB_QP_RNR_RETRY) {
+ qp_context->pri_path.rnr_retry = attr->rnr_retry << 5;
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY);
+ }
+
+ if (attr_mask & IB_QP_AV) {
+ qp_context->pri_path.g_mylmc = attr->ah_attr.src_path_bits & 0x7f;
+ qp_context->pri_path.rlid = cpu_to_be16(attr->ah_attr.dlid);
+ qp_context->pri_path.static_rate = (!!attr->ah_attr.static_rate) << 3;
+ if (attr->ah_attr.ah_flags & IB_AH_GRH) {
+ qp_context->pri_path.g_mylmc |= 1 << 7;
+ qp_context->pri_path.mgid_index = attr->ah_attr.grh.sgid_index;
+ qp_context->pri_path.hop_limit = attr->ah_attr.grh.hop_limit;
+ qp_context->pri_path.sl_tclass_flowlabel =
+ cpu_to_be32((attr->ah_attr.sl << 28) |
+ (attr->ah_attr.grh.traffic_class << 20) |
+ (attr->ah_attr.grh.flow_label));
+ memcpy(qp_context->pri_path.rgid,
+ attr->ah_attr.grh.dgid.raw, 16);
+ } else {
+ qp_context->pri_path.sl_tclass_flowlabel =
+ cpu_to_be32(attr->ah_attr.sl << 28);
+ }
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH);
+ }
+
+ if (attr_mask & IB_QP_TIMEOUT) {
+ qp_context->pri_path.ackto = attr->timeout;
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT);
+ }
+
+ /* XXX alt_path */
+
+ /* leave rdd as 0 */
+ qp_context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pd_num);
+ /* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */
+ qp_context->wqe_lkey = cpu_to_be32(qp->mr.ibmr.lkey);
+ qp_context->params1 = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) |
+ (MTHCA_FLIGHT_LIMIT << 24) |
+ MTHCA_QP_BIT_SRE |
+ MTHCA_QP_BIT_SWE |
+ MTHCA_QP_BIT_SAE);
+ if (qp->sq_policy == IB_SIGNAL_ALL_WR)
+ qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC);
+ if (attr_mask & IB_QP_RETRY_CNT) {
+ qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT);
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+ qp_context->params1 |= cpu_to_be32(min(attr->max_dest_rd_atomic ?
+ ffs(attr->max_dest_rd_atomic) - 1 : 0,
+ 7) << 21);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX);
+ }
+
+ if (attr_mask & IB_QP_SQ_PSN)
+ qp_context->next_send_psn = cpu_to_be32(attr->sq_psn);
+ qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn);
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ qp_context->snd_wqe_base_l = cpu_to_be32(qp->send_wqe_offset);
+ qp_context->snd_db_index = cpu_to_be32(qp->sq.db_index);
+ }
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS) {
+ /*
+ * Only enable RDMA/atomics if we have responder
+ * resources set to a non-zero value.
+ */
+ if (qp->resp_depth) {
+ qp_context->params2 |=
+ cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE ?
+ MTHCA_QP_BIT_RWE : 0);
+ qp_context->params2 |=
+ cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_READ ?
+ MTHCA_QP_BIT_RRE : 0);
+ qp_context->params2 |=
+ cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC ?
+ MTHCA_QP_BIT_RAE : 0);
+ }
+
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+ MTHCA_QP_OPTPAR_RRE |
+ MTHCA_QP_OPTPAR_RAE);
+
+ qp->atomic_rd_en = attr->qp_access_flags;
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ u8 rra_max;
+
+ if (qp->resp_depth && !attr->max_rd_atomic) {
+ /*
+ * Lowering our responder resources to zero.
+ * Turn off RDMA/atomics as responder.
+ * (RWE/RRE/RAE in params2 already zero)
+ */
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+ MTHCA_QP_OPTPAR_RRE |
+ MTHCA_QP_OPTPAR_RAE);
+ }
+
+ if (!qp->resp_depth && attr->max_rd_atomic) {
+ /*
+ * Increasing our responder resources from
+ * zero. Turn on RDMA/atomics as appropriate.
+ */
+ qp_context->params2 |=
+ cpu_to_be32(qp->atomic_rd_en & IB_ACCESS_REMOTE_WRITE ?
+ MTHCA_QP_BIT_RWE : 0);
+ qp_context->params2 |=
+ cpu_to_be32(qp->atomic_rd_en & IB_ACCESS_REMOTE_READ ?
+ MTHCA_QP_BIT_RRE : 0);
+ qp_context->params2 |=
+ cpu_to_be32(qp->atomic_rd_en & IB_ACCESS_REMOTE_ATOMIC ?
+ MTHCA_QP_BIT_RAE : 0);
+
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+ MTHCA_QP_OPTPAR_RRE |
+ MTHCA_QP_OPTPAR_RAE);
+ }
+
+ for (rra_max = 0;
+ 1 << rra_max < attr->max_rd_atomic &&
+ rra_max < dev->qp_table.rdb_shift;
+ ++rra_max)
+ ; /* nothing */
+
+ qp_context->params2 |= cpu_to_be32(rra_max << 21);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX);
+
+ qp->resp_depth = attr->max_rd_atomic;
+ }
+
+ qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+ qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT);
+ }
+ if (attr_mask & IB_QP_RQ_PSN)
+ qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+ qp_context->ra_buff_indx =
+ cpu_to_be32(dev->qp_table.rdb_base +
+ ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE <<
+ dev->qp_table.rdb_shift));
+
+ qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn);
+
+ if (dev->hca_type == ARBEL_NATIVE)
+ qp_context->rcv_db_index = cpu_to_be32(qp->rq.db_index);
+
+ if (attr_mask & IB_QP_QKEY) {
+ qp_context->qkey = cpu_to_be32(attr->qkey);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY);
+ }
+
+ err = mthca_MODIFY_QP(dev, state_table[cur_state][new_state].trans,
+ qp->qpn, 0, qp_param, 0, &status);
+ if (status) {
+ mthca_warn(dev, "modify QP %d returned status %02x.\n",
+ state_table[cur_state][new_state].trans, status);
+ err = -EINVAL;
+ }
+
+ if (!err)
+ qp->state = new_state;
+
+ kfree(mailbox);
+
+ if (is_sqp(dev, qp))
+ store_attrs(to_msqp(qp), attr, attr_mask);
+
+ /*
+ * If we are moving QP0 to RTR, bring the IB link up; if we
+ * are moving QP0 to RESET or ERROR, bring the link back down.
+ */
+ if (is_qp0(dev, qp)) {
+ if (cur_state != IB_QPS_RTR &&
+ new_state == IB_QPS_RTR)
+ init_port(dev, to_msqp(qp)->port);
+
+ if (cur_state != IB_QPS_RESET &&
+ cur_state != IB_QPS_ERR &&
+ (new_state == IB_QPS_RESET ||
+ new_state == IB_QPS_ERR))
+ mthca_CLOSE_IB(dev, to_msqp(qp)->port, &status);
+ }
+
+ return err;
+}
+
+/*
+ * Allocate and register buffer for WQEs. qp->rq.max, sq.max,
+ * rq.max_gs and sq.max_gs must all be assigned.
+ * mthca_alloc_wqe_buf will calculate rq.wqe_shift and
+ * sq.wqe_shift (as well as send_wqe_offset, is_direct, and
+ * queue)
+ */
+static int mthca_alloc_wqe_buf(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_qp *qp)
+{
+ int size;
+ int i;
+ int npages, shift;
+ dma_addr_t t;
+ u64 *dma_list = NULL;
+ int err = -ENOMEM;
+
+ size = sizeof (struct mthca_next_seg) +
+ qp->rq.max_gs * sizeof (struct mthca_data_seg);
+
+ for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
+ qp->rq.wqe_shift++)
+ ; /* nothing */
+
+ size = sizeof (struct mthca_next_seg) +
+ qp->sq.max_gs * sizeof (struct mthca_data_seg);
+ switch (qp->transport) {
+ case MLX:
+ size += 2 * sizeof (struct mthca_data_seg);
+ break;
+ case UD:
+ if (dev->hca_type == ARBEL_NATIVE)
+ size += sizeof (struct mthca_arbel_ud_seg);
+ else
+ size += sizeof (struct mthca_tavor_ud_seg);
+ break;
+ default:
+ /* bind seg is as big as atomic + raddr segs */
+ size += sizeof (struct mthca_bind_seg);
+ }
+
+ for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
+ qp->sq.wqe_shift++)
+ ; /* nothing */
+
+ qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift,
+ 1 << qp->sq.wqe_shift);
+ size = PAGE_ALIGN(qp->send_wqe_offset +
+ (qp->sq.max << qp->sq.wqe_shift));
+
+ qp->wrid = kmalloc((qp->rq.max + qp->sq.max) * sizeof (u64),
+ GFP_KERNEL);
+ if (!qp->wrid)
+ goto err_out;
+
+ if (size <= MTHCA_MAX_DIRECT_QP_SIZE) {
+ qp->is_direct = 1;
+ npages = 1;
+ shift = get_order(size) + PAGE_SHIFT;
+
+ if (0)
+ mthca_dbg(dev, "Creating direct QP of size %d (shift %d)\n",
+ size, shift);
+
+ qp->queue.direct.buf = pci_alloc_consistent(dev->pdev, size, &t);
+ if (!qp->queue.direct.buf)
+ goto err_out;
+
+ pci_unmap_addr_set(&qp->queue.direct, mapping, t);
+
+ memset(qp->queue.direct.buf, 0, size);
+
+ while (t & ((1 << shift) - 1)) {
+ --shift;
+ npages *= 2;
+ }
+
+ dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+ if (!dma_list)
+ goto err_out_free;
+
+ for (i = 0; i < npages; ++i)
+ dma_list[i] = t + i * (1 << shift);
+ } else {
+ qp->is_direct = 0;
+ npages = size / PAGE_SIZE;
+ shift = PAGE_SHIFT;
+
+ if (0)
+ mthca_dbg(dev, "Creating indirect QP with %d pages\n", npages);
+
+ dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+ if (!dma_list)
+ goto err_out;
+
+ qp->queue.page_list = kmalloc(npages *
+ sizeof *qp->queue.page_list,
+ GFP_KERNEL);
+ if (!qp->queue.page_list)
+ goto err_out;
+
+ for (i = 0; i < npages; ++i) {
+ qp->queue.page_list[i].buf =
+ pci_alloc_consistent(dev->pdev, PAGE_SIZE, &t);
+ if (!qp->queue.page_list[i].buf)
+ goto err_out_free;
+
+ memset(qp->queue.page_list[i].buf, 0, PAGE_SIZE);
+
+ pci_unmap_addr_set(&qp->queue.page_list[i], mapping, t);
+ dma_list[i] = t;
+ }
+ }
+
+ err = mthca_mr_alloc_phys(dev, pd->pd_num, dma_list, shift,
+ npages, 0, size,
+ MTHCA_MPT_FLAG_LOCAL_READ,
+ &qp->mr);
+ if (err)
+ goto err_out_free;
+
+ kfree(dma_list);
+ return 0;
+
+ err_out_free:
+ if (qp->is_direct) {
+ pci_free_consistent(dev->pdev, size,
+ qp->queue.direct.buf,
+ pci_unmap_addr(&qp->queue.direct, mapping));
+ } else
+ for (i = 0; i < npages; ++i) {
+ if (qp->queue.page_list[i].buf)
+ pci_free_consistent(dev->pdev, PAGE_SIZE,
+ qp->queue.page_list[i].buf,
+ pci_unmap_addr(&qp->queue.page_list[i],
+ mapping));
+
+ }
+
+ err_out:
+ kfree(qp->wrid);
+ kfree(dma_list);
+ return err;
+}
+
+static int mthca_alloc_memfree(struct mthca_dev *dev,
+ struct mthca_qp *qp)
+{
+ int ret = 0;
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn);
+ if (ret)
+ return ret;
+
+ ret = mthca_table_get(dev, dev->qp_table.eqp_table, qp->qpn);
+ if (ret)
+ goto err_qpc;
+
+ qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ,
+ qp->qpn, &qp->rq.db);
+ if (qp->rq.db_index < 0) {
+ ret = -ENOMEM;
+ goto err_eqpc;
+ }
+
+ qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ,
+ qp->qpn, &qp->sq.db);
+ if (qp->sq.db_index < 0) {
+ ret = -ENOMEM;
+ goto err_rq_db;
+ }
+ }
+
+ return 0;
+
+err_rq_db:
+ mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+
+err_eqpc:
+ mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+
+err_qpc:
+ mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+
+ return ret;
+}
+
+static void mthca_free_memfree(struct mthca_dev *dev,
+ struct mthca_qp *qp)
+{
+ if (dev->hca_type == ARBEL_NATIVE) {
+ mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index);
+ mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+ mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+ mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+ }
+}
+
+static void mthca_wq_init(struct mthca_wq* wq)
+{
+ spin_lock_init(&wq->lock);
+ wq->next_ind = 0;
+ wq->last_comp = wq->max - 1;
+ wq->head = 0;
+ wq->tail = 0;
+ wq->last = NULL;
+}
+
+static int mthca_alloc_qp_common(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_cq *send_cq,
+ struct mthca_cq *recv_cq,
+ enum ib_sig_type send_policy,
+ struct mthca_qp *qp)
+{
+ struct mthca_next_seg *wqe;
+ int ret;
+ int i;
+
+ atomic_set(&qp->refcount, 1);
+ qp->state = IB_QPS_RESET;
+ qp->atomic_rd_en = 0;
+ qp->resp_depth = 0;
+ qp->sq_policy = send_policy;
+ mthca_wq_init(&qp->sq);
+ mthca_wq_init(&qp->rq);
+
+ ret = mthca_alloc_memfree(dev, qp);
+ if (ret)
+ return ret;
+
+ ret = mthca_alloc_wqe_buf(dev, pd, qp);
+ if (ret) {
+ mthca_free_memfree(dev, qp);
+ return ret;
+ }
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ for (i = 0; i < qp->rq.max; ++i) {
+ wqe = get_recv_wqe(qp, i);
+ wqe->nda_op = cpu_to_be32(((i + 1) & (qp->rq.max - 1)) <<
+ qp->rq.wqe_shift);
+ wqe->ee_nds = cpu_to_be32(1 << (qp->rq.wqe_shift - 4));
+ }
+
+ for (i = 0; i < qp->sq.max; ++i) {
+ wqe = get_send_wqe(qp, i);
+ wqe->nda_op = cpu_to_be32((((i + 1) & (qp->sq.max - 1)) <<
+ qp->sq.wqe_shift) +
+ qp->send_wqe_offset);
+ }
+ }
+
+ return 0;
+}
+
+static void mthca_align_qp_size(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+ int i;
+
+ if (dev->hca_type != ARBEL_NATIVE)
+ return;
+
+ for (i = 0; 1 << i < qp->rq.max; ++i)
+ ; /* nothing */
+
+ qp->rq.max = 1 << i;
+
+ for (i = 0; 1 << i < qp->sq.max; ++i)
+ ; /* nothing */
+
+ qp->sq.max = 1 << i;
+}
+
+int mthca_alloc_qp(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_cq *send_cq,
+ struct mthca_cq *recv_cq,
+ enum ib_qp_type type,
+ enum ib_sig_type send_policy,
+ struct mthca_qp *qp)
+{
+ int err;
+
+ mthca_align_qp_size(dev, qp);
+
+ switch (type) {
+ case IB_QPT_RC: qp->transport = RC; break;
+ case IB_QPT_UC: qp->transport = UC; break;
+ case IB_QPT_UD: qp->transport = UD; break;
+ default: return -EINVAL;
+ }
+
+ qp->qpn = mthca_alloc(&dev->qp_table.alloc);
+ if (qp->qpn == -1)
+ return -ENOMEM;
+
+ err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+ send_policy, qp);
+ if (err) {
+ mthca_free(&dev->qp_table.alloc, qp->qpn);
+ return err;
+ }
+
+ spin_lock_irq(&dev->qp_table.lock);
+ mthca_array_set(&dev->qp_table.qp,
+ qp->qpn & (dev->limits.num_qps - 1), qp);
+ spin_unlock_irq(&dev->qp_table.lock);
+
+ return 0;
+}
+
+int mthca_alloc_sqp(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_cq *send_cq,
+ struct mthca_cq *recv_cq,
+ enum ib_sig_type send_policy,
+ int qpn,
+ int port,
+ struct mthca_sqp *sqp)
+{
+ int err = 0;
+ u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
+
+ mthca_align_qp_size(dev, &sqp->qp);
+
+ sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE;
+ sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size,
+ &sqp->header_dma, GFP_KERNEL);
+ if (!sqp->header_buf)
+ return -ENOMEM;
+
+ spin_lock_irq(&dev->qp_table.lock);
+ if (mthca_array_get(&dev->qp_table.qp, mqpn))
+ err = -EBUSY;
+ else
+ mthca_array_set(&dev->qp_table.qp, mqpn, sqp);
+ spin_unlock_irq(&dev->qp_table.lock);
+
+ if (err)
+ goto err_out;
+
+ sqp->port = port;
+ sqp->qp.qpn = mqpn;
+ sqp->qp.transport = MLX;
+
+ err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+ send_policy, &sqp->qp);
+ if (err)
+ goto err_out_free;
+
+ atomic_inc(&pd->sqp_count);
+
+ return 0;
+
+ err_out_free:
+ /*
+ * Lock CQs here, so that CQ polling code can do QP lookup
+ * without taking a lock.
+ */
+ spin_lock_irq(&send_cq->lock);
+ if (send_cq != recv_cq)
+ spin_lock(&recv_cq->lock);
+
+ spin_lock(&dev->qp_table.lock);
+ mthca_array_clear(&dev->qp_table.qp, mqpn);
+ spin_unlock(&dev->qp_table.lock);
+
+ if (send_cq != recv_cq)
+ spin_unlock(&recv_cq->lock);
+ spin_unlock_irq(&send_cq->lock);
+
+ err_out:
+ dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size,
+ sqp->header_buf, sqp->header_dma);
+
+ return err;
+}
+
+void mthca_free_qp(struct mthca_dev *dev,
+ struct mthca_qp *qp)
+{
+ u8 status;
+ int size;
+ int i;
+ struct mthca_cq *send_cq;
+ struct mthca_cq *recv_cq;
+
+ send_cq = to_mcq(qp->ibqp.send_cq);
+ recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+ /*
+ * Lock CQs here, so that CQ polling code can do QP lookup
+ * without taking a lock.
+ */
+ spin_lock_irq(&send_cq->lock);
+ if (send_cq != recv_cq)
+ spin_lock(&recv_cq->lock);
+
+ spin_lock(&dev->qp_table.lock);
+ mthca_array_clear(&dev->qp_table.qp,
+ qp->qpn & (dev->limits.num_qps - 1));
+ spin_unlock(&dev->qp_table.lock);
+
+ if (send_cq != recv_cq)
+ spin_unlock(&recv_cq->lock);
+ spin_unlock_irq(&send_cq->lock);
+
+ atomic_dec(&qp->refcount);
+ wait_event(qp->wait, !atomic_read(&qp->refcount));
+
+ if (qp->state != IB_QPS_RESET)
+ mthca_MODIFY_QP(dev, MTHCA_TRANS_ANY2RST, qp->qpn, 0, NULL, 0, &status);
+
+ mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq)->cqn, qp->qpn);
+ if (qp->ibqp.send_cq != qp->ibqp.recv_cq)
+ mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq)->cqn, qp->qpn);
+
+ mthca_free_mr(dev, &qp->mr);
+
+ size = PAGE_ALIGN(qp->send_wqe_offset +
+ (qp->sq.max << qp->sq.wqe_shift));
+
+ if (qp->is_direct) {
+ pci_free_consistent(dev->pdev, size,
+ qp->queue.direct.buf,
+ pci_unmap_addr(&qp->queue.direct, mapping));
+ } else {
+ for (i = 0; i < size / PAGE_SIZE; ++i) {
+ pci_free_consistent(dev->pdev, PAGE_SIZE,
+ qp->queue.page_list[i].buf,
+ pci_unmap_addr(&qp->queue.page_list[i],
+ mapping));
+ }
+ }
+
+ kfree(qp->wrid);
+
+ mthca_free_memfree(dev, qp);
+
+ if (is_sqp(dev, qp)) {
+ atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count));
+ dma_free_coherent(&dev->pdev->dev,
+ to_msqp(qp)->header_buf_size,
+ to_msqp(qp)->header_buf,
+ to_msqp(qp)->header_dma);
+ } else
+ mthca_free(&dev->qp_table.alloc, qp->qpn);
+}
+
+/* Create UD header for an MLX send and build a data segment for it */
+static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
+ int ind, struct ib_send_wr *wr,
+ struct mthca_mlx_seg *mlx,
+ struct mthca_data_seg *data)
+{
+ int header_size;
+ int err;
+
+ ib_ud_header_init(256, /* assume a MAD */
+ sqp->ud_header.grh_present,
+ &sqp->ud_header);
+
+ err = mthca_read_ah(dev, to_mah(wr->wr.ud.ah), &sqp->ud_header);
+ if (err)
+ return err;
+ mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1);
+ mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) |
+ (sqp->ud_header.lrh.destination_lid == 0xffff ?
+ MTHCA_MLX_SLR : 0) |
+ (sqp->ud_header.lrh.service_level << 8));
+ mlx->rlid = sqp->ud_header.lrh.destination_lid;
+ mlx->vcrc = 0;
+
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ sqp->ud_header.immediate_present = 0;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ sqp->ud_header.immediate_present = 1;
+ sqp->ud_header.immediate_data = wr->imm_data;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0;
+ if (sqp->ud_header.lrh.destination_lid == 0xffff)
+ sqp->ud_header.lrh.source_lid = 0xffff;
+ sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+ if (!sqp->qp.ibqp.qp_num)
+ ib_get_cached_pkey(&dev->ib_dev, sqp->port,
+ sqp->pkey_index,
+ &sqp->ud_header.bth.pkey);
+ else
+ ib_get_cached_pkey(&dev->ib_dev, sqp->port,
+ wr->wr.ud.pkey_index,
+ &sqp->ud_header.bth.pkey);
+ cpu_to_be16s(&sqp->ud_header.bth.pkey);
+ sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+ sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+ sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+ sqp->qkey : wr->wr.ud.remote_qkey);
+ sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+ header_size = ib_ud_header_pack(&sqp->ud_header,
+ sqp->header_buf +
+ ind * MTHCA_UD_HEADER_SIZE);
+
+ data->byte_count = cpu_to_be32(header_size);
+ data->lkey = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey);
+ data->addr = cpu_to_be64(sqp->header_dma +
+ ind * MTHCA_UD_HEADER_SIZE);
+
+ return 0;
+}
+
+static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq,
+ struct ib_cq *ib_cq)
+{
+ unsigned cur;
+ struct mthca_cq *cq;
+
+ cur = wq->head - wq->tail;
+ if (likely(cur + nreq < wq->max))
+ return 0;
+
+ cq = to_mcq(ib_cq);
+ spin_lock(&cq->lock);
+ cur = wq->head - wq->tail;
+ spin_unlock(&cq->lock);
+
+ return cur + nreq >= wq->max;
+}
+
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ void *wqe;
+ void *prev_wqe;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int i;
+ int size;
+ int size0 = 0;
+ u32 f0 = 0;
+ int ind;
+ u8 op0 = 0;
+
+ spin_lock_irqsave(&qp->sq.lock, flags);
+
+ /* XXX check that state is OK to post send */
+
+ ind = qp->sq.next_ind;
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+ mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->sq.head, qp->sq.tail,
+ qp->sq.max, nreq);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ wqe = get_send_wqe(qp, ind);
+ prev_wqe = qp->sq.last;
+ qp->sq.last = wqe;
+
+ ((struct mthca_next_seg *) wqe)->nda_op = 0;
+ ((struct mthca_next_seg *) wqe)->ee_nds = 0;
+ ((struct mthca_next_seg *) wqe)->flags =
+ ((wr->send_flags & IB_SEND_SIGNALED) ?
+ cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+ ((wr->send_flags & IB_SEND_SOLICITED) ?
+ cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0) |
+ cpu_to_be32(1);
+ if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+ wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ ((struct mthca_next_seg *) wqe)->flags = wr->imm_data;
+
+ wqe += sizeof (struct mthca_next_seg);
+ size = sizeof (struct mthca_next_seg) / 16;
+
+ switch (qp->transport) {
+ case RC:
+ switch (wr->opcode) {
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ ((struct mthca_raddr_seg *) wqe)->raddr =
+ cpu_to_be64(wr->wr.atomic.remote_addr);
+ ((struct mthca_raddr_seg *) wqe)->rkey =
+ cpu_to_be32(wr->wr.atomic.rkey);
+ ((struct mthca_raddr_seg *) wqe)->reserved = 0;
+
+ wqe += sizeof (struct mthca_raddr_seg);
+
+ if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ ((struct mthca_atomic_seg *) wqe)->swap_add =
+ cpu_to_be64(wr->wr.atomic.swap);
+ ((struct mthca_atomic_seg *) wqe)->compare =
+ cpu_to_be64(wr->wr.atomic.compare_add);
+ } else {
+ ((struct mthca_atomic_seg *) wqe)->swap_add =
+ cpu_to_be64(wr->wr.atomic.compare_add);
+ ((struct mthca_atomic_seg *) wqe)->compare = 0;
+ }
+
+ wqe += sizeof (struct mthca_atomic_seg);
+ size += sizeof (struct mthca_raddr_seg) / 16 +
+ sizeof (struct mthca_atomic_seg);
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ case IB_WR_RDMA_READ:
+ ((struct mthca_raddr_seg *) wqe)->raddr =
+ cpu_to_be64(wr->wr.rdma.remote_addr);
+ ((struct mthca_raddr_seg *) wqe)->rkey =
+ cpu_to_be32(wr->wr.rdma.rkey);
+ ((struct mthca_raddr_seg *) wqe)->reserved = 0;
+ wqe += sizeof (struct mthca_raddr_seg);
+ size += sizeof (struct mthca_raddr_seg) / 16;
+ break;
+
+ default:
+ /* No extra segments required for sends */
+ break;
+ }
+
+ break;
+
+ case UD:
+ ((struct mthca_tavor_ud_seg *) wqe)->lkey =
+ cpu_to_be32(to_mah(wr->wr.ud.ah)->key);
+ ((struct mthca_tavor_ud_seg *) wqe)->av_addr =
+ cpu_to_be64(to_mah(wr->wr.ud.ah)->avdma);
+ ((struct mthca_tavor_ud_seg *) wqe)->dqpn =
+ cpu_to_be32(wr->wr.ud.remote_qpn);
+ ((struct mthca_tavor_ud_seg *) wqe)->qkey =
+ cpu_to_be32(wr->wr.ud.remote_qkey);
+
+ wqe += sizeof (struct mthca_tavor_ud_seg);
+ size += sizeof (struct mthca_tavor_ud_seg) / 16;
+ break;
+
+ case MLX:
+ err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+ wqe - sizeof (struct mthca_next_seg),
+ wqe);
+ if (err) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ break;
+ }
+
+ if (wr->num_sge > qp->sq.max_gs) {
+ mthca_err(dev, "too many gathers\n");
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ ((struct mthca_data_seg *) wqe)->byte_count =
+ cpu_to_be32(wr->sg_list[i].length);
+ ((struct mthca_data_seg *) wqe)->lkey =
+ cpu_to_be32(wr->sg_list[i].lkey);
+ ((struct mthca_data_seg *) wqe)->addr =
+ cpu_to_be64(wr->sg_list[i].addr);
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ /* Add one more inline data segment for ICRC */
+ if (qp->transport == MLX) {
+ ((struct mthca_data_seg *) wqe)->byte_count =
+ cpu_to_be32((1 << 31) | 4);
+ ((u32 *) wqe)[1] = 0;
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+ if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+ mthca_err(dev, "opcode invalid\n");
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (prev_wqe) {
+ ((struct mthca_next_seg *) prev_wqe)->nda_op =
+ cpu_to_be32(((ind << qp->sq.wqe_shift) +
+ qp->send_wqe_offset) |
+ mthca_opcode[wr->opcode]);
+ wmb();
+ ((struct mthca_next_seg *) prev_wqe)->ee_nds =
+ cpu_to_be32((size0 ? 0 : MTHCA_NEXT_DBD) | size);
+ }
+
+ if (!size0) {
+ size0 = size;
+ op0 = mthca_opcode[wr->opcode];
+ }
+
+ ++ind;
+ if (unlikely(ind >= qp->sq.max))
+ ind -= qp->sq.max;
+ }
+
+out:
+ if (likely(nreq)) {
+ u32 doorbell[2];
+
+ doorbell[0] = cpu_to_be32(((qp->sq.next_ind << qp->sq.wqe_shift) +
+ qp->send_wqe_offset) | f0 | op0);
+ doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0);
+
+ wmb();
+
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_SEND_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
+
+ qp->sq.next_ind = ind;
+ qp->sq.head += nreq;
+
+ spin_unlock_irqrestore(&qp->sq.lock, flags);
+ return err;
+}
+
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int i;
+ int size;
+ int size0 = 0;
+ int ind;
+ void *wqe;
+ void *prev_wqe;
+
+ spin_lock_irqsave(&qp->rq.lock, flags);
+
+ /* XXX check that state is OK to post receive */
+
+ ind = qp->rq.next_ind;
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+ mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->rq.head, qp->rq.tail,
+ qp->rq.max, nreq);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ wqe = get_recv_wqe(qp, ind);
+ prev_wqe = qp->rq.last;
+ qp->rq.last = wqe;
+
+ ((struct mthca_next_seg *) wqe)->nda_op = 0;
+ ((struct mthca_next_seg *) wqe)->ee_nds =
+ cpu_to_be32(MTHCA_NEXT_DBD);
+ ((struct mthca_next_seg *) wqe)->flags = 0;
+
+ wqe += sizeof (struct mthca_next_seg);
+ size = sizeof (struct mthca_next_seg) / 16;
+
+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ ((struct mthca_data_seg *) wqe)->byte_count =
+ cpu_to_be32(wr->sg_list[i].length);
+ ((struct mthca_data_seg *) wqe)->lkey =
+ cpu_to_be32(wr->sg_list[i].lkey);
+ ((struct mthca_data_seg *) wqe)->addr =
+ cpu_to_be64(wr->sg_list[i].addr);
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ qp->wrid[ind] = wr->wr_id;
+
+ if (likely(prev_wqe)) {
+ ((struct mthca_next_seg *) prev_wqe)->nda_op =
+ cpu_to_be32((ind << qp->rq.wqe_shift) | 1);
+ wmb();
+ ((struct mthca_next_seg *) prev_wqe)->ee_nds =
+ cpu_to_be32(MTHCA_NEXT_DBD | size);
+ }
+
+ if (!size0)
+ size0 = size;
+
+ ++ind;
+ if (unlikely(ind >= qp->rq.max))
+ ind -= qp->rq.max;
+ }
+
+out:
+ if (likely(nreq)) {
+ u32 doorbell[2];
+
+ doorbell[0] = cpu_to_be32((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
+ doorbell[1] = cpu_to_be32((qp->qpn << 8) | nreq);
+
+ wmb();
+
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_RECEIVE_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
+
+ qp->rq.next_ind = ind;
+ qp->rq.head += nreq;
+
+ spin_unlock_irqrestore(&qp->rq.lock, flags);
+ return err;
+}
+
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ void *wqe;
+ void *prev_wqe;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int i;
+ int size;
+ int size0 = 0;
+ u32 f0 = 0;
+ int ind;
+ u8 op0 = 0;
+
+ spin_lock_irqsave(&qp->sq.lock, flags);
+
+ /* XXX check that state is OK to post send */
+
+ ind = qp->sq.head & (qp->sq.max - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+ mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->sq.head, qp->sq.tail,
+ qp->sq.max, nreq);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ wqe = get_send_wqe(qp, ind);
+ prev_wqe = qp->sq.last;
+ qp->sq.last = wqe;
+
+ ((struct mthca_next_seg *) wqe)->flags =
+ ((wr->send_flags & IB_SEND_SIGNALED) ?
+ cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+ ((wr->send_flags & IB_SEND_SOLICITED) ?
+ cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0) |
+ cpu_to_be32(1);
+ if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+ wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ ((struct mthca_next_seg *) wqe)->flags = wr->imm_data;
+
+ wqe += sizeof (struct mthca_next_seg);
+ size = sizeof (struct mthca_next_seg) / 16;
+
+ switch (qp->transport) {
+ case UD:
+ memcpy(((struct mthca_arbel_ud_seg *) wqe)->av,
+ to_mah(wr->wr.ud.ah)->av, MTHCA_AV_SIZE);
+ ((struct mthca_arbel_ud_seg *) wqe)->dqpn =
+ cpu_to_be32(wr->wr.ud.remote_qpn);
+ ((struct mthca_arbel_ud_seg *) wqe)->qkey =
+ cpu_to_be32(wr->wr.ud.remote_qkey);
+
+ wqe += sizeof (struct mthca_arbel_ud_seg);
+ size += sizeof (struct mthca_arbel_ud_seg) / 16;
+ break;
+
+ case MLX:
+ err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+ wqe - sizeof (struct mthca_next_seg),
+ wqe);
+ if (err) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ break;
+ }
+
+ if (wr->num_sge > qp->sq.max_gs) {
+ mthca_err(dev, "too many gathers\n");
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ ((struct mthca_data_seg *) wqe)->byte_count =
+ cpu_to_be32(wr->sg_list[i].length);
+ ((struct mthca_data_seg *) wqe)->lkey =
+ cpu_to_be32(wr->sg_list[i].lkey);
+ ((struct mthca_data_seg *) wqe)->addr =
+ cpu_to_be64(wr->sg_list[i].addr);
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ /* Add one more inline data segment for ICRC */
+ if (qp->transport == MLX) {
+ ((struct mthca_data_seg *) wqe)->byte_count =
+ cpu_to_be32((1 << 31) | 4);
+ ((u32 *) wqe)[1] = 0;
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+ if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+ mthca_err(dev, "opcode invalid\n");
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (likely(prev_wqe)) {
+ ((struct mthca_next_seg *) prev_wqe)->nda_op =
+ cpu_to_be32(((ind << qp->sq.wqe_shift) +
+ qp->send_wqe_offset) |
+ mthca_opcode[wr->opcode]);
+ wmb();
+ ((struct mthca_next_seg *) prev_wqe)->ee_nds =
+ cpu_to_be32(MTHCA_NEXT_DBD | size);
+ }
+
+ if (!size0) {
+ size0 = size;
+ op0 = mthca_opcode[wr->opcode];
+ }
+
+ ++ind;
+ if (unlikely(ind >= qp->sq.max))
+ ind -= qp->sq.max;
+ }
+
+out:
+ if (likely(nreq)) {
+ u32 doorbell[2];
+
+ doorbell[0] = cpu_to_be32((nreq << 24) |
+ ((qp->sq.head & 0xffff) << 8) |
+ f0 | op0);
+ doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0);
+
+ qp->sq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+ *qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+ /*
+ * Make sure doorbell record is written before we
+ * write MMIO send doorbell.
+ */
+ wmb();
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_SEND_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
+
+ spin_unlock_irqrestore(&qp->sq.lock, flags);
+ return err;
+}
+
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int ind;
+ int i;
+ void *wqe;
+
+ spin_lock_irqsave(&qp->rq.lock, flags);
+
+ /* XXX check that state is OK to post receive */
+
+ ind = qp->rq.head & (qp->rq.max - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+ mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->rq.head, qp->rq.tail,
+ qp->rq.max, nreq);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ wqe = get_recv_wqe(qp, ind);
+
+ ((struct mthca_next_seg *) wqe)->flags = 0;
+
+ wqe += sizeof (struct mthca_next_seg);
+
+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ ((struct mthca_data_seg *) wqe)->byte_count =
+ cpu_to_be32(wr->sg_list[i].length);
+ ((struct mthca_data_seg *) wqe)->lkey =
+ cpu_to_be32(wr->sg_list[i].lkey);
+ ((struct mthca_data_seg *) wqe)->addr =
+ cpu_to_be64(wr->sg_list[i].addr);
+ wqe += sizeof (struct mthca_data_seg);
+ }
+
+ if (i < qp->rq.max_gs) {
+ ((struct mthca_data_seg *) wqe)->byte_count = 0;
+ ((struct mthca_data_seg *) wqe)->lkey = cpu_to_be32(0x100);
+ ((struct mthca_data_seg *) wqe)->addr = 0;
+ }
+
+ qp->wrid[ind] = wr->wr_id;
+
+ ++ind;
+ if (unlikely(ind >= qp->rq.max))
+ ind -= qp->rq.max;
+ }
+out:
+ if (likely(nreq)) {
+ qp->rq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+ *qp->rq.db = cpu_to_be32(qp->rq.head & 0xffff);
+ }
+
+ spin_unlock_irqrestore(&qp->rq.lock, flags);
+ return err;
+}
+
+int mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+ int index, int *dbd, u32 *new_wqe)
+{
+ struct mthca_next_seg *next;
+
+ if (is_send)
+ next = get_send_wqe(qp, index);
+ else
+ next = get_recv_wqe(qp, index);
+
+ if (dev->hca_type == ARBEL_NATIVE)
+ *dbd = 1;
+ else
+ *dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD));
+ if (next->ee_nds & cpu_to_be32(0x3f))
+ *new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) |
+ (next->ee_nds & cpu_to_be32(0x3f));
+ else
+ *new_wqe = 0;
+
+ return 0;
+}
+
+int __devinit mthca_init_qp_table(struct mthca_dev *dev)
+{
+ int err;
+ u8 status;
+ int i;
+
+ spin_lock_init(&dev->qp_table.lock);
+
+ /*
+ * We reserve 2 extra QPs per port for the special QPs. The
+ * special QP for port 1 has to be even, so round up.
+ */
+ dev->qp_table.sqp_start = (dev->limits.reserved_qps + 1) & ~1UL;
+ err = mthca_alloc_init(&dev->qp_table.alloc,
+ dev->limits.num_qps,
+ (1 << 24) - 1,
+ dev->qp_table.sqp_start +
+ MTHCA_MAX_PORTS * 2);
+ if (err)
+ return err;
+
+ err = mthca_array_init(&dev->qp_table.qp,
+ dev->limits.num_qps);
+ if (err) {
+ mthca_alloc_cleanup(&dev->qp_table.alloc);
+ return err;
+ }
+
+ for (i = 0; i < 2; ++i) {
+ err = mthca_CONF_SPECIAL_QP(dev, i ? IB_QPT_GSI : IB_QPT_SMI,
+ dev->qp_table.sqp_start + i * 2,
+ &status);
+ if (err)
+ goto err_out;
+ if (status) {
+ mthca_warn(dev, "CONF_SPECIAL_QP returned "
+ "status %02x, aborting.\n",
+ status);
+ err = -EINVAL;
+ goto err_out;
+ }
+ }
+ return 0;
+
+ err_out:
+ for (i = 0; i < 2; ++i)
+ mthca_CONF_SPECIAL_QP(dev, i, 0, &status);
+
+ mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+ mthca_alloc_cleanup(&dev->qp_table.alloc);
+
+ return err;
+}
+
+void __devexit mthca_cleanup_qp_table(struct mthca_dev *dev)
+{
+ int i;
+ u8 status;
+
+ for (i = 0; i < 2; ++i)
+ mthca_CONF_SPECIAL_QP(dev, i, 0, &status);
+
+ mthca_alloc_cleanup(&dev->qp_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c
new file mode 100644
index 00000000000..ce3fff7d02b
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: mthca_reset.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+int mthca_reset(struct mthca_dev *mdev)
+{
+ int i;
+ int err = 0;
+ u32 *hca_header = NULL;
+ u32 *bridge_header = NULL;
+ struct pci_dev *bridge = NULL;
+
+#define MTHCA_RESET_OFFSET 0xf0010
+#define MTHCA_RESET_VALUE swab32(1)
+
+ /*
+ * Reset the chip. This is somewhat ugly because we have to
+ * save off the PCI header before reset and then restore it
+ * after the chip reboots. We skip config space offsets 22
+ * and 23 since those have a special meaning.
+ *
+ * To make matters worse, for Tavor (PCI-X HCA) we have to
+ * find the associated bridge device and save off its PCI
+ * header as well.
+ */
+
+ if (mdev->hca_type == TAVOR) {
+ /* Look for the bridge -- its device ID will be 2 more
+ than HCA's device ID. */
+ while ((bridge = pci_get_device(mdev->pdev->vendor,
+ mdev->pdev->device + 2,
+ bridge)) != NULL) {
+ if (bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE &&
+ bridge->subordinate == mdev->pdev->bus) {
+ mthca_dbg(mdev, "Found bridge: %s (%s)\n",
+ pci_pretty_name(bridge), pci_name(bridge));
+ break;
+ }
+ }
+
+ if (!bridge) {
+ /*
+ * Didn't find a bridge for a Tavor device --
+ * assume we're in no-bridge mode and hope for
+ * the best.
+ */
+ mthca_warn(mdev, "No bridge found for %s (%s)\n",
+ pci_pretty_name(mdev->pdev), pci_name(mdev->pdev));
+ }
+
+ }
+
+ /* For Arbel do we need to save off the full 4K PCI Express header?? */
+ hca_header = kmalloc(256, GFP_KERNEL);
+ if (!hca_header) {
+ err = -ENOMEM;
+ mthca_err(mdev, "Couldn't allocate memory to save HCA "
+ "PCI header, aborting.\n");
+ goto out;
+ }
+
+ for (i = 0; i < 64; ++i) {
+ if (i == 22 || i == 23)
+ continue;
+ if (pci_read_config_dword(mdev->pdev, i * 4, hca_header + i)) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't save HCA "
+ "PCI header, aborting.\n");
+ goto out;
+ }
+ }
+
+ if (bridge) {
+ bridge_header = kmalloc(256, GFP_KERNEL);
+ if (!bridge_header) {
+ err = -ENOMEM;
+ mthca_err(mdev, "Couldn't allocate memory to save HCA "
+ "bridge PCI header, aborting.\n");
+ goto out;
+ }
+
+ for (i = 0; i < 64; ++i) {
+ if (i == 22 || i == 23)
+ continue;
+ if (pci_read_config_dword(bridge, i * 4, bridge_header + i)) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't save HCA bridge "
+ "PCI header, aborting.\n");
+ goto out;
+ }
+ }
+ }
+
+ /* actually hit reset */
+ {
+ void __iomem *reset = ioremap(pci_resource_start(mdev->pdev, 0) +
+ MTHCA_RESET_OFFSET, 4);
+
+ if (!reset) {
+ err = -ENOMEM;
+ mthca_err(mdev, "Couldn't map HCA reset register, "
+ "aborting.\n");
+ goto out;
+ }
+
+ writel(MTHCA_RESET_VALUE, reset);
+ iounmap(reset);
+ }
+
+ /* Docs say to wait one second before accessing device */
+ msleep(1000);
+
+ /* Now wait for PCI device to start responding again */
+ {
+ u32 v;
+ int c = 0;
+
+ for (c = 0; c < 100; ++c) {
+ if (pci_read_config_dword(bridge ? bridge : mdev->pdev, 0, &v)) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't access HCA after reset, "
+ "aborting.\n");
+ goto out;
+ }
+
+ if (v != 0xffffffff)
+ goto good;
+
+ msleep(100);
+ }
+
+ err = -ENODEV;
+ mthca_err(mdev, "PCI device did not come back after reset, "
+ "aborting.\n");
+ goto out;
+ }
+
+good:
+ /* Now restore the PCI headers */
+ if (bridge) {
+ /*
+ * Bridge control register is at 0x3e, so we'll
+ * naturally restore it last in this loop.
+ */
+ for (i = 0; i < 16; ++i) {
+ if (i * 4 == PCI_COMMAND)
+ continue;
+
+ if (pci_write_config_dword(bridge, i * 4, bridge_header[i])) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA bridge reg %x, "
+ "aborting.\n", i);
+ goto out;
+ }
+ }
+
+ if (pci_write_config_dword(bridge, PCI_COMMAND,
+ bridge_header[PCI_COMMAND / 4])) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, "
+ "aborting.\n");
+ goto out;
+ }
+ }
+
+ for (i = 0; i < 16; ++i) {
+ if (i * 4 == PCI_COMMAND)
+ continue;
+
+ if (pci_write_config_dword(mdev->pdev, i * 4, hca_header[i])) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA reg %x, "
+ "aborting.\n", i);
+ goto out;
+ }
+ }
+
+ if (pci_write_config_dword(mdev->pdev, PCI_COMMAND,
+ hca_header[PCI_COMMAND / 4])) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA COMMAND, "
+ "aborting.\n");
+ goto out;
+ }
+
+out:
+ if (bridge)
+ pci_dev_put(bridge);
+ kfree(bridge_header);
+ kfree(hca_header);
+
+ return err;
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_uar.c b/drivers/infiniband/hw/mthca/mthca_uar.c
new file mode 100644
index 00000000000..1c8791ded6f
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_uar.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#include "mthca_dev.h"
+#include "mthca_memfree.h"
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+ uar->index = mthca_alloc(&dev->uar_table.alloc);
+ if (uar->index == -1)
+ return -ENOMEM;
+
+ uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+
+ return 0;
+}
+
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+ mthca_free(&dev->uar_table.alloc, uar->index);
+}
+
+int mthca_init_uar_table(struct mthca_dev *dev)
+{
+ int ret;
+
+ ret = mthca_alloc_init(&dev->uar_table.alloc,
+ dev->limits.num_uars,
+ dev->limits.num_uars - 1,
+ dev->limits.reserved_uars);
+ if (ret)
+ return ret;
+
+ ret = mthca_init_db_tab(dev);
+ if (ret)
+ mthca_alloc_cleanup(&dev->uar_table.alloc);
+
+ return ret;
+}
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev)
+{
+ mthca_cleanup_db_tab(dev);
+
+ /* XXX check if any UARs are still allocated? */
+ mthca_alloc_cleanup(&dev->uar_table.alloc);
+}
diff --git a/drivers/infiniband/include/ib_cache.h b/drivers/infiniband/include/ib_cache.h
new file mode 100644
index 00000000000..44ef6bb9b9d
--- /dev/null
+++ b/drivers/infiniband/include/ib_cache.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_cache.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef _IB_CACHE_H
+#define _IB_CACHE_H
+
+#include <ib_verbs.h>
+
+/**
+ * ib_get_cached_gid - Returns a cached GID table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached GID table to query.
+ * @gid: The GID value found at the specified index.
+ *
+ * ib_get_cached_gid() fetches the specified GID table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_gid(struct ib_device *device,
+ u8 port_num,
+ int index,
+ union ib_gid *gid);
+
+/**
+ * ib_find_cached_gid - Returns the port number and GID table index where
+ * a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the cached GID table where the GID was found. This
+ * parameter may be NULL.
+ *
+ * ib_find_cached_gid() searches for the specified GID value in
+ * the local software cache.
+ */
+int ib_find_cached_gid(struct ib_device *device,
+ union ib_gid *gid,
+ u8 *port_num,
+ u16 *index);
+
+/**
+ * ib_get_cached_pkey - Returns a cached PKey table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached PKey table to query.
+ * @pkey: The PKey value found at the specified index.
+ *
+ * ib_get_cached_pkey() fetches the specified PKey table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_pkey(struct ib_device *device_handle,
+ u8 port_num,
+ int index,
+ u16 *pkey);
+
+/**
+ * ib_find_cached_pkey - Returns the PKey table index where a specified
+ * PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index);
+
+#endif /* _IB_CACHE_H */
diff --git a/drivers/infiniband/include/ib_fmr_pool.h b/drivers/infiniband/include/ib_fmr_pool.h
new file mode 100644
index 00000000000..e8769657cbb
--- /dev/null
+++ b/drivers/infiniband/include/ib_fmr_pool.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_fmr_pool.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#if !defined(IB_FMR_POOL_H)
+#define IB_FMR_POOL_H
+
+#include <ib_verbs.h>
+
+struct ib_fmr_pool;
+
+/**
+ * struct ib_fmr_pool_param - Parameters for creating FMR pool
+ * @max_pages_per_fmr:Maximum number of pages per map request.
+ * @access:Access flags for FMRs in pool.
+ * @pool_size:Number of FMRs to allocate for pool.
+ * @dirty_watermark:Flush is triggered when @dirty_watermark dirty
+ * FMRs are present.
+ * @flush_function:Callback called when unmapped FMRs are flushed and
+ * more FMRs are possibly available for mapping
+ * @flush_arg:Context passed to user's flush function.
+ * @cache:If set, FMRs may be reused after unmapping for identical map
+ * requests.
+ */
+struct ib_fmr_pool_param {
+ int max_pages_per_fmr;
+ enum ib_access_flags access;
+ int pool_size;
+ int dirty_watermark;
+ void (*flush_function)(struct ib_fmr_pool *pool,
+ void * arg);
+ void *flush_arg;
+ unsigned cache:1;
+};
+
+struct ib_pool_fmr {
+ struct ib_fmr *fmr;
+ struct ib_fmr_pool *pool;
+ struct list_head list;
+ struct hlist_node cache_node;
+ int ref_count;
+ int remap_count;
+ u64 io_virtual_address;
+ int page_list_len;
+ u64 page_list[0];
+};
+
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
+ struct ib_fmr_pool_param *params);
+
+int ib_destroy_fmr_pool(struct ib_fmr_pool *pool);
+
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool);
+
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+ u64 *page_list,
+ int list_len,
+ u64 *io_virtual_address);
+
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr);
+
+#endif /* IB_FMR_POOL_H */
diff --git a/drivers/infiniband/include/ib_mad.h b/drivers/infiniband/include/ib_mad.h
new file mode 100644
index 00000000000..4a6bf6763a9
--- /dev/null
+++ b/drivers/infiniband/include/ib_mad.h
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_mad.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#if !defined( IB_MAD_H )
+#define IB_MAD_H
+
+#include <ib_verbs.h>
+
+/* Management base version */
+#define IB_MGMT_BASE_VERSION 1
+
+/* Management classes */
+#define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01
+#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE 0x81
+#define IB_MGMT_CLASS_SUBN_ADM 0x03
+#define IB_MGMT_CLASS_PERF_MGMT 0x04
+#define IB_MGMT_CLASS_BM 0x05
+#define IB_MGMT_CLASS_DEVICE_MGMT 0x06
+#define IB_MGMT_CLASS_CM 0x07
+#define IB_MGMT_CLASS_SNMP 0x08
+#define IB_MGMT_CLASS_VENDOR_RANGE2_START 0x30
+#define IB_MGMT_CLASS_VENDOR_RANGE2_END 0x4F
+
+/* Management methods */
+#define IB_MGMT_METHOD_GET 0x01
+#define IB_MGMT_METHOD_SET 0x02
+#define IB_MGMT_METHOD_GET_RESP 0x81
+#define IB_MGMT_METHOD_SEND 0x03
+#define IB_MGMT_METHOD_TRAP 0x05
+#define IB_MGMT_METHOD_REPORT 0x06
+#define IB_MGMT_METHOD_REPORT_RESP 0x86
+#define IB_MGMT_METHOD_TRAP_REPRESS 0x07
+
+#define IB_MGMT_METHOD_RESP 0x80
+
+#define IB_MGMT_MAX_METHODS 128
+
+#define IB_QP0 0
+#define IB_QP1 __constant_htonl(1)
+#define IB_QP1_QKEY 0x80010000
+
+struct ib_grh {
+ u32 version_tclass_flow;
+ u16 paylen;
+ u8 next_hdr;
+ u8 hop_limit;
+ union ib_gid sgid;
+ union ib_gid dgid;
+} __attribute__ ((packed));
+
+struct ib_mad_hdr {
+ u8 base_version;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ u16 status;
+ u16 class_specific;
+ u64 tid;
+ u16 attr_id;
+ u16 resv;
+ u32 attr_mod;
+} __attribute__ ((packed));
+
+struct ib_rmpp_hdr {
+ u8 rmpp_version;
+ u8 rmpp_type;
+ u8 rmpp_rtime_flags;
+ u8 rmpp_status;
+ u32 seg_num;
+ u32 paylen_newwin;
+} __attribute__ ((packed));
+
+struct ib_mad {
+ struct ib_mad_hdr mad_hdr;
+ u8 data[232];
+} __attribute__ ((packed));
+
+struct ib_rmpp_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ u8 data[220];
+} __attribute__ ((packed));
+
+struct ib_vendor_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ u8 reserved;
+ u8 oui[3];
+ u8 data[216];
+} __attribute__ ((packed));
+
+struct ib_mad_agent;
+struct ib_mad_send_wc;
+struct ib_mad_recv_wc;
+
+/**
+ * ib_mad_send_handler - callback handler for a sent MAD.
+ * @mad_agent: MAD agent that sent the MAD.
+ * @mad_send_wc: Send work completion information on the sent MAD.
+ */
+typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_snoop_handler - Callback handler for snooping sent MADs.
+ * @mad_agent: MAD agent that snooped the MAD.
+ * @send_wr: Work request information on the sent MAD.
+ * @mad_send_wc: Work completion information on the sent MAD. Valid
+ * only for snooping that occurs on a send completion.
+ *
+ * Clients snooping MADs should not modify data referenced by the @send_wr
+ * or @mad_send_wc.
+ */
+typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_send_wr *send_wr,
+ struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_recv_handler - callback handler for a received MAD.
+ * @mad_agent: MAD agent requesting the received MAD.
+ * @mad_recv_wc: Received work completion information on the received MAD.
+ *
+ * MADs received in response to a send request operation will be handed to
+ * the user after the send operation completes. All data buffers given
+ * to registered agents through this routine are owned by the receiving
+ * client, except for snooping agents. Clients snooping MADs should not
+ * modify the data referenced by @mad_recv_wc.
+ */
+typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_mad_agent - Used to track MAD registration with the access layer.
+ * @device: Reference to device registration is on.
+ * @qp: Reference to QP used for sending and receiving MADs.
+ * @recv_handler: Callback handler for a received MAD.
+ * @send_handler: Callback handler for a sent MAD.
+ * @snoop_handler: Callback handler for snooped sent MADs.
+ * @context: User-specified context associated with this registration.
+ * @hi_tid: Access layer assigned transaction ID for this client.
+ * Unsolicited MADs sent by this client will have the upper 32-bits
+ * of their TID set to this value.
+ * @port_num: Port number on which QP is registered
+ */
+struct ib_mad_agent {
+ struct ib_device *device;
+ struct ib_qp *qp;
+ ib_mad_recv_handler recv_handler;
+ ib_mad_send_handler send_handler;
+ ib_mad_snoop_handler snoop_handler;
+ void *context;
+ u32 hi_tid;
+ u8 port_num;
+};
+
+/**
+ * ib_mad_send_wc - MAD send completion information.
+ * @wr_id: Work request identifier associated with the send MAD request.
+ * @status: Completion status.
+ * @vendor_err: Optional vendor error information returned with a failed
+ * request.
+ */
+struct ib_mad_send_wc {
+ u64 wr_id;
+ enum ib_wc_status status;
+ u32 vendor_err;
+};
+
+/**
+ * ib_mad_recv_buf - received MAD buffer information.
+ * @list: Reference to next data buffer for a received RMPP MAD.
+ * @grh: References a data buffer containing the global route header.
+ * The data refereced by this buffer is only valid if the GRH is
+ * valid.
+ * @mad: References the start of the received MAD.
+ */
+struct ib_mad_recv_buf {
+ struct list_head list;
+ struct ib_grh *grh;
+ struct ib_mad *mad;
+};
+
+/**
+ * ib_mad_recv_wc - received MAD information.
+ * @wc: Completion information for the received data.
+ * @recv_buf: Specifies the location of the received data buffer(s).
+ * @mad_len: The length of the received MAD, without duplicated headers.
+ *
+ * For received response, the wr_id field of the wc is set to the wr_id
+ * for the corresponding send request.
+ */
+struct ib_mad_recv_wc {
+ struct ib_wc *wc;
+ struct ib_mad_recv_buf recv_buf;
+ int mad_len;
+};
+
+/**
+ * ib_mad_reg_req - MAD registration request
+ * @mgmt_class: Indicates which management class of MADs should be receive
+ * by the caller. This field is only required if the user wishes to
+ * receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version: Indicates which version of MADs for the given
+ * management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ * in the range from 0x30 to 0x4f. Otherwise not used.
+ * @method_mask: The caller will receive unsolicited MADs for any method
+ * where @method_mask = 1.
+ */
+struct ib_mad_reg_req {
+ u8 mgmt_class;
+ u8 mgmt_class_version;
+ u8 oui[3];
+ DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS);
+};
+
+/**
+ * ib_register_mad_agent - Register to send/receive MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP to access. Must be either
+ * IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_reg_req: Specifies which unsolicited MADs should be received
+ * by the caller. This parameter may be NULL if the caller only
+ * wishes to receive solicited responses.
+ * @rmpp_version: If set, indicates that the client will send
+ * and receive MADs that contain the RMPP header for the given version.
+ * If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ * request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ * MAD.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ struct ib_mad_reg_req *mad_reg_req,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context);
+
+enum ib_mad_snoop_flags {
+ /*IB_MAD_SNOOP_POSTED_SENDS = 1,*/
+ /*IB_MAD_SNOOP_RMPP_SENDS = (1<<1),*/
+ IB_MAD_SNOOP_SEND_COMPLETIONS = (1<<2),
+ /*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/
+ IB_MAD_SNOOP_RECVS = (1<<4)
+ /*IB_MAD_SNOOP_RMPP_RECVS = (1<<5),*/
+ /*IB_MAD_SNOOP_REDIRECTED_QPS = (1<<6)*/
+};
+
+/**
+ * ib_register_mad_snoop - Register to snoop sent and received MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP traffic to snoop. Must be either
+ * IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_snoop_flags: Specifies information where snooping occurs.
+ * @send_handler: The callback routine invoked for a snooped send.
+ * @recv_handler: The callback routine invoked for a snooped receive.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ int mad_snoop_flags,
+ ib_mad_snoop_handler snoop_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context);
+
+/**
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services.
+ * @mad_agent: Corresponding MAD registration request to deregister.
+ *
+ * After invoking this routine, MAD services are no longer usable by the
+ * client on the associated QP.
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent);
+
+/**
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ * with the registered client.
+ * @mad_agent: Specifies the associated registration to post the send to.
+ * @send_wr: Specifies the information needed to send the MAD(s).
+ * @bad_send_wr: Specifies the MAD on which an error was encountered.
+ *
+ * Sent MADs are not guaranteed to complete in the order that they were posted.
+ */
+int ib_post_send_mad(struct ib_mad_agent *mad_agent,
+ struct ib_send_wr *send_wr,
+ struct ib_send_wr **bad_send_wr);
+
+/**
+ * ib_coalesce_recv_mad - Coalesces received MAD data into a single buffer.
+ * @mad_recv_wc: Work completion information for a received MAD.
+ * @buf: User-provided data buffer to receive the coalesced buffers. The
+ * referenced buffer should be at least the size of the mad_len specified
+ * by @mad_recv_wc.
+ *
+ * This call copies a chain of received RMPP MADs into a single data buffer,
+ * removing duplicated headers.
+ */
+void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc,
+ void *buf);
+
+/**
+ * ib_free_recv_mad - Returns data buffers used to receive a MAD to the
+ * access layer.
+ * @mad_recv_wc: Work completion information for a received MAD.
+ *
+ * Clients receiving MADs through their ib_mad_recv_handler must call this
+ * routine to return the work completion buffers to the access layer.
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_cancel_mad - Cancels an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @wr_id: Indicates the work request identifier of the MAD to cancel.
+ *
+ * MADs will be returned to the user through the corresponding
+ * ib_mad_send_handler.
+ */
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+ u64 wr_id);
+
+/**
+ * ib_redirect_mad_qp - Registers a QP for MAD services.
+ * @qp: Reference to a QP that requires MAD services.
+ * @rmpp_version: If set, indicates that the client will send
+ * and receive MADs that contain the RMPP header for the given version.
+ * If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ * request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ * MAD.
+ * @context: User specified context associated with the registration.
+ *
+ * Use of this call allows clients to use MAD services, such as RMPP,
+ * on user-owned QPs. After calling this routine, users may send
+ * MADs on the specified QP by calling ib_mad_post_send.
+ */
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context);
+
+/**
+ * ib_process_mad_wc - Processes a work completion associated with a
+ * MAD sent or received on a redirected QP.
+ * @mad_agent: Specifies the registered MAD service using the redirected QP.
+ * @wc: References a work completion associated with a sent or received
+ * MAD segment.
+ *
+ * This routine is used to complete or continue processing on a MAD request.
+ * If the work completion is associated with a send operation, calling
+ * this routine is required to continue an RMPP transfer or to wait for a
+ * corresponding response, if it is a request. If the work completion is
+ * associated with a receive operation, calling this routine is required to
+ * process an inbound or outbound RMPP transfer, or to match a response MAD
+ * with its corresponding request.
+ */
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+ struct ib_wc *wc);
+
+#endif /* IB_MAD_H */
diff --git a/drivers/infiniband/include/ib_pack.h b/drivers/infiniband/include/ib_pack.h
new file mode 100644
index 00000000000..fe480f3e865
--- /dev/null
+++ b/drivers/infiniband/include/ib_pack.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_pack.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#ifndef IB_PACK_H
+#define IB_PACK_H
+
+#include <ib_verbs.h>
+
+enum {
+ IB_LRH_BYTES = 8,
+ IB_GRH_BYTES = 40,
+ IB_BTH_BYTES = 12,
+ IB_DETH_BYTES = 8
+};
+
+struct ib_field {
+ size_t struct_offset_bytes;
+ size_t struct_size_bytes;
+ int offset_words;
+ int offset_bits;
+ int size_bits;
+ char *field_name;
+};
+
+#define RESERVED \
+ .field_name = "reserved"
+
+/*
+ * This macro cleans up the definitions of constants for BTH opcodes.
+ * It is used to define constants such as IB_OPCODE_UD_SEND_ONLY,
+ * which becomes IB_OPCODE_UD + IB_OPCODE_SEND_ONLY, and this gives
+ * the correct value.
+ *
+ * In short, user code should use the constants defined using the
+ * macro rather than worrying about adding together other constants.
+*/
+#define IB_OPCODE(transport, op) \
+ IB_OPCODE_ ## transport ## _ ## op = \
+ IB_OPCODE_ ## transport + IB_OPCODE_ ## op
+
+enum {
+ /* transport types -- just used to define real constants */
+ IB_OPCODE_RC = 0x00,
+ IB_OPCODE_UC = 0x20,
+ IB_OPCODE_RD = 0x40,
+ IB_OPCODE_UD = 0x60,
+
+ /* operations -- just used to define real constants */
+ IB_OPCODE_SEND_FIRST = 0x00,
+ IB_OPCODE_SEND_MIDDLE = 0x01,
+ IB_OPCODE_SEND_LAST = 0x02,
+ IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03,
+ IB_OPCODE_SEND_ONLY = 0x04,
+ IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05,
+ IB_OPCODE_RDMA_WRITE_FIRST = 0x06,
+ IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07,
+ IB_OPCODE_RDMA_WRITE_LAST = 0x08,
+ IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09,
+ IB_OPCODE_RDMA_WRITE_ONLY = 0x0a,
+ IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b,
+ IB_OPCODE_RDMA_READ_REQUEST = 0x0c,
+ IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d,
+ IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e,
+ IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f,
+ IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10,
+ IB_OPCODE_ACKNOWLEDGE = 0x11,
+ IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12,
+ IB_OPCODE_COMPARE_SWAP = 0x13,
+ IB_OPCODE_FETCH_ADD = 0x14,
+
+ /* real constants follow -- see comment about above IB_OPCODE()
+ macro for more details */
+
+ /* RC */
+ IB_OPCODE(RC, SEND_FIRST),
+ IB_OPCODE(RC, SEND_MIDDLE),
+ IB_OPCODE(RC, SEND_LAST),
+ IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, SEND_ONLY),
+ IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_FIRST),
+ IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RC, RDMA_WRITE_LAST),
+ IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_READ_REQUEST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RC, ACKNOWLEDGE),
+ IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RC, COMPARE_SWAP),
+ IB_OPCODE(RC, FETCH_ADD),
+
+ /* UC */
+ IB_OPCODE(UC, SEND_FIRST),
+ IB_OPCODE(UC, SEND_MIDDLE),
+ IB_OPCODE(UC, SEND_LAST),
+ IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, SEND_ONLY),
+ IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_FIRST),
+ IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(UC, RDMA_WRITE_LAST),
+ IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+ /* RD */
+ IB_OPCODE(RD, SEND_FIRST),
+ IB_OPCODE(RD, SEND_MIDDLE),
+ IB_OPCODE(RD, SEND_LAST),
+ IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, SEND_ONLY),
+ IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_FIRST),
+ IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RD, RDMA_WRITE_LAST),
+ IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_READ_REQUEST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RD, ACKNOWLEDGE),
+ IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RD, COMPARE_SWAP),
+ IB_OPCODE(RD, FETCH_ADD),
+
+ /* UD */
+ IB_OPCODE(UD, SEND_ONLY),
+ IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+
+enum {
+ IB_LNH_RAW = 0,
+ IB_LNH_IP = 1,
+ IB_LNH_IBA_LOCAL = 2,
+ IB_LNH_IBA_GLOBAL = 3
+};
+
+struct ib_unpacked_lrh {
+ u8 virtual_lane;
+ u8 link_version;
+ u8 service_level;
+ u8 link_next_header;
+ __be16 destination_lid;
+ __be16 packet_length;
+ __be16 source_lid;
+};
+
+struct ib_unpacked_grh {
+ u8 ip_version;
+ u8 traffic_class;
+ __be32 flow_label;
+ __be16 payload_length;
+ u8 next_header;
+ u8 hop_limit;
+ union ib_gid source_gid;
+ union ib_gid destination_gid;
+};
+
+struct ib_unpacked_bth {
+ u8 opcode;
+ u8 solicited_event;
+ u8 mig_req;
+ u8 pad_count;
+ u8 transport_header_version;
+ __be16 pkey;
+ __be32 destination_qpn;
+ u8 ack_req;
+ __be32 psn;
+};
+
+struct ib_unpacked_deth {
+ __be32 qkey;
+ __be32 source_qpn;
+};
+
+struct ib_ud_header {
+ struct ib_unpacked_lrh lrh;
+ int grh_present;
+ struct ib_unpacked_grh grh;
+ struct ib_unpacked_bth bth;
+ struct ib_unpacked_deth deth;
+ int immediate_present;
+ __be32 immediate_data;
+};
+
+void ib_pack(const struct ib_field *desc,
+ int desc_len,
+ void *structure,
+ void *buf);
+
+void ib_unpack(const struct ib_field *desc,
+ int desc_len,
+ void *buf,
+ void *structure);
+
+void ib_ud_header_init(int payload_bytes,
+ int grh_present,
+ struct ib_ud_header *header);
+
+int ib_ud_header_pack(struct ib_ud_header *header,
+ void *buf);
+
+int ib_ud_header_unpack(void *buf,
+ struct ib_ud_header *header);
+
+#endif /* IB_PACK_H */
diff --git a/drivers/infiniband/include/ib_sa.h b/drivers/infiniband/include/ib_sa.h
new file mode 100644
index 00000000000..f4f747707b3
--- /dev/null
+++ b/drivers/infiniband/include/ib_sa.h
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_sa.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef IB_SA_H
+#define IB_SA_H
+
+#include <linux/compiler.h>
+
+#include <ib_verbs.h>
+#include <ib_mad.h>
+
+enum {
+ IB_SA_CLASS_VERSION = 2, /* IB spec version 1.1/1.2 */
+
+ IB_SA_METHOD_DELETE = 0x15
+};
+
+enum ib_sa_selector {
+ IB_SA_GTE = 0,
+ IB_SA_LTE = 1,
+ IB_SA_EQ = 2,
+ /*
+ * The meaning of "best" depends on the attribute: for
+ * example, for MTU best will return the largest available
+ * MTU, while for packet life time, best will return the
+ * smallest available life time.
+ */
+ IB_SA_BEST = 3
+};
+
+enum ib_sa_rate {
+ IB_SA_RATE_2_5_GBPS = 2,
+ IB_SA_RATE_5_GBPS = 5,
+ IB_SA_RATE_10_GBPS = 3,
+ IB_SA_RATE_20_GBPS = 6,
+ IB_SA_RATE_30_GBPS = 4,
+ IB_SA_RATE_40_GBPS = 7,
+ IB_SA_RATE_60_GBPS = 8,
+ IB_SA_RATE_80_GBPS = 9,
+ IB_SA_RATE_120_GBPS = 10
+};
+
+static inline int ib_sa_rate_enum_to_int(enum ib_sa_rate rate)
+{
+ switch (rate) {
+ case IB_SA_RATE_2_5_GBPS: return 1;
+ case IB_SA_RATE_5_GBPS: return 2;
+ case IB_SA_RATE_10_GBPS: return 4;
+ case IB_SA_RATE_20_GBPS: return 8;
+ case IB_SA_RATE_30_GBPS: return 12;
+ case IB_SA_RATE_40_GBPS: return 16;
+ case IB_SA_RATE_60_GBPS: return 24;
+ case IB_SA_RATE_80_GBPS: return 32;
+ case IB_SA_RATE_120_GBPS: return 48;
+ default: return -1;
+ }
+}
+
+typedef u64 __bitwise ib_sa_comp_mask;
+
+#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n))
+
+/*
+ * Structures for SA records are named "struct ib_sa_xxx_rec." No
+ * attempt is made to pack structures to match the physical layout of
+ * SA records in SA MADs; all packing and unpacking is handled by the
+ * SA query code.
+ *
+ * For a record with structure ib_sa_xxx_rec, the naming convention
+ * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we
+ * never use different abbreviations or otherwise change the spelling
+ * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY).
+ *
+ * Reserved rows are indicated with comments to help maintainability.
+ */
+
+/* reserved: 0 */
+/* reserved: 1 */
+#define IB_SA_PATH_REC_DGID IB_SA_COMP_MASK( 2)
+#define IB_SA_PATH_REC_SGID IB_SA_COMP_MASK( 3)
+#define IB_SA_PATH_REC_DLID IB_SA_COMP_MASK( 4)
+#define IB_SA_PATH_REC_SLID IB_SA_COMP_MASK( 5)
+#define IB_SA_PATH_REC_RAW_TRAFFIC IB_SA_COMP_MASK( 6)
+/* reserved: 7 */
+#define IB_SA_PATH_REC_FLOW_LABEL IB_SA_COMP_MASK( 8)
+#define IB_SA_PATH_REC_HOP_LIMIT IB_SA_COMP_MASK( 9)
+#define IB_SA_PATH_REC_TRAFFIC_CLASS IB_SA_COMP_MASK(10)
+#define IB_SA_PATH_REC_REVERSIBLE IB_SA_COMP_MASK(11)
+#define IB_SA_PATH_REC_NUMB_PATH IB_SA_COMP_MASK(12)
+#define IB_SA_PATH_REC_PKEY IB_SA_COMP_MASK(13)
+/* reserved: 14 */
+#define IB_SA_PATH_REC_SL IB_SA_COMP_MASK(15)
+#define IB_SA_PATH_REC_MTU_SELECTOR IB_SA_COMP_MASK(16)
+#define IB_SA_PATH_REC_MTU IB_SA_COMP_MASK(17)
+#define IB_SA_PATH_REC_RATE_SELECTOR IB_SA_COMP_MASK(18)
+#define IB_SA_PATH_REC_RATE IB_SA_COMP_MASK(19)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(20)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(21)
+#define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22)
+
+struct ib_sa_path_rec {
+ /* reserved */
+ /* reserved */
+ union ib_gid dgid;
+ union ib_gid sgid;
+ u16 dlid;
+ u16 slid;
+ int raw_traffic;
+ /* reserved */
+ u32 flow_label;
+ u8 hop_limit;
+ u8 traffic_class;
+ int reversible;
+ u8 numb_path;
+ u16 pkey;
+ /* reserved */
+ u8 sl;
+ u8 mtu_selector;
+ enum ib_mtu mtu;
+ u8 rate_selector;
+ u8 rate;
+ u8 packet_life_time_selector;
+ u8 packet_life_time;
+ u8 preference;
+};
+
+#define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0)
+#define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1)
+#define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2)
+#define IB_SA_MCMEMBER_REC_MLID IB_SA_COMP_MASK( 3)
+#define IB_SA_MCMEMBER_REC_MTU_SELECTOR IB_SA_COMP_MASK( 4)
+#define IB_SA_MCMEMBER_REC_MTU IB_SA_COMP_MASK( 5)
+#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS IB_SA_COMP_MASK( 6)
+#define IB_SA_MCMEMBER_REC_PKEY IB_SA_COMP_MASK( 7)
+#define IB_SA_MCMEMBER_REC_RATE_SELECTOR IB_SA_COMP_MASK( 8)
+#define IB_SA_MCMEMBER_REC_RATE IB_SA_COMP_MASK( 9)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(10)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(11)
+#define IB_SA_MCMEMBER_REC_SL IB_SA_COMP_MASK(12)
+#define IB_SA_MCMEMBER_REC_FLOW_LABEL IB_SA_COMP_MASK(13)
+#define IB_SA_MCMEMBER_REC_HOP_LIMIT IB_SA_COMP_MASK(14)
+#define IB_SA_MCMEMBER_REC_SCOPE IB_SA_COMP_MASK(15)
+#define IB_SA_MCMEMBER_REC_JOIN_STATE IB_SA_COMP_MASK(16)
+#define IB_SA_MCMEMBER_REC_PROXY_JOIN IB_SA_COMP_MASK(17)
+
+struct ib_sa_mcmember_rec {
+ union ib_gid mgid;
+ union ib_gid port_gid;
+ u32 qkey;
+ u16 mlid;
+ u8 mtu_selector;
+ enum ib_mtu mtu;
+ u8 traffic_class;
+ u16 pkey;
+ u8 rate_selector;
+ u8 rate;
+ u8 packet_life_time_selector;
+ u8 packet_life_time;
+ u8 sl;
+ u32 flow_label;
+ u8 hop_limit;
+ u8 scope;
+ u8 join_state;
+ int proxy_join;
+};
+
+struct ib_sa_query;
+
+void ib_sa_cancel_query(int id, struct ib_sa_query *query);
+
+int ib_sa_path_rec_get(struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, int gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_path_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **query);
+
+int ib_sa_mcmember_rec_query(struct ib_device *device, u8 port_num,
+ u8 method,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, int gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **query);
+
+/**
+ * ib_sa_mcmember_rec_set - Start an MCMember set query
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:MCMember Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send an MCMember Set query to the SA (eg to join a multicast
+ * group). The callback function will be called when the query
+ * completes (or fails); status is 0 for a successful response, -EINTR
+ * if the query is canceled, -ETIMEDOUT is the query timed out, or
+ * -EIO if an error occurred sending the query. The resp parameter of
+ * the callback is only valid if status is 0.
+ *
+ * If the return value of ib_sa_mcmember_rec_set() is negative, it is
+ * an error code. Otherwise it is a query ID that can be used to
+ * cancel the query.
+ */
+static inline int
+ib_sa_mcmember_rec_set(struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, int gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **query)
+{
+ return ib_sa_mcmember_rec_query(device, port_num,
+ IB_MGMT_METHOD_SET,
+ rec, comp_mask,
+ timeout_ms, gfp_mask, callback,
+ context, query);
+}
+
+/**
+ * ib_sa_mcmember_rec_delete - Start an MCMember delete query
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:MCMember Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send an MCMember Delete query to the SA (eg to leave a multicast
+ * group). The callback function will be called when the query
+ * completes (or fails); status is 0 for a successful response, -EINTR
+ * if the query is canceled, -ETIMEDOUT is the query timed out, or
+ * -EIO if an error occurred sending the query. The resp parameter of
+ * the callback is only valid if status is 0.
+ *
+ * If the return value of ib_sa_mcmember_rec_delete() is negative, it
+ * is an error code. Otherwise it is a query ID that can be used to
+ * cancel the query.
+ */
+static inline int
+ib_sa_mcmember_rec_delete(struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, int gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **query)
+{
+ return ib_sa_mcmember_rec_query(device, port_num,
+ IB_SA_METHOD_DELETE,
+ rec, comp_mask,
+ timeout_ms, gfp_mask, callback,
+ context, query);
+}
+
+
+#endif /* IB_SA_H */
diff --git a/drivers/infiniband/include/ib_smi.h b/drivers/infiniband/include/ib_smi.h
new file mode 100644
index 00000000000..ca821651496
--- /dev/null
+++ b/drivers/infiniband/include/ib_smi.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_smi.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#if !defined( IB_SMI_H )
+#define IB_SMI_H
+
+#include <ib_mad.h>
+
+#define IB_LID_PERMISSIVE 0xFFFF
+
+#define IB_SMP_DATA_SIZE 64
+#define IB_SMP_MAX_PATH_HOPS 64
+
+struct ib_smp {
+ u8 base_version;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ u16 status;
+ u8 hop_ptr;
+ u8 hop_cnt;
+ u64 tid;
+ u16 attr_id;
+ u16 resv;
+ u32 attr_mod;
+ u64 mkey;
+ u16 dr_slid;
+ u16 dr_dlid;
+ u8 reserved[28];
+ u8 data[IB_SMP_DATA_SIZE];
+ u8 initial_path[IB_SMP_MAX_PATH_HOPS];
+ u8 return_path[IB_SMP_MAX_PATH_HOPS];
+} __attribute__ ((packed));
+
+#define IB_SMP_DIRECTION __constant_htons(0x8000)
+
+/* Subnet management attributes */
+#define IB_SMP_ATTR_NOTICE __constant_htons(0x0002)
+#define IB_SMP_ATTR_NODE_DESC __constant_htons(0x0010)
+#define IB_SMP_ATTR_NODE_INFO __constant_htons(0x0011)
+#define IB_SMP_ATTR_SWITCH_INFO __constant_htons(0x0012)
+#define IB_SMP_ATTR_GUID_INFO __constant_htons(0x0014)
+#define IB_SMP_ATTR_PORT_INFO __constant_htons(0x0015)
+#define IB_SMP_ATTR_PKEY_TABLE __constant_htons(0x0016)
+#define IB_SMP_ATTR_SL_TO_VL_TABLE __constant_htons(0x0017)
+#define IB_SMP_ATTR_VL_ARB_TABLE __constant_htons(0x0018)
+#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE __constant_htons(0x0019)
+#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE __constant_htons(0x001A)
+#define IB_SMP_ATTR_MCAST_FORWARD_TABLE __constant_htons(0x001B)
+#define IB_SMP_ATTR_SM_INFO __constant_htons(0x0020)
+#define IB_SMP_ATTR_VENDOR_DIAG __constant_htons(0x0030)
+#define IB_SMP_ATTR_LED_INFO __constant_htons(0x0031)
+#define IB_SMP_ATTR_VENDOR_MASK __constant_htons(0xFF00)
+
+static inline u8
+ib_get_smp_direction(struct ib_smp *smp)
+{
+ return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
+}
+
+#endif /* IB_SMI_H */
diff --git a/drivers/infiniband/include/ib_user_mad.h b/drivers/infiniband/include/ib_user_mad.h
new file mode 100644
index 00000000000..06ad4a6075f
--- /dev/null
+++ b/drivers/infiniband/include/ib_user_mad.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_user_mad.h 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#ifndef IB_USER_MAD_H
+#define IB_USER_MAD_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_MAD_ABI_VERSION 2
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ */
+
+/**
+ * ib_user_mad - MAD packet
+ * @data - Contents of MAD
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ * received (transaction ID in data[] will be set to TID of original
+ * request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ *
+ * All multi-byte quantities are stored in network (big endian) byte order.
+ */
+struct ib_user_mad {
+ __u8 data[256];
+ __u32 id;
+ __u32 status;
+ __u32 timeout_ms;
+ __u32 qpn;
+ __u32 qkey;
+ __u16 lid;
+ __u8 sl;
+ __u8 path_bits;
+ __u8 grh_present;
+ __u8 gid_index;
+ __u8 hop_limit;
+ __u8 traffic_class;
+ __u8 gid[16];
+ __u32 flow_label;
+};
+
+/**
+ * ib_user_mad_reg_req - MAD registration request
+ * @id - Set by the kernel; used to identify agent in future requests.
+ * @qpn - Queue pair number; must be 0 or 1.
+ * @method_mask - The caller will receive unsolicited MADs for any method
+ * where @method_mask = 1.
+ * @mgmt_class - Indicates which management class of MADs should be receive
+ * by the caller. This field is only required if the user wishes to
+ * receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ * management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ * in the range from 0x30 to 0x4f. Otherwise not used.
+ */
+struct ib_user_mad_reg_req {
+ __u32 id;
+ __u32 method_mask[4];
+ __u8 qpn;
+ __u8 mgmt_class;
+ __u8 mgmt_class_version;
+ __u8 oui[3];
+};
+
+#define IB_IOCTL_MAGIC 0x1b
+
+#define IB_USER_MAD_REGISTER_AGENT _IOWR(IB_IOCTL_MAGIC, 1, \
+ struct ib_user_mad_reg_req)
+
+#define IB_USER_MAD_UNREGISTER_AGENT _IOW(IB_IOCTL_MAGIC, 2, __u32)
+
+#endif /* IB_USER_MAD_H */
diff --git a/drivers/infiniband/include/ib_verbs.h b/drivers/infiniband/include/ib_verbs.h
new file mode 100644
index 00000000000..cf01f044a22
--- /dev/null
+++ b/drivers/infiniband/include/ib_verbs.h
@@ -0,0 +1,1252 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_verbs.h 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#if !defined(IB_VERBS_H)
+#define IB_VERBS_H
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <asm/atomic.h>
+
+union ib_gid {
+ u8 raw[16];
+ struct {
+ u64 subnet_prefix;
+ u64 interface_id;
+ } global;
+};
+
+enum ib_node_type {
+ IB_NODE_CA = 1,
+ IB_NODE_SWITCH,
+ IB_NODE_ROUTER
+};
+
+enum ib_device_cap_flags {
+ IB_DEVICE_RESIZE_MAX_WR = 1,
+ IB_DEVICE_BAD_PKEY_CNTR = (1<<1),
+ IB_DEVICE_BAD_QKEY_CNTR = (1<<2),
+ IB_DEVICE_RAW_MULTI = (1<<3),
+ IB_DEVICE_AUTO_PATH_MIG = (1<<4),
+ IB_DEVICE_CHANGE_PHY_PORT = (1<<5),
+ IB_DEVICE_UD_AV_PORT_ENFORCE = (1<<6),
+ IB_DEVICE_CURR_QP_STATE_MOD = (1<<7),
+ IB_DEVICE_SHUTDOWN_PORT = (1<<8),
+ IB_DEVICE_INIT_TYPE = (1<<9),
+ IB_DEVICE_PORT_ACTIVE_EVENT = (1<<10),
+ IB_DEVICE_SYS_IMAGE_GUID = (1<<11),
+ IB_DEVICE_RC_RNR_NAK_GEN = (1<<12),
+ IB_DEVICE_SRQ_RESIZE = (1<<13),
+ IB_DEVICE_N_NOTIFY_CQ = (1<<14),
+};
+
+enum ib_atomic_cap {
+ IB_ATOMIC_NONE,
+ IB_ATOMIC_HCA,
+ IB_ATOMIC_GLOB
+};
+
+struct ib_device_attr {
+ u64 fw_ver;
+ u64 node_guid;
+ u64 sys_image_guid;
+ u64 max_mr_size;
+ u64 page_size_cap;
+ u32 vendor_id;
+ u32 vendor_part_id;
+ u32 hw_ver;
+ int max_qp;
+ int max_qp_wr;
+ int device_cap_flags;
+ int max_sge;
+ int max_sge_rd;
+ int max_cq;
+ int max_cqe;
+ int max_mr;
+ int max_pd;
+ int max_qp_rd_atom;
+ int max_ee_rd_atom;
+ int max_res_rd_atom;
+ int max_qp_init_rd_atom;
+ int max_ee_init_rd_atom;
+ enum ib_atomic_cap atomic_cap;
+ int max_ee;
+ int max_rdd;
+ int max_mw;
+ int max_raw_ipv6_qp;
+ int max_raw_ethy_qp;
+ int max_mcast_grp;
+ int max_mcast_qp_attach;
+ int max_total_mcast_qp_attach;
+ int max_ah;
+ int max_fmr;
+ int max_map_per_fmr;
+ int max_srq;
+ int max_srq_wr;
+ int max_srq_sge;
+ u16 max_pkeys;
+ u8 local_ca_ack_delay;
+};
+
+enum ib_mtu {
+ IB_MTU_256 = 1,
+ IB_MTU_512 = 2,
+ IB_MTU_1024 = 3,
+ IB_MTU_2048 = 4,
+ IB_MTU_4096 = 5
+};
+
+static inline int ib_mtu_enum_to_int(enum ib_mtu mtu)
+{
+ switch (mtu) {
+ case IB_MTU_256: return 256;
+ case IB_MTU_512: return 512;
+ case IB_MTU_1024: return 1024;
+ case IB_MTU_2048: return 2048;
+ case IB_MTU_4096: return 4096;
+ default: return -1;
+ }
+}
+
+enum ib_port_state {
+ IB_PORT_NOP = 0,
+ IB_PORT_DOWN = 1,
+ IB_PORT_INIT = 2,
+ IB_PORT_ARMED = 3,
+ IB_PORT_ACTIVE = 4,
+ IB_PORT_ACTIVE_DEFER = 5
+};
+
+enum ib_port_cap_flags {
+ IB_PORT_SM = 1 << 1,
+ IB_PORT_NOTICE_SUP = 1 << 2,
+ IB_PORT_TRAP_SUP = 1 << 3,
+ IB_PORT_OPT_IPD_SUP = 1 << 4,
+ IB_PORT_AUTO_MIGR_SUP = 1 << 5,
+ IB_PORT_SL_MAP_SUP = 1 << 6,
+ IB_PORT_MKEY_NVRAM = 1 << 7,
+ IB_PORT_PKEY_NVRAM = 1 << 8,
+ IB_PORT_LED_INFO_SUP = 1 << 9,
+ IB_PORT_SM_DISABLED = 1 << 10,
+ IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11,
+ IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12,
+ IB_PORT_CM_SUP = 1 << 16,
+ IB_PORT_SNMP_TUNNEL_SUP = 1 << 17,
+ IB_PORT_REINIT_SUP = 1 << 18,
+ IB_PORT_DEVICE_MGMT_SUP = 1 << 19,
+ IB_PORT_VENDOR_CLASS_SUP = 1 << 20,
+ IB_PORT_DR_NOTICE_SUP = 1 << 21,
+ IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22,
+ IB_PORT_BOOT_MGMT_SUP = 1 << 23,
+ IB_PORT_LINK_LATENCY_SUP = 1 << 24,
+ IB_PORT_CLIENT_REG_SUP = 1 << 25
+};
+
+enum ib_port_width {
+ IB_WIDTH_1X = 1,
+ IB_WIDTH_4X = 2,
+ IB_WIDTH_8X = 4,
+ IB_WIDTH_12X = 8
+};
+
+static inline int ib_width_enum_to_int(enum ib_port_width width)
+{
+ switch (width) {
+ case IB_WIDTH_1X: return 1;
+ case IB_WIDTH_4X: return 4;
+ case IB_WIDTH_8X: return 8;
+ case IB_WIDTH_12X: return 12;
+ default: return -1;
+ }
+}
+
+struct ib_port_attr {
+ enum ib_port_state state;
+ enum ib_mtu max_mtu;
+ enum ib_mtu active_mtu;
+ int gid_tbl_len;
+ u32 port_cap_flags;
+ u32 max_msg_sz;
+ u32 bad_pkey_cntr;
+ u32 qkey_viol_cntr;
+ u16 pkey_tbl_len;
+ u16 lid;
+ u16 sm_lid;
+ u8 lmc;
+ u8 max_vl_num;
+ u8 sm_sl;
+ u8 subnet_timeout;
+ u8 init_type_reply;
+ u8 active_width;
+ u8 active_speed;
+ u8 phys_state;
+};
+
+enum ib_device_modify_flags {
+ IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1
+};
+
+struct ib_device_modify {
+ u64 sys_image_guid;
+};
+
+enum ib_port_modify_flags {
+ IB_PORT_SHUTDOWN = 1,
+ IB_PORT_INIT_TYPE = (1<<2),
+ IB_PORT_RESET_QKEY_CNTR = (1<<3)
+};
+
+struct ib_port_modify {
+ u32 set_port_cap_mask;
+ u32 clr_port_cap_mask;
+ u8 init_type;
+};
+
+enum ib_event_type {
+ IB_EVENT_CQ_ERR,
+ IB_EVENT_QP_FATAL,
+ IB_EVENT_QP_REQ_ERR,
+ IB_EVENT_QP_ACCESS_ERR,
+ IB_EVENT_COMM_EST,
+ IB_EVENT_SQ_DRAINED,
+ IB_EVENT_PATH_MIG,
+ IB_EVENT_PATH_MIG_ERR,
+ IB_EVENT_DEVICE_FATAL,
+ IB_EVENT_PORT_ACTIVE,
+ IB_EVENT_PORT_ERR,
+ IB_EVENT_LID_CHANGE,
+ IB_EVENT_PKEY_CHANGE,
+ IB_EVENT_SM_CHANGE
+};
+
+struct ib_event {
+ struct ib_device *device;
+ union {
+ struct ib_cq *cq;
+ struct ib_qp *qp;
+ u8 port_num;
+ } element;
+ enum ib_event_type event;
+};
+
+struct ib_event_handler {
+ struct ib_device *device;
+ void (*handler)(struct ib_event_handler *, struct ib_event *);
+ struct list_head list;
+};
+
+#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \
+ do { \
+ (_ptr)->device = _device; \
+ (_ptr)->handler = _handler; \
+ INIT_LIST_HEAD(&(_ptr)->list); \
+ } while (0)
+
+struct ib_global_route {
+ union ib_gid dgid;
+ u32 flow_label;
+ u8 sgid_index;
+ u8 hop_limit;
+ u8 traffic_class;
+};
+
+enum {
+ IB_MULTICAST_QPN = 0xffffff
+};
+
+enum ib_ah_flags {
+ IB_AH_GRH = 1
+};
+
+struct ib_ah_attr {
+ struct ib_global_route grh;
+ u16 dlid;
+ u8 sl;
+ u8 src_path_bits;
+ u8 static_rate;
+ u8 ah_flags;
+ u8 port_num;
+};
+
+enum ib_wc_status {
+ IB_WC_SUCCESS,
+ IB_WC_LOC_LEN_ERR,
+ IB_WC_LOC_QP_OP_ERR,
+ IB_WC_LOC_EEC_OP_ERR,
+ IB_WC_LOC_PROT_ERR,
+ IB_WC_WR_FLUSH_ERR,
+ IB_WC_MW_BIND_ERR,
+ IB_WC_BAD_RESP_ERR,
+ IB_WC_LOC_ACCESS_ERR,
+ IB_WC_REM_INV_REQ_ERR,
+ IB_WC_REM_ACCESS_ERR,
+ IB_WC_REM_OP_ERR,
+ IB_WC_RETRY_EXC_ERR,
+ IB_WC_RNR_RETRY_EXC_ERR,
+ IB_WC_LOC_RDD_VIOL_ERR,
+ IB_WC_REM_INV_RD_REQ_ERR,
+ IB_WC_REM_ABORT_ERR,
+ IB_WC_INV_EECN_ERR,
+ IB_WC_INV_EEC_STATE_ERR,
+ IB_WC_FATAL_ERR,
+ IB_WC_RESP_TIMEOUT_ERR,
+ IB_WC_GENERAL_ERR
+};
+
+enum ib_wc_opcode {
+ IB_WC_SEND,
+ IB_WC_RDMA_WRITE,
+ IB_WC_RDMA_READ,
+ IB_WC_COMP_SWAP,
+ IB_WC_FETCH_ADD,
+ IB_WC_BIND_MW,
+/*
+ * Set value of IB_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IB_WC_RECV).
+ */
+ IB_WC_RECV = 1 << 7,
+ IB_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ib_wc_flags {
+ IB_WC_GRH = 1,
+ IB_WC_WITH_IMM = (1<<1)
+};
+
+struct ib_wc {
+ u64 wr_id;
+ enum ib_wc_status status;
+ enum ib_wc_opcode opcode;
+ u32 vendor_err;
+ u32 byte_len;
+ __be32 imm_data;
+ u32 qp_num;
+ u32 src_qp;
+ int wc_flags;
+ u16 pkey_index;
+ u16 slid;
+ u8 sl;
+ u8 dlid_path_bits;
+ u8 port_num; /* valid only for DR SMPs on switches */
+};
+
+enum ib_cq_notify {
+ IB_CQ_SOLICITED,
+ IB_CQ_NEXT_COMP
+};
+
+struct ib_qp_cap {
+ u32 max_send_wr;
+ u32 max_recv_wr;
+ u32 max_send_sge;
+ u32 max_recv_sge;
+ u32 max_inline_data;
+};
+
+enum ib_sig_type {
+ IB_SIGNAL_ALL_WR,
+ IB_SIGNAL_REQ_WR
+};
+
+enum ib_qp_type {
+ /*
+ * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+ * here (and in that order) since the MAD layer uses them as
+ * indices into a 2-entry table.
+ */
+ IB_QPT_SMI,
+ IB_QPT_GSI,
+
+ IB_QPT_RC,
+ IB_QPT_UC,
+ IB_QPT_UD,
+ IB_QPT_RAW_IPV6,
+ IB_QPT_RAW_ETY
+};
+
+struct ib_qp_init_attr {
+ void (*event_handler)(struct ib_event *, void *);
+ void *qp_context;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
+ struct ib_srq *srq;
+ struct ib_qp_cap cap;
+ enum ib_sig_type sq_sig_type;
+ enum ib_qp_type qp_type;
+ u8 port_num; /* special QP types only */
+};
+
+enum ib_rnr_timeout {
+ IB_RNR_TIMER_655_36 = 0,
+ IB_RNR_TIMER_000_01 = 1,
+ IB_RNR_TIMER_000_02 = 2,
+ IB_RNR_TIMER_000_03 = 3,
+ IB_RNR_TIMER_000_04 = 4,
+ IB_RNR_TIMER_000_06 = 5,
+ IB_RNR_TIMER_000_08 = 6,
+ IB_RNR_TIMER_000_12 = 7,
+ IB_RNR_TIMER_000_16 = 8,
+ IB_RNR_TIMER_000_24 = 9,
+ IB_RNR_TIMER_000_32 = 10,
+ IB_RNR_TIMER_000_48 = 11,
+ IB_RNR_TIMER_000_64 = 12,
+ IB_RNR_TIMER_000_96 = 13,
+ IB_RNR_TIMER_001_28 = 14,
+ IB_RNR_TIMER_001_92 = 15,
+ IB_RNR_TIMER_002_56 = 16,
+ IB_RNR_TIMER_003_84 = 17,
+ IB_RNR_TIMER_005_12 = 18,
+ IB_RNR_TIMER_007_68 = 19,
+ IB_RNR_TIMER_010_24 = 20,
+ IB_RNR_TIMER_015_36 = 21,
+ IB_RNR_TIMER_020_48 = 22,
+ IB_RNR_TIMER_030_72 = 23,
+ IB_RNR_TIMER_040_96 = 24,
+ IB_RNR_TIMER_061_44 = 25,
+ IB_RNR_TIMER_081_92 = 26,
+ IB_RNR_TIMER_122_88 = 27,
+ IB_RNR_TIMER_163_84 = 28,
+ IB_RNR_TIMER_245_76 = 29,
+ IB_RNR_TIMER_327_68 = 30,
+ IB_RNR_TIMER_491_52 = 31
+};
+
+enum ib_qp_attr_mask {
+ IB_QP_STATE = 1,
+ IB_QP_CUR_STATE = (1<<1),
+ IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2),
+ IB_QP_ACCESS_FLAGS = (1<<3),
+ IB_QP_PKEY_INDEX = (1<<4),
+ IB_QP_PORT = (1<<5),
+ IB_QP_QKEY = (1<<6),
+ IB_QP_AV = (1<<7),
+ IB_QP_PATH_MTU = (1<<8),
+ IB_QP_TIMEOUT = (1<<9),
+ IB_QP_RETRY_CNT = (1<<10),
+ IB_QP_RNR_RETRY = (1<<11),
+ IB_QP_RQ_PSN = (1<<12),
+ IB_QP_MAX_QP_RD_ATOMIC = (1<<13),
+ IB_QP_ALT_PATH = (1<<14),
+ IB_QP_MIN_RNR_TIMER = (1<<15),
+ IB_QP_SQ_PSN = (1<<16),
+ IB_QP_MAX_DEST_RD_ATOMIC = (1<<17),
+ IB_QP_PATH_MIG_STATE = (1<<18),
+ IB_QP_CAP = (1<<19),
+ IB_QP_DEST_QPN = (1<<20)
+};
+
+enum ib_qp_state {
+ IB_QPS_RESET,
+ IB_QPS_INIT,
+ IB_QPS_RTR,
+ IB_QPS_RTS,
+ IB_QPS_SQD,
+ IB_QPS_SQE,
+ IB_QPS_ERR
+};
+
+enum ib_mig_state {
+ IB_MIG_MIGRATED,
+ IB_MIG_REARM,
+ IB_MIG_ARMED
+};
+
+struct ib_qp_attr {
+ enum ib_qp_state qp_state;
+ enum ib_qp_state cur_qp_state;
+ enum ib_mtu path_mtu;
+ enum ib_mig_state path_mig_state;
+ u32 qkey;
+ u32 rq_psn;
+ u32 sq_psn;
+ u32 dest_qp_num;
+ int qp_access_flags;
+ struct ib_qp_cap cap;
+ struct ib_ah_attr ah_attr;
+ struct ib_ah_attr alt_ah_attr;
+ u16 pkey_index;
+ u16 alt_pkey_index;
+ u8 en_sqd_async_notify;
+ u8 sq_draining;
+ u8 max_rd_atomic;
+ u8 max_dest_rd_atomic;
+ u8 min_rnr_timer;
+ u8 port_num;
+ u8 timeout;
+ u8 retry_cnt;
+ u8 rnr_retry;
+ u8 alt_port_num;
+ u8 alt_timeout;
+};
+
+enum ib_wr_opcode {
+ IB_WR_RDMA_WRITE,
+ IB_WR_RDMA_WRITE_WITH_IMM,
+ IB_WR_SEND,
+ IB_WR_SEND_WITH_IMM,
+ IB_WR_RDMA_READ,
+ IB_WR_ATOMIC_CMP_AND_SWP,
+ IB_WR_ATOMIC_FETCH_AND_ADD
+};
+
+enum ib_send_flags {
+ IB_SEND_FENCE = 1,
+ IB_SEND_SIGNALED = (1<<1),
+ IB_SEND_SOLICITED = (1<<2),
+ IB_SEND_INLINE = (1<<3)
+};
+
+struct ib_sge {
+ u64 addr;
+ u32 length;
+ u32 lkey;
+};
+
+struct ib_send_wr {
+ struct ib_send_wr *next;
+ u64 wr_id;
+ struct ib_sge *sg_list;
+ int num_sge;
+ enum ib_wr_opcode opcode;
+ int send_flags;
+ u32 imm_data;
+ union {
+ struct {
+ u64 remote_addr;
+ u32 rkey;
+ } rdma;
+ struct {
+ u64 remote_addr;
+ u64 compare_add;
+ u64 swap;
+ u32 rkey;
+ } atomic;
+ struct {
+ struct ib_ah *ah;
+ struct ib_mad_hdr *mad_hdr;
+ u32 remote_qpn;
+ u32 remote_qkey;
+ int timeout_ms; /* valid for MADs only */
+ u16 pkey_index; /* valid for GSI only */
+ u8 port_num; /* valid for DR SMPs on switch only */
+ } ud;
+ } wr;
+};
+
+struct ib_recv_wr {
+ struct ib_recv_wr *next;
+ u64 wr_id;
+ struct ib_sge *sg_list;
+ int num_sge;
+};
+
+enum ib_access_flags {
+ IB_ACCESS_LOCAL_WRITE = 1,
+ IB_ACCESS_REMOTE_WRITE = (1<<1),
+ IB_ACCESS_REMOTE_READ = (1<<2),
+ IB_ACCESS_REMOTE_ATOMIC = (1<<3),
+ IB_ACCESS_MW_BIND = (1<<4)
+};
+
+struct ib_phys_buf {
+ u64 addr;
+ u64 size;
+};
+
+struct ib_mr_attr {
+ struct ib_pd *pd;
+ u64 device_virt_addr;
+ u64 size;
+ int mr_access_flags;
+ u32 lkey;
+ u32 rkey;
+};
+
+enum ib_mr_rereg_flags {
+ IB_MR_REREG_TRANS = 1,
+ IB_MR_REREG_PD = (1<<1),
+ IB_MR_REREG_ACCESS = (1<<2)
+};
+
+struct ib_mw_bind {
+ struct ib_mr *mr;
+ u64 wr_id;
+ u64 addr;
+ u32 length;
+ int send_flags;
+ int mw_access_flags;
+};
+
+struct ib_fmr_attr {
+ int max_pages;
+ int max_maps;
+ u8 page_size;
+};
+
+struct ib_pd {
+ struct ib_device *device;
+ atomic_t usecnt; /* count all resources */
+};
+
+struct ib_ah {
+ struct ib_device *device;
+ struct ib_pd *pd;
+};
+
+typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
+
+struct ib_cq {
+ struct ib_device *device;
+ ib_comp_handler comp_handler;
+ void (*event_handler)(struct ib_event *, void *);
+ void * cq_context;
+ int cqe;
+ atomic_t usecnt; /* count number of work queues */
+};
+
+struct ib_srq {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ void *srq_context;
+ atomic_t usecnt;
+};
+
+struct ib_qp {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
+ struct ib_srq *srq;
+ void (*event_handler)(struct ib_event *, void *);
+ void *qp_context;
+ u32 qp_num;
+ enum ib_qp_type qp_type;
+};
+
+struct ib_mr {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ u32 lkey;
+ u32 rkey;
+ atomic_t usecnt; /* count number of MWs */
+};
+
+struct ib_mw {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ u32 rkey;
+};
+
+struct ib_fmr {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct list_head list;
+ u32 lkey;
+ u32 rkey;
+};
+
+struct ib_mad;
+struct ib_grh;
+
+enum ib_process_mad_flags {
+ IB_MAD_IGNORE_MKEY = 1,
+ IB_MAD_IGNORE_BKEY = 2,
+ IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY
+};
+
+enum ib_mad_result {
+ IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */
+ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */
+ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */
+ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */
+};
+
+#define IB_DEVICE_NAME_MAX 64
+
+struct ib_cache {
+ rwlock_t lock;
+ struct ib_event_handler event_handler;
+ struct ib_pkey_cache **pkey_cache;
+ struct ib_gid_cache **gid_cache;
+};
+
+struct ib_device {
+ struct device *dma_device;
+
+ char name[IB_DEVICE_NAME_MAX];
+
+ struct list_head event_handler_list;
+ spinlock_t event_handler_lock;
+
+ struct list_head core_list;
+ struct list_head client_data_list;
+ spinlock_t client_data_lock;
+
+ struct ib_cache cache;
+
+ u32 flags;
+
+ int (*query_device)(struct ib_device *device,
+ struct ib_device_attr *device_attr);
+ int (*query_port)(struct ib_device *device,
+ u8 port_num,
+ struct ib_port_attr *port_attr);
+ int (*query_gid)(struct ib_device *device,
+ u8 port_num, int index,
+ union ib_gid *gid);
+ int (*query_pkey)(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey);
+ int (*modify_device)(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify);
+ int (*modify_port)(struct ib_device *device,
+ u8 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify);
+ struct ib_pd * (*alloc_pd)(struct ib_device *device);
+ int (*dealloc_pd)(struct ib_pd *pd);
+ struct ib_ah * (*create_ah)(struct ib_pd *pd,
+ struct ib_ah_attr *ah_attr);
+ int (*modify_ah)(struct ib_ah *ah,
+ struct ib_ah_attr *ah_attr);
+ int (*query_ah)(struct ib_ah *ah,
+ struct ib_ah_attr *ah_attr);
+ int (*destroy_ah)(struct ib_ah *ah);
+ struct ib_qp * (*create_qp)(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr);
+ int (*modify_qp)(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask);
+ int (*query_qp)(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr);
+ int (*destroy_qp)(struct ib_qp *qp);
+ int (*post_send)(struct ib_qp *qp,
+ struct ib_send_wr *send_wr,
+ struct ib_send_wr **bad_send_wr);
+ int (*post_recv)(struct ib_qp *qp,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr);
+ struct ib_cq * (*create_cq)(struct ib_device *device,
+ int cqe);
+ int (*destroy_cq)(struct ib_cq *cq);
+ int (*resize_cq)(struct ib_cq *cq, int *cqe);
+ int (*poll_cq)(struct ib_cq *cq, int num_entries,
+ struct ib_wc *wc);
+ int (*peek_cq)(struct ib_cq *cq, int wc_cnt);
+ int (*req_notify_cq)(struct ib_cq *cq,
+ enum ib_cq_notify cq_notify);
+ int (*req_ncomp_notif)(struct ib_cq *cq,
+ int wc_cnt);
+ struct ib_mr * (*get_dma_mr)(struct ib_pd *pd,
+ int mr_access_flags);
+ struct ib_mr * (*reg_phys_mr)(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+ int (*query_mr)(struct ib_mr *mr,
+ struct ib_mr_attr *mr_attr);
+ int (*dereg_mr)(struct ib_mr *mr);
+ int (*rereg_phys_mr)(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+ struct ib_mw * (*alloc_mw)(struct ib_pd *pd);
+ int (*bind_mw)(struct ib_qp *qp,
+ struct ib_mw *mw,
+ struct ib_mw_bind *mw_bind);
+ int (*dealloc_mw)(struct ib_mw *mw);
+ struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr);
+ int (*map_phys_fmr)(struct ib_fmr *fmr,
+ u64 *page_list, int list_len,
+ u64 iova);
+ int (*unmap_fmr)(struct list_head *fmr_list);
+ int (*dealloc_fmr)(struct ib_fmr *fmr);
+ int (*attach_mcast)(struct ib_qp *qp,
+ union ib_gid *gid,
+ u16 lid);
+ int (*detach_mcast)(struct ib_qp *qp,
+ union ib_gid *gid,
+ u16 lid);
+ int (*process_mad)(struct ib_device *device,
+ int process_mad_flags,
+ u8 port_num,
+ struct ib_wc *in_wc,
+ struct ib_grh *in_grh,
+ struct ib_mad *in_mad,
+ struct ib_mad *out_mad);
+
+ struct class_device class_dev;
+ struct kobject ports_parent;
+ struct list_head port_list;
+
+ enum {
+ IB_DEV_UNINITIALIZED,
+ IB_DEV_REGISTERED,
+ IB_DEV_UNREGISTERED
+ } reg_state;
+
+ u8 node_type;
+ u8 phys_port_cnt;
+};
+
+struct ib_client {
+ char *name;
+ void (*add) (struct ib_device *);
+ void (*remove)(struct ib_device *);
+
+ struct list_head list;
+};
+
+struct ib_device *ib_alloc_device(size_t size);
+void ib_dealloc_device(struct ib_device *device);
+
+int ib_register_device (struct ib_device *device);
+void ib_unregister_device(struct ib_device *device);
+
+int ib_register_client (struct ib_client *client);
+void ib_unregister_client(struct ib_client *client);
+
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data);
+
+int ib_register_event_handler (struct ib_event_handler *event_handler);
+int ib_unregister_event_handler(struct ib_event_handler *event_handler);
+void ib_dispatch_event(struct ib_event *event);
+
+int ib_query_device(struct ib_device *device,
+ struct ib_device_attr *device_attr);
+
+int ib_query_port(struct ib_device *device,
+ u8 port_num, struct ib_port_attr *port_attr);
+
+int ib_query_gid(struct ib_device *device,
+ u8 port_num, int index, union ib_gid *gid);
+
+int ib_query_pkey(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey);
+
+int ib_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify);
+
+int ib_modify_port(struct ib_device *device,
+ u8 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify);
+
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ */
+struct ib_pd *ib_alloc_pd(struct ib_device *device);
+
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ */
+int ib_dealloc_pd(struct ib_pd *pd);
+
+/**
+ * ib_create_ah - Creates an address handle for the given address vector.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_modify_ah - Modifies the address vector associated with an address
+ * handle.
+ * @ah: The address handle to modify.
+ * @ah_attr: The new address vector attributes to associate with the
+ * address handle.
+ */
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_query_ah - Queries the address vector associated with an address
+ * handle.
+ * @ah: The address handle to query.
+ * @ah_attr: The address vector attributes associated with the address
+ * handle.
+ */
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_destroy_ah - Destroys an address handle.
+ * @ah: The address handle to destroy.
+ */
+int ib_destroy_ah(struct ib_ah *ah);
+
+/**
+ * ib_create_qp - Creates a QP associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the QP.
+ * @qp_init_attr: A list of initial attributes required to create the QP.
+ */
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_modify_qp - Modifies the attributes for the specified QP and then
+ * transitions the QP to the given state.
+ * @qp: The QP to modify.
+ * @qp_attr: On input, specifies the QP attributes to modify. On output,
+ * the current values of selected QP attributes are returned.
+ * @qp_attr_mask: A bit-mask used to specify which attributes of the QP
+ * are being modified.
+ */
+int ib_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask);
+
+/**
+ * ib_query_qp - Returns the attribute list and current values for the
+ * specified QP.
+ * @qp: The QP to query.
+ * @qp_attr: The attributes of the specified QP.
+ * @qp_attr_mask: A bit-mask used to select specific attributes to query.
+ * @qp_init_attr: Additional attributes of the selected QP.
+ *
+ * The qp_attr_mask may be used to limit the query to gathering only the
+ * selected attributes.
+ */
+int ib_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_destroy_qp - Destroys the specified QP.
+ * @qp: The QP to destroy.
+ */
+int ib_destroy_qp(struct ib_qp *qp);
+
+/**
+ * ib_post_send - Posts a list of work requests to the send queue of
+ * the specified QP.
+ * @qp: The QP to post the work request on.
+ * @send_wr: A list of work requests to post on the send queue.
+ * @bad_send_wr: On an immediate failure, this parameter will reference
+ * the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_send(struct ib_qp *qp,
+ struct ib_send_wr *send_wr,
+ struct ib_send_wr **bad_send_wr)
+{
+ return qp->device->post_send(qp, send_wr, bad_send_wr);
+}
+
+/**
+ * ib_post_recv - Posts a list of work requests to the receive queue of
+ * the specified QP.
+ * @qp: The QP to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ * the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_recv(struct ib_qp *qp,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr)
+{
+ return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
+}
+
+/**
+ * ib_create_cq - Creates a CQ on the specified device.
+ * @device: The device on which to create the CQ.
+ * @comp_handler: A user-specified callback that is invoked when a
+ * completion event occurs on the CQ.
+ * @event_handler: A user-specified callback that is invoked when an
+ * asynchronous event not associated with a completion occurs on the CQ.
+ * @cq_context: Context associated with the CQ returned to the user via
+ * the associated completion and event handlers.
+ * @cqe: The minimum size of the CQ.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+struct ib_cq *ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context, int cqe);
+
+/**
+ * ib_resize_cq - Modifies the capacity of the CQ.
+ * @cq: The CQ to resize.
+ * @cqe: The minimum size of the CQ.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+int ib_resize_cq(struct ib_cq *cq, int cqe);
+
+/**
+ * ib_destroy_cq - Destroys the specified CQ.
+ * @cq: The CQ to destroy.
+ */
+int ib_destroy_cq(struct ib_cq *cq);
+
+/**
+ * ib_poll_cq - poll a CQ for completion(s)
+ * @cq:the CQ being polled
+ * @num_entries:maximum number of completions to return
+ * @wc:array of at least @num_entries &struct ib_wc where completions
+ * will be returned
+ *
+ * Poll a CQ for (possibly multiple) completions. If the return value
+ * is < 0, an error occurred. If the return value is >= 0, it is the
+ * number of completions returned. If the return value is
+ * non-negative and < num_entries, then the CQ was emptied.
+ */
+static inline int ib_poll_cq(struct ib_cq *cq, int num_entries,
+ struct ib_wc *wc)
+{
+ return cq->device->poll_cq(cq, num_entries, wc);
+}
+
+/**
+ * ib_peek_cq - Returns the number of unreaped completions currently
+ * on the specified CQ.
+ * @cq: The CQ to peek.
+ * @wc_cnt: A minimum number of unreaped completions to check for.
+ *
+ * If the number of unreaped completions is greater than or equal to wc_cnt,
+ * this function returns wc_cnt, otherwise, it returns the actual number of
+ * unreaped completions.
+ */
+int ib_peek_cq(struct ib_cq *cq, int wc_cnt);
+
+/**
+ * ib_req_notify_cq - Request completion notification on a CQ.
+ * @cq: The CQ to generate an event for.
+ * @cq_notify: If set to %IB_CQ_SOLICITED, completion notification will
+ * occur on the next solicited event. If set to %IB_CQ_NEXT_COMP,
+ * notification will occur on the next completion.
+ */
+static inline int ib_req_notify_cq(struct ib_cq *cq,
+ enum ib_cq_notify cq_notify)
+{
+ return cq->device->req_notify_cq(cq, cq_notify);
+}
+
+/**
+ * ib_req_ncomp_notif - Request completion notification when there are
+ * at least the specified number of unreaped completions on the CQ.
+ * @cq: The CQ to generate an event for.
+ * @wc_cnt: The number of unreaped completions that should be on the
+ * CQ before an event is generated.
+ */
+static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt)
+{
+ return cq->device->req_ncomp_notif ?
+ cq->device->req_ncomp_notif(cq, wc_cnt) :
+ -ENOSYS;
+}
+
+/**
+ * ib_get_dma_mr - Returns a memory region for system memory that is
+ * usable for DMA.
+ * @pd: The protection domain associated with the memory region.
+ * @mr_access_flags: Specifies the memory access rights.
+ */
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+/**
+ * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
+ * by an HCA.
+ * @pd: The protection domain associated assigned to the registered region.
+ * @phys_buf_array: Specifies a list of physical buffers to use in the
+ * memory region.
+ * @num_phys_buf: Specifies the size of the phys_buf_array.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+
+/**
+ * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
+ * Conceptually, this call performs the functions deregister memory region
+ * followed by register physical memory region. Where possible,
+ * resources are reused instead of deallocated and reallocated.
+ * @mr: The memory region to modify.
+ * @mr_rereg_mask: A bit-mask used to indicate which of the following
+ * properties of the memory region are being modified.
+ * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
+ * the new protection domain to associated with the memory region,
+ * otherwise, this parameter is ignored.
+ * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ * field specifies a list of physical buffers to use in the new
+ * translation, otherwise, this parameter is ignored.
+ * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ * field specifies the size of the phys_buf_array, otherwise, this
+ * parameter is ignored.
+ * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
+ * field specifies the new memory access rights, otherwise, this
+ * parameter is ignored.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+int ib_rereg_phys_mr(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+
+/**
+ * ib_query_mr - Retrieves information about a specific memory region.
+ * @mr: The memory region to retrieve information about.
+ * @mr_attr: The attributes of the specified memory region.
+ */
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+/**
+ * ib_dereg_mr - Deregisters a memory region and removes it from the
+ * HCA translation table.
+ * @mr: The memory region to deregister.
+ */
+int ib_dereg_mr(struct ib_mr *mr);
+
+/**
+ * ib_alloc_mw - Allocates a memory window.
+ * @pd: The protection domain associated with the memory window.
+ */
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd);
+
+/**
+ * ib_bind_mw - Posts a work request to the send queue of the specified
+ * QP, which binds the memory window to the given address range and
+ * remote access attributes.
+ * @qp: QP to post the bind work request on.
+ * @mw: The memory window to bind.
+ * @mw_bind: Specifies information about the memory window, including
+ * its address range, remote access rights, and associated memory region.
+ */
+static inline int ib_bind_mw(struct ib_qp *qp,
+ struct ib_mw *mw,
+ struct ib_mw_bind *mw_bind)
+{
+ /* XXX reference counting in corresponding MR? */
+ return mw->device->bind_mw ?
+ mw->device->bind_mw(qp, mw, mw_bind) :
+ -ENOSYS;
+}
+
+/**
+ * ib_dealloc_mw - Deallocates a memory window.
+ * @mw: The memory window to deallocate.
+ */
+int ib_dealloc_mw(struct ib_mw *mw);
+
+/**
+ * ib_alloc_fmr - Allocates a unmapped fast memory region.
+ * @pd: The protection domain associated with the unmapped region.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @fmr_attr: Attributes of the unmapped region.
+ *
+ * A fast memory region must be mapped before it can be used as part of
+ * a work request.
+ */
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr);
+
+/**
+ * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region.
+ * @fmr: The fast memory region to associate with the pages.
+ * @page_list: An array of physical pages to map to the fast memory region.
+ * @list_len: The number of pages in page_list.
+ * @iova: The I/O virtual address to use with the mapped region.
+ */
+static inline int ib_map_phys_fmr(struct ib_fmr *fmr,
+ u64 *page_list, int list_len,
+ u64 iova)
+{
+ return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova);
+}
+
+/**
+ * ib_unmap_fmr - Removes the mapping from a list of fast memory regions.
+ * @fmr_list: A linked list of fast memory regions to unmap.
+ */
+int ib_unmap_fmr(struct list_head *fmr_list);
+
+/**
+ * ib_dealloc_fmr - Deallocates a fast memory region.
+ * @fmr: The fast memory region to deallocate.
+ */
+int ib_dealloc_fmr(struct ib_fmr *fmr);
+
+/**
+ * ib_attach_mcast - Attaches the specified QP to a multicast group.
+ * @qp: QP to attach to the multicast group. The QP must be type
+ * IB_QPT_UD.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ *
+ * In order to send and receive multicast packets, subnet
+ * administration must have created the multicast group and configured
+ * the fabric appropriately. The port associated with the specified
+ * QP must also be a member of the multicast group.
+ */
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+/**
+ * ib_detach_mcast - Detaches the specified QP from a multicast group.
+ * @qp: QP to detach from the multicast group.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ */
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+#endif /* IB_VERBS_H */
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
new file mode 100644
index 00000000000..8d2e04cac68
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -0,0 +1,33 @@
+config INFINIBAND_IPOIB
+ tristate "IP-over-InfiniBand"
+ depends on INFINIBAND && NETDEVICES && INET
+ ---help---
+ Support for the IP-over-InfiniBand protocol (IPoIB). This
+ transports IP packets over InfiniBand so you can use your IB
+ device as a fancy NIC.
+
+ The IPoIB protocol is defined by the IETF ipoib working
+ group: <http://www.ietf.org/html.charters/ipoib-charter.html>.
+
+config INFINIBAND_IPOIB_DEBUG
+ bool "IP-over-InfiniBand debugging"
+ depends on INFINIBAND_IPOIB
+ ---help---
+ This option causes debugging code to be compiled into the
+ IPoIB driver. The output can be turned on via the
+ debug_level and mcast_debug_level module parameters (which
+ can also be set after the driver is loaded through sysfs).
+
+ This option also creates an "ipoib_debugfs," which can be
+ mounted to expose debugging information about IB multicast
+ groups used by the IPoIB driver.
+
+config INFINIBAND_IPOIB_DEBUG_DATA
+ bool "IP-over-InfiniBand data path debugging"
+ depends on INFINIBAND_IPOIB_DEBUG
+ ---help---
+ This option compiles debugging code into the the data path
+ of the IPoIB driver. The output can be turned on via the
+ data_debug_level module parameter; however, even with output
+ turned off, this debugging code will have some performance
+ impact.
diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile
new file mode 100644
index 00000000000..394bc08abc6
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/Makefile
@@ -0,0 +1,11 @@
+EXTRA_CFLAGS += -Idrivers/infiniband/include
+
+obj-$(CONFIG_INFINIBAND_IPOIB) += ib_ipoib.o
+
+ib_ipoib-y := ipoib_main.o \
+ ipoib_ib.o \
+ ipoib_multicast.o \
+ ipoib_verbs.o \
+ ipoib_vlan.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o
+
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
new file mode 100644
index 00000000000..04c98f54e9c
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib.h 1358 2004-12-17 22:00:11Z roland $
+ */
+
+#ifndef _IPOIB_H
+#define _IPOIB_H
+
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/workqueue.h>
+#include <linux/pci.h>
+#include <linux/config.h>
+#include <linux/kref.h>
+#include <linux/if_infiniband.h>
+
+#include <net/neighbour.h>
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+#include <ib_verbs.h>
+#include <ib_pack.h>
+#include <ib_sa.h>
+
+/* constants */
+
+enum {
+ IPOIB_PACKET_SIZE = 2048,
+ IPOIB_BUF_SIZE = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
+
+ IPOIB_ENCAP_LEN = 4,
+
+ IPOIB_RX_RING_SIZE = 128,
+ IPOIB_TX_RING_SIZE = 64,
+
+ IPOIB_NUM_WC = 4,
+
+ IPOIB_MAX_PATH_REC_QUEUE = 3,
+ IPOIB_MAX_MCAST_QUEUE = 3,
+
+ IPOIB_FLAG_OPER_UP = 0,
+ IPOIB_FLAG_ADMIN_UP = 1,
+ IPOIB_PKEY_ASSIGNED = 2,
+ IPOIB_PKEY_STOP = 3,
+ IPOIB_FLAG_SUBINTERFACE = 4,
+ IPOIB_MCAST_RUN = 5,
+ IPOIB_STOP_REAPER = 6,
+
+ IPOIB_MAX_BACKOFF_SECONDS = 16,
+
+ IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */
+ IPOIB_MCAST_FLAG_SENDONLY = 1,
+ IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */
+ IPOIB_MCAST_FLAG_ATTACHED = 3,
+};
+
+/* structs */
+
+struct ipoib_header {
+ u16 proto;
+ u16 reserved;
+};
+
+struct ipoib_pseudoheader {
+ u8 hwaddr[INFINIBAND_ALEN];
+};
+
+struct ipoib_mcast;
+
+struct ipoib_buf {
+ struct sk_buff *skb;
+ DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+/*
+ * Device private locking: tx_lock protects members used in TX fast
+ * path (and we use LLTX so upper layers don't do extra locking).
+ * lock protects everything else. lock nests inside of tx_lock (ie
+ * tx_lock must be acquired first if needed).
+ */
+struct ipoib_dev_priv {
+ spinlock_t lock;
+
+ struct net_device *dev;
+
+ unsigned long flags;
+
+ struct semaphore mcast_mutex;
+ struct semaphore vlan_mutex;
+
+ struct rb_root path_tree;
+ struct list_head path_list;
+
+ struct ipoib_mcast *broadcast;
+ struct list_head multicast_list;
+ struct rb_root multicast_tree;
+
+ struct work_struct pkey_task;
+ struct work_struct mcast_task;
+ struct work_struct flush_task;
+ struct work_struct restart_task;
+ struct work_struct ah_reap_task;
+
+ struct ib_device *ca;
+ u8 port;
+ u16 pkey;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ struct ib_cq *cq;
+ struct ib_qp *qp;
+ u32 qkey;
+
+ union ib_gid local_gid;
+ u16 local_lid;
+ u8 local_rate;
+
+ unsigned int admin_mtu;
+ unsigned int mcast_mtu;
+
+ struct ipoib_buf *rx_ring;
+
+ spinlock_t tx_lock;
+ struct ipoib_buf *tx_ring;
+ unsigned tx_head;
+ unsigned tx_tail;
+ struct ib_sge tx_sge;
+ struct ib_send_wr tx_wr;
+
+ struct ib_wc ibwc[IPOIB_NUM_WC];
+
+ struct list_head dead_ahs;
+
+ struct ib_event_handler event_handler;
+
+ struct net_device_stats stats;
+
+ struct net_device *parent;
+ struct list_head child_intfs;
+ struct list_head list;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+ struct list_head fs_list;
+ struct dentry *mcg_dentry;
+#endif
+};
+
+struct ipoib_ah {
+ struct net_device *dev;
+ struct ib_ah *ah;
+ struct list_head list;
+ struct kref ref;
+ unsigned last_send;
+};
+
+struct ipoib_path {
+ struct net_device *dev;
+ struct ib_sa_path_rec pathrec;
+ struct ipoib_ah *ah;
+ struct sk_buff_head queue;
+
+ struct list_head neigh_list;
+
+ int query_id;
+ struct ib_sa_query *query;
+ struct completion done;
+
+ struct rb_node rb_node;
+ struct list_head list;
+};
+
+struct ipoib_neigh {
+ struct ipoib_ah *ah;
+ struct sk_buff_head queue;
+
+ struct neighbour *neighbour;
+
+ struct list_head list;
+};
+
+static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh)
+{
+ return (struct ipoib_neigh **) (neigh->ha + 24 -
+ (offsetof(struct neighbour, ha) & 4));
+}
+
+extern struct workqueue_struct *ipoib_workqueue;
+
+/* functions */
+
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+ struct ib_pd *pd, struct ib_ah_attr *attr);
+void ipoib_free_ah(struct kref *kref);
+static inline void ipoib_put_ah(struct ipoib_ah *ah)
+{
+ kref_put(&ah->ref, ipoib_free_ah);
+}
+
+int ipoib_add_pkey_attr(struct net_device *dev);
+
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+ struct ipoib_ah *address, u32 qpn);
+void ipoib_reap_ah(void *dev_ptr);
+
+void ipoib_flush_paths(struct net_device *dev);
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
+
+int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+void ipoib_ib_dev_flush(void *dev);
+void ipoib_ib_dev_cleanup(struct net_device *dev);
+
+int ipoib_ib_dev_open(struct net_device *dev);
+int ipoib_ib_dev_up(struct net_device *dev);
+int ipoib_ib_dev_down(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev);
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+void ipoib_dev_cleanup(struct net_device *dev);
+
+void ipoib_mcast_join_task(void *dev_ptr);
+void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid,
+ struct sk_buff *skb);
+
+void ipoib_mcast_restart_task(void *dev_ptr);
+int ipoib_mcast_start_thread(struct net_device *dev);
+int ipoib_mcast_stop_thread(struct net_device *dev);
+
+void ipoib_mcast_dev_down(struct net_device *dev);
+void ipoib_mcast_dev_flush(struct net_device *dev);
+
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
+void ipoib_mcast_iter_free(struct ipoib_mcast_iter *iter);
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+ union ib_gid *gid,
+ unsigned long *created,
+ unsigned int *queuelen,
+ unsigned int *complete,
+ unsigned int *send_only);
+
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
+ union ib_gid *mgid);
+int ipoib_mcast_detach(struct net_device *dev, u16 mlid,
+ union ib_gid *mgid);
+
+int ipoib_qp_create(struct net_device *dev);
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
+void ipoib_transport_dev_cleanup(struct net_device *dev);
+
+void ipoib_event(struct ib_event_handler *handler,
+ struct ib_event *record);
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey);
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
+
+void ipoib_pkey_poll(void *dev);
+int ipoib_pkey_dev_delay_open(struct net_device *dev);
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int ipoib_create_debug_file(struct net_device *dev);
+void ipoib_delete_debug_file(struct net_device *dev);
+int ipoib_register_debugfs(void);
+void ipoib_unregister_debugfs(void);
+#else
+static inline int ipoib_create_debug_file(struct net_device *dev) { return 0; }
+static inline void ipoib_delete_debug_file(struct net_device *dev) { }
+static inline int ipoib_register_debugfs(void) { return 0; }
+static inline void ipoib_unregister_debugfs(void) { }
+#endif
+
+
+#define ipoib_printk(level, priv, format, arg...) \
+ printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg)
+#define ipoib_warn(priv, format, arg...) \
+ ipoib_printk(KERN_WARNING, priv, format , ## arg)
+
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+extern int ipoib_debug_level;
+
+#define ipoib_dbg(priv, format, arg...) \
+ do { \
+ if (ipoib_debug_level > 0) \
+ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+ } while (0)
+#define ipoib_dbg_mcast(priv, format, arg...) \
+ do { \
+ if (mcast_debug_level > 0) \
+ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+ } while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+#define ipoib_dbg(priv, format, arg...) \
+ do { (void) (priv); } while (0)
+#define ipoib_dbg_mcast(priv, format, arg...) \
+ do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+#define ipoib_dbg_data(priv, format, arg...) \
+ do { \
+ if (data_debug_level > 0) \
+ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+ } while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+#define ipoib_dbg_data(priv, format, arg...) \
+ do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+
+
+#define IPOIB_GID_FMT "%x:%x:%x:%x:%x:%x:%x:%x"
+
+#define IPOIB_GID_ARG(gid) be16_to_cpup((__be16 *) ((gid).raw + 0)), \
+ be16_to_cpup((__be16 *) ((gid).raw + 2)), \
+ be16_to_cpup((__be16 *) ((gid).raw + 4)), \
+ be16_to_cpup((__be16 *) ((gid).raw + 6)), \
+ be16_to_cpup((__be16 *) ((gid).raw + 8)), \
+ be16_to_cpup((__be16 *) ((gid).raw + 10)), \
+ be16_to_cpup((__be16 *) ((gid).raw + 12)), \
+ be16_to_cpup((__be16 *) ((gid).raw + 14))
+
+#endif /* _IPOIB_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
new file mode 100644
index 00000000000..044f2c78ef1
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_fs.c 1389 2004-12-27 22:56:47Z roland $
+ */
+
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+
+#include "ipoib.h"
+
+enum {
+ IPOIB_MAGIC = 0x49504942 /* "IPIB" */
+};
+
+static DECLARE_MUTEX(ipoib_fs_mutex);
+static struct dentry *ipoib_root;
+static struct super_block *ipoib_sb;
+static LIST_HEAD(ipoib_device_list);
+
+static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos)
+{
+ struct ipoib_mcast_iter *iter;
+ loff_t n = *pos;
+
+ iter = ipoib_mcast_iter_init(file->private);
+ if (!iter)
+ return NULL;
+
+ while (n--) {
+ if (ipoib_mcast_iter_next(iter)) {
+ ipoib_mcast_iter_free(iter);
+ return NULL;
+ }
+ }
+
+ return iter;
+}
+
+static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr,
+ loff_t *pos)
+{
+ struct ipoib_mcast_iter *iter = iter_ptr;
+
+ (*pos)++;
+
+ if (ipoib_mcast_iter_next(iter)) {
+ ipoib_mcast_iter_free(iter);
+ return NULL;
+ }
+
+ return iter;
+}
+
+static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+ /* nothing for now */
+}
+
+static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr)
+{
+ struct ipoib_mcast_iter *iter = iter_ptr;
+ char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+ union ib_gid mgid;
+ int i, n;
+ unsigned long created;
+ unsigned int queuelen, complete, send_only;
+
+ if (iter) {
+ ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen,
+ &complete, &send_only);
+
+ for (n = 0, i = 0; i < sizeof mgid / 2; ++i) {
+ n += sprintf(gid_buf + n, "%x",
+ be16_to_cpu(((u16 *)mgid.raw)[i]));
+ if (i < sizeof mgid / 2 - 1)
+ gid_buf[n++] = ':';
+ }
+ }
+
+ seq_printf(file, "GID: %*s", -(1 + (int) sizeof gid_buf), gid_buf);
+
+ seq_printf(file,
+ " created: %10ld queuelen: %4d complete: %d send_only: %d\n",
+ created, queuelen, complete, send_only);
+
+ return 0;
+}
+
+static struct seq_operations ipoib_seq_ops = {
+ .start = ipoib_mcg_seq_start,
+ .next = ipoib_mcg_seq_next,
+ .stop = ipoib_mcg_seq_stop,
+ .show = ipoib_mcg_seq_show,
+};
+
+static int ipoib_mcg_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &ipoib_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->u.generic_ip;
+
+ return 0;
+}
+
+static struct file_operations ipoib_fops = {
+ .owner = THIS_MODULE,
+ .open = ipoib_mcg_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static struct inode *ipoib_get_inode(void)
+{
+ struct inode *inode = new_inode(ipoib_sb);
+
+ if (inode) {
+ inode->i_mode = S_IFREG | S_IRUGO;
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ inode->i_blksize = PAGE_CACHE_SIZE;
+ inode->i_blocks = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_fop = &ipoib_fops;
+ }
+
+ return inode;
+}
+
+static int __ipoib_create_debug_file(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct dentry *dentry;
+ struct inode *inode;
+ char name[IFNAMSIZ + sizeof "_mcg"];
+
+ snprintf(name, sizeof name, "%s_mcg", dev->name);
+
+ dentry = d_alloc_name(ipoib_root, name);
+ if (!dentry)
+ return -ENOMEM;
+
+ inode = ipoib_get_inode();
+ if (!inode) {
+ dput(dentry);
+ return -ENOMEM;
+ }
+
+ inode->u.generic_ip = dev;
+ priv->mcg_dentry = dentry;
+
+ d_add(dentry, inode);
+
+ return 0;
+}
+
+int ipoib_create_debug_file(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ down(&ipoib_fs_mutex);
+
+ list_add_tail(&priv->fs_list, &ipoib_device_list);
+
+ if (!ipoib_sb) {
+ up(&ipoib_fs_mutex);
+ return 0;
+ }
+
+ up(&ipoib_fs_mutex);
+
+ return __ipoib_create_debug_file(dev);
+}
+
+void ipoib_delete_debug_file(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ down(&ipoib_fs_mutex);
+ list_del(&priv->fs_list);
+ if (!ipoib_sb) {
+ up(&ipoib_fs_mutex);
+ return;
+ }
+ up(&ipoib_fs_mutex);
+
+ if (priv->mcg_dentry) {
+ d_drop(priv->mcg_dentry);
+ simple_unlink(ipoib_root->d_inode, priv->mcg_dentry);
+ }
+}
+
+static int ipoib_fill_super(struct super_block *sb, void *data, int silent)
+{
+ static struct tree_descr ipoib_files[] = {
+ { "" }
+ };
+ struct ipoib_dev_priv *priv;
+ int ret;
+
+ ret = simple_fill_super(sb, IPOIB_MAGIC, ipoib_files);
+ if (ret)
+ return ret;
+
+ ipoib_root = sb->s_root;
+
+ down(&ipoib_fs_mutex);
+
+ ipoib_sb = sb;
+
+ list_for_each_entry(priv, &ipoib_device_list, fs_list) {
+ ret = __ipoib_create_debug_file(priv->dev);
+ if (ret)
+ break;
+ }
+
+ up(&ipoib_fs_mutex);
+
+ return ret;
+}
+
+static struct super_block *ipoib_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return get_sb_single(fs_type, flags, data, ipoib_fill_super);
+}
+
+static void ipoib_kill_sb(struct super_block *sb)
+{
+ down(&ipoib_fs_mutex);
+ ipoib_sb = NULL;
+ up(&ipoib_fs_mutex);
+
+ kill_litter_super(sb);
+}
+
+static struct file_system_type ipoib_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ipoib_debugfs",
+ .get_sb = ipoib_get_sb,
+ .kill_sb = ipoib_kill_sb,
+};
+
+int ipoib_register_debugfs(void)
+{
+ return register_filesystem(&ipoib_fs_type);
+}
+
+void ipoib_unregister_debugfs(void)
+{
+ unregister_filesystem(&ipoib_fs_type);
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
new file mode 100644
index 00000000000..c5a1d45e0ac
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_ib.c 1386 2004-12-27 16:23:17Z roland $
+ */
+
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+
+#include <ib_cache.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param(data_debug_level, int, 0644);
+MODULE_PARM_DESC(data_debug_level,
+ "Enable data path debug tracing if > 0");
+#endif
+
+#define IPOIB_OP_RECV (1ul << 31)
+
+static DECLARE_MUTEX(pkey_sem);
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+ struct ib_pd *pd, struct ib_ah_attr *attr)
+{
+ struct ipoib_ah *ah;
+
+ ah = kmalloc(sizeof *ah, GFP_KERNEL);
+ if (!ah)
+ return NULL;
+
+ ah->dev = dev;
+ ah->last_send = 0;
+ kref_init(&ah->ref);
+
+ ah->ah = ib_create_ah(pd, attr);
+ if (IS_ERR(ah->ah)) {
+ kfree(ah);
+ ah = NULL;
+ } else
+ ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah);
+
+ return ah;
+}
+
+void ipoib_free_ah(struct kref *kref)
+{
+ struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
+ struct ipoib_dev_priv *priv = netdev_priv(ah->dev);
+
+ unsigned long flags;
+
+ if (ah->last_send <= priv->tx_tail) {
+ ipoib_dbg(priv, "Freeing ah %p\n", ah->ah);
+ ib_destroy_ah(ah->ah);
+ kfree(ah);
+ } else {
+ spin_lock_irqsave(&priv->lock, flags);
+ list_add_tail(&ah->list, &priv->dead_ahs);
+ spin_unlock_irqrestore(&priv->lock, flags);
+ }
+}
+
+static inline int ipoib_ib_receive(struct ipoib_dev_priv *priv,
+ unsigned int wr_id,
+ dma_addr_t addr)
+{
+ struct ib_sge list = {
+ .addr = addr,
+ .length = IPOIB_BUF_SIZE,
+ .lkey = priv->mr->lkey,
+ };
+ struct ib_recv_wr param = {
+ .wr_id = wr_id | IPOIB_OP_RECV,
+ .sg_list = &list,
+ .num_sge = 1,
+ };
+ struct ib_recv_wr *bad_wr;
+
+ return ib_post_recv(priv->qp, &param, &bad_wr);
+}
+
+static int ipoib_ib_post_receive(struct net_device *dev, int id)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct sk_buff *skb;
+ dma_addr_t addr;
+ int ret;
+
+ skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
+ if (!skb) {
+ ipoib_warn(priv, "failed to allocate receive buffer\n");
+
+ priv->rx_ring[id].skb = NULL;
+ return -ENOMEM;
+ }
+ skb_reserve(skb, 4); /* 16 byte align IP header */
+ priv->rx_ring[id].skb = skb;
+ addr = dma_map_single(priv->ca->dma_device,
+ skb->data, IPOIB_BUF_SIZE,
+ DMA_FROM_DEVICE);
+ pci_unmap_addr_set(&priv->rx_ring[id], mapping, addr);
+
+ ret = ipoib_ib_receive(priv, id, addr);
+ if (ret) {
+ ipoib_warn(priv, "ipoib_ib_receive failed for buf %d (%d)\n",
+ id, ret);
+ dma_unmap_single(priv->ca->dma_device, addr,
+ IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+ dev_kfree_skb_any(skb);
+ priv->rx_ring[id].skb = NULL;
+ }
+
+ return ret;
+}
+
+static int ipoib_ib_post_receives(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int i;
+
+ for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) {
+ if (ipoib_ib_post_receive(dev, i)) {
+ ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static void ipoib_ib_handle_wc(struct net_device *dev,
+ struct ib_wc *wc)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ unsigned int wr_id = wc->wr_id;
+
+ ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n",
+ wr_id, wc->opcode, wc->status);
+
+ if (wr_id & IPOIB_OP_RECV) {
+ wr_id &= ~IPOIB_OP_RECV;
+
+ if (wr_id < IPOIB_RX_RING_SIZE) {
+ struct sk_buff *skb = priv->rx_ring[wr_id].skb;
+
+ priv->rx_ring[wr_id].skb = NULL;
+
+ dma_unmap_single(priv->ca->dma_device,
+ pci_unmap_addr(&priv->rx_ring[wr_id],
+ mapping),
+ IPOIB_BUF_SIZE,
+ DMA_FROM_DEVICE);
+
+ if (wc->status != IB_WC_SUCCESS) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ ipoib_warn(priv, "failed recv event "
+ "(status=%d, wrid=%d vend_err %x)\n",
+ wc->status, wr_id, wc->vendor_err);
+ dev_kfree_skb_any(skb);
+ return;
+ }
+
+ ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+ wc->byte_len, wc->slid);
+
+ skb_put(skb, wc->byte_len);
+ skb_pull(skb, IB_GRH_BYTES);
+
+ if (wc->slid != priv->local_lid ||
+ wc->src_qp != priv->qp->qp_num) {
+ skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+
+ skb_pull(skb, IPOIB_ENCAP_LEN);
+
+ dev->last_rx = jiffies;
+ ++priv->stats.rx_packets;
+ priv->stats.rx_bytes += skb->len;
+
+ skb->dev = dev;
+ /* XXX get correct PACKET_ type here */
+ skb->pkt_type = PACKET_HOST;
+ netif_rx_ni(skb);
+ } else {
+ ipoib_dbg_data(priv, "dropping loopback packet\n");
+ dev_kfree_skb_any(skb);
+ }
+
+ /* repost receive */
+ if (ipoib_ib_post_receive(dev, wr_id))
+ ipoib_warn(priv, "ipoib_ib_post_receive failed "
+ "for buf %d\n", wr_id);
+ } else
+ ipoib_warn(priv, "completion event with wrid %d\n",
+ wr_id);
+
+ } else {
+ struct ipoib_buf *tx_req;
+ unsigned long flags;
+
+ if (wr_id >= IPOIB_TX_RING_SIZE) {
+ ipoib_warn(priv, "completion event with wrid %d (> %d)\n",
+ wr_id, IPOIB_TX_RING_SIZE);
+ return;
+ }
+
+ ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id);
+
+ tx_req = &priv->tx_ring[wr_id];
+
+ dma_unmap_single(priv->ca->dma_device,
+ pci_unmap_addr(tx_req, mapping),
+ tx_req->skb->len,
+ DMA_TO_DEVICE);
+
+ ++priv->stats.tx_packets;
+ priv->stats.tx_bytes += tx_req->skb->len;
+
+ dev_kfree_skb_any(tx_req->skb);
+
+ spin_lock_irqsave(&priv->tx_lock, flags);
+ ++priv->tx_tail;
+ if (netif_queue_stopped(dev) &&
+ priv->tx_head - priv->tx_tail <= IPOIB_TX_RING_SIZE / 2)
+ netif_wake_queue(dev);
+ spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+ if (wc->status != IB_WC_SUCCESS &&
+ wc->status != IB_WC_WR_FLUSH_ERR)
+ ipoib_warn(priv, "failed send event "
+ "(status=%d, wrid=%d vend_err %x)\n",
+ wc->status, wr_id, wc->vendor_err);
+ }
+}
+
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+{
+ struct net_device *dev = (struct net_device *) dev_ptr;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int n, i;
+
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ do {
+ n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc);
+ for (i = 0; i < n; ++i)
+ ipoib_ib_handle_wc(dev, priv->ibwc + i);
+ } while (n == IPOIB_NUM_WC);
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+ unsigned int wr_id,
+ struct ib_ah *address, u32 qpn,
+ dma_addr_t addr, int len)
+{
+ struct ib_send_wr *bad_wr;
+
+ priv->tx_sge.addr = addr;
+ priv->tx_sge.length = len;
+
+ priv->tx_wr.wr_id = wr_id;
+ priv->tx_wr.wr.ud.remote_qpn = qpn;
+ priv->tx_wr.wr.ud.ah = address;
+
+ return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+}
+
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+ struct ipoib_ah *address, u32 qpn)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_buf *tx_req;
+ dma_addr_t addr;
+
+ if (skb->len > dev->mtu + INFINIBAND_ALEN) {
+ ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+ skb->len, dev->mtu + INFINIBAND_ALEN);
+ ++priv->stats.tx_dropped;
+ ++priv->stats.tx_errors;
+ dev_kfree_skb_any(skb);
+ return;
+ }
+
+ ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
+ skb->len, address, qpn);
+
+ /*
+ * We put the skb into the tx_ring _before_ we call post_send()
+ * because it's entirely possible that the completion handler will
+ * run before we execute anything after the post_send(). That
+ * means we have to make sure everything is properly recorded and
+ * our state is consistent before we call post_send().
+ */
+ tx_req = &priv->tx_ring[priv->tx_head & (IPOIB_TX_RING_SIZE - 1)];
+ tx_req->skb = skb;
+ addr = dma_map_single(priv->ca->dma_device, skb->data, skb->len,
+ DMA_TO_DEVICE);
+ pci_unmap_addr_set(tx_req, mapping, addr);
+
+ if (unlikely(post_send(priv, priv->tx_head & (IPOIB_TX_RING_SIZE - 1),
+ address->ah, qpn, addr, skb->len))) {
+ ipoib_warn(priv, "post_send failed\n");
+ ++priv->stats.tx_errors;
+ dma_unmap_single(priv->ca->dma_device, addr, skb->len,
+ DMA_TO_DEVICE);
+ dev_kfree_skb_any(skb);
+ } else {
+ dev->trans_start = jiffies;
+
+ address->last_send = priv->tx_head;
+ ++priv->tx_head;
+
+ if (priv->tx_head - priv->tx_tail == IPOIB_TX_RING_SIZE) {
+ ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+ netif_stop_queue(dev);
+ }
+ }
+}
+
+static void __ipoib_reap_ah(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_ah *ah, *tah;
+ LIST_HEAD(remove_list);
+
+ spin_lock_irq(&priv->lock);
+ list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
+ if (ah->last_send <= priv->tx_tail) {
+ list_del(&ah->list);
+ list_add_tail(&ah->list, &remove_list);
+ }
+ spin_unlock_irq(&priv->lock);
+
+ list_for_each_entry_safe(ah, tah, &remove_list, list) {
+ ipoib_dbg(priv, "Reaping ah %p\n", ah->ah);
+ ib_destroy_ah(ah->ah);
+ kfree(ah);
+ }
+}
+
+void ipoib_reap_ah(void *dev_ptr)
+{
+ struct net_device *dev = dev_ptr;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ __ipoib_reap_ah(dev);
+
+ if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
+ queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
+}
+
+int ipoib_ib_dev_open(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int ret;
+
+ ret = ipoib_qp_create(dev);
+ if (ret) {
+ ipoib_warn(priv, "ipoib_qp_create returned %d\n", ret);
+ return -1;
+ }
+
+ ret = ipoib_ib_post_receives(dev);
+ if (ret) {
+ ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+ return -1;
+ }
+
+ clear_bit(IPOIB_STOP_REAPER, &priv->flags);
+ queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
+
+ return 0;
+}
+
+int ipoib_ib_dev_up(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+
+ return ipoib_mcast_start_thread(dev);
+}
+
+int ipoib_ib_dev_down(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ ipoib_dbg(priv, "downing ib_dev\n");
+
+ clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+ netif_carrier_off(dev);
+
+ /* Shutdown the P_Key thread if still active */
+ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+ down(&pkey_sem);
+ set_bit(IPOIB_PKEY_STOP, &priv->flags);
+ cancel_delayed_work(&priv->pkey_task);
+ up(&pkey_sem);
+ flush_workqueue(ipoib_workqueue);
+ }
+
+ ipoib_mcast_stop_thread(dev);
+
+ /*
+ * Flush the multicast groups first so we stop any multicast joins. The
+ * completion thread may have already died and we may deadlock waiting
+ * for the completion thread to finish some multicast joins.
+ */
+ ipoib_mcast_dev_flush(dev);
+
+ /* Delete broadcast and local addresses since they will be recreated */
+ ipoib_mcast_dev_down(dev);
+
+ ipoib_flush_paths(dev);
+
+ return 0;
+}
+
+static int recvs_pending(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int pending = 0;
+ int i;
+
+ for (i = 0; i < IPOIB_RX_RING_SIZE; ++i)
+ if (priv->rx_ring[i].skb)
+ ++pending;
+
+ return pending;
+}
+
+int ipoib_ib_dev_stop(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_qp_attr qp_attr;
+ int attr_mask;
+ unsigned long begin;
+ struct ipoib_buf *tx_req;
+ int i;
+
+ /* Kill the existing QP and allocate a new one */
+ qp_attr.qp_state = IB_QPS_ERR;
+ attr_mask = IB_QP_STATE;
+ if (ib_modify_qp(priv->qp, &qp_attr, attr_mask))
+ ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
+
+ /* Wait for all sends and receives to complete */
+ begin = jiffies;
+
+ while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
+ if (time_after(jiffies, begin + 5 * HZ)) {
+ ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
+ priv->tx_head - priv->tx_tail, recvs_pending(dev));
+
+ /*
+ * assume the HW is wedged and just free up
+ * all our pending work requests.
+ */
+ while (priv->tx_tail < priv->tx_head) {
+ tx_req = &priv->tx_ring[priv->tx_tail &
+ (IPOIB_TX_RING_SIZE - 1)];
+ dma_unmap_single(priv->ca->dma_device,
+ pci_unmap_addr(tx_req, mapping),
+ tx_req->skb->len,
+ DMA_TO_DEVICE);
+ dev_kfree_skb_any(tx_req->skb);
+ ++priv->tx_tail;
+ }
+
+ for (i = 0; i < IPOIB_RX_RING_SIZE; ++i)
+ if (priv->rx_ring[i].skb) {
+ dma_unmap_single(priv->ca->dma_device,
+ pci_unmap_addr(&priv->rx_ring[i],
+ mapping),
+ IPOIB_BUF_SIZE,
+ DMA_FROM_DEVICE);
+ dev_kfree_skb_any(priv->rx_ring[i].skb);
+ priv->rx_ring[i].skb = NULL;
+ }
+
+ goto timeout;
+ }
+
+ msleep(1);
+ }
+
+ ipoib_dbg(priv, "All sends and receives done.\n");
+
+timeout:
+ qp_attr.qp_state = IB_QPS_RESET;
+ attr_mask = IB_QP_STATE;
+ if (ib_modify_qp(priv->qp, &qp_attr, attr_mask))
+ ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+ /* Wait for all AHs to be reaped */
+ set_bit(IPOIB_STOP_REAPER, &priv->flags);
+ cancel_delayed_work(&priv->ah_reap_task);
+ flush_workqueue(ipoib_workqueue);
+
+ begin = jiffies;
+
+ while (!list_empty(&priv->dead_ahs)) {
+ __ipoib_reap_ah(dev);
+
+ if (time_after(jiffies, begin + HZ)) {
+ ipoib_warn(priv, "timing out; will leak address handles\n");
+ break;
+ }
+
+ msleep(1);
+ }
+
+ return 0;
+}
+
+int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ priv->ca = ca;
+ priv->port = port;
+ priv->qp = NULL;
+
+ if (ipoib_transport_dev_init(dev, ca)) {
+ printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
+ return -ENODEV;
+ }
+
+ if (dev->flags & IFF_UP) {
+ if (ipoib_ib_dev_open(dev)) {
+ ipoib_transport_dev_cleanup(dev);
+ return -ENODEV;
+ }
+ }
+
+ return 0;
+}
+
+void ipoib_ib_dev_flush(void *_dev)
+{
+ struct net_device *dev = (struct net_device *)_dev;
+ struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv;
+
+ if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+ return;
+
+ ipoib_dbg(priv, "flushing\n");
+
+ ipoib_ib_dev_down(dev);
+
+ /*
+ * The device could have been brought down between the start and when
+ * we get here, don't bring it back up if it's not configured up
+ */
+ if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+ ipoib_ib_dev_up(dev);
+
+ /* Flush any child interfaces too */
+ list_for_each_entry(cpriv, &priv->child_intfs, list)
+ ipoib_ib_dev_flush(&cpriv->dev);
+}
+
+void ipoib_ib_dev_cleanup(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ ipoib_dbg(priv, "cleaning up ib_dev\n");
+
+ ipoib_mcast_stop_thread(dev);
+
+ /* Delete the broadcast address and the local address */
+ ipoib_mcast_dev_down(dev);
+
+ ipoib_transport_dev_cleanup(dev);
+}
+
+/*
+ * Delayed P_Key Assigment Interim Support
+ *
+ * The following is initial implementation of delayed P_Key assigment
+ * mechanism. It is using the same approach implemented for the multicast
+ * group join. The single goal of this implementation is to quickly address
+ * Bug #2507. This implementation will probably be removed when the P_Key
+ * change async notification is available.
+ */
+int ipoib_open(struct net_device *dev);
+
+static void ipoib_pkey_dev_check_presence(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ u16 pkey_index = 0;
+
+ if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index))
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ else
+ set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+}
+
+void ipoib_pkey_poll(void *dev_ptr)
+{
+ struct net_device *dev = dev_ptr;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ ipoib_pkey_dev_check_presence(dev);
+
+ if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+ ipoib_open(dev);
+ else {
+ down(&pkey_sem);
+ if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
+ queue_delayed_work(ipoib_workqueue,
+ &priv->pkey_task,
+ HZ);
+ up(&pkey_sem);
+ }
+}
+
+int ipoib_pkey_dev_delay_open(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ /* Look for the interface pkey value in the IB Port P_Key table and */
+ /* set the interface pkey assigment flag */
+ ipoib_pkey_dev_check_presence(dev);
+
+ /* P_Key value not assigned yet - start polling */
+ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+ down(&pkey_sem);
+ clear_bit(IPOIB_PKEY_STOP, &priv->flags);
+ queue_delayed_work(ipoib_workqueue,
+ &priv->pkey_task,
+ HZ);
+ up(&pkey_sem);
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
new file mode 100644
index 00000000000..5a3b5c6a449
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -0,0 +1,1103 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $
+ */
+
+#include "ipoib.h"
+
+#include <linux/version.h>
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <linux/if_arp.h> /* For ARPHRD_xxx */
+
+#include <linux/ip.h>
+#include <linux/in.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int ipoib_debug_level;
+
+module_param_named(debug_level, ipoib_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+#endif
+
+static const u8 ipv4_bcast_addr[] = {
+ 0x00, 0xff, 0xff, 0xff,
+ 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
+};
+
+struct workqueue_struct *ipoib_workqueue;
+
+static void ipoib_add_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device);
+
+static struct ib_client ipoib_client = {
+ .name = "ipoib",
+ .add = ipoib_add_one,
+ .remove = ipoib_remove_one
+};
+
+int ipoib_open(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ ipoib_dbg(priv, "bringing up interface\n");
+
+ set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+ if (ipoib_pkey_dev_delay_open(dev))
+ return 0;
+
+ if (ipoib_ib_dev_open(dev))
+ return -EINVAL;
+
+ if (ipoib_ib_dev_up(dev))
+ return -EINVAL;
+
+ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+ struct ipoib_dev_priv *cpriv;
+
+ /* Bring up any child interfaces too */
+ down(&priv->vlan_mutex);
+ list_for_each_entry(cpriv, &priv->child_intfs, list) {
+ int flags;
+
+ flags = cpriv->dev->flags;
+ if (flags & IFF_UP)
+ continue;
+
+ dev_change_flags(cpriv->dev, flags | IFF_UP);
+ }
+ up(&priv->vlan_mutex);
+ }
+
+ netif_start_queue(dev);
+
+ return 0;
+}
+
+static int ipoib_stop(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ ipoib_dbg(priv, "stopping interface\n");
+
+ clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+ netif_stop_queue(dev);
+
+ ipoib_ib_dev_down(dev);
+ ipoib_ib_dev_stop(dev);
+
+ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+ struct ipoib_dev_priv *cpriv;
+
+ /* Bring down any child interfaces too */
+ down(&priv->vlan_mutex);
+ list_for_each_entry(cpriv, &priv->child_intfs, list) {
+ int flags;
+
+ flags = cpriv->dev->flags;
+ if (!(flags & IFF_UP))
+ continue;
+
+ dev_change_flags(cpriv->dev, flags & ~IFF_UP);
+ }
+ up(&priv->vlan_mutex);
+ }
+
+ return 0;
+}
+
+static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+ return -EINVAL;
+
+ priv->admin_mtu = new_mtu;
+
+ dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+ return 0;
+}
+
+static struct ipoib_path *__path_find(struct net_device *dev,
+ union ib_gid *gid)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct rb_node *n = priv->path_tree.rb_node;
+ struct ipoib_path *path;
+ int ret;
+
+ while (n) {
+ path = rb_entry(n, struct ipoib_path, rb_node);
+
+ ret = memcmp(gid->raw, path->pathrec.dgid.raw,
+ sizeof (union ib_gid));
+
+ if (ret < 0)
+ n = n->rb_left;
+ else if (ret > 0)
+ n = n->rb_right;
+ else
+ return path;
+ }
+
+ return NULL;
+}
+
+static int __path_add(struct net_device *dev, struct ipoib_path *path)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct rb_node **n = &priv->path_tree.rb_node;
+ struct rb_node *pn = NULL;
+ struct ipoib_path *tpath;
+ int ret;
+
+ while (*n) {
+ pn = *n;
+ tpath = rb_entry(pn, struct ipoib_path, rb_node);
+
+ ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
+ sizeof (union ib_gid));
+ if (ret < 0)
+ n = &pn->rb_left;
+ else if (ret > 0)
+ n = &pn->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&path->rb_node, pn, n);
+ rb_insert_color(&path->rb_node, &priv->path_tree);
+
+ list_add_tail(&path->list, &priv->path_list);
+
+ return 0;
+}
+
+static void path_free(struct net_device *dev, struct ipoib_path *path)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_neigh *neigh, *tn;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ while ((skb = __skb_dequeue(&path->queue)))
+ dev_kfree_skb_irq(skb);
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
+ /*
+ * It's safe to call ipoib_put_ah() inside priv->lock
+ * here, because we know that path->ah will always
+ * hold one more reference, so ipoib_put_ah() will
+ * never do more than decrement the ref count.
+ */
+ if (neigh->ah)
+ ipoib_put_ah(neigh->ah);
+ *to_ipoib_neigh(neigh->neighbour) = NULL;
+ neigh->neighbour->ops->destructor = NULL;
+ kfree(neigh);
+ }
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ if (path->ah)
+ ipoib_put_ah(path->ah);
+
+ kfree(path);
+}
+
+void ipoib_flush_paths(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_path *path, *tp;
+ LIST_HEAD(remove_list);
+ unsigned long flags;
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ list_splice(&priv->path_list, &remove_list);
+ INIT_LIST_HEAD(&priv->path_list);
+
+ list_for_each_entry(path, &remove_list, list)
+ rb_erase(&path->rb_node, &priv->path_tree);
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ list_for_each_entry_safe(path, tp, &remove_list, list) {
+ if (path->query)
+ ib_sa_cancel_query(path->query_id, path->query);
+ wait_for_completion(&path->done);
+ path_free(dev, path);
+ }
+}
+
+static void path_rec_completion(int status,
+ struct ib_sa_path_rec *pathrec,
+ void *path_ptr)
+{
+ struct ipoib_path *path = path_ptr;
+ struct net_device *dev = path->dev;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_ah *ah = NULL;
+ struct ipoib_neigh *neigh;
+ struct sk_buff_head skqueue;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ if (pathrec)
+ ipoib_dbg(priv, "PathRec LID 0x%04x for GID " IPOIB_GID_FMT "\n",
+ be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid));
+ else
+ ipoib_dbg(priv, "PathRec status %d for GID " IPOIB_GID_FMT "\n",
+ status, IPOIB_GID_ARG(path->pathrec.dgid));
+
+ skb_queue_head_init(&skqueue);
+
+ if (!status) {
+ struct ib_ah_attr av = {
+ .dlid = be16_to_cpu(pathrec->dlid),
+ .sl = pathrec->sl,
+ .port_num = priv->port
+ };
+
+ if (ib_sa_rate_enum_to_int(pathrec->rate) > 0)
+ av.static_rate = (2 * priv->local_rate -
+ ib_sa_rate_enum_to_int(pathrec->rate) - 1) /
+ (priv->local_rate ? priv->local_rate : 1);
+
+ ipoib_dbg(priv, "static_rate %d for local port %dX, path %dX\n",
+ av.static_rate, priv->local_rate,
+ ib_sa_rate_enum_to_int(pathrec->rate));
+
+ ah = ipoib_create_ah(dev, priv->pd, &av);
+ }
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ path->ah = ah;
+
+ if (ah) {
+ path->pathrec = *pathrec;
+
+ ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
+ ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
+
+ while ((skb = __skb_dequeue(&path->queue)))
+ __skb_queue_tail(&skqueue, skb);
+
+ list_for_each_entry(neigh, &path->neigh_list, list) {
+ kref_get(&path->ah->ref);
+ neigh->ah = path->ah;
+
+ while ((skb = __skb_dequeue(&neigh->queue)))
+ __skb_queue_tail(&skqueue, skb);
+ }
+ } else
+ path->query = NULL;
+
+ complete(&path->done);
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ while ((skb = __skb_dequeue(&skqueue))) {
+ skb->dev = dev;
+ if (dev_queue_xmit(skb))
+ ipoib_warn(priv, "dev_queue_xmit failed "
+ "to requeue packet\n");
+ }
+}
+
+static struct ipoib_path *path_rec_create(struct net_device *dev,
+ union ib_gid *gid)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_path *path;
+
+ path = kmalloc(sizeof *path, GFP_ATOMIC);
+ if (!path)
+ return NULL;
+
+ path->dev = dev;
+ path->pathrec.dlid = 0;
+ path->ah = NULL;
+
+ skb_queue_head_init(&path->queue);
+
+ INIT_LIST_HEAD(&path->neigh_list);
+ path->query = NULL;
+ init_completion(&path->done);
+
+ memcpy(path->pathrec.dgid.raw, gid->raw, sizeof (union ib_gid));
+ path->pathrec.sgid = priv->local_gid;
+ path->pathrec.pkey = cpu_to_be16(priv->pkey);
+ path->pathrec.numb_path = 1;
+
+ return path;
+}
+
+static int path_rec_start(struct net_device *dev,
+ struct ipoib_path *path)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ ipoib_dbg(priv, "Start path record lookup for " IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(path->pathrec.dgid));
+
+ path->query_id =
+ ib_sa_path_rec_get(priv->ca, priv->port,
+ &path->pathrec,
+ IB_SA_PATH_REC_DGID |
+ IB_SA_PATH_REC_SGID |
+ IB_SA_PATH_REC_NUMB_PATH |
+ IB_SA_PATH_REC_PKEY,
+ 1000, GFP_ATOMIC,
+ path_rec_completion,
+ path, &path->query);
+ if (path->query_id < 0) {
+ ipoib_warn(priv, "ib_sa_path_rec_get failed\n");
+ path->query = NULL;
+ return path->query_id;
+ }
+
+ return 0;
+}
+
+static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_path *path;
+ struct ipoib_neigh *neigh;
+
+ neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
+ if (!neigh) {
+ ++priv->stats.tx_dropped;
+ dev_kfree_skb_any(skb);
+ return;
+ }
+
+ skb_queue_head_init(&neigh->queue);
+ neigh->neighbour = skb->dst->neighbour;
+ *to_ipoib_neigh(skb->dst->neighbour) = neigh;
+
+ /*
+ * We can only be called from ipoib_start_xmit, so we're
+ * inside tx_lock -- no need to save/restore flags.
+ */
+ spin_lock(&priv->lock);
+
+ path = __path_find(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4));
+ if (!path) {
+ path = path_rec_create(dev,
+ (union ib_gid *) (skb->dst->neighbour->ha + 4));
+ if (!path)
+ goto err;
+
+ __path_add(dev, path);
+ }
+
+ list_add_tail(&neigh->list, &path->neigh_list);
+
+ if (path->pathrec.dlid) {
+ kref_get(&path->ah->ref);
+ neigh->ah = path->ah;
+
+ ipoib_send(dev, skb, path->ah,
+ be32_to_cpup((__be32 *) skb->dst->neighbour->ha));
+ } else {
+ neigh->ah = NULL;
+ if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+ __skb_queue_tail(&neigh->queue, skb);
+ } else {
+ ++priv->stats.tx_dropped;
+ dev_kfree_skb_any(skb);
+ }
+
+ if (!path->query && path_rec_start(dev, path))
+ goto err;
+ }
+
+ spin_unlock(&priv->lock);
+ return;
+
+err:
+ *to_ipoib_neigh(skb->dst->neighbour) = NULL;
+ list_del(&neigh->list);
+ neigh->neighbour->ops->destructor = NULL;
+ kfree(neigh);
+
+ ++priv->stats.tx_dropped;
+ dev_kfree_skb_any(skb);
+
+ spin_unlock(&priv->lock);
+}
+
+static void path_lookup(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
+
+ /* Look up path record for unicasts */
+ if (skb->dst->neighbour->ha[4] != 0xff) {
+ neigh_add_path(skb, dev);
+ return;
+ }
+
+ /* Add in the P_Key for multicasts */
+ skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff;
+ skb->dst->neighbour->ha[9] = priv->pkey & 0xff;
+ ipoib_mcast_send(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4), skb);
+}
+
+static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
+ struct ipoib_pseudoheader *phdr)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_path *path;
+
+ /*
+ * We can only be called from ipoib_start_xmit, so we're
+ * inside tx_lock -- no need to save/restore flags.
+ */
+ spin_lock(&priv->lock);
+
+ path = __path_find(dev, (union ib_gid *) (phdr->hwaddr + 4));
+ if (!path) {
+ path = path_rec_create(dev,
+ (union ib_gid *) (phdr->hwaddr + 4));
+ if (path) {
+ /* put pseudoheader back on for next time */
+ skb_push(skb, sizeof *phdr);
+ __skb_queue_tail(&path->queue, skb);
+
+ if (path_rec_start(dev, path)) {
+ spin_unlock(&priv->lock);
+ path_free(dev, path);
+ return;
+ } else
+ __path_add(dev, path);
+ } else {
+ ++priv->stats.tx_dropped;
+ dev_kfree_skb_any(skb);
+ }
+
+ spin_unlock(&priv->lock);
+ return;
+ }
+
+ if (path->pathrec.dlid) {
+ ipoib_dbg(priv, "Send unicast ARP to %04x\n",
+ be16_to_cpu(path->pathrec.dlid));
+
+ ipoib_send(dev, skb, path->ah,
+ be32_to_cpup((__be32 *) phdr->hwaddr));
+ } else if ((path->query || !path_rec_start(dev, path)) &&
+ skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+ /* put pseudoheader back on for next time */
+ skb_push(skb, sizeof *phdr);
+ __skb_queue_tail(&path->queue, skb);
+ } else {
+ ++priv->stats.tx_dropped;
+ dev_kfree_skb_any(skb);
+ }
+
+ spin_unlock(&priv->lock);
+}
+
+static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_neigh *neigh;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (!spin_trylock(&priv->tx_lock)) {
+ local_irq_restore(flags);
+ return NETDEV_TX_LOCKED;
+ }
+
+ /*
+ * Check if our queue is stopped. Since we have the LLTX bit
+ * set, we can't rely on netif_stop_queue() preventing our
+ * xmit function from being called with a full queue.
+ */
+ if (unlikely(netif_queue_stopped(dev))) {
+ spin_unlock_irqrestore(&priv->tx_lock, flags);
+ return NETDEV_TX_BUSY;
+ }
+
+ if (skb->dst && skb->dst->neighbour) {
+ if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
+ path_lookup(skb, dev);
+ goto out;
+ }
+
+ neigh = *to_ipoib_neigh(skb->dst->neighbour);
+
+ if (likely(neigh->ah)) {
+ ipoib_send(dev, skb, neigh->ah,
+ be32_to_cpup((__be32 *) skb->dst->neighbour->ha));
+ goto out;
+ }
+
+ if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+ spin_lock(&priv->lock);
+ __skb_queue_tail(&neigh->queue, skb);
+ spin_unlock(&priv->lock);
+ } else {
+ ++priv->stats.tx_dropped;
+ dev_kfree_skb_any(skb);
+ }
+ } else {
+ struct ipoib_pseudoheader *phdr =
+ (struct ipoib_pseudoheader *) skb->data;
+ skb_pull(skb, sizeof *phdr);
+
+ if (phdr->hwaddr[4] == 0xff) {
+ /* Add in the P_Key for multicast*/
+ phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
+ phdr->hwaddr[9] = priv->pkey & 0xff;
+
+ ipoib_mcast_send(dev, (union ib_gid *) (phdr->hwaddr + 4), skb);
+ } else {
+ /* unicast GID -- should be ARP reply */
+
+ if (be16_to_cpup((u16 *) skb->data) != ETH_P_ARP) {
+ ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x "
+ IPOIB_GID_FMT "\n",
+ skb->dst ? "neigh" : "dst",
+ be16_to_cpup((u16 *) skb->data),
+ be32_to_cpup((u32 *) phdr->hwaddr),
+ IPOIB_GID_ARG(*(union ib_gid *) (phdr->hwaddr + 4)));
+ dev_kfree_skb_any(skb);
+ ++priv->stats.tx_dropped;
+ goto out;
+ }
+
+ unicast_arp_send(skb, dev, phdr);
+ }
+ }
+
+out:
+ spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+ return NETDEV_TX_OK;
+}
+
+static struct net_device_stats *ipoib_get_stats(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ return &priv->stats;
+}
+
+static void ipoib_timeout(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ ipoib_warn(priv, "transmit timeout: latency %ld\n",
+ jiffies - dev->trans_start);
+ /* XXX reset QP, etc. */
+}
+
+static int ipoib_hard_header(struct sk_buff *skb,
+ struct net_device *dev,
+ unsigned short type,
+ void *daddr, void *saddr, unsigned len)
+{
+ struct ipoib_header *header;
+
+ header = (struct ipoib_header *) skb_push(skb, sizeof *header);
+
+ header->proto = htons(type);
+ header->reserved = 0;
+
+ /*
+ * If we don't have a neighbour structure, stuff the
+ * destination address onto the front of the skb so we can
+ * figure out where to send the packet later.
+ */
+ if (!skb->dst || !skb->dst->neighbour) {
+ struct ipoib_pseudoheader *phdr =
+ (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
+ memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
+ }
+
+ return 0;
+}
+
+static void ipoib_set_mcast_list(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ schedule_work(&priv->restart_task);
+}
+
+static void ipoib_neigh_destructor(struct neighbour *n)
+{
+ struct ipoib_neigh *neigh;
+ struct ipoib_dev_priv *priv = netdev_priv(n->dev);
+ unsigned long flags;
+ struct ipoib_ah *ah = NULL;
+
+ ipoib_dbg(priv,
+ "neigh_destructor for %06x " IPOIB_GID_FMT "\n",
+ be32_to_cpup((__be32 *) n->ha),
+ IPOIB_GID_ARG(*((union ib_gid *) (n->ha + 4))));
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ neigh = *to_ipoib_neigh(n);
+ if (neigh) {
+ if (neigh->ah)
+ ah = neigh->ah;
+ list_del(&neigh->list);
+ *to_ipoib_neigh(n) = NULL;
+ kfree(neigh);
+ }
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ if (ah)
+ ipoib_put_ah(ah);
+}
+
+static int ipoib_neigh_setup(struct neighbour *neigh)
+{
+ /*
+ * Is this kosher? I can't find anybody in the kernel that
+ * sets neigh->destructor, so we should be able to set it here
+ * without trouble.
+ */
+ neigh->ops->destructor = ipoib_neigh_destructor;
+
+ return 0;
+}
+
+static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms)
+{
+ parms->neigh_setup = ipoib_neigh_setup;
+
+ return 0;
+}
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ /* Allocate RX/TX "rings" to hold queued skbs */
+
+ priv->rx_ring = kmalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_buf),
+ GFP_KERNEL);
+ if (!priv->rx_ring) {
+ printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
+ ca->name, IPOIB_RX_RING_SIZE);
+ goto out;
+ }
+ memset(priv->rx_ring, 0,
+ IPOIB_RX_RING_SIZE * sizeof (struct ipoib_buf));
+
+ priv->tx_ring = kmalloc(IPOIB_TX_RING_SIZE * sizeof (struct ipoib_buf),
+ GFP_KERNEL);
+ if (!priv->tx_ring) {
+ printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
+ ca->name, IPOIB_TX_RING_SIZE);
+ goto out_rx_ring_cleanup;
+ }
+ memset(priv->tx_ring, 0,
+ IPOIB_TX_RING_SIZE * sizeof (struct ipoib_buf));
+
+ /* priv->tx_head & tx_tail are already 0 */
+
+ if (ipoib_ib_dev_init(dev, ca, port))
+ goto out_tx_ring_cleanup;
+
+ return 0;
+
+out_tx_ring_cleanup:
+ kfree(priv->tx_ring);
+
+out_rx_ring_cleanup:
+ kfree(priv->rx_ring);
+
+out:
+ return -ENOMEM;
+}
+
+void ipoib_dev_cleanup(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
+
+ ipoib_delete_debug_file(dev);
+
+ /* Delete any child interfaces first */
+ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+ unregister_netdev(cpriv->dev);
+ ipoib_dev_cleanup(cpriv->dev);
+ free_netdev(cpriv->dev);
+ }
+
+ ipoib_ib_dev_cleanup(dev);
+
+ if (priv->rx_ring) {
+ kfree(priv->rx_ring);
+ priv->rx_ring = NULL;
+ }
+
+ if (priv->tx_ring) {
+ kfree(priv->tx_ring);
+ priv->tx_ring = NULL;
+ }
+}
+
+static void ipoib_setup(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ dev->open = ipoib_open;
+ dev->stop = ipoib_stop;
+ dev->change_mtu = ipoib_change_mtu;
+ dev->hard_start_xmit = ipoib_start_xmit;
+ dev->get_stats = ipoib_get_stats;
+ dev->tx_timeout = ipoib_timeout;
+ dev->hard_header = ipoib_hard_header;
+ dev->set_multicast_list = ipoib_set_mcast_list;
+ dev->neigh_setup = ipoib_neigh_setup_dev;
+
+ dev->watchdog_timeo = HZ;
+
+ dev->rebuild_header = NULL;
+ dev->set_mac_address = NULL;
+ dev->header_cache_update = NULL;
+
+ dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
+
+ /*
+ * We add in INFINIBAND_ALEN to allow for the destination
+ * address "pseudoheader" for skbs without neighbour struct.
+ */
+ dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
+ dev->addr_len = INFINIBAND_ALEN;
+ dev->type = ARPHRD_INFINIBAND;
+ dev->tx_queue_len = IPOIB_TX_RING_SIZE * 2;
+ dev->features = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
+
+ /* MTU will be reset when mcast join happens */
+ dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
+ priv->mcast_mtu = priv->admin_mtu = dev->mtu;
+
+ memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
+
+ netif_carrier_off(dev);
+
+ SET_MODULE_OWNER(dev);
+
+ priv->dev = dev;
+
+ spin_lock_init(&priv->lock);
+ spin_lock_init(&priv->tx_lock);
+
+ init_MUTEX(&priv->mcast_mutex);
+ init_MUTEX(&priv->vlan_mutex);
+
+ INIT_LIST_HEAD(&priv->path_list);
+ INIT_LIST_HEAD(&priv->child_intfs);
+ INIT_LIST_HEAD(&priv->dead_ahs);
+ INIT_LIST_HEAD(&priv->multicast_list);
+
+ INIT_WORK(&priv->pkey_task, ipoib_pkey_poll, priv->dev);
+ INIT_WORK(&priv->mcast_task, ipoib_mcast_join_task, priv->dev);
+ INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush, priv->dev);
+ INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task, priv->dev);
+ INIT_WORK(&priv->ah_reap_task, ipoib_reap_ah, priv->dev);
+}
+
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
+{
+ struct net_device *dev;
+
+ dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
+ ipoib_setup);
+ if (!dev)
+ return NULL;
+
+ return netdev_priv(dev);
+}
+
+static ssize_t show_pkey(struct class_device *cdev, char *buf)
+{
+ struct ipoib_dev_priv *priv =
+ netdev_priv(container_of(cdev, struct net_device, class_dev));
+
+ return sprintf(buf, "0x%04x\n", priv->pkey);
+}
+static CLASS_DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
+
+static ssize_t create_child(struct class_device *cdev,
+ const char *buf, size_t count)
+{
+ int pkey;
+ int ret;
+
+ if (sscanf(buf, "%i", &pkey) != 1)
+ return -EINVAL;
+
+ if (pkey < 0 || pkey > 0xffff)
+ return -EINVAL;
+
+ ret = ipoib_vlan_add(container_of(cdev, struct net_device, class_dev),
+ pkey);
+
+ return ret ? ret : count;
+}
+static CLASS_DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child);
+
+static ssize_t delete_child(struct class_device *cdev,
+ const char *buf, size_t count)
+{
+ int pkey;
+ int ret;
+
+ if (sscanf(buf, "%i", &pkey) != 1)
+ return -EINVAL;
+
+ if (pkey < 0 || pkey > 0xffff)
+ return -EINVAL;
+
+ ret = ipoib_vlan_delete(container_of(cdev, struct net_device, class_dev),
+ pkey);
+
+ return ret ? ret : count;
+
+}
+static CLASS_DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child);
+
+int ipoib_add_pkey_attr(struct net_device *dev)
+{
+ return class_device_create_file(&dev->class_dev,
+ &class_device_attr_pkey);
+}
+
+static struct net_device *ipoib_add_port(const char *format,
+ struct ib_device *hca, u8 port)
+{
+ struct ipoib_dev_priv *priv;
+ int result = -ENOMEM;
+
+ priv = ipoib_intf_alloc(format);
+ if (!priv)
+ goto alloc_mem_failed;
+
+ SET_NETDEV_DEV(priv->dev, hca->dma_device);
+
+ result = ib_query_pkey(hca, port, 0, &priv->pkey);
+ if (result) {
+ printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
+ hca->name, port, result);
+ goto alloc_mem_failed;
+ }
+
+ priv->dev->broadcast[8] = priv->pkey >> 8;
+ priv->dev->broadcast[9] = priv->pkey & 0xff;
+
+ result = ib_query_gid(hca, port, 0, &priv->local_gid);
+ if (result) {
+ printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
+ hca->name, port, result);
+ goto alloc_mem_failed;
+ } else
+ memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+
+ result = ipoib_dev_init(priv->dev, hca, port);
+ if (result < 0) {
+ printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
+ hca->name, port, result);
+ goto device_init_failed;
+ }
+
+ INIT_IB_EVENT_HANDLER(&priv->event_handler,
+ priv->ca, ipoib_event);
+ result = ib_register_event_handler(&priv->event_handler);
+ if (result < 0) {
+ printk(KERN_WARNING "%s: ib_register_event_handler failed for "
+ "port %d (ret = %d)\n",
+ hca->name, port, result);
+ goto event_failed;
+ }
+
+ result = register_netdev(priv->dev);
+ if (result) {
+ printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
+ hca->name, port, result);
+ goto register_failed;
+ }
+
+ if (ipoib_create_debug_file(priv->dev))
+ goto debug_failed;
+
+ if (ipoib_add_pkey_attr(priv->dev))
+ goto sysfs_failed;
+ if (class_device_create_file(&priv->dev->class_dev,
+ &class_device_attr_create_child))
+ goto sysfs_failed;
+ if (class_device_create_file(&priv->dev->class_dev,
+ &class_device_attr_delete_child))
+ goto sysfs_failed;
+
+ return priv->dev;
+
+sysfs_failed:
+ ipoib_delete_debug_file(priv->dev);
+
+debug_failed:
+ unregister_netdev(priv->dev);
+
+register_failed:
+ ib_unregister_event_handler(&priv->event_handler);
+
+event_failed:
+ ipoib_dev_cleanup(priv->dev);
+
+device_init_failed:
+ free_netdev(priv->dev);
+
+alloc_mem_failed:
+ return ERR_PTR(result);
+}
+
+static void ipoib_add_one(struct ib_device *device)
+{
+ struct list_head *dev_list;
+ struct net_device *dev;
+ struct ipoib_dev_priv *priv;
+ int s, e, p;
+
+ dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
+ if (!dev_list)
+ return;
+
+ INIT_LIST_HEAD(dev_list);
+
+ if (device->node_type == IB_NODE_SWITCH) {
+ s = 0;
+ e = 0;
+ } else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ for (p = s; p <= e; ++p) {
+ dev = ipoib_add_port("ib%d", device, p);
+ if (!IS_ERR(dev)) {
+ priv = netdev_priv(dev);
+ list_add_tail(&priv->list, dev_list);
+ }
+ }
+
+ ib_set_client_data(device, &ipoib_client, dev_list);
+}
+
+static void ipoib_remove_one(struct ib_device *device)
+{
+ struct ipoib_dev_priv *priv, *tmp;
+ struct list_head *dev_list;
+
+ dev_list = ib_get_client_data(device, &ipoib_client);
+
+ list_for_each_entry_safe(priv, tmp, dev_list, list) {
+ ib_unregister_event_handler(&priv->event_handler);
+
+ unregister_netdev(priv->dev);
+ ipoib_dev_cleanup(priv->dev);
+ free_netdev(priv->dev);
+ }
+}
+
+static int __init ipoib_init_module(void)
+{
+ int ret;
+
+ ret = ipoib_register_debugfs();
+ if (ret)
+ return ret;
+
+ /*
+ * We create our own workqueue mainly because we want to be
+ * able to flush it when devices are being removed. We can't
+ * use schedule_work()/flush_scheduled_work() because both
+ * unregister_netdev() and linkwatch_event take the rtnl lock,
+ * so flush_scheduled_work() can deadlock during device
+ * removal.
+ */
+ ipoib_workqueue = create_singlethread_workqueue("ipoib");
+ if (!ipoib_workqueue) {
+ ret = -ENOMEM;
+ goto err_fs;
+ }
+
+ ret = ib_register_client(&ipoib_client);
+ if (ret)
+ goto err_wq;
+
+ return 0;
+
+err_fs:
+ ipoib_unregister_debugfs();
+
+err_wq:
+ destroy_workqueue(ipoib_workqueue);
+
+ return ret;
+}
+
+static void __exit ipoib_cleanup_module(void)
+{
+ ipoib_unregister_debugfs();
+ ib_unregister_client(&ipoib_client);
+ destroy_workqueue(ipoib_workqueue);
+}
+
+module_init(ipoib_init_module);
+module_exit(ipoib_cleanup_module);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
new file mode 100644
index 00000000000..f46932dc81c
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_multicast.c 1362 2004-12-18 15:56:29Z roland $
+ */
+
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+static int mcast_debug_level;
+
+module_param(mcast_debug_level, int, 0644);
+MODULE_PARM_DESC(mcast_debug_level,
+ "Enable multicast debug tracing if > 0");
+#endif
+
+static DECLARE_MUTEX(mcast_mutex);
+
+/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
+struct ipoib_mcast {
+ struct ib_sa_mcmember_rec mcmember;
+ struct ipoib_ah *ah;
+
+ struct rb_node rb_node;
+ struct list_head list;
+ struct completion done;
+
+ int query_id;
+ struct ib_sa_query *query;
+
+ unsigned long created;
+ unsigned long backoff;
+
+ unsigned long flags;
+ unsigned char logcount;
+
+ struct list_head neigh_list;
+
+ struct sk_buff_head pkt_queue;
+
+ struct net_device *dev;
+};
+
+struct ipoib_mcast_iter {
+ struct net_device *dev;
+ union ib_gid mgid;
+ unsigned long created;
+ unsigned int queuelen;
+ unsigned int complete;
+ unsigned int send_only;
+};
+
+static void ipoib_mcast_free(struct ipoib_mcast *mcast)
+{
+ struct net_device *dev = mcast->dev;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_neigh *neigh, *tmp;
+ unsigned long flags;
+ LIST_HEAD(ah_list);
+ struct ipoib_ah *ah, *tah;
+
+ ipoib_dbg_mcast(netdev_priv(dev),
+ "deleting multicast group " IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) {
+ if (neigh->ah)
+ list_add_tail(&neigh->ah->list, &ah_list);
+ *to_ipoib_neigh(neigh->neighbour) = NULL;
+ neigh->neighbour->ops->destructor = NULL;
+ kfree(neigh);
+ }
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ list_for_each_entry_safe(ah, tah, &ah_list, list)
+ ipoib_put_ah(ah);
+
+ if (mcast->ah)
+ ipoib_put_ah(mcast->ah);
+
+ while (!skb_queue_empty(&mcast->pkt_queue)) {
+ struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+ skb->dev = dev;
+ dev_kfree_skb_any(skb);
+ }
+
+ kfree(mcast);
+}
+
+static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
+ int can_sleep)
+{
+ struct ipoib_mcast *mcast;
+
+ mcast = kmalloc(sizeof (*mcast), can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+ if (!mcast)
+ return NULL;
+
+ memset(mcast, 0, sizeof (*mcast));
+
+ init_completion(&mcast->done);
+
+ mcast->dev = dev;
+ mcast->created = jiffies;
+ mcast->backoff = HZ;
+ mcast->logcount = 0;
+
+ INIT_LIST_HEAD(&mcast->list);
+ INIT_LIST_HEAD(&mcast->neigh_list);
+ skb_queue_head_init(&mcast->pkt_queue);
+
+ mcast->ah = NULL;
+ mcast->query = NULL;
+
+ return mcast;
+}
+
+static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, union ib_gid *mgid)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct rb_node *n = priv->multicast_tree.rb_node;
+
+ while (n) {
+ struct ipoib_mcast *mcast;
+ int ret;
+
+ mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+ ret = memcmp(mgid->raw, mcast->mcmember.mgid.raw,
+ sizeof (union ib_gid));
+ if (ret < 0)
+ n = n->rb_left;
+ else if (ret > 0)
+ n = n->rb_right;
+ else
+ return mcast;
+ }
+
+ return NULL;
+}
+
+static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL;
+
+ while (*n) {
+ struct ipoib_mcast *tmcast;
+ int ret;
+
+ pn = *n;
+ tmcast = rb_entry(pn, struct ipoib_mcast, rb_node);
+
+ ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw,
+ sizeof (union ib_gid));
+ if (ret < 0)
+ n = &pn->rb_left;
+ else if (ret > 0)
+ n = &pn->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&mcast->rb_node, pn, n);
+ rb_insert_color(&mcast->rb_node, &priv->multicast_tree);
+
+ return 0;
+}
+
+static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
+ struct ib_sa_mcmember_rec *mcmember)
+{
+ struct net_device *dev = mcast->dev;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int ret;
+
+ mcast->mcmember = *mcmember;
+
+ /* Set the cached Q_Key before we attach if it's the broadcast group */
+ if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+ sizeof (union ib_gid))) {
+ priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
+ priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+ }
+
+ if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+ if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+ ipoib_warn(priv, "multicast group " IPOIB_GID_FMT
+ " already attached\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+ return 0;
+ }
+
+ ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
+ &mcast->mcmember.mgid);
+ if (ret < 0) {
+ ipoib_warn(priv, "couldn't attach QP to multicast group "
+ IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+ clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
+ return ret;
+ }
+ }
+
+ {
+ struct ib_ah_attr av = {
+ .dlid = be16_to_cpu(mcast->mcmember.mlid),
+ .port_num = priv->port,
+ .sl = mcast->mcmember.sl,
+ .ah_flags = IB_AH_GRH,
+ .grh = {
+ .flow_label = be32_to_cpu(mcast->mcmember.flow_label),
+ .hop_limit = mcast->mcmember.hop_limit,
+ .sgid_index = 0,
+ .traffic_class = mcast->mcmember.traffic_class
+ }
+ };
+
+ av.grh.dgid = mcast->mcmember.mgid;
+
+ if (ib_sa_rate_enum_to_int(mcast->mcmember.rate) > 0)
+ av.static_rate = (2 * priv->local_rate -
+ ib_sa_rate_enum_to_int(mcast->mcmember.rate) - 1) /
+ (priv->local_rate ? priv->local_rate : 1);
+
+ ipoib_dbg_mcast(priv, "static_rate %d for local port %dX, mcmember %dX\n",
+ av.static_rate, priv->local_rate,
+ ib_sa_rate_enum_to_int(mcast->mcmember.rate));
+
+ mcast->ah = ipoib_create_ah(dev, priv->pd, &av);
+ if (!mcast->ah) {
+ ipoib_warn(priv, "ib_address_create failed\n");
+ } else {
+ ipoib_dbg_mcast(priv, "MGID " IPOIB_GID_FMT
+ " AV %p, LID 0x%04x, SL %d\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid),
+ mcast->ah->ah,
+ be16_to_cpu(mcast->mcmember.mlid),
+ mcast->mcmember.sl);
+ }
+ }
+
+ /* actually send any queued packets */
+ while (!skb_queue_empty(&mcast->pkt_queue)) {
+ struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+ skb->dev = dev;
+
+ if (!skb->dst || !skb->dst->neighbour) {
+ /* put pseudoheader back on for next time */
+ skb_push(skb, sizeof (struct ipoib_pseudoheader));
+ }
+
+ if (dev_queue_xmit(skb))
+ ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n");
+ }
+
+ return 0;
+}
+
+static void
+ipoib_mcast_sendonly_join_complete(int status,
+ struct ib_sa_mcmember_rec *mcmember,
+ void *mcast_ptr)
+{
+ struct ipoib_mcast *mcast = mcast_ptr;
+ struct net_device *dev = mcast->dev;
+
+ if (!status)
+ ipoib_mcast_join_finish(mcast, mcmember);
+ else {
+ if (mcast->logcount++ < 20)
+ ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for "
+ IPOIB_GID_FMT ", status %d\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid), status);
+
+ /* Flush out any queued packets */
+ while (!skb_queue_empty(&mcast->pkt_queue)) {
+ struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+ skb->dev = dev;
+
+ dev_kfree_skb_any(skb);
+ }
+
+ /* Clear the busy flag so we try again */
+ clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+ }
+
+ complete(&mcast->done);
+}
+
+static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
+{
+ struct net_device *dev = mcast->dev;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_sa_mcmember_rec rec = {
+#if 0 /* Some SMs don't support send-only yet */
+ .join_state = 4
+#else
+ .join_state = 1
+#endif
+ };
+ int ret = 0;
+
+ if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+ ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
+ return -ENODEV;
+ }
+
+ if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
+ ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
+ return -EBUSY;
+ }
+
+ rec.mgid = mcast->mcmember.mgid;
+ rec.port_gid = priv->local_gid;
+ rec.pkey = be16_to_cpu(priv->pkey);
+
+ ret = ib_sa_mcmember_rec_set(priv->ca, priv->port, &rec,
+ IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_PKEY |
+ IB_SA_MCMEMBER_REC_JOIN_STATE,
+ 1000, GFP_ATOMIC,
+ ipoib_mcast_sendonly_join_complete,
+ mcast, &mcast->query);
+ if (ret < 0) {
+ ipoib_warn(priv, "ib_sa_mcmember_rec_set failed (ret = %d)\n",
+ ret);
+ } else {
+ ipoib_dbg_mcast(priv, "no multicast record for " IPOIB_GID_FMT
+ ", starting join\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+ mcast->query_id = ret;
+ }
+
+ return ret;
+}
+
+static void ipoib_mcast_join_complete(int status,
+ struct ib_sa_mcmember_rec *mcmember,
+ void *mcast_ptr)
+{
+ struct ipoib_mcast *mcast = mcast_ptr;
+ struct net_device *dev = mcast->dev;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ ipoib_dbg_mcast(priv, "join completion for " IPOIB_GID_FMT
+ " (status %d)\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid), status);
+
+ if (!status && !ipoib_mcast_join_finish(mcast, mcmember)) {
+ mcast->backoff = HZ;
+ down(&mcast_mutex);
+ if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ queue_work(ipoib_workqueue, &priv->mcast_task);
+ up(&mcast_mutex);
+ complete(&mcast->done);
+ return;
+ }
+
+ if (status == -EINTR) {
+ complete(&mcast->done);
+ return;
+ }
+
+ if (status && mcast->logcount++ < 20) {
+ if (status == -ETIMEDOUT || status == -EINTR) {
+ ipoib_dbg_mcast(priv, "multicast join failed for " IPOIB_GID_FMT
+ ", status %d\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid),
+ status);
+ } else {
+ ipoib_warn(priv, "multicast join failed for "
+ IPOIB_GID_FMT ", status %d\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid),
+ status);
+ }
+ }
+
+ mcast->backoff *= 2;
+ if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+ mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+ mcast->query = NULL;
+
+ down(&mcast_mutex);
+ if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) {
+ if (status == -ETIMEDOUT)
+ queue_work(ipoib_workqueue, &priv->mcast_task);
+ else
+ queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
+ mcast->backoff * HZ);
+ } else
+ complete(&mcast->done);
+ up(&mcast_mutex);
+
+ return;
+}
+
+static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
+ int create)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_sa_mcmember_rec rec = {
+ .join_state = 1
+ };
+ ib_sa_comp_mask comp_mask;
+ int ret = 0;
+
+ ipoib_dbg_mcast(priv, "joining MGID " IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+ rec.mgid = mcast->mcmember.mgid;
+ rec.port_gid = priv->local_gid;
+ rec.pkey = be16_to_cpu(priv->pkey);
+
+ comp_mask =
+ IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_PKEY |
+ IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+ if (create) {
+ comp_mask |=
+ IB_SA_MCMEMBER_REC_QKEY |
+ IB_SA_MCMEMBER_REC_SL |
+ IB_SA_MCMEMBER_REC_FLOW_LABEL |
+ IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+ rec.qkey = priv->broadcast->mcmember.qkey;
+ rec.sl = priv->broadcast->mcmember.sl;
+ rec.flow_label = priv->broadcast->mcmember.flow_label;
+ rec.traffic_class = priv->broadcast->mcmember.traffic_class;
+ }
+
+ ret = ib_sa_mcmember_rec_set(priv->ca, priv->port, &rec, comp_mask,
+ mcast->backoff * 1000, GFP_ATOMIC,
+ ipoib_mcast_join_complete,
+ mcast, &mcast->query);
+
+ if (ret < 0) {
+ ipoib_warn(priv, "ib_sa_mcmember_rec_set failed, status %d\n", ret);
+
+ mcast->backoff *= 2;
+ if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+ mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+ down(&mcast_mutex);
+ if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ queue_delayed_work(ipoib_workqueue,
+ &priv->mcast_task,
+ mcast->backoff);
+ up(&mcast_mutex);
+ } else
+ mcast->query_id = ret;
+}
+
+void ipoib_mcast_join_task(void *dev_ptr)
+{
+ struct net_device *dev = dev_ptr;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ return;
+
+ if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
+ ipoib_warn(priv, "ib_gid_entry_get() failed\n");
+ else
+ memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+ {
+ struct ib_port_attr attr;
+
+ if (!ib_query_port(priv->ca, priv->port, &attr)) {
+ priv->local_lid = attr.lid;
+ priv->local_rate = attr.active_speed *
+ ib_width_enum_to_int(attr.active_width);
+ } else
+ ipoib_warn(priv, "ib_query_port failed\n");
+ }
+
+ if (!priv->broadcast) {
+ priv->broadcast = ipoib_mcast_alloc(dev, 1);
+ if (!priv->broadcast) {
+ ipoib_warn(priv, "failed to allocate broadcast group\n");
+ down(&mcast_mutex);
+ if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ queue_delayed_work(ipoib_workqueue,
+ &priv->mcast_task, HZ);
+ up(&mcast_mutex);
+ return;
+ }
+
+ memcpy(priv->broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+ sizeof (union ib_gid));
+
+ spin_lock_irq(&priv->lock);
+ __ipoib_mcast_add(dev, priv->broadcast);
+ spin_unlock_irq(&priv->lock);
+ }
+
+ if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+ ipoib_mcast_join(dev, priv->broadcast, 0);
+ return;
+ }
+
+ while (1) {
+ struct ipoib_mcast *mcast = NULL;
+
+ spin_lock_irq(&priv->lock);
+ list_for_each_entry(mcast, &priv->multicast_list, list) {
+ if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
+ && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
+ && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+ /* Found the next unjoined group */
+ break;
+ }
+ }
+ spin_unlock_irq(&priv->lock);
+
+ if (&mcast->list == &priv->multicast_list) {
+ /* All done */
+ break;
+ }
+
+ ipoib_mcast_join(dev, mcast, 1);
+ return;
+ }
+
+ priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
+ IPOIB_ENCAP_LEN;
+ dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+ ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
+
+ clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+ netif_carrier_on(dev);
+}
+
+int ipoib_mcast_start_thread(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ ipoib_dbg_mcast(priv, "starting multicast thread\n");
+
+ down(&mcast_mutex);
+ if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
+ queue_work(ipoib_workqueue, &priv->mcast_task);
+ up(&mcast_mutex);
+
+ return 0;
+}
+
+int ipoib_mcast_stop_thread(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_mcast *mcast;
+
+ ipoib_dbg_mcast(priv, "stopping multicast thread\n");
+
+ down(&mcast_mutex);
+ clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+ cancel_delayed_work(&priv->mcast_task);
+ up(&mcast_mutex);
+
+ flush_workqueue(ipoib_workqueue);
+
+ if (priv->broadcast && priv->broadcast->query) {
+ ib_sa_cancel_query(priv->broadcast->query_id, priv->broadcast->query);
+ priv->broadcast->query = NULL;
+ ipoib_dbg_mcast(priv, "waiting for bcast\n");
+ wait_for_completion(&priv->broadcast->done);
+ }
+
+ list_for_each_entry(mcast, &priv->multicast_list, list) {
+ if (mcast->query) {
+ ib_sa_cancel_query(mcast->query_id, mcast->query);
+ mcast->query = NULL;
+ ipoib_dbg_mcast(priv, "waiting for MGID " IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid));
+ wait_for_completion(&mcast->done);
+ }
+ }
+
+ return 0;
+}
+
+static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_sa_mcmember_rec rec = {
+ .join_state = 1
+ };
+ int ret = 0;
+
+ if (!test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags))
+ return 0;
+
+ ipoib_dbg_mcast(priv, "leaving MGID " IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+ rec.mgid = mcast->mcmember.mgid;
+ rec.port_gid = priv->local_gid;
+ rec.pkey = be16_to_cpu(priv->pkey);
+
+ /* Remove ourselves from the multicast group */
+ ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid),
+ &mcast->mcmember.mgid);
+ if (ret)
+ ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret);
+
+ /*
+ * Just make one shot at leaving and don't wait for a reply;
+ * if we fail, too bad.
+ */
+ ret = ib_sa_mcmember_rec_delete(priv->ca, priv->port, &rec,
+ IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_PKEY |
+ IB_SA_MCMEMBER_REC_JOIN_STATE,
+ 0, GFP_ATOMIC, NULL,
+ mcast, &mcast->query);
+ if (ret < 0)
+ ipoib_warn(priv, "ib_sa_mcmember_rec_delete failed "
+ "for leave (result = %d)\n", ret);
+
+ return 0;
+}
+
+void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid,
+ struct sk_buff *skb)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ipoib_mcast *mcast;
+
+ /*
+ * We can only be called from ipoib_start_xmit, so we're
+ * inside tx_lock -- no need to save/restore flags.
+ */
+ spin_lock(&priv->lock);
+
+ mcast = __ipoib_mcast_find(dev, mgid);
+ if (!mcast) {
+ /* Let's create a new send only group now */
+ ipoib_dbg_mcast(priv, "setting up send only multicast group for "
+ IPOIB_GID_FMT "\n", IPOIB_GID_ARG(*mgid));
+
+ mcast = ipoib_mcast_alloc(dev, 0);
+ if (!mcast) {
+ ipoib_warn(priv, "unable to allocate memory for "
+ "multicast structure\n");
+ dev_kfree_skb_any(skb);
+ goto out;
+ }
+
+ set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
+ mcast->mcmember.mgid = *mgid;
+ __ipoib_mcast_add(dev, mcast);
+ list_add_tail(&mcast->list, &priv->multicast_list);
+ }
+
+ if (!mcast->ah) {
+ if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
+ skb_queue_tail(&mcast->pkt_queue, skb);
+ else
+ dev_kfree_skb_any(skb);
+
+ if (mcast->query)
+ ipoib_dbg_mcast(priv, "no address vector, "
+ "but multicast join already started\n");
+ else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+ ipoib_mcast_sendonly_join(mcast);
+
+ /*
+ * If lookup completes between here and out:, don't
+ * want to send packet twice.
+ */
+ mcast = NULL;
+ }
+
+out:
+ if (mcast && mcast->ah) {
+ if (skb->dst &&
+ skb->dst->neighbour &&
+ !*to_ipoib_neigh(skb->dst->neighbour)) {
+ struct ipoib_neigh *neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
+
+ if (neigh) {
+ kref_get(&mcast->ah->ref);
+ neigh->ah = mcast->ah;
+ neigh->neighbour = skb->dst->neighbour;
+ *to_ipoib_neigh(skb->dst->neighbour) = neigh;
+ list_add_tail(&neigh->list, &mcast->neigh_list);
+ }
+ }
+
+ ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
+ }
+
+ spin_unlock(&priv->lock);
+}
+
+void ipoib_mcast_dev_flush(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ LIST_HEAD(remove_list);
+ struct ipoib_mcast *mcast, *tmcast, *nmcast;
+ unsigned long flags;
+
+ ipoib_dbg_mcast(priv, "flushing multicast list\n");
+
+ spin_lock_irqsave(&priv->lock, flags);
+ list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+ nmcast = ipoib_mcast_alloc(dev, 0);
+ if (nmcast) {
+ nmcast->flags =
+ mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY);
+
+ nmcast->mcmember.mgid = mcast->mcmember.mgid;
+
+ /* Add the new group in before the to-be-destroyed group */
+ list_add_tail(&nmcast->list, &mcast->list);
+ list_del_init(&mcast->list);
+
+ rb_replace_node(&mcast->rb_node, &nmcast->rb_node,
+ &priv->multicast_tree);
+
+ list_add_tail(&mcast->list, &remove_list);
+ } else {
+ ipoib_warn(priv, "could not reallocate multicast group "
+ IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid));
+ }
+ }
+
+ if (priv->broadcast) {
+ nmcast = ipoib_mcast_alloc(dev, 0);
+ if (nmcast) {
+ nmcast->mcmember.mgid = priv->broadcast->mcmember.mgid;
+
+ rb_replace_node(&priv->broadcast->rb_node,
+ &nmcast->rb_node,
+ &priv->multicast_tree);
+
+ list_add_tail(&priv->broadcast->list, &remove_list);
+ }
+
+ priv->broadcast = nmcast;
+ }
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+ ipoib_mcast_leave(dev, mcast);
+ ipoib_mcast_free(mcast);
+ }
+}
+
+void ipoib_mcast_dev_down(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ unsigned long flags;
+
+ /* Delete broadcast since it will be recreated */
+ if (priv->broadcast) {
+ ipoib_dbg_mcast(priv, "deleting broadcast group\n");
+
+ spin_lock_irqsave(&priv->lock, flags);
+ rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
+ spin_unlock_irqrestore(&priv->lock, flags);
+ ipoib_mcast_leave(dev, priv->broadcast);
+ ipoib_mcast_free(priv->broadcast);
+ priv->broadcast = NULL;
+ }
+}
+
+void ipoib_mcast_restart_task(void *dev_ptr)
+{
+ struct net_device *dev = dev_ptr;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct dev_mc_list *mclist;
+ struct ipoib_mcast *mcast, *tmcast;
+ LIST_HEAD(remove_list);
+ unsigned long flags;
+
+ ipoib_dbg_mcast(priv, "restarting multicast task\n");
+
+ ipoib_mcast_stop_thread(dev);
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ /*
+ * Unfortunately, the networking core only gives us a list of all of
+ * the multicast hardware addresses. We need to figure out which ones
+ * are new and which ones have been removed
+ */
+
+ /* Clear out the found flag */
+ list_for_each_entry(mcast, &priv->multicast_list, list)
+ clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+
+ /* Mark all of the entries that are found or don't exist */
+ for (mclist = dev->mc_list; mclist; mclist = mclist->next) {
+ union ib_gid mgid;
+
+ memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid);
+
+ /* Add in the P_Key */
+ mgid.raw[4] = (priv->pkey >> 8) & 0xff;
+ mgid.raw[5] = priv->pkey & 0xff;
+
+ mcast = __ipoib_mcast_find(dev, &mgid);
+ if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+ struct ipoib_mcast *nmcast;
+
+ /* Not found or send-only group, let's add a new entry */
+ ipoib_dbg_mcast(priv, "adding multicast entry for mgid "
+ IPOIB_GID_FMT "\n", IPOIB_GID_ARG(mgid));
+
+ nmcast = ipoib_mcast_alloc(dev, 0);
+ if (!nmcast) {
+ ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
+ continue;
+ }
+
+ set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
+
+ nmcast->mcmember.mgid = mgid;
+
+ if (mcast) {
+ /* Destroy the send only entry */
+ list_del(&mcast->list);
+ list_add_tail(&mcast->list, &remove_list);
+
+ rb_replace_node(&mcast->rb_node,
+ &nmcast->rb_node,
+ &priv->multicast_tree);
+ } else
+ __ipoib_mcast_add(dev, nmcast);
+
+ list_add_tail(&nmcast->list, &priv->multicast_list);
+ }
+
+ if (mcast)
+ set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+ }
+
+ /* Remove all of the entries don't exist anymore */
+ list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+ if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
+ !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+ ipoib_dbg_mcast(priv, "deleting multicast group " IPOIB_GID_FMT "\n",
+ IPOIB_GID_ARG(mcast->mcmember.mgid));
+
+ rb_erase(&mcast->rb_node, &priv->multicast_tree);
+
+ /* Move to the remove list */
+ list_del(&mcast->list);
+ list_add_tail(&mcast->list, &remove_list);
+ }
+ }
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ /* We have to cancel outside of the spinlock */
+ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+ ipoib_mcast_leave(mcast->dev, mcast);
+ ipoib_mcast_free(mcast);
+ }
+
+ if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+ ipoib_mcast_start_thread(dev);
+}
+
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev)
+{
+ struct ipoib_mcast_iter *iter;
+
+ iter = kmalloc(sizeof *iter, GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ iter->dev = dev;
+ memset(iter->mgid.raw, 0, sizeof iter->mgid);
+
+ if (ipoib_mcast_iter_next(iter)) {
+ ipoib_mcast_iter_free(iter);
+ return NULL;
+ }
+
+ return iter;
+}
+
+void ipoib_mcast_iter_free(struct ipoib_mcast_iter *iter)
+{
+ kfree(iter);
+}
+
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
+ struct rb_node *n;
+ struct ipoib_mcast *mcast;
+ int ret = 1;
+
+ spin_lock_irq(&priv->lock);
+
+ n = rb_first(&priv->multicast_tree);
+
+ while (n) {
+ mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+ if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw,
+ sizeof (union ib_gid)) < 0) {
+ iter->mgid = mcast->mcmember.mgid;
+ iter->created = mcast->created;
+ iter->queuelen = skb_queue_len(&mcast->pkt_queue);
+ iter->complete = !!mcast->ah;
+ iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY));
+
+ ret = 0;
+
+ break;
+ }
+
+ n = rb_next(n);
+ }
+
+ spin_unlock_irq(&priv->lock);
+
+ return ret;
+}
+
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+ union ib_gid *mgid,
+ unsigned long *created,
+ unsigned int *queuelen,
+ unsigned int *complete,
+ unsigned int *send_only)
+{
+ *mgid = iter->mgid;
+ *created = iter->created;
+ *queuelen = iter->queuelen;
+ *complete = iter->complete;
+ *send_only = iter->send_only;
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
new file mode 100644
index 00000000000..4933edf062c
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <ib_cache.h>
+
+#include "ipoib.h"
+
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_qp_attr *qp_attr;
+ int attr_mask;
+ int ret;
+ u16 pkey_index;
+
+ ret = -ENOMEM;
+ qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+ if (!qp_attr)
+ goto out;
+
+ if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ ret = -ENXIO;
+ goto out;
+ }
+ set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+ /* set correct QKey for QP */
+ qp_attr->qkey = priv->qkey;
+ attr_mask = IB_QP_QKEY;
+ ret = ib_modify_qp(priv->qp, qp_attr, attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
+ goto out;
+ }
+
+ /* attach QP to multicast group */
+ down(&priv->mcast_mutex);
+ ret = ib_attach_mcast(priv->qp, mgid, mlid);
+ up(&priv->mcast_mutex);
+ if (ret)
+ ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret);
+
+out:
+ kfree(qp_attr);
+ return ret;
+}
+
+int ipoib_mcast_detach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int ret;
+
+ down(&priv->mcast_mutex);
+ ret = ib_detach_mcast(priv->qp, mgid, mlid);
+ up(&priv->mcast_mutex);
+ if (ret)
+ ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
+
+ return ret;
+}
+
+int ipoib_qp_create(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ int ret;
+ u16 pkey_index;
+ struct ib_qp_attr qp_attr;
+ int attr_mask;
+
+ /*
+ * Search through the port P_Key table for the requested pkey value.
+ * The port has to be assigned to the respective IB partition in
+ * advance.
+ */
+ ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index);
+ if (ret) {
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ return ret;
+ }
+ set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.qkey = 0;
+ qp_attr.port_num = priv->port;
+ qp_attr.pkey_index = pkey_index;
+ attr_mask =
+ IB_QP_QKEY |
+ IB_QP_PORT |
+ IB_QP_PKEY_INDEX |
+ IB_QP_STATE;
+ ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret);
+ goto out_fail;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ /* Can't set this in a INIT->RTR transition */
+ attr_mask &= ~IB_QP_PORT;
+ ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret);
+ goto out_fail;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 0;
+ attr_mask |= IB_QP_SQ_PSN;
+ attr_mask &= ~IB_QP_PKEY_INDEX;
+ ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret);
+ goto out_fail;
+ }
+
+ return 0;
+
+out_fail:
+ ib_destroy_qp(priv->qp);
+ priv->qp = NULL;
+
+ return -EINVAL;
+}
+
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_qp_init_attr init_attr = {
+ .cap = {
+ .max_send_wr = IPOIB_TX_RING_SIZE,
+ .max_recv_wr = IPOIB_RX_RING_SIZE,
+ .max_send_sge = 1,
+ .max_recv_sge = 1
+ },
+ .sq_sig_type = IB_SIGNAL_ALL_WR,
+ .qp_type = IB_QPT_UD
+ };
+
+ priv->pd = ib_alloc_pd(priv->ca);
+ if (IS_ERR(priv->pd)) {
+ printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
+ return -ENODEV;
+ }
+
+ priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev,
+ IPOIB_TX_RING_SIZE + IPOIB_RX_RING_SIZE + 1);
+ if (IS_ERR(priv->cq)) {
+ printk(KERN_WARNING "%s: failed to create CQ\n", ca->name);
+ goto out_free_pd;
+ }
+
+ if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
+ goto out_free_cq;
+
+ priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(priv->mr)) {
+ printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
+ goto out_free_cq;
+ }
+
+ init_attr.send_cq = priv->cq;
+ init_attr.recv_cq = priv->cq,
+
+ priv->qp = ib_create_qp(priv->pd, &init_attr);
+ if (IS_ERR(priv->qp)) {
+ printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
+ goto out_free_mr;
+ }
+
+ priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
+ priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff;
+ priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff;
+
+ priv->tx_sge.lkey = priv->mr->lkey;
+
+ priv->tx_wr.opcode = IB_WR_SEND;
+ priv->tx_wr.sg_list = &priv->tx_sge;
+ priv->tx_wr.num_sge = 1;
+ priv->tx_wr.send_flags = IB_SEND_SIGNALED;
+
+ return 0;
+
+out_free_mr:
+ ib_dereg_mr(priv->mr);
+
+out_free_cq:
+ ib_destroy_cq(priv->cq);
+
+out_free_pd:
+ ib_dealloc_pd(priv->pd);
+ return -ENODEV;
+}
+
+void ipoib_transport_dev_cleanup(struct net_device *dev)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ if (priv->qp) {
+ if (ib_destroy_qp(priv->qp))
+ ipoib_warn(priv, "ib_qp_destroy failed\n");
+
+ priv->qp = NULL;
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ }
+
+ if (ib_dereg_mr(priv->mr))
+ ipoib_warn(priv, "ib_dereg_mr failed\n");
+
+ if (ib_destroy_cq(priv->cq))
+ ipoib_warn(priv, "ib_cq_destroy failed\n");
+
+ if (ib_dealloc_pd(priv->pd))
+ ipoib_warn(priv, "ib_dealloc_pd failed\n");
+}
+
+void ipoib_event(struct ib_event_handler *handler,
+ struct ib_event *record)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(handler, struct ipoib_dev_priv, event_handler);
+
+ if (record->event == IB_EVENT_PORT_ACTIVE ||
+ record->event == IB_EVENT_LID_CHANGE ||
+ record->event == IB_EVENT_SM_CHANGE) {
+ ipoib_dbg(priv, "Port active event\n");
+ schedule_work(&priv->flush_task);
+ }
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
new file mode 100644
index 00000000000..94b8ea812fe
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ipoib_vlan.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+
+#include <asm/uaccess.h>
+
+#include "ipoib.h"
+
+static ssize_t show_parent(struct class_device *class_dev, char *buf)
+{
+ struct net_device *dev =
+ container_of(class_dev, struct net_device, class_dev);
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ return sprintf(buf, "%s\n", priv->parent->name);
+}
+static CLASS_DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
+{
+ struct ipoib_dev_priv *ppriv, *priv;
+ char intf_name[IFNAMSIZ];
+ int result;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ ppriv = netdev_priv(pdev);
+
+ down(&ppriv->vlan_mutex);
+
+ /*
+ * First ensure this isn't a duplicate. We check the parent device and
+ * then all of the child interfaces to make sure the Pkey doesn't match.
+ */
+ if (ppriv->pkey == pkey) {
+ result = -ENOTUNIQ;
+ goto err;
+ }
+
+ list_for_each_entry(priv, &ppriv->child_intfs, list) {
+ if (priv->pkey == pkey) {
+ result = -ENOTUNIQ;
+ goto err;
+ }
+ }
+
+ snprintf(intf_name, sizeof intf_name, "%s.%04x",
+ ppriv->dev->name, pkey);
+ priv = ipoib_intf_alloc(intf_name);
+ if (!priv) {
+ result = -ENOMEM;
+ goto err;
+ }
+
+ set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
+
+ priv->pkey = pkey;
+
+ memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
+ priv->dev->broadcast[8] = pkey >> 8;
+ priv->dev->broadcast[9] = pkey & 0xff;
+
+ result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port);
+ if (result < 0) {
+ ipoib_warn(ppriv, "failed to initialize subinterface: "
+ "device %s, port %d",
+ ppriv->ca->name, ppriv->port);
+ goto device_init_failed;
+ }
+
+ result = register_netdev(priv->dev);
+ if (result) {
+ ipoib_warn(priv, "failed to initialize; error %i", result);
+ goto register_failed;
+ }
+
+ priv->parent = ppriv->dev;
+
+ if (ipoib_create_debug_file(priv->dev))
+ goto debug_failed;
+
+ if (ipoib_add_pkey_attr(priv->dev))
+ goto sysfs_failed;
+
+ if (class_device_create_file(&priv->dev->class_dev,
+ &class_device_attr_parent))
+ goto sysfs_failed;
+
+ list_add_tail(&priv->list, &ppriv->child_intfs);
+
+ up(&ppriv->vlan_mutex);
+
+ return 0;
+
+sysfs_failed:
+ ipoib_delete_debug_file(priv->dev);
+
+debug_failed:
+ unregister_netdev(priv->dev);
+
+register_failed:
+ ipoib_dev_cleanup(priv->dev);
+
+device_init_failed:
+ free_netdev(priv->dev);
+
+err:
+ up(&ppriv->vlan_mutex);
+ return result;
+}
+
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
+{
+ struct ipoib_dev_priv *ppriv, *priv, *tpriv;
+ int ret = -ENOENT;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ ppriv = netdev_priv(pdev);
+
+ down(&ppriv->vlan_mutex);
+ list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
+ if (priv->pkey == pkey) {
+ unregister_netdev(priv->dev);
+ ipoib_dev_cleanup(priv->dev);
+
+ list_del(&priv->list);
+
+ kfree(priv);
+
+ ret = 0;
+ break;
+ }
+ }
+ up(&ppriv->vlan_mutex);
+
+ return ret;
+}