RDMA/cxgb4: Support on-chip SQs

T4 support on-chip SQs to reduce latency.  This patch adds support for
this in iw_cxgb4:

 - Manage ocqp memory like other adapter mem resources.
 - Allocate user mode SQs from ocqp mem if available.
 - Map ocqp mem to user process using write combining.
 - Map PCIE_MA_SYNC reg to user process.

Bump uverbs ABI.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 2851bf8..986cfd7 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -364,7 +364,14 @@
 		printk(KERN_ERR MOD "error %d initializing rqt pool\n", err);
 		goto err3;
 	}
+	err = c4iw_ocqp_pool_create(rdev);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err);
+		goto err4;
+	}
 	return 0;
+err4:
+	c4iw_rqtpool_destroy(rdev);
 err3:
 	c4iw_pblpool_destroy(rdev);
 err2:
@@ -391,6 +398,7 @@
 	idr_destroy(&dev->cqidr);
 	idr_destroy(&dev->qpidr);
 	idr_destroy(&dev->mmidr);
+	iounmap(dev->rdev.oc_mw_kva);
 	ib_dealloc_device(&dev->ibdev);
 }
 
@@ -406,6 +414,17 @@
 	}
 	devp->rdev.lldi = *infop;
 
+	devp->rdev.oc_mw_pa = pci_resource_start(devp->rdev.lldi.pdev, 2) +
+		(pci_resource_len(devp->rdev.lldi.pdev, 2) -
+		 roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size));
+	devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa,
+					       devp->rdev.lldi.vr->ocq.size);
+
+	printk(KERN_INFO MOD "ocq memory: "
+	       "hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n",
+	       devp->rdev.lldi.vr->ocq.start, devp->rdev.lldi.vr->ocq.size,
+	       devp->rdev.oc_mw_pa, devp->rdev.oc_mw_kva);
+
 	mutex_lock(&dev_mutex);
 
 	ret = c4iw_rdev_open(&devp->rdev);