IB/mthca: Optimize large messages on Sinai HCAs

Sinai (one-port PCI Express) HCAs get improved throughput for messages
bigger than 80 KB in DDR mode if memory keys are formatted in a
specific way.  The enhancement only works if the memory key table is
smaller than 2^24 entries.  For larger tables, the enhancement is off
and a warning is printed (to avoid silent performance loss).

Signed-off-by: Eli Cohen <eli@mellanox.co.il>
Signed-off-by: Michael Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 948a286..343eca5 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -1277,7 +1277,8 @@
 	int err;
 
 #define INIT_HCA_IN_SIZE             	 0x200
-#define INIT_HCA_FLAGS_OFFSET        	 0x014
+#define INIT_HCA_FLAGS1_OFFSET           0x00c
+#define INIT_HCA_FLAGS2_OFFSET           0x014
 #define INIT_HCA_QPC_OFFSET          	 0x020
 #define  INIT_HCA_QPC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x10)
 #define  INIT_HCA_LOG_QP_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x17)
@@ -1320,15 +1321,18 @@
 
 	memset(inbox, 0, INIT_HCA_IN_SIZE);
 
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		MTHCA_PUT(inbox, 0x1, INIT_HCA_FLAGS1_OFFSET);
+
 #if defined(__LITTLE_ENDIAN)
-	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
 #elif defined(__BIG_ENDIAN)
-	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1);
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1 << 1);
 #else
 #error Host endianness not defined
 #endif
 	/* Check port for UD address vector: */
-	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1);
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1);
 
 	/* We leave wqe_quota, responder_exu, etc as 0 (default) */
 
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index ea48c89..ad52edb 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -64,7 +64,8 @@
 	MTHCA_FLAG_NO_LAM     = 1 << 5,
 	MTHCA_FLAG_FMR        = 1 << 6,
 	MTHCA_FLAG_MEMFREE    = 1 << 7,
-	MTHCA_FLAG_PCIE       = 1 << 8
+	MTHCA_FLAG_PCIE       = 1 << 8,
+	MTHCA_FLAG_SINAI_OPT  = 1 << 9
 };
 
 enum {
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 9c849d2..d94837f 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -935,13 +935,19 @@
 
 static struct {
 	u64 latest_fw;
-	int is_memfree;
-	int is_pcie;
+	u32 flags;
 } mthca_hca_table[] = {
-	[TAVOR]        = { .latest_fw = MTHCA_FW_VER(3, 3, 3), .is_memfree = 0, .is_pcie = 0 },
-	[ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 7, 0), .is_memfree = 0, .is_pcie = 1 },
-	[ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 1, 0), .is_memfree = 1, .is_pcie = 1 },
-	[SINAI]        = { .latest_fw = MTHCA_FW_VER(1, 0, 1), .is_memfree = 1, .is_pcie = 1 }
+	[TAVOR]        = { .latest_fw = MTHCA_FW_VER(3, 3, 3),
+			   .flags     = 0 },
+	[ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 7, 0),
+			   .flags     = MTHCA_FLAG_PCIE },
+	[ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 1, 0),
+			   .flags     = MTHCA_FLAG_MEMFREE |
+					MTHCA_FLAG_PCIE },
+	[SINAI]        = { .latest_fw = MTHCA_FW_VER(1, 0, 1),
+			   .flags     = MTHCA_FLAG_MEMFREE |
+					MTHCA_FLAG_PCIE    |
+					MTHCA_FLAG_SINAI_OPT }
 };
 
 static int __devinit mthca_init_one(struct pci_dev *pdev,
@@ -1031,12 +1037,9 @@
 
 	mdev->pdev = pdev;
 
+	mdev->mthca_flags = mthca_hca_table[id->driver_data].flags;
 	if (ddr_hidden)
 		mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
-	if (mthca_hca_table[id->driver_data].is_memfree)
-		mdev->mthca_flags |= MTHCA_FLAG_MEMFREE;
-	if (mthca_hca_table[id->driver_data].is_pcie)
-		mdev->mthca_flags |= MTHCA_FLAG_PCIE;
 
 	/*
 	 * Now reset the HCA before we touch the PCI capabilities or
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
index 0b48048..698b621 100644
--- a/drivers/infiniband/hw/mthca/mthca_mr.c
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -76,6 +76,8 @@
 #define MTHCA_MPT_STATUS_SW 0xF0
 #define MTHCA_MPT_STATUS_HW 0x00
 
+#define SINAI_FMR_KEY_INC 0x1000000
+
 /*
  * Buddy allocator for MTT segments (currently not very efficient
  * since it doesn't keep a free list and just searches linearly
@@ -330,6 +332,14 @@
 		return tavor_key_to_hw_index(key);
 }
 
+static inline u32 adjust_key(struct mthca_dev *dev, u32 key)
+{
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		return ((key << 20) & 0x800000) | (key & 0x7fffff);
+	else
+		return key;
+}
+
 int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
 		   u64 iova, u64 total_size, u32 access, struct mthca_mr *mr)
 {
@@ -345,6 +355,7 @@
 	key = mthca_alloc(&dev->mr_table.mpt_alloc);
 	if (key == -1)
 		return -ENOMEM;
+	key = adjust_key(dev, key);
 	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
 
 	if (mthca_is_memfree(dev)) {
@@ -504,6 +515,7 @@
 	key = mthca_alloc(&dev->mr_table.mpt_alloc);
 	if (key == -1)
 		return -ENOMEM;
+	key = adjust_key(dev, key);
 
 	idx = key & (dev->limits.num_mpts - 1);
 	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
@@ -687,7 +699,10 @@
 	++fmr->maps;
 
 	key = arbel_key_to_hw_index(fmr->ibmr.lkey);
-	key += dev->limits.num_mpts;
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		key += SINAI_FMR_KEY_INC;
+	else
+		key += dev->limits.num_mpts;
 	fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key);
 
 	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
@@ -760,6 +775,9 @@
 	else
 		dev->mthca_flags |= MTHCA_FLAG_FMR;
 
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		mthca_dbg(dev, "Memory key throughput optimization activated.\n");
+
 	err = mthca_buddy_init(&dev->mr_table.mtt_buddy,
 			       fls(dev->limits.num_mtt_segs - 1));
 
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c
index 08a9093..58d44aa 100644
--- a/drivers/infiniband/hw/mthca/mthca_profile.c
+++ b/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -152,7 +152,7 @@
 		}
 		if (total_size > mem_avail) {
 			mthca_err(dev, "Profile requires 0x%llx bytes; "
-				  "won't in 0x%llx bytes of context memory.\n",
+				  "won't fit in 0x%llx bytes of context memory.\n",
 				  (unsigned long long) total_size,
 				  (unsigned long long) mem_avail);
 			kfree(profile);
@@ -262,6 +262,14 @@
 	 */
 	dev->limits.num_pds = MTHCA_NUM_PDS;
 
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT &&
+	    init_hca->log_mpt_sz > 23) {
+		mthca_warn(dev, "MPT table too large (requested size 2^%d >= 2^24)\n",
+			   init_hca->log_mpt_sz);
+		mthca_warn(dev, "Disabling memory key throughput optimization.\n");
+		dev->mthca_flags &= ~MTHCA_FLAG_SINAI_OPT;
+	}
+
 	/*
 	 * For Tavor, FMRs use ioremapped PCI memory. For 32 bit
 	 * systems it may use too much vmalloc space to map all MTT