scsi: hisi_sas: optimise DMA slot memory

Currently we allocate 3 sets of DMA memories from separate pools for
each slot. This is inefficient in terms of memory usage
(buffers are less than 1 page in size, so we lose due to alignment),
and also time spent in doing 3 allocations + de-allocations per slot,
instead of 1.

To optimise, combine the 3 DMA buffers into a single buffer from a
single pool.

Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
Signed-off-by: John Garry <john.garry@huawei.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h
index 22dd48b..a722f2b 100644
--- a/drivers/scsi/hisi_sas/hisi_sas.h
+++ b/drivers/scsi/hisi_sas/hisi_sas.h
@@ -34,10 +34,24 @@
 #define HISI_SAS_MAX_DEVICES HISI_SAS_MAX_ITCT_ENTRIES
 #define HISI_SAS_RESET_BIT	0
 
-#define HISI_SAS_STATUS_BUF_SZ \
-		(sizeof(struct hisi_sas_err_record) + 1024)
-#define HISI_SAS_COMMAND_TABLE_SZ \
-		(((sizeof(union hisi_sas_command_table)+3)/4)*4)
+#define HISI_SAS_STATUS_BUF_SZ (sizeof(struct hisi_sas_status_buffer))
+#define HISI_SAS_COMMAND_TABLE_SZ (sizeof(union hisi_sas_command_table))
+
+#define hisi_sas_status_buf_addr(buf) \
+	(buf + offsetof(struct hisi_sas_slot_buf_table, status_buffer))
+#define hisi_sas_status_buf_addr_mem(slot) hisi_sas_status_buf_addr(slot->buf)
+#define hisi_sas_status_buf_addr_dma(slot) \
+	hisi_sas_status_buf_addr(slot->buf_dma)
+
+#define hisi_sas_cmd_hdr_addr(buf) \
+	(buf + offsetof(struct hisi_sas_slot_buf_table, command_header))
+#define hisi_sas_cmd_hdr_addr_mem(slot) hisi_sas_cmd_hdr_addr(slot->buf)
+#define hisi_sas_cmd_hdr_addr_dma(slot) hisi_sas_cmd_hdr_addr(slot->buf_dma)
+
+#define hisi_sas_sge_addr(buf) \
+	(buf + offsetof(struct hisi_sas_slot_buf_table, sge_page))
+#define hisi_sas_sge_addr_mem(slot) hisi_sas_sge_addr(slot->buf)
+#define hisi_sas_sge_addr_dma(slot) hisi_sas_sge_addr(slot->buf_dma)
 
 #define HISI_SAS_MAX_SSP_RESP_SZ (sizeof(struct ssp_frame_hdr) + 1024)
 #define HISI_SAS_MAX_SMP_RESP_SZ 1028
@@ -139,14 +153,10 @@
 	int	cmplt_queue_slot;
 	int	idx;
 	int	abort;
+	void	*buf;
+	dma_addr_t buf_dma;
 	void	*cmd_hdr;
 	dma_addr_t cmd_hdr_dma;
-	void	*status_buffer;
-	dma_addr_t status_buffer_dma;
-	void *command_table;
-	dma_addr_t command_table_dma;
-	struct hisi_sas_sge_page *sge_page;
-	dma_addr_t sge_page_dma;
 	struct work_struct abort_slot;
 	struct timer_list internal_abort_timer;
 };
@@ -232,10 +242,8 @@
 
 	int	queue_count;
 
-	struct dma_pool *sge_page_pool;
+	struct dma_pool *buffer_pool;
 	struct hisi_sas_device	devices[HISI_SAS_MAX_DEVICES];
-	struct dma_pool *command_table_pool;
-	struct dma_pool *status_buffer_pool;
 	struct hisi_sas_cmd_hdr	*cmd_hdr[HISI_SAS_MAX_QUEUES];
 	dma_addr_t cmd_hdr_dma[HISI_SAS_MAX_QUEUES];
 	void *complete_hdr[HISI_SAS_MAX_QUEUES];
@@ -347,7 +355,7 @@
 #define HISI_SAS_SGE_PAGE_CNT SG_CHUNK_SIZE
 struct hisi_sas_sge_page {
 	struct hisi_sas_sge sge[HISI_SAS_SGE_PAGE_CNT];
-};
+}  __aligned(16);
 
 struct hisi_sas_command_table_ssp {
 	struct ssp_frame_hdr hdr;
@@ -366,6 +374,17 @@
 	struct hisi_sas_command_table_ssp ssp;
 	struct hisi_sas_command_table_smp smp;
 	struct hisi_sas_command_table_stp stp;
+}  __aligned(16);
+
+struct hisi_sas_status_buffer {
+	struct hisi_sas_err_record err;
+	u8	iu[1024];
+}  __aligned(16);
+
+struct hisi_sas_slot_buf_table {
+	struct hisi_sas_status_buffer status_buffer;
+	union hisi_sas_command_table command_header;
+	struct hisi_sas_sge_page sge_page;
 };
 
 extern struct scsi_transport_template *hisi_sas_stt;