target: Remove extra se_device->execute_task_lock access in fast path

This patch makes __transport_execute_tasks() perform the addition of
tasks to dev->execute_task_list via __transport_add_tasks_from_cmd()
while holding dev->execute_task_lock during normal I/O fast path
submission.

It effectively removes the unnecessary re-acquire of dev->execute_task_lock
during transport_execute_tasks() -> transport_add_tasks_from_cmd() ahead
of calling  __transport_execute_tasks() to queue tasks for the passed
*se_cmd descriptor.

(v2: Re-add goto check_depth usage for multi-task submission for now..)

Cc: Christoph Hellwig <hch@lst.de>
Cc: Roland Dreier <roland@purestorage.com>
Cc: Joern Engel <joern@logfs.org>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 7c2def7..46e2511 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -68,7 +68,7 @@
 
 static int transport_generic_write_pending(struct se_cmd *);
 static int transport_processing_thread(void *param);
-static int __transport_execute_tasks(struct se_device *dev);
+static int __transport_execute_tasks(struct se_device *dev, struct se_cmd *);
 static void transport_complete_task_attr(struct se_cmd *cmd);
 static void transport_handle_queue_full(struct se_cmd *cmd,
 		struct se_device *dev);
@@ -851,13 +851,11 @@
 	spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 }
 
-static void transport_add_tasks_from_cmd(struct se_cmd *cmd)
+static void __transport_add_tasks_from_cmd(struct se_cmd *cmd)
 {
 	struct se_device *dev = cmd->se_dev;
 	struct se_task *task, *task_prev = NULL;
-	unsigned long flags;
 
-	spin_lock_irqsave(&dev->execute_task_lock, flags);
 	list_for_each_entry(task, &cmd->t_task_list, t_list) {
 		if (!list_empty(&task->t_execute_list))
 			continue;
@@ -868,6 +866,15 @@
 		__transport_add_task_to_execute_queue(task, task_prev, dev);
 		task_prev = task;
 	}
+}
+
+static void transport_add_tasks_from_cmd(struct se_cmd *cmd)
+{
+	unsigned long flags;
+	struct se_device *dev = cmd->se_dev;
+
+	spin_lock_irqsave(&dev->execute_task_lock, flags);
+	__transport_add_tasks_from_cmd(cmd);
 	spin_unlock_irqrestore(&dev->execute_task_lock, flags);
 }
 
@@ -2075,19 +2082,16 @@
 		if (!add_tasks)
 			goto execute_tasks;
 		/*
-		 * This calls transport_add_tasks_from_cmd() to handle
-		 * HEAD_OF_QUEUE ordering for SAM Task Attribute emulation
-		 * (if enabled) in __transport_add_task_to_execute_queue() and
-		 * transport_add_task_check_sam_attr().
+		 * __transport_execute_tasks() -> __transport_add_tasks_from_cmd()
+		 * adds associated se_tasks while holding dev->execute_task_lock
+		 * before I/O dispath to avoid a double spinlock access.
 		 */
-		transport_add_tasks_from_cmd(cmd);
+		__transport_execute_tasks(se_dev, cmd);
+		return 0;
 	}
-	/*
-	 * Kick the execution queue for the cmd associated struct se_device
-	 * storage object.
-	 */
+
 execute_tasks:
-	__transport_execute_tasks(se_dev);
+	__transport_execute_tasks(se_dev, NULL);
 	return 0;
 }
 
@@ -2097,7 +2101,7 @@
  *
  * Called from transport_processing_thread()
  */
-static int __transport_execute_tasks(struct se_device *dev)
+static int __transport_execute_tasks(struct se_device *dev, struct se_cmd *new_cmd)
 {
 	int error;
 	struct se_cmd *cmd = NULL;
@@ -2106,6 +2110,9 @@
 
 check_depth:
 	spin_lock_irq(&dev->execute_task_lock);
+	if (new_cmd != NULL)
+		__transport_add_tasks_from_cmd(new_cmd);
+
 	if (list_empty(&dev->execute_task_list)) {
 		spin_unlock_irq(&dev->execute_task_lock);
 		return 0;
@@ -2139,6 +2146,7 @@
 		transport_generic_request_failure(cmd);
 	}
 
+	new_cmd = NULL;
 	goto check_depth;
 
 	return 0;
@@ -4647,7 +4655,7 @@
 			goto out;
 
 get_cmd:
-		__transport_execute_tasks(dev);
+		__transport_execute_tasks(dev, NULL);
 
 		cmd = transport_get_cmd_from_queue(&dev->dev_queue_obj);
 		if (!cmd)
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index cd4caf3..a74cb2be 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -781,7 +781,6 @@
 	u32			dev_port_count;
 	/* See transport_device_status_table */
 	u32			dev_status;
-	u32			dev_tcq_window_closed;
 	/* Physical device queue depth */
 	u32			queue_depth;
 	/* Used for SPC-2 reservations enforce of ISIDs */