cxgb4/iw_cxgb4: Doorbell Drop Avoidance Bug Fixes The current logic suffers from a slow response time to disable user DB usage, and also fails to avoid DB FIFO drops under heavy load. This commit fixes these deficiencies and makes the avoidance logic more optimal. This is done by more efficiently notifying the ULDs of potential DB problems, and implements a smoother flow control algorithm in iw_cxgb4, which is the ULD that puts the most load on the DB fifo. Design: cxgb4: Direct ULD callback from the DB FULL/DROP interrupt handler. This allows the ULD to stop doing user DB writes as quickly as possible. While user DB usage is disabled, the LLD will accumulate DB write events for its queues. Then once DB usage is reenabled, a single DB write is done for each queue with its accumulated write count. This reduces the load put on the DB fifo when reenabling. iw_cxgb4: Instead of marking each qp to indicate DB writes are disabled, we create a device-global status page that each user process maps. This allows iw_cxgb4 to only set this single bit to disable all DB writes for all user QPs vs traversing the idr of all the active QPs. If the libcxgb4 doesn't support this, then we fall back to the old approach of marking each QP. Thus we allow the new driver to work with an older libcxgb4. When the LLD upcalls iw_cxgb4 indicating DB FULL, we disable all DB writes via the status page and transition the DB state to STOPPED. As user processes see that DB writes are disabled, they call into iw_cxgb4 to submit their DB write events. Since the DB state is in STOPPED, the QP trying to write gets enqueued on a new DB "flow control" list. As subsequent DB writes are submitted for this flow controlled QP, the amount of writes are accumulated for each QP on the flow control list. So all the user QPs that are actively ringing the DB get put on this list and the number of writes they request are accumulated. When the LLD upcalls iw_cxgb4 indicating DB EMPTY, which is in a workq context, we change the DB state to FLOW_CONTROL, and begin resuming all the QPs that are on the flow control list. This logic runs on until the flow control list is empty or we exit FLOW_CONTROL mode (due to a DB DROP upcall, for example). QPs are removed from this list, and their accumulated DB write counts written to the DB FIFO. Sets of QPs, called chunks in the code, are removed at one time. The chunk size is 64. So 64 QPs are resumed at a time, and before the next chunk is resumed, the logic waits (blocks) for the DB FIFO to drain. This prevents resuming to quickly and overflowing the FIFO. Once the flow control list is empty, the db state transitions back to NORMAL and user QPs are again allowed to write directly to the user DB register. The algorithm is designed such that if the DB write load is high enough, then all the DB writes get submitted by the kernel using this flow controlled approach to avoid DB drops. As the load lightens though, we resume to normal DB writes directly by user applications. Signed-off-by: Steve Wise <swise@opengridcomputing.com> Signed-off-by: David S. Miller <davem@davemloft.net>

commit: 05eb23893c2cf9502a9cec0c32e7f1d1ed2895c8 [log] [tgz]
author: Steve Wise <swise@opengridcomputing.com> Fri Mar 14 21:52:08 2014 +0530
committer: David S. Miller <davem@davemloft.net> Fri Mar 14 22:44:11 2014 -0400
tree: b7552ee535f565d5a83039ed976a442c7f869402
parent: 7a2cea2aaae2d5eb5c00c49c52180c7c2c66130a [diff] [blame]
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 4a03385..ba7335f 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c

@@ -64,6 +64,10 @@
 static LIST_HEAD(uld_ctx_list);
 static DEFINE_MUTEX(dev_mutex);
 
+#define DB_FC_RESUME_SIZE 64
+#define DB_FC_RESUME_DELAY 1
+#define DB_FC_DRAIN_THRESH 0
+
 static struct dentry *c4iw_debugfs_root;
 
 struct c4iw_debugfs_data {
@@ -282,7 +286,7 @@
 	.llseek  = default_llseek,
 };
 
-static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY"};
+static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY", "STOPPED"};
 
 static int stats_show(struct seq_file *seq, void *v)
 {
@@ -311,9 +315,10 @@
 	seq_printf(seq, "  DB FULL: %10llu\n", dev->rdev.stats.db_full);
 	seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty);
 	seq_printf(seq, "  DB DROP: %10llu\n", dev->rdev.stats.db_drop);
-	seq_printf(seq, " DB State: %s Transitions %llu\n",
+	seq_printf(seq, " DB State: %s Transitions %llu FC Interruptions %llu\n",
 		   db_state_str[dev->db_state],
-		   dev->rdev.stats.db_state_transitions);
+		   dev->rdev.stats.db_state_transitions,
+		   dev->rdev.stats.db_fc_interruptions);
 	seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full);
 	seq_printf(seq, "ACT_OFLD_CONN_FAILS: %10llu\n",
 		   dev->rdev.stats.act_ofld_conn_fails);
@@ -643,6 +648,12 @@
 		printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err);
 		goto err4;
 	}
+	rdev->status_page = (struct t4_dev_status_page *)
+			    __get_free_page(GFP_KERNEL);
+	if (!rdev->status_page) {
+		pr_err(MOD "error allocating status page\n");
+		goto err4;
+	}
 	return 0;
 err4:
 	c4iw_rqtpool_destroy(rdev);
@@ -656,6 +667,7 @@
 
 static void c4iw_rdev_close(struct c4iw_rdev *rdev)
 {
+	free_page((unsigned long)rdev->status_page);
 	c4iw_pblpool_destroy(rdev);
 	c4iw_rqtpool_destroy(rdev);
 	c4iw_destroy_resource(&rdev->resource);
@@ -703,18 +715,6 @@
 		pr_info("%s: On-Chip Queues not supported on this device.\n",
 			pci_name(infop->pdev));
 
-	if (!is_t4(infop->adapter_type)) {
-		if (!allow_db_fc_on_t5) {
-			db_fc_threshold = 100000;
-			pr_info("DB Flow Control Disabled.\n");
-		}
-
-		if (!allow_db_coalescing_on_t5) {
-			db_coalescing_threshold = -1;
-			pr_info("DB Coalescing Disabled.\n");
-		}
-	}
-
 	devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp));
 	if (!devp) {
 		printk(KERN_ERR MOD "Cannot allocate ib device\n");
@@ -749,6 +749,7 @@
 	spin_lock_init(&devp->lock);
 	mutex_init(&devp->rdev.stats.lock);
 	mutex_init(&devp->db_mutex);
+	INIT_LIST_HEAD(&devp->db_fc_list);
 
 	if (c4iw_debugfs_root) {
 		devp->debugfs_root = debugfs_create_dir(
@@ -977,13 +978,16 @@
 
 static void stop_queues(struct uld_ctx *ctx)
 {
-	spin_lock_irq(&ctx->dev->lock);
-	if (ctx->dev->db_state == NORMAL) {
-		ctx->dev->rdev.stats.db_state_transitions++;
-		ctx->dev->db_state = FLOW_CONTROL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->dev->lock, flags);
+	ctx->dev->rdev.stats.db_state_transitions++;
+	ctx->dev->db_state = STOPPED;
+	if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED)
 		idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL);
-	}
-	spin_unlock_irq(&ctx->dev->lock);
+	else
+		ctx->dev->rdev.status_page->db_off = 1;
+	spin_unlock_irqrestore(&ctx->dev->lock, flags);
 }
 
 static int enable_qp_db(int id, void *p, void *data)
@@ -994,15 +998,70 @@
 	return 0;
 }
 
+static void resume_rc_qp(struct c4iw_qp *qp)
+{
+	spin_lock(&qp->lock);
+	t4_ring_sq_db(&qp->wq, qp->wq.sq.wq_pidx_inc);
+	qp->wq.sq.wq_pidx_inc = 0;
+	t4_ring_rq_db(&qp->wq, qp->wq.rq.wq_pidx_inc);
+	qp->wq.rq.wq_pidx_inc = 0;
+	spin_unlock(&qp->lock);
+}
+
+static void resume_a_chunk(struct uld_ctx *ctx)
+{
+	int i;
+	struct c4iw_qp *qp;
+
+	for (i = 0; i < DB_FC_RESUME_SIZE; i++) {
+		qp = list_first_entry(&ctx->dev->db_fc_list, struct c4iw_qp,
+				      db_fc_entry);
+		list_del_init(&qp->db_fc_entry);
+		resume_rc_qp(qp);
+		if (list_empty(&ctx->dev->db_fc_list))
+			break;
+	}
+}
+
 static void resume_queues(struct uld_ctx *ctx)
 {
 	spin_lock_irq(&ctx->dev->lock);
-	if (ctx->dev->qpcnt <= db_fc_threshold &&
-	    ctx->dev->db_state == FLOW_CONTROL) {
-		ctx->dev->db_state = NORMAL;
-		ctx->dev->rdev.stats.db_state_transitions++;
-		idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL);
+	if (ctx->dev->db_state != STOPPED)
+		goto out;
+	ctx->dev->db_state = FLOW_CONTROL;
+	while (1) {
+		if (list_empty(&ctx->dev->db_fc_list)) {
+			WARN_ON(ctx->dev->db_state != FLOW_CONTROL);
+			ctx->dev->db_state = NORMAL;
+			ctx->dev->rdev.stats.db_state_transitions++;
+			if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) {
+				idr_for_each(&ctx->dev->qpidr, enable_qp_db,
+					     NULL);
+			} else {
+				ctx->dev->rdev.status_page->db_off = 0;
+			}
+			break;
+		} else {
+			if (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1)
+			    < (ctx->dev->rdev.lldi.dbfifo_int_thresh <<
+			       DB_FC_DRAIN_THRESH)) {
+				resume_a_chunk(ctx);
+			}
+			if (!list_empty(&ctx->dev->db_fc_list)) {
+				spin_unlock_irq(&ctx->dev->lock);
+				if (DB_FC_RESUME_DELAY) {
+					set_current_state(TASK_UNINTERRUPTIBLE);
+					schedule_timeout(DB_FC_RESUME_DELAY);
+				}
+				spin_lock_irq(&ctx->dev->lock);
+				if (ctx->dev->db_state != FLOW_CONTROL)
+					break;
+			}
+		}
 	}
+out:
+	if (ctx->dev->db_state != NORMAL)
+		ctx->dev->rdev.stats.db_fc_interruptions++;
 	spin_unlock_irq(&ctx->dev->lock);
 }
 
@@ -1028,12 +1087,12 @@
 	return 0;
 }
 
-static void deref_qps(struct qp_list qp_list)
+static void deref_qps(struct qp_list *qp_list)
 {
 	int idx;
 
-	for (idx = 0; idx < qp_list.idx; idx++)
-		c4iw_qp_rem_ref(&qp_list.qps[idx]->ibqp);
+	for (idx = 0; idx < qp_list->idx; idx++)
+		c4iw_qp_rem_ref(&qp_list->qps[idx]->ibqp);
 }
 
 static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list)
@@ -1044,17 +1103,22 @@
 	for (idx = 0; idx < qp_list->idx; idx++) {
 		struct c4iw_qp *qp = qp_list->qps[idx];
 
+		spin_lock_irq(&qp->rhp->lock);
+		spin_lock(&qp->lock);
 		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
 					  qp->wq.sq.qid,
 					  t4_sq_host_wq_pidx(&qp->wq),
 					  t4_sq_wq_size(&qp->wq));
 		if (ret) {
-			printk(KERN_ERR MOD "%s: Fatal error - "
+			pr_err(KERN_ERR MOD "%s: Fatal error - "
 			       "DB overflow recovery failed - "
 			       "error syncing SQ qid %u\n",
 			       pci_name(ctx->lldi.pdev), qp->wq.sq.qid);
+			spin_unlock(&qp->lock);
+			spin_unlock_irq(&qp->rhp->lock);
 			return;
 		}
+		qp->wq.sq.wq_pidx_inc = 0;
 
 		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
 					  qp->wq.rq.qid,
@@ -1062,12 +1126,17 @@
 					  t4_rq_wq_size(&qp->wq));
 
 		if (ret) {
-			printk(KERN_ERR MOD "%s: Fatal error - "
+			pr_err(KERN_ERR MOD "%s: Fatal error - "
 			       "DB overflow recovery failed - "
 			       "error syncing RQ qid %u\n",
 			       pci_name(ctx->lldi.pdev), qp->wq.rq.qid);
+			spin_unlock(&qp->lock);
+			spin_unlock_irq(&qp->rhp->lock);
 			return;
 		}
+		qp->wq.rq.wq_pidx_inc = 0;
+		spin_unlock(&qp->lock);
+		spin_unlock_irq(&qp->rhp->lock);
 
 		/* Wait for the dbfifo to drain */
 		while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) {
@@ -1083,36 +1152,22 @@
 	struct qp_list qp_list;
 	int ret;
 
-	/* lock out kernel db ringers */
-	mutex_lock(&ctx->dev->db_mutex);
-
-	/* put all queues in to recovery mode */
-	spin_lock_irq(&ctx->dev->lock);
-	ctx->dev->db_state = RECOVERY;
-	ctx->dev->rdev.stats.db_state_transitions++;
-	idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL);
-	spin_unlock_irq(&ctx->dev->lock);
-
 	/* slow everybody down */
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	schedule_timeout(usecs_to_jiffies(1000));
 
-	/* Wait for the dbfifo to completely drain. */
-	while (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) > 0) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(usecs_to_jiffies(10));
-	}
-
 	/* flush the SGE contexts */
 	ret = cxgb4_flush_eq_cache(ctx->dev->rdev.lldi.ports[0]);
 	if (ret) {
 		printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n",
 		       pci_name(ctx->lldi.pdev));
-		goto out;
+		return;
 	}
 
 	/* Count active queues so we can build a list of queues to recover */
 	spin_lock_irq(&ctx->dev->lock);
+	WARN_ON(ctx->dev->db_state != STOPPED);
+	ctx->dev->db_state = RECOVERY;
 	idr_for_each(&ctx->dev->qpidr, count_qps, &count);
 
 	qp_list.qps = kzalloc(count * sizeof *qp_list.qps, GFP_ATOMIC);
@@ -1120,7 +1175,7 @@
 		printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n",
 		       pci_name(ctx->lldi.pdev));
 		spin_unlock_irq(&ctx->dev->lock);
-		goto out;
+		return;
 	}
 	qp_list.idx = 0;
 
@@ -1133,29 +1188,13 @@
 	recover_lost_dbs(ctx, &qp_list);
 
 	/* we're almost done!  deref the qps and clean up */
-	deref_qps(qp_list);
+	deref_qps(&qp_list);
 	kfree(qp_list.qps);
 
-	/* Wait for the dbfifo to completely drain again */
-	while (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) > 0) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(usecs_to_jiffies(10));
-	}
-
-	/* resume the queues */
 	spin_lock_irq(&ctx->dev->lock);
-	if (ctx->dev->qpcnt > db_fc_threshold)
-		ctx->dev->db_state = FLOW_CONTROL;
-	else {
-		ctx->dev->db_state = NORMAL;
-		idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL);
-	}
-	ctx->dev->rdev.stats.db_state_transitions++;
+	WARN_ON(ctx->dev->db_state != RECOVERY);
+	ctx->dev->db_state = STOPPED;
 	spin_unlock_irq(&ctx->dev->lock);
-
-out:
-	/* start up kernel db ringers again */
-	mutex_unlock(&ctx->dev->db_mutex);
 }
 
 static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...)
@@ -1165,9 +1204,7 @@
 	switch (control) {
 	case CXGB4_CONTROL_DB_FULL:
 		stop_queues(ctx);
-		mutex_lock(&ctx->dev->rdev.stats.lock);
 		ctx->dev->rdev.stats.db_full++;
-		mutex_unlock(&ctx->dev->rdev.stats.lock);
 		break;
 	case CXGB4_CONTROL_DB_EMPTY:
 		resume_queues(ctx);
commit	05eb23893c2cf9502a9cec0c32e7f1d1ed2895c8	[log] [tgz]
author	Steve Wise <swise@opengridcomputing.com>	Fri Mar 14 21:52:08 2014 +0530
committer	David S. Miller <davem@davemloft.net>	Fri Mar 14 22:44:11 2014 -0400
tree	b7552ee535f565d5a83039ed976a442c7f869402
parent	7a2cea2aaae2d5eb5c00c49c52180c7c2c66130a [diff] [blame]