blk-mq: save memory by freeing requests on unused hardware queues Depending on the topology of the machine and the number of queues exposed by a device, we can end up in a situation where some of the hardware queues are unused (as in, they don't map to any software queues). For this case, free up the memory used by the request map, as we will not use it. This can be a substantial amount of memory, depending on the number of queues vs CPUs and the queue depth of the device. Signed-off-by: Jens Axboe <axboe@fb.com>

commit: 484b4061e6683e0e6a09c7455f80781128dc8a6b [log] [tgz]
author: Jens Axboe <axboe@fb.com> Wed May 21 14:01:15 2014 -0600
committer: Jens Axboe <axboe@fb.com> Wed May 21 14:01:15 2014 -0600
tree: 614cc76c04c6a7e17278839d28b3e26608a63f82
parent: e814e71ba4a6e1d7509b0f4b1928365ea650cace [diff] [blame]
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5a3683f..103aa1d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c

@@ -597,8 +597,16 @@
 	unsigned long next = 0;
 	int i, next_set = 0;
 
-	queue_for_each_hw_ctx(q, hctx, i)
+	queue_for_each_hw_ctx(q, hctx, i) {
+		/*
+		 * If not software queues are currently mapped to this
+		 * hardware queue, there's nothing to check
+		 */
+		if (!hctx->nr_ctx || !hctx->tags)
+			continue;
+
 		blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
+	}
 
 	if (next_set) {
 		next = blk_rq_timeout(round_jiffies_up(next));
@@ -1196,53 +1204,6 @@
 }
 EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
 
-static int blk_mq_hctx_notify(void *data, unsigned long action,
-			      unsigned int cpu)
-{
-	struct blk_mq_hw_ctx *hctx = data;
-	struct request_queue *q = hctx->queue;
-	struct blk_mq_ctx *ctx;
-	LIST_HEAD(tmp);
-
-	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
-		return NOTIFY_OK;
-
-	/*
-	 * Move ctx entries to new CPU, if this one is going away.
-	 */
-	ctx = __blk_mq_get_ctx(q, cpu);
-
-	spin_lock(&ctx->lock);
-	if (!list_empty(&ctx->rq_list)) {
-		list_splice_init(&ctx->rq_list, &tmp);
-		blk_mq_hctx_clear_pending(hctx, ctx);
-	}
-	spin_unlock(&ctx->lock);
-
-	if (list_empty(&tmp))
-		return NOTIFY_OK;
-
-	ctx = blk_mq_get_ctx(q);
-	spin_lock(&ctx->lock);
-
-	while (!list_empty(&tmp)) {
-		struct request *rq;
-
-		rq = list_first_entry(&tmp, struct request, queuelist);
-		rq->mq_ctx = ctx;
-		list_move_tail(&rq->queuelist, &ctx->rq_list);
-	}
-
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-	blk_mq_hctx_mark_pending(hctx, ctx);
-
-	spin_unlock(&ctx->lock);
-
-	blk_mq_run_hw_queue(hctx, true);
-	blk_mq_put_ctx(ctx);
-	return NOTIFY_OK;
-}
-
 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
 		struct blk_mq_tags *tags, unsigned int hctx_idx)
 {
@@ -1384,6 +1345,77 @@
 	return 0;
 }
 
+static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
+{
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_ctx *ctx;
+	LIST_HEAD(tmp);
+
+	/*
+	 * Move ctx entries to new CPU, if this one is going away.
+	 */
+	ctx = __blk_mq_get_ctx(q, cpu);
+
+	spin_lock(&ctx->lock);
+	if (!list_empty(&ctx->rq_list)) {
+		list_splice_init(&ctx->rq_list, &tmp);
+		blk_mq_hctx_clear_pending(hctx, ctx);
+	}
+	spin_unlock(&ctx->lock);
+
+	if (list_empty(&tmp))
+		return NOTIFY_OK;
+
+	ctx = blk_mq_get_ctx(q);
+	spin_lock(&ctx->lock);
+
+	while (!list_empty(&tmp)) {
+		struct request *rq;
+
+		rq = list_first_entry(&tmp, struct request, queuelist);
+		rq->mq_ctx = ctx;
+		list_move_tail(&rq->queuelist, &ctx->rq_list);
+	}
+
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+	blk_mq_hctx_mark_pending(hctx, ctx);
+
+	spin_unlock(&ctx->lock);
+
+	blk_mq_run_hw_queue(hctx, true);
+	blk_mq_put_ctx(ctx);
+	return NOTIFY_OK;
+}
+
+static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
+{
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_tag_set *set = q->tag_set;
+
+	if (set->tags[hctx->queue_num])
+		return NOTIFY_OK;
+
+	set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
+	if (!set->tags[hctx->queue_num])
+		return NOTIFY_STOP;
+
+	hctx->tags = set->tags[hctx->queue_num];
+	return NOTIFY_OK;
+}
+
+static int blk_mq_hctx_notify(void *data, unsigned long action,
+			      unsigned int cpu)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
+		return blk_mq_hctx_cpu_offline(hctx, cpu);
+	else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
+		return blk_mq_hctx_cpu_online(hctx, cpu);
+
+	return NOTIFY_OK;
+}
+
 static int blk_mq_init_hw_queues(struct request_queue *q,
 		struct blk_mq_tag_set *set)
 {
@@ -1513,6 +1545,24 @@
 	}
 
 	queue_for_each_hw_ctx(q, hctx, i) {
+		/*
+		 * If not software queues are mapped to this hardware queue,
+		 * disable it and free the request entries
+		 */
+		if (!hctx->nr_ctx) {
+			struct blk_mq_tag_set *set = q->tag_set;
+
+			if (set->tags[i]) {
+				blk_mq_free_rq_map(set, set->tags[i], i);
+				set->tags[i] = NULL;
+				hctx->tags = NULL;
+			}
+			continue;
+		}
+
+		/*
+		 * Initialize batch roundrobin counts
+		 */
 		hctx->next_cpu = cpumask_first(hctx->cpumask);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
@@ -1645,14 +1695,14 @@
 	if (blk_mq_init_hw_queues(q, set))
 		goto err_flush_rq;
 
-	blk_mq_map_swqueue(q);
-
 	mutex_lock(&all_q_mutex);
 	list_add_tail(&q->all_q_node, &all_q_list);
 	mutex_unlock(&all_q_mutex);
 
 	blk_mq_add_queue_tag_set(set, q);
 
+	blk_mq_map_swqueue(q);
+
 	return q;
 
 err_flush_rq:
@@ -1790,8 +1840,11 @@
 {
 	int i;
 
-	for (i = 0; i < set->nr_hw_queues; i++)
-		blk_mq_free_rq_map(set, set->tags[i], i);
+	for (i = 0; i < set->nr_hw_queues; i++) {
+		if (set->tags[i])
+			blk_mq_free_rq_map(set, set->tags[i], i);
+	}
+
 	kfree(set->tags);
 }
 EXPORT_SYMBOL(blk_mq_free_tag_set);
commit	484b4061e6683e0e6a09c7455f80781128dc8a6b	[log] [tgz]
author	Jens Axboe <axboe@fb.com>	Wed May 21 14:01:15 2014 -0600
committer	Jens Axboe <axboe@fb.com>	Wed May 21 14:01:15 2014 -0600
tree	614cc76c04c6a7e17278839d28b3e26608a63f82
parent	e814e71ba4a6e1d7509b0f4b1928365ea650cace [diff] [blame]