blk-mq: switch ctx pending map to the sparser blk_align_bitmap

Each hardware queue has a bitmap of software queues with pending
requests. When new IO is queued on a software queue, the bit is
set, and when IO is pruned on a hardware queue run, the bit is
cleared. This causes a lot of traffic. Switch this from the regular
BITS_PER_LONG bitmap to a sparser layout, similarly to what was
done for blk-mq tagging.

20% performance increase was observed for single threaded IO, and
about 15% performanc increase on multiple threads driving the
same device.

Signed-off-by: Jens Axboe <axboe@fb.com>
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f83d15f..952e558 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -11,6 +11,12 @@
 	void (*notify)(void *data, unsigned long action, unsigned int cpu);
 };
 
+struct blk_mq_ctxmap {
+	unsigned int map_size;
+	unsigned int bits_per_word;
+	struct blk_align_bitmap *map;
+};
+
 struct blk_mq_hw_ctx {
 	struct {
 		spinlock_t		lock;
@@ -31,8 +37,8 @@
 
 	void			*driver_data;
 
-	unsigned int 		nr_ctx_map;
-	unsigned long		*ctx_map;
+	struct blk_mq_ctxmap	ctx_map;
+
 	unsigned int		nr_ctx;
 	struct blk_mq_ctx	**ctxs;