blk-mq: add basic round-robin of what CPU to queue workqueue work on

Right now we just pick the first CPU in the mask, but that can
easily overload that one. Add some basic batching and round-robin
all the entries in the mask instead.

Signed-off-by: Jens Axboe <axboe@fb.com>
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3b561d6..5bd677e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -21,6 +21,8 @@
 	struct delayed_work	run_work;
 	struct delayed_work	delay_work;
 	cpumask_var_t		cpumask;
+	int			next_cpu;
+	int			next_cpu_batch;
 
 	unsigned long		flags;		/* BLK_MQ_F_* flags */
 
@@ -126,6 +128,8 @@
 	BLK_MQ_S_STOPPED	= 0,
 
 	BLK_MQ_MAX_DEPTH	= 2048,
+
+	BLK_MQ_CPU_WORK_BATCH	= 8,
 };
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);