block: Export I/O topology for block devices and partitions
To support devices with physical block sizes bigger than 512 bytes we
need to ensure proper alignment. This patch adds support for exposing
I/O topology characteristics as devices are stacked.
logical_block_size is the smallest unit the device can address.
physical_block_size indicates the smallest I/O the device can write
without incurring a read-modify-write penalty.
The io_min parameter is the smallest preferred I/O size reported by
the device. In many cases this is the same as the physical block
size. However, the io_min parameter can be scaled up when stacking
(RAID5 chunk size > physical block size).
The io_opt characteristic indicates the optimal I/O size reported by
the device. This is usually the stripe width for arrays.
The alignment_offset parameter indicates the number of bytes the start
of the device/partition is offset from the device's natural alignment.
Partition tools and MD/DM utilities can use this to pad their offsets
so filesystems start on proper boundaries.
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/block/blk-settings.c b/block/blk-settings.c
index b0f547c..5649f34 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -309,9 +309,94 @@
void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
{
q->limits.logical_block_size = size;
+
+ if (q->limits.physical_block_size < size)
+ q->limits.physical_block_size = size;
+
+ if (q->limits.io_min < q->limits.physical_block_size)
+ q->limits.io_min = q->limits.physical_block_size;
}
EXPORT_SYMBOL(blk_queue_logical_block_size);
+/**
+ * blk_queue_physical_block_size - set physical block size for the queue
+ * @q: the request queue for the device
+ * @size: the physical block size, in bytes
+ *
+ * Description:
+ * This should be set to the lowest possible sector size that the
+ * hardware can operate on without reverting to read-modify-write
+ * operations.
+ */
+void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+{
+ q->limits.physical_block_size = size;
+
+ if (q->limits.physical_block_size < q->limits.logical_block_size)
+ q->limits.physical_block_size = q->limits.logical_block_size;
+
+ if (q->limits.io_min < q->limits.physical_block_size)
+ q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_physical_block_size);
+
+/**
+ * blk_queue_alignment_offset - set physical block alignment offset
+ * @q: the request queue for the device
+ * @alignment: alignment offset in bytes
+ *
+ * Description:
+ * Some devices are naturally misaligned to compensate for things like
+ * the legacy DOS partition table 63-sector offset. Low-level drivers
+ * should call this function for devices whose first sector is not
+ * naturally aligned.
+ */
+void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
+{
+ q->limits.alignment_offset =
+ offset & (q->limits.physical_block_size - 1);
+ q->limits.misaligned = 0;
+}
+EXPORT_SYMBOL(blk_queue_alignment_offset);
+
+/**
+ * blk_queue_io_min - set minimum request size for the queue
+ * @q: the request queue for the device
+ * @io_min: smallest I/O size in bytes
+ *
+ * Description:
+ * Some devices have an internal block size bigger than the reported
+ * hardware sector size. This function can be used to signal the
+ * smallest I/O the device can perform without incurring a performance
+ * penalty.
+ */
+void blk_queue_io_min(struct request_queue *q, unsigned int min)
+{
+ q->limits.io_min = min;
+
+ if (q->limits.io_min < q->limits.logical_block_size)
+ q->limits.io_min = q->limits.logical_block_size;
+
+ if (q->limits.io_min < q->limits.physical_block_size)
+ q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_io_min);
+
+/**
+ * blk_queue_io_opt - set optimal request size for the queue
+ * @q: the request queue for the device
+ * @io_opt: optimal request size in bytes
+ *
+ * Description:
+ * Drivers can call this function to set the preferred I/O request
+ * size for devices that report such a value.
+ */
+void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
+{
+ q->limits.io_opt = opt;
+}
+EXPORT_SYMBOL(blk_queue_io_opt);
+
/*
* Returns the minimum that is _not_ zero, unless both are zero.
*/
@@ -358,6 +443,107 @@
EXPORT_SYMBOL(blk_queue_stack_limits);
/**
+ * blk_stack_limits - adjust queue_limits for stacked devices
+ * @t: the stacking driver limits (top)
+ * @bdev: the underlying queue limits (bottom)
+ * @offset: offset to beginning of data within component device
+ *
+ * Description:
+ * Merges two queue_limit structs. Returns 0 if alignment didn't
+ * change. Returns -1 if adding the bottom device caused
+ * misalignment.
+ */
+int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
+ sector_t offset)
+{
+ t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
+ t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+
+ t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
+ b->seg_boundary_mask);
+
+ t->max_phys_segments = min_not_zero(t->max_phys_segments,
+ b->max_phys_segments);
+
+ t->max_hw_segments = min_not_zero(t->max_hw_segments,
+ b->max_hw_segments);
+
+ t->max_segment_size = min_not_zero(t->max_segment_size,
+ b->max_segment_size);
+
+ t->logical_block_size = max(t->logical_block_size,
+ b->logical_block_size);
+
+ t->physical_block_size = max(t->physical_block_size,
+ b->physical_block_size);
+
+ t->io_min = max(t->io_min, b->io_min);
+ t->no_cluster |= b->no_cluster;
+
+ /* Bottom device offset aligned? */
+ if (offset &&
+ (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
+ t->misaligned = 1;
+ return -1;
+ }
+
+ /* If top has no alignment offset, inherit from bottom */
+ if (!t->alignment_offset)
+ t->alignment_offset =
+ b->alignment_offset & (b->physical_block_size - 1);
+
+ /* Top device aligned on logical block boundary? */
+ if (t->alignment_offset & (t->logical_block_size - 1)) {
+ t->misaligned = 1;
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * disk_stack_limits - adjust queue limits for stacked drivers
+ * @t: MD/DM gendisk (top)
+ * @bdev: the underlying block device (bottom)
+ * @offset: offset to beginning of data within component device
+ *
+ * Description:
+ * Merges the limits for two queues. Returns 0 if alignment
+ * didn't change. Returns -1 if adding the bottom device caused
+ * misalignment.
+ */
+void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
+ sector_t offset)
+{
+ struct request_queue *t = disk->queue;
+ struct request_queue *b = bdev_get_queue(bdev);
+
+ offset += get_start_sect(bdev) << 9;
+
+ if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) {
+ char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
+
+ disk_name(disk, 0, top);
+ bdevname(bdev, bottom);
+
+ printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
+ top, bottom);
+ }
+
+ if (!t->queue_lock)
+ WARN_ON_ONCE(1);
+ else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(t->queue_lock, flags);
+ if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
+ queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
+ spin_unlock_irqrestore(t->queue_lock, flags);
+ }
+}
+EXPORT_SYMBOL(disk_stack_limits);
+
+/**
* blk_queue_dma_pad - set pad mask
* @q: the request queue for the device
* @mask: pad mask
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3ccdadb..9337e17 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -105,6 +105,21 @@
return queue_var_show(queue_logical_block_size(q), page);
}
+static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(queue_physical_block_size(q), page);
+}
+
+static ssize_t queue_io_min_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(queue_io_min(q), page);
+}
+
+static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(queue_io_opt(q), page);
+}
+
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
@@ -257,6 +272,21 @@
.show = queue_logical_block_size_show,
};
+static struct queue_sysfs_entry queue_physical_block_size_entry = {
+ .attr = {.name = "physical_block_size", .mode = S_IRUGO },
+ .show = queue_physical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_io_min_entry = {
+ .attr = {.name = "minimum_io_size", .mode = S_IRUGO },
+ .show = queue_io_min_show,
+};
+
+static struct queue_sysfs_entry queue_io_opt_entry = {
+ .attr = {.name = "optimal_io_size", .mode = S_IRUGO },
+ .show = queue_io_opt_show,
+};
+
static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
.show = queue_nonrot_show,
@@ -289,6 +319,9 @@
&queue_iosched_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
+ &queue_physical_block_size_entry.attr,
+ &queue_io_min_entry.attr,
+ &queue_io_opt_entry.attr,
&queue_nonrot_entry.attr,
&queue_nomerges_entry.attr,
&queue_rq_affinity_entry.attr,
diff --git a/block/genhd.c b/block/genhd.c
index 1a4916e..fe7ccc0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -852,11 +852,21 @@
return sprintf(buf, "%x\n", disk->flags);
}
+static ssize_t disk_alignment_offset_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
+}
+
static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -875,6 +885,7 @@
&dev_attr_removable.attr,
&dev_attr_ro.attr,
&dev_attr_size.attr,
+ &dev_attr_alignment_offset.attr,
&dev_attr_capability.attr,
&dev_attr_stat.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST