block: Block layer data integrity support
Some block devices support verifying the integrity of requests by way
of checksums or other protection information that is submitted along
with the I/O.
This patch implements support for generating and verifying integrity
metadata, as well as correctly merging, splitting and cloning bios and
requests that have this extra information attached.
See Documentation/block/data-integrity.txt for more information.
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/block/Kconfig b/block/Kconfig
index 3e97f2b..1ab7c15 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -81,6 +81,18 @@
If unsure, say N.
+config BLK_DEV_INTEGRITY
+ bool "Block layer data integrity support"
+ ---help---
+ Some storage devices allow extra information to be
+ stored/retrieved to help protect the data. The block layer
+ data integrity option provides hooks which can be used by
+ filesystems to ensure better data integrity.
+
+ Say yes here if you have a storage device that provides the
+ T10/SCSI Data Integrity Field or the T13/ATA External Path
+ Protection. If in doubt, say N.
+
endif # BLOCK
config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 5a43c7d..045f7b6 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -14,3 +14,4 @@
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 1905aab..e0fb0bc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -143,6 +143,10 @@
bio->bi_size -= nbytes;
bio->bi_sector += (nbytes >> 9);
+
+ if (bio_integrity(bio))
+ bio_integrity_advance(bio, nbytes);
+
if (bio->bi_size == 0)
bio_endio(bio, error);
} else {
@@ -1381,6 +1385,9 @@
*/
blk_partition_remap(bio);
+ if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
+ goto end_io;
+
if (old_sector != -1)
blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
old_sector);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
new file mode 100644
index 0000000..65f23ef
--- /dev/null
+++ b/block/blk-integrity.c
@@ -0,0 +1,382 @@
+/*
+ * blk-integrity.c - Block layer data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/scatterlist.h>
+
+#include "blk.h"
+
+static struct kmem_cache *integrity_cachep;
+
+/**
+ * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
+ * @rq: request with integrity metadata attached
+ *
+ * Description: Returns the number of elements required in a
+ * scatterlist corresponding to the integrity metadata in a request.
+ */
+int blk_rq_count_integrity_sg(struct request *rq)
+{
+ struct bio_vec *iv, *ivprv;
+ struct req_iterator iter;
+ unsigned int segments;
+
+ ivprv = NULL;
+ segments = 0;
+
+ rq_for_each_integrity_segment(iv, rq, iter) {
+
+ if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+ segments++;
+
+ ivprv = iv;
+ }
+
+ return segments;
+}
+EXPORT_SYMBOL(blk_rq_count_integrity_sg);
+
+/**
+ * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
+ * @rq: request with integrity metadata attached
+ * @sglist: target scatterlist
+ *
+ * Description: Map the integrity vectors in request into a
+ * scatterlist. The scatterlist must be big enough to hold all
+ * elements. I.e. sized using blk_rq_count_integrity_sg().
+ */
+int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
+{
+ struct bio_vec *iv, *ivprv;
+ struct req_iterator iter;
+ struct scatterlist *sg;
+ unsigned int segments;
+
+ ivprv = NULL;
+ sg = NULL;
+ segments = 0;
+
+ rq_for_each_integrity_segment(iv, rq, iter) {
+
+ if (ivprv) {
+ if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+ goto new_segment;
+
+ sg->length += iv->bv_len;
+ } else {
+new_segment:
+ if (!sg)
+ sg = sglist;
+ else {
+ sg->page_link &= ~0x02;
+ sg = sg_next(sg);
+ }
+
+ sg_set_page(sg, iv->bv_page, iv->bv_len, iv->bv_offset);
+ segments++;
+ }
+
+ ivprv = iv;
+ }
+
+ if (sg)
+ sg_mark_end(sg);
+
+ return segments;
+}
+EXPORT_SYMBOL(blk_rq_map_integrity_sg);
+
+/**
+ * blk_integrity_compare - Compare integrity profile of two block devices
+ * @b1: Device to compare
+ * @b2: Device to compare
+ *
+ * Description: Meta-devices like DM and MD need to verify that all
+ * sub-devices use the same integrity format before advertising to
+ * upper layers that they can send/receive integrity metadata. This
+ * function can be used to check whether two block devices have
+ * compatible integrity formats.
+ */
+int blk_integrity_compare(struct block_device *bd1, struct block_device *bd2)
+{
+ struct blk_integrity *b1 = bd1->bd_disk->integrity;
+ struct blk_integrity *b2 = bd2->bd_disk->integrity;
+
+ BUG_ON(bd1->bd_disk == NULL);
+ BUG_ON(bd2->bd_disk == NULL);
+
+ if (!b1 || !b2)
+ return 0;
+
+ if (b1->sector_size != b2->sector_size) {
+ printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__,
+ bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+ b1->sector_size, b2->sector_size);
+ return -1;
+ }
+
+ if (b1->tuple_size != b2->tuple_size) {
+ printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__,
+ bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+ b1->tuple_size, b2->tuple_size);
+ return -1;
+ }
+
+ if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
+ printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__,
+ bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+ b1->tag_size, b2->tag_size);
+ return -1;
+ }
+
+ if (strcmp(b1->name, b2->name)) {
+ printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__,
+ bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+ b1->name, b2->name);
+ return -1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(blk_integrity_compare);
+
+struct integrity_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_integrity *, char *);
+ ssize_t (*store)(struct blk_integrity *, const char *, size_t);
+};
+
+static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *page)
+{
+ struct blk_integrity *bi =
+ container_of(kobj, struct blk_integrity, kobj);
+ struct integrity_sysfs_entry *entry =
+ container_of(attr, struct integrity_sysfs_entry, attr);
+
+ return entry->show(bi, page);
+}
+
+static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *page, size_t count)
+{
+ struct blk_integrity *bi =
+ container_of(kobj, struct blk_integrity, kobj);
+ struct integrity_sysfs_entry *entry =
+ container_of(attr, struct integrity_sysfs_entry, attr);
+ ssize_t ret = 0;
+
+ if (entry->store)
+ ret = entry->store(bi, page, count);
+
+ return ret;
+}
+
+static ssize_t integrity_format_show(struct blk_integrity *bi, char *page)
+{
+ if (bi != NULL && bi->name != NULL)
+ return sprintf(page, "%s\n", bi->name);
+ else
+ return sprintf(page, "none\n");
+}
+
+static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page)
+{
+ if (bi != NULL)
+ return sprintf(page, "%u\n", bi->tag_size);
+ else
+ return sprintf(page, "0\n");
+}
+
+static ssize_t integrity_read_store(struct blk_integrity *bi,
+ const char *page, size_t count)
+{
+ char *p = (char *) page;
+ unsigned long val = simple_strtoul(p, &p, 10);
+
+ if (val)
+ set_bit(INTEGRITY_FLAG_READ, &bi->flags);
+ else
+ clear_bit(INTEGRITY_FLAG_READ, &bi->flags);
+
+ return count;
+}
+
+static ssize_t integrity_read_show(struct blk_integrity *bi, char *page)
+{
+ return sprintf(page, "%d\n",
+ test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0);
+}
+
+static ssize_t integrity_write_store(struct blk_integrity *bi,
+ const char *page, size_t count)
+{
+ char *p = (char *) page;
+ unsigned long val = simple_strtoul(p, &p, 10);
+
+ if (val)
+ set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+ else
+ clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+
+ return count;
+}
+
+static ssize_t integrity_write_show(struct blk_integrity *bi, char *page)
+{
+ return sprintf(page, "%d\n",
+ test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0);
+}
+
+static struct integrity_sysfs_entry integrity_format_entry = {
+ .attr = { .name = "format", .mode = S_IRUGO },
+ .show = integrity_format_show,
+};
+
+static struct integrity_sysfs_entry integrity_tag_size_entry = {
+ .attr = { .name = "tag_size", .mode = S_IRUGO },
+ .show = integrity_tag_size_show,
+};
+
+static struct integrity_sysfs_entry integrity_read_entry = {
+ .attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR },
+ .show = integrity_read_show,
+ .store = integrity_read_store,
+};
+
+static struct integrity_sysfs_entry integrity_write_entry = {
+ .attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR },
+ .show = integrity_write_show,
+ .store = integrity_write_store,
+};
+
+static struct attribute *integrity_attrs[] = {
+ &integrity_format_entry.attr,
+ &integrity_tag_size_entry.attr,
+ &integrity_read_entry.attr,
+ &integrity_write_entry.attr,
+ NULL,
+};
+
+static struct sysfs_ops integrity_ops = {
+ .show = &integrity_attr_show,
+ .store = &integrity_attr_store,
+};
+
+static int __init blk_dev_integrity_init(void)
+{
+ integrity_cachep = kmem_cache_create("blkdev_integrity",
+ sizeof(struct blk_integrity),
+ 0, SLAB_PANIC, NULL);
+ return 0;
+}
+subsys_initcall(blk_dev_integrity_init);
+
+static void blk_integrity_release(struct kobject *kobj)
+{
+ struct blk_integrity *bi =
+ container_of(kobj, struct blk_integrity, kobj);
+
+ kmem_cache_free(integrity_cachep, bi);
+}
+
+static struct kobj_type integrity_ktype = {
+ .default_attrs = integrity_attrs,
+ .sysfs_ops = &integrity_ops,
+ .release = blk_integrity_release,
+};
+
+/**
+ * blk_integrity_register - Register a gendisk as being integrity-capable
+ * @disk: struct gendisk pointer to make integrity-aware
+ * @template: integrity profile
+ *
+ * Description: When a device needs to advertise itself as being able
+ * to send/receive integrity metadata it must use this function to
+ * register the capability with the block layer. The template is a
+ * blk_integrity struct with values appropriate for the underlying
+ * hardware. See Documentation/block/data-integrity.txt.
+ */
+int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
+{
+ struct blk_integrity *bi;
+
+ BUG_ON(disk == NULL);
+ BUG_ON(template == NULL);
+
+ if (disk->integrity == NULL) {
+ bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO);
+ if (!bi)
+ return -1;
+
+ if (kobject_init_and_add(&bi->kobj, &integrity_ktype,
+ &disk->dev.kobj, "%s", "integrity")) {
+ kmem_cache_free(integrity_cachep, bi);
+ return -1;
+ }
+
+ kobject_uevent(&bi->kobj, KOBJ_ADD);
+
+ set_bit(INTEGRITY_FLAG_READ, &bi->flags);
+ set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+ bi->sector_size = disk->queue->hardsect_size;
+ disk->integrity = bi;
+ } else
+ bi = disk->integrity;
+
+ /* Use the provided profile as template */
+ bi->name = template->name;
+ bi->generate_fn = template->generate_fn;
+ bi->verify_fn = template->verify_fn;
+ bi->tuple_size = template->tuple_size;
+ bi->set_tag_fn = template->set_tag_fn;
+ bi->get_tag_fn = template->get_tag_fn;
+ bi->tag_size = template->tag_size;
+
+ return 0;
+}
+EXPORT_SYMBOL(blk_integrity_register);
+
+/**
+ * blk_integrity_unregister - Remove block integrity profile
+ * @disk: disk whose integrity profile to deallocate
+ *
+ * Description: This function frees all memory used by the block
+ * integrity profile. To be called at device teardown.
+ */
+void blk_integrity_unregister(struct gendisk *disk)
+{
+ struct blk_integrity *bi;
+
+ if (!disk || !disk->integrity)
+ return;
+
+ bi = disk->integrity;
+
+ kobject_uevent(&bi->kobj, KOBJ_REMOVE);
+ kobject_del(&bi->kobj);
+ kobject_put(&disk->dev.kobj);
+ kmem_cache_free(integrity_cachep, bi);
+}
+EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 651136a..5efc9e7 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -441,6 +441,9 @@
|| next->special)
return 0;
+ if (blk_integrity_rq(req) != blk_integrity_rq(next))
+ return 0;
+
/*
* If we are allowed to merge, then append bio list
* from next to rq and release next. merge_requests_fn
diff --git a/block/blk.h b/block/blk.h
index 59776ab..c79f30e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,4 +51,12 @@
return q->nr_congestion_off;
}
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+
+#define rq_for_each_integrity_segment(bvl, _rq, _iter) \
+ __rq_for_each_bio(_iter.bio, _rq) \
+ bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
+
+#endif /* BLK_DEV_INTEGRITY */
+
#endif
diff --git a/block/elevator.c b/block/elevator.c
index 902dd13..1f5bfe6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -86,6 +86,12 @@
if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
return 0;
+ /*
+ * only merge integrity protected bio into ditto rq
+ */
+ if (bio_integrity(bio) != blk_integrity_rq(rq))
+ return 0;
+
if (!elv_iosched_allow_merge(rq, bio))
return 0;