Btrfs: New data=ordered implementation
The old data=ordered code would force commit to wait until
all the data extents from the transaction were fully on disk. This
introduced large latencies into the commit and stalled new writers
in the transaction for a long time.
The new code changes the way data allocations and extents work:
* When delayed allocation is filled, data extents are reserved, and
the extent bit EXTENT_ORDERED is set on the entire range of the extent.
A struct btrfs_ordered_extent is allocated an inserted into a per-inode
rbtree to track the pending extents.
* As each page is written EXTENT_ORDERED is cleared on the bytes corresponding
to that page.
* When all of the bytes corresponding to a single struct btrfs_ordered_extent
are written, The previously reserved extent is inserted into the FS
btree and into the extent allocation trees. The checksums for the file
data are also updated.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b01b3f4..4a5ebaf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -407,7 +407,11 @@
end_io_wq->error = err;
end_io_wq->work.func = end_workqueue_fn;
end_io_wq->work.flags = 0;
- btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
+ if (bio->bi_rw & (1 << BIO_RW))
+ btrfs_queue_worker(&fs_info->endio_write_workers,
+ &end_io_wq->work);
+ else
+ btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
return 0;
@@ -1286,6 +1290,7 @@
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
+ init_waitqueue_head(&fs_info->transaction_throttle);
#if 0
ret = add_hasher(fs_info, "crc32c");
@@ -1325,9 +1330,13 @@
btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+ btrfs_init_workers(&fs_info->endio_write_workers,
+ fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+ btrfs_start_workers(&fs_info->endio_write_workers,
+ fs_info->thread_pool_size);
err = -EINVAL;
if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
@@ -1447,6 +1456,7 @@
extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
+ btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
fail_iput:
iput(fs_info->btree_inode);
@@ -1702,6 +1712,7 @@
btrfs_stop_workers(&fs_info->workers);
btrfs_stop_workers(&fs_info->endio_workers);
+ btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
iput(fs_info->btree_inode);