Btrfs: avoid tree log commit when there are no changes
rpm has a habit of running fdatasync when the file hasn't
changed. We already detect if a file hasn't been changed
in the current transaction but it might have been sent to
the tree-log in this transaction and not changed since
the last call to fsync.
In this case, we want to avoid a tree log sync, which includes
a number of synchronous writes and barriers. This commit
extends the existing tracking of the last transaction to change
a file to also track the last sub-transaction.
The end result is that rpm -ivh and -Uvh are roughly twice as fast,
and on par with ext3.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c71abec..f6783a4 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -86,6 +86,12 @@
* transid of the trans_handle that last modified this inode
*/
u64 last_trans;
+
+ /*
+ * log transid when this inode was last modified
+ */
+ u64 last_sub_trans;
+
/*
* transid that last logged this inode
*/
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 36a19cd..d0cede5f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1009,6 +1009,7 @@
atomic_t log_writers;
atomic_t log_commit[2];
unsigned long log_transid;
+ unsigned long last_log_commit;
unsigned long log_batch;
pid_t log_start_pid;
bool log_multiple_pids;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ac8927b..d4132aa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -919,6 +919,7 @@
atomic_set(&root->log_writers, 0);
root->log_batch = 0;
root->log_transid = 0;
+ root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -1089,6 +1090,7 @@
WARN_ON(root->log_root);
root->log_root = log_root;
root->log_transid = 0;
+ root->last_log_commit = 0;
return 0;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 53fb1c9..4599113 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1087,8 +1087,10 @@
btrfs_end_transaction(trans, root);
else
btrfs_commit_transaction(trans, root);
- } else {
+ } else if (ret != BTRFS_NO_LOG_SYNC) {
btrfs_commit_transaction(trans, root);
+ } else {
+ btrfs_end_transaction(trans, root);
}
}
if (file->f_flags & O_DIRECT) {
@@ -1138,6 +1140,13 @@
int ret = 0;
struct btrfs_trans_handle *trans;
+
+ /* we wait first, since the writeback may change the inode */
+ root->log_batch++;
+ /* the VFS called filemap_fdatawrite for us */
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ root->log_batch++;
+
/*
* check the transaction that last modified this inode
* and see if its already been committed
@@ -1145,6 +1154,11 @@
if (!BTRFS_I(inode)->last_trans)
goto out;
+ /*
+ * if the last transaction that changed this file was before
+ * the current transaction, we can bail out now without any
+ * syncing
+ */
mutex_lock(&root->fs_info->trans_mutex);
if (BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
@@ -1154,13 +1168,6 @@
}
mutex_unlock(&root->fs_info->trans_mutex);
- root->log_batch++;
- filemap_fdatawrite(inode->i_mapping);
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- root->log_batch++;
-
- if (datasync && !(inode->i_state & I_DIRTY_PAGES))
- goto out;
/*
* ok we haven't committed the transaction yet, lets do a commit
*/
@@ -1189,14 +1196,18 @@
*/
mutex_unlock(&dentry->d_inode->i_mutex);
- if (ret > 0) {
- ret = btrfs_commit_transaction(trans, root);
- } else {
- ret = btrfs_sync_log(trans, root);
- if (ret == 0)
- ret = btrfs_end_transaction(trans, root);
- else
+ if (ret != BTRFS_NO_LOG_SYNC) {
+ if (ret > 0) {
ret = btrfs_commit_transaction(trans, root);
+ } else {
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ ret = btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_commit_transaction(trans, root);
+ }
+ } else {
+ ret = btrfs_end_transaction(trans, root);
}
mutex_lock(&dentry->d_inode->i_mutex);
out:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ef399a7..5b9567c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3480,6 +3480,7 @@
bi->generation = 0;
bi->sequence = 0;
bi->last_trans = 0;
+ bi->last_sub_trans = 0;
bi->logged_trans = 0;
bi->delalloc_bytes = 0;
bi->reserved_bytes = 0;
@@ -4980,7 +4981,9 @@
set_page_dirty(page);
SetPageUptodate(page);
- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+ BTRFS_I(inode)->last_trans = root->fs_info->generation;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
@@ -5100,6 +5103,7 @@
if (!ei)
return NULL;
ei->last_trans = 0;
+ ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->outstanding_extents = 0;
ei->reserved_extents = 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 663c674..f68cbbe 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,6 +79,7 @@
struct inode *inode)
{
BTRFS_I(inode)->last_trans = trans->transaction->transid;
+ BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6d9ec28..0a1bde2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1980,6 +1980,7 @@
int ret;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+ u64 log_transid = 0;
mutex_lock(&root->log_mutex);
index1 = root->log_transid % 2;
@@ -2018,6 +2019,7 @@
btrfs_set_root_node(&log->root_item, log->node);
root->log_batch = 0;
+ log_transid = root->log_transid;
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
@@ -2095,6 +2097,11 @@
write_ctree_super(trans, root->fs_info->tree_root, 1);
ret = 0;
+ mutex_lock(&root->log_mutex);
+ if (root->last_log_commit < log_transid)
+ root->last_log_commit = log_transid;
+ mutex_unlock(&root->log_mutex);
+
out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
smp_mb();
@@ -2862,6 +2869,21 @@
return ret;
}
+static int inode_in_log(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret = 0;
+
+ mutex_lock(&root->log_mutex);
+ if (BTRFS_I(inode)->logged_trans == trans->transid &&
+ BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+ ret = 1;
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -2901,6 +2923,11 @@
if (ret)
goto end_no_trans;
+ if (inode_in_log(trans, inode)) {
+ ret = BTRFS_NO_LOG_SYNC;
+ goto end_no_trans;
+ }
+
start_log_trans(trans, root);
ret = btrfs_log_inode(trans, root, inode, inode_only);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index d09c760..0776eac 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,9 @@
#ifndef __TREE_LOG_
#define __TREE_LOG_
+/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+#define BTRFS_NO_LOG_SYNC 256
+
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);