Btrfs: transaction ioctls
These ioctls let a user application hold a transaction open while it
performs a series of operations. A final ioctl does a sync on the fs
(closing the current transaction). This is the main requirement for
Ceph's OSD to be able to keep the data it's storing in a btrfs volume
consistent, and AFAICS it works just fine. The application would do
something like
fd = ::open("some/file", O_RDONLY);
::ioctl(fd, BTRFS_IOC_TRANS_START);
/* do a bunch of stuff */
::ioctl(fd, BTRFS_IOC_TRANS_END);
or just
::close(fd);
And to ensure it commits to disk,
::ioctl(fd, BTRFS_IOC_SYNC);
When a transaction is held open, the trans_handle is attached to the
struct file (via private_data) so that it will get cleaned up if the
process dies unexpectedly. A held transaction is also ended on fsync() to
avoid a deadlock.
A misbehaving application could also deliberately hold a transaction open,
effectively locking up the FS, so it may make sense to restrict something
like this to root or something.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ad4eacc..1dcf4fb5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1575,6 +1575,7 @@
void btrfs_destroy_inode(struct inode *inode);
int btrfs_init_cachep(void);
void btrfs_destroy_cachep(void);
+long btrfs_ioctl_trans_end(struct file *file);
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
struct btrfs_root *root);
@@ -1595,6 +1596,8 @@
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+int btrfs_release_file(struct inode *inode, struct file *file);
+
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int cache_only);
@@ -1615,4 +1618,5 @@
u64 btrfs_parse_size(char *str);
int btrfs_parse_options(char *options, struct btrfs_root *root,
char **subvol_name);
+int btrfs_sync_fs(struct super_block *sb, int wait);
#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c4fa466..73c6d08 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -978,9 +978,11 @@
return num_written ? num_written : err;
}
-static int btrfs_release_file (struct inode * inode, struct file * filp)
+int btrfs_release_file(struct inode * inode, struct file * filp)
{
btrfs_del_ordered_inode(inode);
+ if (filp->private_data)
+ btrfs_ioctl_trans_end(filp);
return 0;
}
@@ -1011,6 +1013,9 @@
/*
* ok we haven't committed the transaction yet, lets do a commit
*/
+ if (file->private_data)
+ btrfs_ioctl_trans_end(file);
+
trans = btrfs_start_transaction(root, 1);
if (!trans) {
ret = -ENOMEM;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 31aa4ba..0f14697b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3336,6 +3336,61 @@
return ret;
}
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks. They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_start(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ mutex_lock(&root->fs_info->fs_mutex);
+ if (file->private_data) {
+ ret = -EINPROGRESS;
+ goto out;
+ }
+ trans = btrfs_start_transaction(root, 0);
+ if (trans)
+ file->private_data = trans;
+ else
+ ret = -ENOMEM;
+ /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+out:
+ mutex_unlock(&root->fs_info->fs_mutex);
+ return ret;
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks. They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_end(struct file *file)
+{
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+
+ mutex_lock(&root->fs_info->fs_mutex);
+ trans = file->private_data;
+ if (!trans) {
+ ret = -EINVAL;
+ goto out;
+ }
+ btrfs_end_transaction(trans, root);
+ file->private_data = 0;
+out:
+ mutex_unlock(&root->fs_info->fs_mutex);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -3356,6 +3411,13 @@
return btrfs_balance(root->fs_info->dev_root);
case BTRFS_IOC_CLONE:
return btrfs_ioctl_clone(file, arg);
+ case BTRFS_IOC_TRANS_START:
+ return btrfs_ioctl_trans_start(file);
+ case BTRFS_IOC_TRANS_END:
+ return btrfs_ioctl_trans_end(file);
+ case BTRFS_IOC_SYNC:
+ btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ return 0;
}
return -ENOTTY;
@@ -3679,6 +3741,7 @@
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
#endif
+ .release = btrfs_release_file,
};
static struct extent_io_ops btrfs_extent_io_ops = {
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index b0e73f5..85ed35a 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -36,6 +36,14 @@
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
struct btrfs_ioctl_vol_args)
+/* trans start and trans end are dangerous, and only for
+ * use by applications that know how to avoid the
+ * resulting deadlocks
+ */
+#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
+
#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
struct btrfs_ioctl_vol_args)
@@ -43,4 +51,5 @@
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
struct btrfs_ioctl_vol_args)
+
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 77f4449..39bb869 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -293,7 +293,7 @@
return err;
}
-static int btrfs_sync_fs(struct super_block *sb, int wait)
+int btrfs_sync_fs(struct super_block *sb, int wait)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root;