btrfs: fix dead lock while running replace and defrag concurrently
This can be reproduced by fstests: btrfs/070
The scenario is like the following:
replace worker thread defrag thread
--------------------- -------------
copy_nocow_pages_worker btrfs_defrag_file
copy_nocow_pages_for_inode ...
btrfs_writepages
|A| lock_extent_bits extent_write_cache_pages
|B| lock_page
__extent_writepage
... writepage_delalloc
find_lock_delalloc_range
|B| lock_extent_bits
find_or_create_page
pagecache_get_page
|A| lock_page
This leads to an ABBA pattern deadlock. To fix it,
o we just change it to an AABB pattern which means to @unlock_extent_bits()
before we @lock_page(), and in this way the @extent_read_full_page_nolock()
is no longer in an locked context, so change it back to @extent_read_full_page()
to regain protection.
o Since we @unlock_extent_bits() earlier, then before @write_page_nocow(),
the extent may not really point at the physical block we want, so we
have to check it before write.
Signed-off-by: Gui Hecheng <guihc.fnst@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efa0831..4325bb0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3310,6 +3310,50 @@
scrub_pending_trans_workers_dec(sctx);
}
+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
+ u64 logical)
+{
+ struct extent_state *cached_state = NULL;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_io_tree *io_tree;
+ struct extent_map *em;
+ u64 lockstart = start, lockend = start + len - 1;
+ int ret = 0;
+
+ io_tree = &BTRFS_I(inode)->io_tree;
+
+ lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ ret = 1;
+ goto out_unlock;
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock;
+ }
+
+ /*
+ * This extent does not actually cover the logical extent anymore,
+ * move on to the next inode.
+ */
+ if (em->block_start > logical ||
+ em->block_start + em->block_len < logical + len) {
+ free_extent_map(em);
+ ret = 1;
+ goto out_unlock;
+ }
+ free_extent_map(em);
+
+out_unlock:
+ unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+ GFP_NOFS);
+ return ret;
+}
+
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct scrub_copy_nocow_ctx *nocow_ctx)
{
@@ -3318,13 +3362,10 @@
struct inode *inode;
struct page *page;
struct btrfs_root *local_root;
- struct btrfs_ordered_extent *ordered;
- struct extent_map *em;
- struct extent_state *cached_state = NULL;
struct extent_io_tree *io_tree;
u64 physical_for_dev_replace;
+ u64 nocow_ctx_logical;
u64 len = nocow_ctx->len;
- u64 lockstart = offset, lockend = offset + len - 1;
unsigned long index;
int srcu_index;
int ret = 0;
@@ -3356,31 +3397,14 @@
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
io_tree = &BTRFS_I(inode)->io_tree;
+ nocow_ctx_logical = nocow_ctx->logical;
- lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
- if (ordered) {
- btrfs_put_ordered_extent(ordered);
- goto out_unlock;
+ ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto out;
}
- em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out_unlock;
- }
-
- /*
- * This extent does not actually cover the logical extent anymore,
- * move on to the next inode.
- */
- if (em->block_start > nocow_ctx->logical ||
- em->block_start + em->block_len < nocow_ctx->logical + len) {
- free_extent_map(em);
- goto out_unlock;
- }
- free_extent_map(em);
-
while (len >= PAGE_CACHE_SIZE) {
index = offset >> PAGE_CACHE_SHIFT;
again:
@@ -3396,7 +3420,7 @@
goto next_page;
} else {
ClearPageError(page);
- err = extent_read_full_page_nolock(io_tree, page,
+ err = extent_read_full_page(io_tree, page,
btrfs_get_extent,
nocow_ctx->mirror_num);
if (err) {
@@ -3421,6 +3445,14 @@
goto next_page;
}
}
+
+ ret = check_extent_to_block(inode, offset, len,
+ nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto next_page;
+ }
+
err = write_page_nocow(nocow_ctx->sctx,
physical_for_dev_replace, page);
if (err)
@@ -3434,12 +3466,10 @@
offset += PAGE_CACHE_SIZE;
physical_for_dev_replace += PAGE_CACHE_SIZE;
+ nocow_ctx_logical += PAGE_CACHE_SIZE;
len -= PAGE_CACHE_SIZE;
}
ret = COPY_COMPLETE;
-out_unlock:
- unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
- GFP_NOFS);
out:
mutex_unlock(&inode->i_mutex);
iput(inode);