md: resolve external metadata handling deadlock in md_allow_write
md_allow_write() marks the metadata dirty while holding mddev->lock and then
waits for the write to complete. For externally managed metadata this causes a
deadlock as userspace needs to take the lock to communicate that the metadata
update has completed.
Change md_allow_write() in the 'external' case to start the 'mark active'
operation and then return -EAGAIN. The expected side effects while waiting for
userspace to write 'active' to 'array_state' are holding off reshape (code
currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to
fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and
cause updates to 'raid_disks' to fail. Except for 'stripe_cache_size' changes
these failures can be mitigated by coordinating with mdmon.
md_write_start() still prevents writes from occurring until the metadata
handler has had a chance to take action as it unconditionally waits for
MD_CHANGE_CLEAN to be cleared.
[neilb@suse.de: return -EAGAIN, try GFP_NOIO]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
diff --git a/drivers/md/md.c b/drivers/md/md.c
index df1230a..43d033d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4172,9 +4172,11 @@
char *ptr, *buf = NULL;
int err = -ENOMEM;
- md_allow_write(mddev);
+ if (md_allow_write(mddev))
+ file = kmalloc(sizeof(*file), GFP_NOIO);
+ else
+ file = kmalloc(sizeof(*file), GFP_KERNEL);
- file = kmalloc(sizeof(*file), GFP_KERNEL);
if (!file)
goto out;
@@ -5667,15 +5669,18 @@
* may proceed without blocking. It is important to call this before
* attempting a GFP_KERNEL allocation while holding the mddev lock.
* Must be called with mddev_lock held.
+ *
+ * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
+ * is dropped, so return -EAGAIN after notifying userspace.
*/
-void md_allow_write(mddev_t *mddev)
+int md_allow_write(mddev_t *mddev)
{
if (!mddev->pers)
- return;
+ return 0;
if (mddev->ro)
- return;
+ return 0;
if (!mddev->pers->sync_request)
- return;
+ return 0;
spin_lock_irq(&mddev->write_lock);
if (mddev->in_sync) {
@@ -5686,14 +5691,14 @@
mddev->safemode = 1;
spin_unlock_irq(&mddev->write_lock);
md_update_sb(mddev, 0);
-
sysfs_notify(&mddev->kobj, NULL, "array_state");
- /* wait for the dirty state to be recorded in the metadata */
- wait_event(mddev->sb_wait,
- !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
- !test_bit(MD_CHANGE_PENDING, &mddev->flags));
} else
spin_unlock_irq(&mddev->write_lock);
+
+ if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
+ return -EAGAIN;
+ else
+ return 0;
}
EXPORT_SYMBOL_GPL(md_allow_write);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f05d598..491dc2d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2136,7 +2136,7 @@
conf_t *conf = mddev_to_conf(mddev);
int cnt, raid_disks;
unsigned long flags;
- int d, d2;
+ int d, d2, err;
/* Cannot change chunk_size, layout, or level */
if (mddev->chunk_size != mddev->new_chunk ||
@@ -2148,7 +2148,9 @@
return -EINVAL;
}
- md_allow_write(mddev);
+ err = md_allow_write(mddev);
+ if (err)
+ return err;
raid_disks = mddev->raid_disks + mddev->delta_disks;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4426220..8f4c70a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -911,14 +911,16 @@
struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes);
struct disk_info *ndisks;
- int err = 0;
+ int err;
struct kmem_cache *sc;
int i;
if (newsize <= conf->pool_size)
return 0; /* never bother to shrink */
- md_allow_write(conf->mddev);
+ err = md_allow_write(conf->mddev);
+ if (err)
+ return err;
/* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -3843,6 +3845,8 @@
{
raid5_conf_t *conf = mddev_to_conf(mddev);
unsigned long new;
+ int err;
+
if (len >= PAGE_SIZE)
return -EINVAL;
if (!conf)
@@ -3858,7 +3862,9 @@
else
break;
}
- md_allow_write(mddev);
+ err = md_allow_write(mddev);
+ if (err)
+ return err;
while (new > conf->max_nr_stripes) {
if (grow_one_stripe(conf))
conf->max_nr_stripes++;