fs: Use rename lock and RCU for multi-step operations
The remaining usages for dcache_lock is to allow atomic, multi-step read-side
operations over the directory tree by excluding modifications to the tree.
Also, to walk in the leaf->root direction in the tree where we don't have
a natural d_lock ordering.
This could be accomplished by taking every d_lock, but this would mean a
huge number of locks and actually gets very tricky.
Solve this instead by using the rename seqlock for multi-step read-side
operations, retry in case of a rename so we don't walk up the wrong parent.
Concurrent dentry insertions are not serialised against. Concurrent deletes
are tricky when walking up the directory: our parent might have been deleted
when dropping locks so also need to check and retry for that.
We can also use the rename lock in cases where livelock is a worry (and it
is introduced in subsequent patch).
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
diff --git a/drivers/staging/pohmelfs/path_entry.c b/drivers/staging/pohmelfs/path_entry.c
index 8ec83d2..bbe42f4 100644
--- a/drivers/staging/pohmelfs/path_entry.c
+++ b/drivers/staging/pohmelfs/path_entry.c
@@ -83,10 +83,11 @@
int pohmelfs_path_length(struct pohmelfs_inode *pi)
{
struct dentry *d, *root, *first;
- int len = 1; /* Root slash */
+ int len;
+ unsigned seq;
- first = d = d_find_alias(&pi->vfs_inode);
- if (!d) {
+ first = d_find_alias(&pi->vfs_inode);
+ if (!first) {
dprintk("%s: ino: %llu, mode: %o.\n", __func__, pi->ino, pi->vfs_inode.i_mode);
return -ENOENT;
}
@@ -95,6 +96,11 @@
root = dget(current->fs->root.dentry);
spin_unlock(¤t->fs->lock);
+rename_retry:
+ len = 1; /* Root slash */
+ d = first;
+ seq = read_seqbegin(&rename_lock);
+ rcu_read_lock();
spin_lock(&dcache_lock);
if (!IS_ROOT(d) && d_unhashed(d))
@@ -105,6 +111,9 @@
d = d->d_parent;
}
spin_unlock(&dcache_lock);
+ rcu_read_unlock();
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
dput(root);
dput(first);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 2341375..4be8f77 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -186,16 +186,25 @@
{
struct dentry *root = sbi->sb->s_root;
struct dentry *tmp;
- char *buf = *name;
+ char *buf;
char *p;
- int len = 0;
+ int len;
+ unsigned seq;
+rename_retry:
+ buf = *name;
+ len = 0;
+ seq = read_seqbegin(&rename_lock);
+ rcu_read_lock();
spin_lock(&dcache_lock);
for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
len += tmp->d_name.len + 1;
if (!len || --len > NAME_MAX) {
spin_unlock(&dcache_lock);
+ rcu_read_unlock();
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
return 0;
}
@@ -209,6 +218,9 @@
strncpy(p, tmp->d_name.name, tmp->d_name.len);
}
spin_unlock(&dcache_lock);
+ rcu_read_unlock();
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
return len;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index a09f077..a9bc4ec 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -80,6 +80,7 @@
__cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
+EXPORT_SYMBOL(rename_lock);
EXPORT_SYMBOL(dcache_inode_lock);
EXPORT_SYMBOL(dcache_lock);
@@ -243,6 +244,7 @@
__releases(dcache_inode_lock)
__releases(dcache_lock)
{
+ dentry->d_parent = NULL;
list_del(&dentry->d_u.d_child);
if (parent)
spin_unlock(&parent->d_lock);
@@ -1017,11 +1019,15 @@
* Return true if the parent or its subdirectories contain
* a mount point
*/
-
int have_submounts(struct dentry *parent)
{
- struct dentry *this_parent = parent;
+ struct dentry *this_parent;
struct list_head *next;
+ unsigned seq;
+
+rename_retry:
+ this_parent = parent;
+ seq = read_seqbegin(&rename_lock);
spin_lock(&dcache_lock);
if (d_mountpoint(parent))
@@ -1055,17 +1061,37 @@
* All done at this level ... ascend and resume the search.
*/
if (this_parent != parent) {
- next = this_parent->d_u.d_child.next;
+ struct dentry *tmp;
+ struct dentry *child;
+
+ tmp = this_parent->d_parent;
+ rcu_read_lock();
spin_unlock(&this_parent->d_lock);
- this_parent = this_parent->d_parent;
+ child = this_parent;
+ this_parent = tmp;
spin_lock(&this_parent->d_lock);
+ /* might go back up the wrong parent if we have had a rename
+ * or deletion */
+ if (this_parent != child->d_parent ||
+ read_seqretry(&rename_lock, seq)) {
+ spin_unlock(&this_parent->d_lock);
+ spin_unlock(&dcache_lock);
+ rcu_read_unlock();
+ goto rename_retry;
+ }
+ rcu_read_unlock();
+ next = child->d_u.d_child.next;
goto resume;
}
spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock);
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
return 0; /* No mount points found in tree */
positive:
spin_unlock(&dcache_lock);
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
return 1;
}
EXPORT_SYMBOL(have_submounts);
@@ -1086,10 +1112,15 @@
*/
static int select_parent(struct dentry * parent)
{
- struct dentry *this_parent = parent;
+ struct dentry *this_parent;
struct list_head *next;
+ unsigned seq;
int found = 0;
+rename_retry:
+ this_parent = parent;
+ seq = read_seqbegin(&rename_lock);
+
spin_lock(&dcache_lock);
spin_lock(&this_parent->d_lock);
repeat:
@@ -1099,7 +1130,6 @@
struct list_head *tmp = next;
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
next = tmp->next;
- BUG_ON(this_parent == dentry);
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
@@ -1142,17 +1172,32 @@
*/
if (this_parent != parent) {
struct dentry *tmp;
- next = this_parent->d_u.d_child.next;
+ struct dentry *child;
+
tmp = this_parent->d_parent;
+ rcu_read_lock();
spin_unlock(&this_parent->d_lock);
- BUG_ON(tmp == this_parent);
+ child = this_parent;
this_parent = tmp;
spin_lock(&this_parent->d_lock);
+ /* might go back up the wrong parent if we have had a rename
+ * or deletion */
+ if (this_parent != child->d_parent ||
+ read_seqretry(&rename_lock, seq)) {
+ spin_unlock(&this_parent->d_lock);
+ spin_unlock(&dcache_lock);
+ rcu_read_unlock();
+ goto rename_retry;
+ }
+ rcu_read_unlock();
+ next = child->d_u.d_child.next;
goto resume;
}
out:
spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock);
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
return found;
}
@@ -1654,7 +1699,7 @@
struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
{
struct dentry * dentry = NULL;
- unsigned long seq;
+ unsigned seq;
do {
seq = read_seqbegin(&rename_lock);
@@ -2290,7 +2335,7 @@
* @buffer: pointer to the end of the buffer
* @buflen: pointer to buffer length
*
- * Caller holds the dcache_lock.
+ * Caller holds the rename_lock.
*
* If path is not reachable from the supplied root, then the value of
* root is changed (without modifying refcounts).
@@ -2377,7 +2422,9 @@
prepend(&res, &buflen, "\0", 1);
spin_lock(&dcache_lock);
+ write_seqlock(&rename_lock);
error = prepend_path(path, root, &res, &buflen);
+ write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock);
if (error)
@@ -2441,10 +2488,12 @@
get_fs_root(current->fs, &root);
spin_lock(&dcache_lock);
+ write_seqlock(&rename_lock);
tmp = root;
error = path_with_deleted(path, &tmp, &res, &buflen);
if (error)
res = ERR_PTR(error);
+ write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock);
path_put(&root);
return res;
@@ -2472,10 +2521,12 @@
get_fs_root(current->fs, &root);
spin_lock(&dcache_lock);
+ write_seqlock(&rename_lock);
tmp = root;
error = path_with_deleted(path, &tmp, &res, &buflen);
if (!error && !path_equal(&tmp, &root))
error = prepend_unreachable(&res, &buflen);
+ write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock);
path_put(&root);
if (error)
@@ -2544,7 +2595,9 @@
char *retval;
spin_lock(&dcache_lock);
+ write_seqlock(&rename_lock);
retval = __dentry_path(dentry, buf, buflen);
+ write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock);
return retval;
@@ -2557,6 +2610,7 @@
char *retval;
spin_lock(&dcache_lock);
+ write_seqlock(&rename_lock);
if (d_unlinked(dentry)) {
p = buf + buflen;
if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2564,6 +2618,7 @@
buflen++;
}
retval = __dentry_path(dentry, buf, buflen);
+ write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock);
if (!IS_ERR(retval) && p)
*p = '/'; /* restore '/' overriden with '\0' */
@@ -2604,6 +2659,7 @@
error = -ENOENT;
spin_lock(&dcache_lock);
+ write_seqlock(&rename_lock);
if (!d_unlinked(pwd.dentry)) {
unsigned long len;
struct path tmp = root;
@@ -2612,6 +2668,7 @@
prepend(&cwd, &buflen, "\0", 1);
error = prepend_path(&pwd, &tmp, &cwd, &buflen);
+ write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock);
if (error)
@@ -2631,8 +2688,10 @@
if (copy_to_user(buf, cwd, len))
error = -EFAULT;
}
- } else
+ } else {
+ write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock);
+ }
out:
path_put(&pwd);
@@ -2660,25 +2719,25 @@
int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
int result;
- unsigned long seq;
+ unsigned seq;
if (new_dentry == old_dentry)
return 1;
- /*
- * Need rcu_readlock to protect against the d_parent trashing
- * due to d_move
- */
- rcu_read_lock();
do {
/* for restarting inner loop in case of seq retry */
seq = read_seqbegin(&rename_lock);
+ /*
+ * Need rcu_readlock to protect against the d_parent trashing
+ * due to d_move
+ */
+ rcu_read_lock();
if (d_ancestor(old_dentry, new_dentry))
result = 1;
else
result = 0;
+ rcu_read_unlock();
} while (read_seqretry(&rename_lock, seq));
- rcu_read_unlock();
return result;
}
@@ -2710,9 +2769,13 @@
void d_genocide(struct dentry *root)
{
- struct dentry *this_parent = root;
+ struct dentry *this_parent;
struct list_head *next;
+ unsigned seq;
+rename_retry:
+ this_parent = root;
+ seq = read_seqbegin(&rename_lock);
spin_lock(&dcache_lock);
spin_lock(&this_parent->d_lock);
repeat:
@@ -2722,6 +2785,7 @@
struct list_head *tmp = next;
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
next = tmp->next;
+
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
if (d_unhashed(dentry) || !dentry->d_inode) {
spin_unlock(&dentry->d_lock);
@@ -2734,19 +2798,43 @@
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
goto repeat;
}
- dentry->d_count--;
+ if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+ dentry->d_flags |= DCACHE_GENOCIDE;
+ dentry->d_count--;
+ }
spin_unlock(&dentry->d_lock);
}
if (this_parent != root) {
- next = this_parent->d_u.d_child.next;
- this_parent->d_count--;
+ struct dentry *tmp;
+ struct dentry *child;
+
+ tmp = this_parent->d_parent;
+ if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
+ this_parent->d_flags |= DCACHE_GENOCIDE;
+ this_parent->d_count--;
+ }
+ rcu_read_lock();
spin_unlock(&this_parent->d_lock);
- this_parent = this_parent->d_parent;
+ child = this_parent;
+ this_parent = tmp;
spin_lock(&this_parent->d_lock);
+ /* might go back up the wrong parent if we have had a rename
+ * or deletion */
+ if (this_parent != child->d_parent ||
+ read_seqretry(&rename_lock, seq)) {
+ spin_unlock(&this_parent->d_lock);
+ spin_unlock(&dcache_lock);
+ rcu_read_unlock();
+ goto rename_retry;
+ }
+ rcu_read_unlock();
+ next = child->d_u.d_child.next;
goto resume;
}
spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock);
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
}
/**
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index db6aa36..78c0ebb 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -49,11 +49,17 @@
const struct dentry *dentry,
char *buffer, ssize_t buflen)
{
- char *end = buffer+buflen;
+ char *end;
int namelen;
+ unsigned seq;
+rename_retry:
+ end = buffer+buflen;
*--end = '\0';
buflen--;
+
+ seq = read_seqbegin(&rename_lock);
+ rcu_read_lock();
spin_lock(&dcache_lock);
while (!IS_ROOT(dentry) && dentry != droot) {
namelen = dentry->d_name.len;
@@ -66,6 +72,9 @@
dentry = dentry->d_parent;
}
spin_unlock(&dcache_lock);
+ rcu_read_unlock();
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
if (*end != '/') {
if (--buflen < 0)
goto Elong;
@@ -83,6 +92,9 @@
return end;
Elong_unlock:
spin_unlock(&dcache_lock);
+ rcu_read_unlock();
+ if (read_seqretry(&rename_lock, seq))
+ goto rename_retry;
Elong:
return ERR_PTR(-ENAMETOOLONG);
}
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index bda5ec0b0..c963eba 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -180,6 +180,7 @@
#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */
#define DCACHE_CANT_MOUNT 0x0100
+#define DCACHE_GENOCIDE 0x0200
extern spinlock_t dcache_inode_lock;
extern spinlock_t dcache_lock;