[PATCH] NFS: Introduce the use of inode->i_lock to protect fields in nfsi
Down the road we want to eliminate the use of the global kernel lock entirely
from the NFS client. To do this, we need to protect the fields in the
nfs_inode structure adequately. Start by serializing updates to the
"cache_validity" field.
Note this change addresses an SMP hang found by njw@osdl.org, where processes
deadlock because nfs_end_data_update and nfs_revalidate_mapping update the
"cache_validity" field without proper serialization.
Test plan:
Millions of fsx ops on SMP clients. Run Nick Wilson's breaknfs program on
large SMP clients.
Signed-off-by: Chuck Lever <cel@netapp.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 27cf557..147cbf9 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -189,7 +189,9 @@
goto error;
}
SetPageUptodate(page);
+ spin_lock(&inode->i_lock);
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+ spin_unlock(&inode->i_lock);
/* Ensure consistent page alignment of the data.
* Note: assumes we have exclusive access to this mapping either
* through inode->i_sem or some other mechanism.
@@ -462,7 +464,9 @@
page,
NFS_SERVER(inode)->dtsize,
desc->plus);
+ spin_lock(&inode->i_lock);
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+ spin_unlock(&inode->i_lock);
desc->page = page;
desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
if (desc->error >= 0) {
@@ -1596,7 +1600,10 @@
put_rpccred(cache->cred);
cache->cred = get_rpccred(set->cred);
}
+ /* FIXME: replace current access_cache BKL reliance with inode->i_lock */
+ spin_lock(&inode->i_lock);
nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
+ spin_unlock(&inode->i_lock);
cache->jiffies = set->jiffies;
cache->mask = set->mask;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ee27578..541b418 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -615,6 +615,8 @@
struct nfs_inode *nfsi = NFS_I(inode);
int mode = inode->i_mode;
+ spin_lock(&inode->i_lock);
+
NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
@@ -623,6 +625,8 @@
nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
else
nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+
+ spin_unlock(&inode->i_lock);
}
static void nfs_zap_acl_cache(struct inode *inode)
@@ -632,7 +636,9 @@
clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache;
if (clear_acl_cache != NULL)
clear_acl_cache(inode);
+ spin_lock(&inode->i_lock);
NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL;
+ spin_unlock(&inode->i_lock);
}
/*
@@ -841,7 +847,9 @@
inode->i_uid = attr->ia_uid;
if ((attr->ia_valid & ATTR_GID) != 0)
inode->i_gid = attr->ia_gid;
+ spin_lock(&inode->i_lock);
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ spin_unlock(&inode->i_lock);
}
if ((attr->ia_valid & ATTR_SIZE) != 0) {
inode->i_size = attr->ia_size;
@@ -1082,6 +1090,7 @@
(long long)NFS_FILEID(inode), status);
goto out;
}
+ spin_lock(&inode->i_lock);
cache_validity = nfsi->cache_validity;
nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
@@ -1091,6 +1100,7 @@
*/
if (verifier == nfsi->cache_change_attribute)
nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
+ spin_unlock(&inode->i_lock);
nfs_revalidate_mapping(inode, inode->i_mapping);
@@ -1149,12 +1159,16 @@
nfs_wb_all(inode);
}
invalidate_inode_pages2(mapping);
+
+ spin_lock(&inode->i_lock);
nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
if (S_ISDIR(inode->i_mode)) {
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
/* This ensures we revalidate child dentries */
nfsi->cache_change_attribute++;
}
+ spin_unlock(&inode->i_lock);
+
dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode));
@@ -1184,10 +1198,12 @@
if (!nfs_have_delegation(inode, FMODE_READ)) {
/* Mark the attribute cache for revalidation */
+ spin_lock(&inode->i_lock);
nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
/* Directories and symlinks: invalidate page cache too */
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ spin_unlock(&inode->i_lock);
}
nfsi->cache_change_attribute ++;
atomic_dec(&nfsi->data_updates);
@@ -1212,6 +1228,8 @@
if (nfs_have_delegation(inode, FMODE_READ))
return 0;
+ spin_lock(&inode->i_lock);
+
/* Are we in the process of updating data on the server? */
data_unstable = nfs_caches_unstable(inode);
@@ -1226,13 +1244,17 @@
}
}
- if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0) {
+ spin_unlock(&inode->i_lock);
return 0;
+ }
/* Has the inode gone and changed behind our back? */
if (nfsi->fileid != fattr->fileid
- || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+ || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+ spin_unlock(&inode->i_lock);
return -EIO;
+ }
cur_size = i_size_read(inode);
new_isize = nfs_size_to_loff_t(fattr->size);
@@ -1271,6 +1293,7 @@
nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
nfsi->read_cache_jiffies = fattr->timestamp;
+ spin_unlock(&inode->i_lock);
return 0;
}
@@ -1309,11 +1332,15 @@
goto out_err;
}
+ spin_lock(&inode->i_lock);
+
/*
* Make sure the inode's type hasn't changed.
*/
- if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+ if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+ spin_unlock(&inode->i_lock);
goto out_changed;
+ }
/*
* Update the read time so we don't revalidate too often.
@@ -1406,6 +1433,7 @@
if (!nfs_have_delegation(inode, FMODE_READ))
nfsi->cache_validity |= invalid;
+ spin_unlock(&inode->i_lock);
return 0;
out_changed:
/*
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index a020e65..6a5bbc0 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -308,7 +308,9 @@
nfs_begin_data_update(inode);
status = rpc_call(server->client_acl, ACLPROC3_SETACL,
&args, &fattr, 0);
+ spin_lock(&inode->i_lock);
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
+ spin_unlock(&inode->i_lock);
nfs_end_data_update(inode);
dprintk("NFS reply setacl: %d\n", status);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 90df050..6ceb1d4 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -140,7 +140,9 @@
if (rdata->res.eof != 0 || result == 0)
break;
} while (count);
+ spin_lock(&inode->i_lock);
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+ spin_unlock(&inode->i_lock);
if (count)
memclear_highpage_flush(page, rdata->args.pgbase, count);
@@ -473,7 +475,9 @@
}
task->tk_status = -EIO;
}
+ spin_lock(&data->inode->i_lock);
NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+ spin_unlock(&data->inode->i_lock);
data->complete(data, status);
}