ocfs2: zero tail of sparse files on truncate
Since we don't zero on extend anymore, truncate needs to be fixed up to zero
the part of a file between i_size and and end of it's cluster. Otherwise a
subsequent extend could expose bad data.
This introduced a new helper, which can be used in ocfs2_write().
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9a40603..98694a1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -27,6 +27,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
@@ -34,6 +35,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
@@ -3342,6 +3344,228 @@
return status;
}
+static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ return 0;
+}
+
+static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ return ocfs2_journal_dirty_data(handle, bh);
+}
+
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+ struct page **pages, int numpages,
+ u64 phys, handle_t *handle)
+{
+ int i, ret, partial = 0;
+ void *kaddr;
+ struct page *page;
+ unsigned int from, to = PAGE_CACHE_SIZE;
+ struct super_block *sb = inode->i_sb;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+ if (numpages == 0)
+ goto out;
+
+ from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+ if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+ /*
+ * Since 'from' has been capped to a value below page
+ * size, this calculation won't be able to overflow
+ * 'to'
+ */
+ to = ocfs2_align_bytes_to_clusters(sb, from);
+
+ /*
+ * The truncate tail in this case should never contain
+ * more than one page at maximum. The loop below also
+ * assumes this.
+ */
+ BUG_ON(numpages != 1);
+ }
+
+ for(i = 0; i < numpages; i++) {
+ page = pages[i];
+
+ BUG_ON(from > PAGE_CACHE_SIZE);
+ BUG_ON(to > PAGE_CACHE_SIZE);
+
+ ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
+ if (ret)
+ mlog_errno(ret);
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + from, 0, to - from);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ /*
+ * Need to set the buffers we zero'd into uptodate
+ * here if they aren't - ocfs2_map_page_blocks()
+ * might've skipped some
+ */
+ if (ocfs2_should_order_data(inode)) {
+ ret = walk_page_buffers(handle,
+ page_buffers(page),
+ from, to, &partial,
+ ocfs2_ordered_zero_func);
+ if (ret < 0)
+ mlog_errno(ret);
+ } else {
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, &partial,
+ ocfs2_writeback_zero_func);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+ if (!partial)
+ SetPageUptodate(page);
+
+ flush_dcache_page(page);
+
+ /*
+ * Every page after the 1st one should be completely zero'd.
+ */
+ from = 0;
+ }
+out:
+ if (pages) {
+ for (i = 0; i < numpages; i++) {
+ page = pages[i];
+ unlock_page(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
+ }
+ }
+}
+
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
+ int *num, u64 *phys)
+{
+ int i, numpages = 0, ret = 0;
+ unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long index;
+ u64 next_cluster_bytes;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+ /* Cluster boundary, so we don't need to grab any pages. */
+ if ((isize & (csize - 1)) == 0)
+ goto out;
+
+ ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+ phys, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Tail is a hole. */
+ if (*phys == 0)
+ goto out;
+
+ next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
+ index = isize >> PAGE_CACHE_SHIFT;
+ do {
+ pages[numpages] = grab_cache_page(mapping, index);
+ if (!pages[numpages]) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ numpages++;
+ index++;
+ } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+
+out:
+ if (ret != 0) {
+ if (pages) {
+ for (i = 0; i < numpages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ }
+ }
+ numpages = 0;
+ }
+
+ *num = numpages;
+
+ return ret;
+}
+
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+ u64 new_i_size)
+{
+ int ret, numpages;
+ struct page **pages = NULL;
+ u64 phys;
+
+ /*
+ * File systems which don't support sparse files zero on every
+ * extend.
+ */
+ if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ return 0;
+
+ pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+ sizeof(struct page *), GFP_NOFS);
+ if (pages == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Truncate on an i_size boundary - nothing more to do.
+ */
+ if (numpages == 0)
+ goto out;
+
+ ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+ handle);
+
+ /*
+ * Initiate writeout of the pages we zero'd here. We don't
+ * wait on them - the truncate_inode_pages() call later will
+ * do that for us.
+ */
+ ret = filemap_fdatawrite(inode->i_mapping);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ if (pages)
+ kfree(pages);
+
+ return ret;
+}
+
/*
* It is expected, that by the time you call this function,
* inode->i_size and fe->i_size have been adjusted.
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index bff2a16..3cb39cd 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -71,6 +71,8 @@
struct buffer_head *tc_last_eb_bh;
};
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+ u64 new_i_size);
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index acf8f00..605c82a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -308,13 +308,13 @@
* functionality yet, but IMHO it's better to cut and paste the whole
* thing so we can avoid introducing our own bugs (and easily pick up
* their fixes when they happen) --Mark */
-static int walk_page_buffers( handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)( handle_t *handle,
- struct buffer_head *bh))
+int walk_page_buffers( handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)( handle_t *handle,
+ struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
@@ -654,9 +654,9 @@
*
* This will also skip zeroing, which is handled externally.
*/
-static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
- struct inode *inode, unsigned int from,
- unsigned int to, int new)
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+ struct inode *inode, unsigned int from,
+ unsigned int to, int new)
{
int ret = 0;
struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
@@ -675,8 +675,7 @@
* Ignore blocks outside of our i/o range -
* they may belong to unallocated clusters.
*/
- if (block_start >= to ||
- (block_start + bsize) <= from) {
+ if (block_start >= to || block_end <= from) {
if (PageUptodate(page))
set_buffer_uptodate(bh);
continue;
@@ -971,7 +970,6 @@
u64 v_blkno, p_blkno;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
unsigned long index, start;
struct page **cpages;
@@ -979,13 +977,11 @@
/*
* Figure out how many pages we'll be manipulating here. For
- * non-allocating write, or any writes where cluster size is
- * less than page size, we only need one page. Otherwise,
- * allocating writes of cluster size larger than page size
- * need cluster size pages.
+ * non allocating write, we just change the one
+ * page. Otherwise, we'll need a whole clusters worth.
*/
- if (new && !wc->w_large_pages)
- numpages = (1 << cbits) / PAGE_SIZE;
+ if (new)
+ numpages = ocfs2_pages_per_cluster(inode->i_sb);
cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
if (!cpages) {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index eeb2c42..7d94071 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -30,6 +30,18 @@
unsigned from,
unsigned to);
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+ struct inode *inode, unsigned int from,
+ unsigned int to, int new);
+
+int walk_page_buffers( handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)( handle_t *handle,
+ struct buffer_head *bh));
+
struct ocfs2_write_ctxt;
typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
u64 *, unsigned int *, unsigned int *);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 667e5a8..5fd49ec 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -262,6 +262,7 @@
{
int status;
handle_t *handle;
+ struct ocfs2_dinode *di;
mlog_entry_void();
@@ -275,12 +276,39 @@
goto out;
}
- status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+ status = ocfs2_journal_access(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ /*
+ * Do this before setting i_size.
+ */
+ status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+ if (status) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+
+ i_size_write(inode, new_i_size);
+ inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+ di = (struct ocfs2_dinode *) fe_bh->b_data;
+ di->i_size = cpu_to_le64(new_i_size);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+ status = ocfs2_journal_dirty(handle, fe_bh);
if (status < 0)
mlog_errno(status);
+out_commit:
ocfs2_commit_trans(osb, handle);
out:
+
mlog_exit(status);
return status;
}
@@ -343,7 +371,6 @@
mlog_errno(status);
goto bail;
}
- ocfs2_data_unlock(inode, 1);
/* alright, we're going to need to do a full blown alloc size
* change. Orphan the inode so that recovery can complete the
@@ -352,22 +379,25 @@
status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto bail_unlock_data;
}
status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto bail_unlock_data;
}
status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto bail_unlock_data;
}
/* TODO: orphan dir cleanup here. */
+bail_unlock_data:
+ ocfs2_data_unlock(inode, 1);
+
bail:
mlog_exit(status);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 0bd86a1..78c99b5 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -489,12 +489,38 @@
int status = 0;
struct ocfs2_truncate_context *tc = NULL;
struct ocfs2_dinode *fe;
+ handle_t *handle = NULL;
mlog_entry_void();
fe = (struct ocfs2_dinode *) fe_bh->b_data;
if (fe->i_clusters) {
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_journal_access(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ i_size_write(inode, 0);
+
+ status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ ocfs2_commit_trans(osb, handle);
+ handle = NULL;
+
status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
if (status < 0) {
mlog_errno(status);
@@ -507,8 +533,10 @@
goto out;
}
}
-out:
+out:
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
mlog_exit(status);
return status;
}
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 2699f7ca..82cc92d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -495,6 +495,17 @@
return index;
}
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+ unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+ unsigned int pages_per_cluster = 1;
+
+ if (PAGE_CACHE_SHIFT < cbits)
+ pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+
+ return pages_per_cluster;
+}
+
#define ocfs2_set_bit ext2_set_bit
#define ocfs2_clear_bit ext2_clear_bit
#define ocfs2_test_bit ext2_test_bit