ocfs2: support writing of unwritten extents

Update the write code to detect when the user is asking to write to an
unwritten extent. Like writing to a hole, we must zero the region between
the write and the cluster boundaries. Most of the existing cluster zeroing
logic can be re-used with some additional checks for the unwritten flag on
extent records.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 077583b..8af9233 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -782,8 +782,14 @@
 	 * filled.
 	 */
 	unsigned	c_new;
+	unsigned	c_unwritten;
 };
 
+static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
+{
+	return d->c_new || d->c_unwritten;
+}
+
 struct ocfs2_write_ctxt {
 	/* Logical cluster position / len of write */
 	u32				w_cpos;
@@ -829,6 +835,8 @@
 	handle_t			*w_handle;
 
 	struct buffer_head		*w_di_bh;
+
+	struct ocfs2_cached_dealloc_ctxt w_dealloc;
 };
 
 static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
@@ -868,6 +876,8 @@
 	else
 		wc->w_large_pages = 0;
 
+	ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+
 	*wcp = wc;
 
 	return 0;
@@ -1103,16 +1113,19 @@
  * Prepare a single cluster for write one cluster into the file.
  */
 static int ocfs2_write_cluster(struct address_space *mapping,
-			       u32 phys, struct ocfs2_alloc_context *data_ac,
+			       u32 phys, unsigned int unwritten,
+			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct ocfs2_write_ctxt *wc, u32 cpos,
 			       loff_t user_pos, unsigned user_len)
 {
-	int ret, i, new;
+	int ret, i, new, should_zero = 0;
 	u64 v_blkno, p_blkno;
 	struct inode *inode = mapping->host;
 
 	new = phys == 0 ? 1 : 0;
+	if (new || unwritten)
+		should_zero = 1;
 
 	if (new) {
 		u32 tmp_pos;
@@ -1142,12 +1155,21 @@
 			mlog_errno(ret);
 			goto out;
 		}
-
-		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
-	} else {
-		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
+	} else if (unwritten) {
+		ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+						wc->w_handle, cpos, 1, phys,
+						meta_ac, &wc->w_dealloc);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
 	}
 
+	if (should_zero)
+		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
+	else
+		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
+
 	/*
 	 * The only reason this should fail is due to an inability to
 	 * find the extent added.
@@ -1169,7 +1191,8 @@
 
 		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
 						      wc->w_pages[i], cpos,
-						      user_pos, user_len, new);
+						      user_pos, user_len,
+						      should_zero);
 		if (tmpret) {
 			mlog_errno(tmpret);
 			if (ret == 0)
@@ -1200,8 +1223,9 @@
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
 
-		ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac,
-					  meta_ac, wc, desc->c_cpos, pos, len);
+		ret = ocfs2_write_cluster(mapping, desc->c_phys,
+					  desc->c_unwritten, data_ac, meta_ac,
+					  wc, desc->c_cpos, pos, len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1242,19 +1266,19 @@
 	if (wc->w_large_pages) {
 		/*
 		 * We only care about the 1st and last cluster within
-		 * our range and whether they are holes or not. Either
+		 * our range and whether they should be zero'd or not. Either
 		 * value may be extended out to the start/end of a
 		 * newly allocated cluster.
 		 */
 		desc = &wc->w_desc[0];
-		if (desc->c_new)
+		if (ocfs2_should_zero_cluster(desc))
 			ocfs2_figure_cluster_boundaries(osb,
 							desc->c_cpos,
 							&wc->w_target_from,
 							NULL);
 
 		desc = &wc->w_desc[wc->w_clen - 1];
-		if (desc->c_new)
+		if (ocfs2_should_zero_cluster(desc))
 			ocfs2_figure_cluster_boundaries(osb,
 							desc->c_cpos,
 							NULL,
@@ -1268,28 +1292,52 @@
 /*
  * Populate each single-cluster write descriptor in the write context
  * with information about the i/o to be done.
+ *
+ * Returns the number of clusters that will have to be allocated, as
+ * well as a worst case estimate of the number of extent records that
+ * would have to be created during a write to an unwritten region.
  */
 static int ocfs2_populate_write_desc(struct inode *inode,
 				     struct ocfs2_write_ctxt *wc,
-				     unsigned int *clusters_to_alloc)
+				     unsigned int *clusters_to_alloc,
+				     unsigned int *extents_to_split)
 {
 	int ret;
 	struct ocfs2_write_cluster_desc *desc;
 	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
 	u32 phys = 0;
 	int i;
 
+	*clusters_to_alloc = 0;
+	*extents_to_split = 0;
+
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
 		desc->c_cpos = wc->w_cpos + i;
 
 		if (num_clusters == 0) {
+			/*
+			 * Need to look up the next extent record.
+			 */
 			ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
-						 &num_clusters, NULL);
+						 &num_clusters, &ext_flags);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
+
+			/*
+			 * Assume worst case - that we're writing in
+			 * the middle of the extent.
+			 *
+			 * We can assume that the write proceeds from
+			 * left to right, in which case the extent
+			 * insert code is smart enough to coalesce the
+			 * next splits into the previous records created.
+			 */
+			if (ext_flags & OCFS2_EXT_UNWRITTEN)
+				*extents_to_split = *extents_to_split + 2;
 		} else if (phys) {
 			/*
 			 * Only increment phys if it doesn't describe
@@ -1303,6 +1351,8 @@
 			desc->c_new = 1;
 			*clusters_to_alloc = *clusters_to_alloc + 1;
 		}
+		if (ext_flags & OCFS2_EXT_UNWRITTEN)
+			desc->c_unwritten = 1;
 
 		num_clusters--;
 	}
@@ -1318,7 +1368,7 @@
 			     struct buffer_head *di_bh, struct page *mmap_page)
 {
 	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
-	unsigned int clusters_to_alloc = 0;
+	unsigned int clusters_to_alloc, extents_to_split;
 	struct ocfs2_write_ctxt *wc;
 	struct inode *inode = mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1333,7 +1383,8 @@
 		return ret;
 	}
 
-	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc);
+	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
+					&extents_to_split);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -1347,14 +1398,14 @@
 	 * write out. An allocation requires that we write the entire
 	 * cluster range.
 	 */
-	if (clusters_to_alloc > 0) {
+	if (clusters_to_alloc || extents_to_split) {
 		/*
 		 * XXX: We are stretching the limits of
-		 * ocfs2_lock_allocators(). It greately over-estimates
+		 * ocfs2_lock_allocators(). It greatly over-estimates
 		 * the work to be done.
 		 */
 		ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
-					    &data_ac, &meta_ac);
+					    extents_to_split, &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1365,7 +1416,8 @@
 
 	}
 
-	ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc);
+	ocfs2_set_target_boundaries(osb, wc, pos, len,
+				    clusters_to_alloc + extents_to_split);
 
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
@@ -1393,7 +1445,8 @@
 	 * extent.
 	 */
 	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
-					 clusters_to_alloc, mmap_page);
+					 clusters_to_alloc + extents_to_split,
+					 mmap_page);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1538,11 +1591,12 @@
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
 	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-
 	ocfs2_journal_dirty(handle, wc->w_di_bh);
 
 	ocfs2_commit_trans(osb, handle);
 
+	ocfs2_run_deallocs(osb, &wc->w_dealloc);
+
 	ocfs2_free_write_ctxt(wc);
 
 	return copied;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a80f317..6745086 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -527,20 +527,21 @@
  * understand sparse inodes.
  */
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-			  u32 clusters_to_add,
+			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac)
 {
 	int ret, num_free_extents;
+	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	*meta_ac = NULL;
 	*data_ac = NULL;
 
 	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-	     "clusters_to_add = %u\n",
+	     "clusters_to_add = %u, extents_to_split = %u\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
-	     le32_to_cpu(di->i_clusters), clusters_to_add);
+	     le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
 
 	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
 	if (num_free_extents < 0) {
@@ -558,9 +559,12 @@
 	 *
 	 * Most of the time we'll only be seeing this 1 cluster at a time
 	 * anyway.
+	 *
+	 * Always lock for any unwritten extents - we might want to
+	 * add blocks during a split.
 	 */
 	if (!num_free_extents ||
-	    (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
 		ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
 		if (ret < 0) {
 			if (ret != -ENOSPC)
@@ -641,7 +645,7 @@
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	drop_alloc_sem = 1;
 
-	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
+	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
 				       &meta_ac);
 	if (status) {
 		mlog_errno(status);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index a4dd1fa..54df3c4 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -47,7 +47,7 @@
 			       struct ocfs2_alloc_context *meta_ac,
 			       enum ocfs2_alloc_restarted *reason);
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-			  u32 clusters_to_add,
+			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);