Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ryusuke/nilfs2

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ryusuke/nilfs2: (49 commits)
  nilfs2: separate wait function from nilfs_segctor_write
  nilfs2: add iterator for segment buffers
  nilfs2: hide nilfs_write_info struct in segment buffer code
  nilfs2: relocate io status variables to segment buffer
  nilfs2: do not return io error for bio allocation failure
  nilfs2: use list_splice_tail or list_splice_tail_init
  nilfs2: replace mark_inode_dirty as nilfs_mark_inode_dirty
  nilfs2: delete mark_inode_dirty in nilfs_delete_entry
  nilfs2: delete mark_inode_dirty in nilfs_commit_chunk
  nilfs2: change return type of nilfs_commit_chunk
  nilfs2: split nilfs_unlink as nilfs_do_unlink and nilfs_unlink
  nilfs2: delete redundant mark_inode_dirty
  nilfs2: expand inode_inc_link_count and inode_dec_link_count
  nilfs2: delete mark_inode_dirty from nilfs_set_link
  nilfs2: delete mark_inode_dirty in nilfs_new_inode
  nilfs2: add norecovery mount option
  nilfs2: add helper to get if volume is in a valid state
  nilfs2: move recovery completion into load_nilfs function
  nilfs2: apply readahead for recovery on mount
  nilfs2: clean up get/put function of a segment usage
  ...
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 01539f4..4949fca 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -49,8 +49,7 @@
 NILFS2 supports the following mount options:
 (*) == default
 
-barrier=on(*)		This enables/disables barriers. barrier=off disables
-			it, barrier=on enables it.
+nobarrier		Disables barriers.
 errors=continue(*)	Keep going on a filesystem error.
 errors=remount-ro	Remount the filesystem read-only on an error.
 errors=panic		Panic and halt the machine if an error occurs.
@@ -71,6 +70,10 @@
 			blocks.  That means, it is guaranteed that no
 			overtaking of events occurs in the recovered file
 			system after a crash.
+norecovery		Disable recovery of the filesystem on mount.
+			This disables every write access on the device for
+			read-only mounts or snapshots.  This option will fail
+			for r/w mounts on an unclean volume.
 
 NILFS2 usage
 ============
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index d69e6ae..3f959f1 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -142,29 +142,75 @@
 	}
 }
 
+static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
+				  int create,
+				  void (*init_block)(struct inode *,
+						     struct buffer_head *,
+						     void *),
+				  struct buffer_head **bhp,
+				  struct nilfs_bh_assoc *prev,
+				  spinlock_t *lock)
+{
+	int ret;
+
+	spin_lock(lock);
+	if (prev->bh && blkoff == prev->blkoff) {
+		get_bh(prev->bh);
+		*bhp = prev->bh;
+		spin_unlock(lock);
+		return 0;
+	}
+	spin_unlock(lock);
+
+	ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp);
+	if (!ret) {
+		spin_lock(lock);
+		/*
+		 * The following code must be safe for change of the
+		 * cache contents during the get block call.
+		 */
+		brelse(prev->bh);
+		get_bh(*bhp);
+		prev->bh = *bhp;
+		prev->blkoff = blkoff;
+		spin_unlock(lock);
+	}
+	return ret;
+}
+
 static int nilfs_palloc_get_desc_block(struct inode *inode,
 				       unsigned long group,
 				       int create, struct buffer_head **bhp)
 {
-	return nilfs_mdt_get_block(inode,
-				   nilfs_palloc_desc_blkoff(inode, group),
-				   create, nilfs_palloc_desc_block_init, bhp);
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_desc_blkoff(inode, group),
+				      create, nilfs_palloc_desc_block_init,
+				      bhp, &cache->prev_desc, &cache->lock);
 }
 
 static int nilfs_palloc_get_bitmap_block(struct inode *inode,
 					 unsigned long group,
 					 int create, struct buffer_head **bhp)
 {
-	return nilfs_mdt_get_block(inode,
-				   nilfs_palloc_bitmap_blkoff(inode, group),
-				   create, NULL, bhp);
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_bitmap_blkoff(inode, group),
+				      create, NULL, bhp,
+				      &cache->prev_bitmap, &cache->lock);
 }
 
 int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
 				 int create, struct buffer_head **bhp)
 {
-	return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
-				   create, NULL, bhp);
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	return nilfs_palloc_get_block(inode,
+				      nilfs_palloc_entry_blkoff(inode, nr),
+				      create, NULL, bhp,
+				      &cache->prev_entry, &cache->lock);
 }
 
 static struct nilfs_palloc_group_desc *
@@ -176,13 +222,6 @@
 		group % nilfs_palloc_groups_per_desc_block(inode);
 }
 
-static unsigned char *
-nilfs_palloc_block_get_bitmap(const struct inode *inode,
-			      const struct buffer_head *bh, void *kaddr)
-{
-	return (unsigned char *)(kaddr + bh_offset(bh));
-}
-
 void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
 				   const struct buffer_head *bh, void *kaddr)
 {
@@ -289,8 +328,7 @@
 				if (ret < 0)
 					goto out_desc;
 				bitmap_kaddr = kmap(bitmap_bh->b_page);
-				bitmap = nilfs_palloc_block_get_bitmap(
-					inode, bitmap_bh, bitmap_kaddr);
+				bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
 				pos = nilfs_palloc_find_available_slot(
 					inode, group, group_offset, bitmap,
 					entries_per_group);
@@ -351,8 +389,7 @@
 	desc = nilfs_palloc_block_get_group_desc(inode, group,
 						 req->pr_desc_bh, desc_kaddr);
 	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
-	bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
-					       bitmap_kaddr);
+	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 
 	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
 				    group_offset, bitmap))
@@ -385,8 +422,7 @@
 	desc = nilfs_palloc_block_get_group_desc(inode, group,
 						 req->pr_desc_bh, desc_kaddr);
 	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
-	bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
-					       bitmap_kaddr);
+	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
 				    group_offset, bitmap))
 		printk(KERN_WARNING "%s: entry numer %llu already freed\n",
@@ -472,8 +508,7 @@
 		desc = nilfs_palloc_block_get_group_desc(
 			inode, group, desc_bh, desc_kaddr);
 		bitmap_kaddr = kmap(bitmap_bh->b_page);
-		bitmap = nilfs_palloc_block_get_bitmap(
-			inode, bitmap_bh, bitmap_kaddr);
+		bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
 		for (j = i, n = 0;
 		     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
 							      entry_nrs[j]);
@@ -502,3 +537,30 @@
 	}
 	return 0;
 }
+
+void nilfs_palloc_setup_cache(struct inode *inode,
+			      struct nilfs_palloc_cache *cache)
+{
+	NILFS_MDT(inode)->mi_palloc_cache = cache;
+	spin_lock_init(&cache->lock);
+}
+
+void nilfs_palloc_clear_cache(struct inode *inode)
+{
+	struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+	spin_lock(&cache->lock);
+	brelse(cache->prev_desc.bh);
+	brelse(cache->prev_bitmap.bh);
+	brelse(cache->prev_entry.bh);
+	cache->prev_desc.bh = NULL;
+	cache->prev_bitmap.bh = NULL;
+	cache->prev_entry.bh = NULL;
+	spin_unlock(&cache->lock);
+}
+
+void nilfs_palloc_destroy_cache(struct inode *inode)
+{
+	nilfs_palloc_clear_cache(inode);
+	NILFS_MDT(inode)->mi_palloc_cache = NULL;
+}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 4ace5475..f4543ac 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -69,4 +69,25 @@
 #define nilfs_clear_bit_atomic		ext2_clear_bit_atomic
 #define nilfs_find_next_zero_bit	ext2_find_next_zero_bit
 
+/*
+ * persistent object allocator cache
+ */
+
+struct nilfs_bh_assoc {
+	unsigned long blkoff;
+	struct buffer_head *bh;
+};
+
+struct nilfs_palloc_cache {
+	spinlock_t lock;
+	struct nilfs_bh_assoc prev_desc;
+	struct nilfs_bh_assoc prev_bitmap;
+	struct nilfs_bh_assoc prev_entry;
+};
+
+void nilfs_palloc_setup_cache(struct inode *inode,
+			      struct nilfs_palloc_cache *cache);
+void nilfs_palloc_clear_cache(struct inode *inode);
+void nilfs_palloc_destroy_cache(struct inode *inode);
+
 #endif	/* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 08834df..f4a14ea 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -402,19 +402,11 @@
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
 {
 	inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-	if (NILFS_MDT(bmap->b_inode))
-		nilfs_mdt_mark_dirty(bmap->b_inode);
-	else
-		mark_inode_dirty(bmap->b_inode);
 }
 
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
 {
 	inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-	if (NILFS_MDT(bmap->b_inode))
-		nilfs_mdt_mark_dirty(bmap->b_inode);
-	else
-		mark_inode_dirty(bmap->b_inode);
 }
 
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 84c2538..471e269 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -68,9 +68,34 @@
 	truncate_inode_pages(btnc, 0);
 }
 
+struct buffer_head *
+nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
+{
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	struct buffer_head *bh;
+
+	bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+	if (unlikely(!bh))
+		return NULL;
+
+	if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
+		     buffer_dirty(bh))) {
+		brelse(bh);
+		BUG();
+	}
+	memset(bh->b_data, 0, 1 << inode->i_blkbits);
+	bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+	bh->b_blocknr = blocknr;
+	set_buffer_mapped(bh);
+	set_buffer_uptodate(bh);
+
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	return bh;
+}
+
 int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
-			      sector_t pblocknr, struct buffer_head **pbh,
-			      int newblk)
+			      sector_t pblocknr, struct buffer_head **pbh)
 {
 	struct buffer_head *bh;
 	struct inode *inode = NILFS_BTNC_I(btnc);
@@ -81,19 +106,6 @@
 		return -ENOMEM;
 
 	err = -EEXIST; /* internal code */
-	if (newblk) {
-		if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
-			     buffer_dirty(bh))) {
-			brelse(bh);
-			BUG();
-		}
-		memset(bh->b_data, 0, 1 << inode->i_blkbits);
-		bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
-		bh->b_blocknr = blocknr;
-		set_buffer_mapped(bh);
-		set_buffer_uptodate(bh);
-		goto found;
-	}
 
 	if (buffer_uptodate(bh) || buffer_dirty(bh))
 		goto found;
@@ -135,27 +147,6 @@
 	return err;
 }
 
-int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
-		     sector_t pblocknr, struct buffer_head **pbh, int newblk)
-{
-	struct buffer_head *bh;
-	int err;
-
-	err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
-	if (err == -EEXIST) /* internal code (cache hit) */
-		return 0;
-	if (unlikely(err))
-		return err;
-
-	bh = *pbh;
-	wait_on_buffer(bh);
-	if (!buffer_uptodate(bh)) {
-		brelse(bh);
-		return -EIO;
-	}
-	return 0;
-}
-
 /**
  * nilfs_btnode_delete - delete B-tree node buffer
  * @bh: buffer to be deleted
@@ -244,12 +235,13 @@
 		unlock_page(obh->b_page);
 	}
 
-	err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
-	if (likely(!err)) {
-		BUG_ON(nbh == obh);
-		ctxt->newbh = nbh;
-	}
-	return err;
+	nbh = nilfs_btnode_create_block(btnc, newkey);
+	if (!nbh)
+		return -ENOMEM;
+
+	BUG_ON(nbh == obh);
+	ctxt->newbh = nbh;
+	return 0;
 
  failed_unlock:
 	unlock_page(obh->b_page);
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 3e22751..07da83f 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -40,10 +40,10 @@
 void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
+struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
+					      __u64 blocknr);
 int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
-			      struct buffer_head **, int);
-int nilfs_btnode_get(struct address_space *, __u64, sector_t,
-		     struct buffer_head **, int);
+			      struct buffer_head **);
 void nilfs_btnode_delete(struct buffer_head *);
 int nilfs_btnode_prepare_change_key(struct address_space *,
 				    struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index e25b507..7cdd98b 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -114,7 +114,18 @@
 {
 	struct address_space *btnc =
 		&NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
-	return nilfs_btnode_get(btnc, ptr, 0, bhp, 0);
+	int err;
+
+	err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
+	if (err)
+		return err == -EEXIST ? 0 : err;
+
+	wait_on_buffer(*bhp);
+	if (!buffer_uptodate(*bhp)) {
+		brelse(*bhp);
+		return -EIO;
+	}
+	return 0;
 }
 
 static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
@@ -122,12 +133,15 @@
 {
 	struct address_space *btnc =
 		&NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
-	int ret;
+	struct buffer_head *bh;
 
-	ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1);
-	if (!ret)
-		set_buffer_nilfs_volatile(*bhp);
-	return ret;
+	bh = nilfs_btnode_create_block(btnc, ptr);
+	if (!bh)
+		return -ENOMEM;
+
+	set_buffer_nilfs_volatile(bh);
+	*bhp = bh;
+	return 0;
 }
 
 static inline int
@@ -444,6 +458,18 @@
 		nilfs_btree_get_nonroot_node(path, level);
 }
 
+static inline int
+nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
+{
+	if (unlikely(nilfs_btree_node_get_level(node) != level)) {
+		dump_stack();
+		printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
+		       nilfs_btree_node_get_level(node), level);
+		return 1;
+	}
+	return 0;
+}
+
 static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
 				 struct nilfs_btree_path *path,
 				 __u64 key, __u64 *ptrp, int minlevel)
@@ -467,7 +493,8 @@
 		if (ret < 0)
 			return ret;
 		node = nilfs_btree_get_nonroot_node(path, level);
-		BUG_ON(level != nilfs_btree_node_get_level(node));
+		if (nilfs_btree_bad_node(node, level))
+			return -EINVAL;
 		if (!found)
 			found = nilfs_btree_node_lookup(node, key, &index);
 		else
@@ -512,7 +539,8 @@
 		if (ret < 0)
 			return ret;
 		node = nilfs_btree_get_nonroot_node(path, level);
-		BUG_ON(level != nilfs_btree_node_get_level(node));
+		if (nilfs_btree_bad_node(node, level))
+			return -EINVAL;
 		index = nilfs_btree_node_get_nchildren(node) - 1;
 		ptr = nilfs_btree_node_get_ptr(btree, node, index);
 		path[level].bp_index = index;
@@ -638,13 +666,11 @@
 {
 	if (level < nilfs_btree_height(btree) - 1) {
 		do {
-			lock_buffer(path[level].bp_bh);
 			nilfs_btree_node_set_key(
 				nilfs_btree_get_nonroot_node(path, level),
 				path[level].bp_index, key);
 			if (!buffer_dirty(path[level].bp_bh))
 				nilfs_btnode_mark_dirty(path[level].bp_bh);
-			unlock_buffer(path[level].bp_bh);
 		} while ((path[level].bp_index == 0) &&
 			 (++level < nilfs_btree_height(btree) - 1));
 	}
@@ -663,13 +689,11 @@
 	struct nilfs_btree_node *node;
 
 	if (level < nilfs_btree_height(btree) - 1) {
-		lock_buffer(path[level].bp_bh);
 		node = nilfs_btree_get_nonroot_node(path, level);
 		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
 					path[level].bp_index);
 		if (!buffer_dirty(path[level].bp_bh))
 			nilfs_btnode_mark_dirty(path[level].bp_bh);
-		unlock_buffer(path[level].bp_bh);
 
 		if (path[level].bp_index == 0)
 			nilfs_btree_promote_key(btree, path, level + 1,
@@ -689,9 +713,6 @@
 	struct nilfs_btree_node *node, *left;
 	int nchildren, lnchildren, n, move;
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	left = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
@@ -712,9 +733,6 @@
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(node, 0));
 
@@ -740,9 +758,6 @@
 	struct nilfs_btree_node *node, *right;
 	int nchildren, rnchildren, n, move;
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	right = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
@@ -763,9 +778,6 @@
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	path[level + 1].bp_index++;
 	nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(right, 0));
@@ -794,9 +806,6 @@
 	__u64 newptr;
 	int nchildren, n, move;
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	right = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
@@ -815,9 +824,6 @@
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	newkey = nilfs_btree_node_get_key(right, 0);
 	newptr = path[level].bp_newreq.bpr_ptr;
 
@@ -852,8 +858,6 @@
 	struct nilfs_btree_node *root, *child;
 	int n;
 
-	lock_buffer(path[level].bp_sib_bh);
-
 	root = nilfs_btree_get_root(btree);
 	child = nilfs_btree_get_sib_node(path, level);
 
@@ -865,8 +869,6 @@
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_sib_bh);
-
 	path[level].bp_bh = path[level].bp_sib_bh;
 	path[level].bp_sib_bh = NULL;
 
@@ -1023,11 +1025,9 @@
 
 		stats->bs_nblocks++;
 
-		lock_buffer(bh);
 		nilfs_btree_node_init(btree,
 				      (struct nilfs_btree_node *)bh->b_data,
 				      0, level, 0, NULL, NULL);
-		unlock_buffer(bh);
 		path[level].bp_sib_bh = bh;
 		path[level].bp_op = nilfs_btree_split;
 	}
@@ -1052,10 +1052,8 @@
 	if (ret < 0)
 		goto err_out_curr_node;
 
-	lock_buffer(bh);
 	nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
 			      0, level, 0, NULL, NULL);
-	unlock_buffer(bh);
 	path[level].bp_sib_bh = bh;
 	path[level].bp_op = nilfs_btree_grow;
 
@@ -1154,13 +1152,11 @@
 	struct nilfs_btree_node *node;
 
 	if (level < nilfs_btree_height(btree) - 1) {
-		lock_buffer(path[level].bp_bh);
 		node = nilfs_btree_get_nonroot_node(path, level);
 		nilfs_btree_node_delete(btree, node, keyp, ptrp,
 					path[level].bp_index);
 		if (!buffer_dirty(path[level].bp_bh))
 			nilfs_btnode_mark_dirty(path[level].bp_bh);
-		unlock_buffer(path[level].bp_bh);
 		if (path[level].bp_index == 0)
 			nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(node, 0));
@@ -1180,9 +1176,6 @@
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	left = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
@@ -1197,9 +1190,6 @@
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(node, 0));
 
@@ -1217,9 +1207,6 @@
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	right = nilfs_btree_get_sib_node(path, level);
 	nchildren = nilfs_btree_node_get_nchildren(node);
@@ -1234,9 +1221,6 @@
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	path[level + 1].bp_index++;
 	nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(right, 0));
@@ -1255,9 +1239,6 @@
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	left = nilfs_btree_get_sib_node(path, level);
 
@@ -1268,9 +1249,6 @@
 	if (!buffer_dirty(path[level].bp_sib_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	nilfs_btnode_delete(path[level].bp_bh);
 	path[level].bp_bh = path[level].bp_sib_bh;
 	path[level].bp_sib_bh = NULL;
@@ -1286,9 +1264,6 @@
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
-	lock_buffer(path[level].bp_bh);
-	lock_buffer(path[level].bp_sib_bh);
-
 	node = nilfs_btree_get_nonroot_node(path, level);
 	right = nilfs_btree_get_sib_node(path, level);
 
@@ -1299,9 +1274,6 @@
 	if (!buffer_dirty(path[level].bp_bh))
 		nilfs_btnode_mark_dirty(path[level].bp_bh);
 
-	unlock_buffer(path[level].bp_bh);
-	unlock_buffer(path[level].bp_sib_bh);
-
 	nilfs_btnode_delete(path[level].bp_sib_bh);
 	path[level].bp_sib_bh = NULL;
 	path[level + 1].bp_index++;
@@ -1316,7 +1288,6 @@
 
 	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
 
-	lock_buffer(path[level].bp_bh);
 	root = nilfs_btree_get_root(btree);
 	child = nilfs_btree_get_nonroot_node(path, level);
 
@@ -1324,7 +1295,6 @@
 	nilfs_btree_node_set_level(root, level);
 	n = nilfs_btree_node_get_nchildren(child);
 	nilfs_btree_node_move_left(btree, root, child, n);
-	unlock_buffer(path[level].bp_bh);
 
 	nilfs_btnode_delete(path[level].bp_bh);
 	path[level].bp_bh = NULL;
@@ -1699,7 +1669,6 @@
 		nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
 
 		/* create child node at level 1 */
-		lock_buffer(bh);
 		node = (struct nilfs_btree_node *)bh->b_data;
 		nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
 		nilfs_btree_node_insert(btree, node,
@@ -1709,7 +1678,6 @@
 		if (!nilfs_bmap_dirty(bmap))
 			nilfs_bmap_set_dirty(bmap);
 
-		unlock_buffer(bh);
 		brelse(bh);
 
 		/* create root node at level 2 */
@@ -2050,7 +2018,7 @@
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
 	     level < NILFS_BTREE_LEVEL_MAX;
 	     level++)
-		list_splice(&lists[level], listp->prev);
+		list_splice_tail(&lists[level], listp);
 }
 
 static int nilfs_btree_assign_p(struct nilfs_btree *btree,
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 0e72bbb..4b82d84 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -34,28 +34,6 @@
 struct nilfs_btree_path;
 
 /**
- * struct nilfs_btree_node - B-tree node
- * @bn_flags: flags
- * @bn_level: level
- * @bn_nchildren: number of children
- * @bn_pad: padding
- */
-struct nilfs_btree_node {
-	__u8 bn_flags;
-	__u8 bn_level;
-	__le16 bn_nchildren;
-	__le32 bn_pad;
-};
-
-/* flags */
-#define NILFS_BTREE_NODE_ROOT	0x01
-
-/* level */
-#define NILFS_BTREE_LEVEL_DATA		0
-#define NILFS_BTREE_LEVEL_NODE_MIN	(NILFS_BTREE_LEVEL_DATA + 1)
-#define NILFS_BTREE_LEVEL_MAX		14
-
-/**
  * struct nilfs_btree - B-tree structure
  * @bt_bmap: bmap base structure
  */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 3f5d5d0..d5ad54e 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -926,3 +926,29 @@
 	up_read(&NILFS_MDT(cpfile)->mi_sem);
 	return ret;
 }
+
+/**
+ * nilfs_cpfile_read - read cpfile inode
+ * @cpfile: cpfile inode
+ * @raw_inode: on-disk cpfile inode
+ */
+int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode)
+{
+	return nilfs_read_inode_common(cpfile, raw_inode);
+}
+
+/**
+ * nilfs_cpfile_new - create cpfile
+ * @nilfs: nilfs object
+ * @cpsize: size of a checkpoint entry
+ */
+struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize)
+{
+	struct inode *cpfile;
+
+	cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO, 0);
+	if (cpfile)
+		nilfs_mdt_set_entry_size(cpfile, cpsize,
+					 sizeof(struct nilfs_cpfile_header));
+	return cpfile;
+}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index debea89..bc0809e 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -40,4 +40,7 @@
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
 				size_t);
 
+int nilfs_cpfile_read(struct inode *cpfile, struct nilfs_inode *raw_inode);
+struct inode *nilfs_cpfile_new(struct the_nilfs *nilfs, size_t cpsize);
+
 #endif	/* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 1ff8e15..187dd07 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -33,6 +33,16 @@
 #define NILFS_CNO_MIN	((__u64)1)
 #define NILFS_CNO_MAX	(~(__u64)0)
 
+struct nilfs_dat_info {
+	struct nilfs_mdt_info mi;
+	struct nilfs_palloc_cache palloc_cache;
+};
+
+static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
+{
+	return (struct nilfs_dat_info *)NILFS_MDT(dat);
+}
+
 static int nilfs_dat_prepare_entry(struct inode *dat,
 				   struct nilfs_palloc_req *req, int create)
 {
@@ -425,3 +435,40 @@
 
 	return nvi;
 }
+
+/**
+ * nilfs_dat_read - read dat inode
+ * @dat: dat inode
+ * @raw_inode: on-disk dat inode
+ */
+int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode)
+{
+	return nilfs_read_inode_common(dat, raw_inode);
+}
+
+/**
+ * nilfs_dat_new - create dat file
+ * @nilfs: nilfs object
+ * @entry_size: size of a dat entry
+ */
+struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size)
+{
+	static struct lock_class_key dat_lock_key;
+	struct inode *dat;
+	struct nilfs_dat_info *di;
+	int err;
+
+	dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO, sizeof(*di));
+	if (dat) {
+		err = nilfs_palloc_init_blockgroup(dat, entry_size);
+		if (unlikely(err)) {
+			nilfs_mdt_destroy(dat);
+			return NULL;
+		}
+
+		di = NILFS_DAT_I(dat);
+		lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
+		nilfs_palloc_setup_cache(dat, &di->palloc_cache);
+	}
+	return dat;
+}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index 406070d..d31c3aa 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -53,4 +53,7 @@
 int nilfs_dat_move(struct inode *, __u64, sector_t);
 ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
 
+int nilfs_dat_read(struct inode *dat, struct nilfs_inode *raw_inode);
+struct inode *nilfs_dat_new(struct the_nilfs *nilfs, size_t entry_size);
+
 #endif	/* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index e097099..76d803e 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -99,9 +99,9 @@
 				 NULL, nilfs_get_block);
 }
 
-static int nilfs_commit_chunk(struct page *page,
-			      struct address_space *mapping,
-			      unsigned from, unsigned to)
+static void nilfs_commit_chunk(struct page *page,
+			       struct address_space *mapping,
+			       unsigned from, unsigned to)
 {
 	struct inode *dir = mapping->host;
 	struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
@@ -112,15 +112,13 @@
 
 	nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
 	copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
-	if (pos + copied > dir->i_size) {
+	if (pos + copied > dir->i_size)
 		i_size_write(dir, pos + copied);
-		mark_inode_dirty(dir);
-	}
 	if (IS_DIRSYNC(dir))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 	err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+	WARN_ON(err); /* do not happen */
 	unlock_page(page);
-	return err;
 }
 
 static void nilfs_check_page(struct page *page)
@@ -455,11 +453,10 @@
 	BUG_ON(err);
 	de->inode = cpu_to_le64(inode->i_ino);
 	nilfs_set_de_type(de, inode);
-	err = nilfs_commit_chunk(page, mapping, from, to);
+	nilfs_commit_chunk(page, mapping, from, to);
 	nilfs_put_page(page);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 /*	NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
-	mark_inode_dirty(dir);
 }
 
 /*
@@ -548,10 +545,10 @@
 	memcpy(de->name, name, namelen);
 	de->inode = cpu_to_le64(inode->i_ino);
 	nilfs_set_de_type(de, inode);
-	err = nilfs_commit_chunk(page, page->mapping, from, to);
+	nilfs_commit_chunk(page, page->mapping, from, to);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 /*	NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
-	mark_inode_dirty(dir);
+	nilfs_mark_inode_dirty(dir);
 	/* OFFSET_CACHE */
 out_put:
 	nilfs_put_page(page);
@@ -595,10 +592,9 @@
 	if (pde)
 		pde->rec_len = cpu_to_le16(to - from);
 	dir->inode = 0;
-	err = nilfs_commit_chunk(page, mapping, from, to);
+	nilfs_commit_chunk(page, mapping, from, to);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 /*	NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
-	mark_inode_dirty(inode);
 out:
 	nilfs_put_page(page);
 	return err;
@@ -640,7 +636,7 @@
 	memcpy(de->name, "..\0", 4);
 	nilfs_set_de_type(de, inode);
 	kunmap_atomic(kaddr, KM_USER0);
-	err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
+	nilfs_commit_chunk(page, mapping, 0, chunk_size);
 fail:
 	page_cache_release(page);
 	return err;
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
index 93383c5..dd5f7e0 100644
--- a/fs/nilfs2/gcdat.c
+++ b/fs/nilfs2/gcdat.c
@@ -61,6 +61,8 @@
 
 	nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
 
+	nilfs_palloc_clear_cache(dat);
+	nilfs_palloc_clear_cache(gcdat);
 	nilfs_clear_dirty_pages(mapping);
 	nilfs_copy_back_pages(mapping, gmapping);
 	/* note: mdt dirty flags should be cleared by segctor. */
@@ -79,6 +81,7 @@
 	gcdat->i_state = I_CLEAR;
 	gii->i_flags = 0;
 
+	nilfs_palloc_clear_cache(gcdat);
 	truncate_inode_pages(gcdat->i_mapping, 0);
 	truncate_inode_pages(&gii->i_btnode_cache, 0);
 }
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e6de0a2..e16a666 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -149,7 +149,7 @@
 				   __u64 vbn, struct buffer_head **out_bh)
 {
 	int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
-					    vbn ? : pbn, pbn, out_bh, 0);
+					    vbn ? : pbn, pbn, out_bh);
 	if (ret == -EEXIST) /* internal code (cache hit) */
 		ret = 0;
 	return ret;
@@ -212,9 +212,10 @@
 static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
 				   __u64 cno)
 {
-	struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
+	struct inode *inode;
 	struct nilfs_inode_info *ii;
 
+	inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS, 0);
 	if (!inode)
 		return NULL;
 
@@ -265,7 +266,6 @@
  */
 void nilfs_clear_gcinode(struct inode *inode)
 {
-	nilfs_mdt_clear(inode);
 	nilfs_mdt_destroy(inode);
 }
 
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index de86401..922d9dd 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -29,6 +29,17 @@
 #include "alloc.h"
 #include "ifile.h"
 
+
+struct nilfs_ifile_info {
+	struct nilfs_mdt_info mi;
+	struct nilfs_palloc_cache palloc_cache;
+};
+
+static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile)
+{
+	return (struct nilfs_ifile_info *)NILFS_MDT(ifile);
+}
+
 /**
  * nilfs_ifile_create_inode - create a new disk inode
  * @ifile: ifile inode
@@ -148,3 +159,27 @@
 	}
 	return err;
 }
+
+/**
+ * nilfs_ifile_new - create inode file
+ * @sbi: nilfs_sb_info struct
+ * @inode_size: size of an inode
+ */
+struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size)
+{
+	struct inode *ifile;
+	int err;
+
+	ifile = nilfs_mdt_new(sbi->s_nilfs, sbi->s_super, NILFS_IFILE_INO,
+			      sizeof(struct nilfs_ifile_info));
+	if (ifile) {
+		err = nilfs_palloc_init_blockgroup(ifile, inode_size);
+		if (unlikely(err)) {
+			nilfs_mdt_destroy(ifile);
+			return NULL;
+		}
+		nilfs_palloc_setup_cache(ifile,
+					 &NILFS_IFILE_I(ifile)->palloc_cache);
+	}
+	return ifile;
+}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index ecc3ba76..cbca32e 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,4 +49,6 @@
 int nilfs_ifile_delete_inode(struct inode *, ino_t);
 int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
 
+struct inode *nilfs_ifile_new(struct nilfs_sb_info *sbi, size_t inode_size);
+
 #endif	/* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2a0a5a3..7868cc1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -97,6 +97,7 @@
 			nilfs_transaction_abort(inode->i_sb);
 			goto out;
 		}
+		nilfs_mark_inode_dirty(inode);
 		nilfs_transaction_commit(inode->i_sb); /* never fails */
 		/* Error handling should be detailed */
 		set_buffer_new(bh_result);
@@ -322,7 +323,6 @@
 				    nilfs_init_acl(), proper cancellation of
 				    above jobs should be considered */
 
-	mark_inode_dirty(inode);
 	return inode;
 
  failed_acl:
@@ -525,7 +525,6 @@
 
 	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
 
-	/* The buffer is guarded with lock_buffer() by the caller */
 	if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
 		memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
 	set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
@@ -599,6 +598,7 @@
 	if (IS_SYNC(inode))
 		nilfs_set_transaction_flag(NILFS_TI_SYNC);
 
+	nilfs_mark_inode_dirty(inode);
 	nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
 	nilfs_transaction_commit(sb);
 	/* May construct a logical segment and may fail in sync mode.
@@ -623,6 +623,7 @@
 		truncate_inode_pages(&inode->i_data, 0);
 
 	nilfs_truncate_bmap(ii, 0);
+	nilfs_mark_inode_dirty(inode);
 	nilfs_free_inode(inode);
 	/* nilfs_free_inode() marks inode buffer dirty */
 	if (IS_SYNC(inode))
@@ -745,9 +746,7 @@
 			      "failed to reget inode block.\n");
 		return err;
 	}
-	lock_buffer(ibh);
 	nilfs_update_inode(inode, ibh);
-	unlock_buffer(ibh);
 	nilfs_mdt_mark_buffer_dirty(ibh);
 	nilfs_mdt_mark_dirty(sbi->s_ifile);
 	brelse(ibh);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index f632611..06713ff 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -186,7 +186,7 @@
 }
 
 static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
-				struct buffer_head **out_bh)
+				int readahead, struct buffer_head **out_bh)
 {
 	struct buffer_head *first_bh, *bh;
 	unsigned long blkoff;
@@ -200,16 +200,18 @@
 	if (unlikely(err))
 		goto failed;
 
-	blkoff = block + 1;
-	for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
-		err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
-		if (likely(!err || err == -EEXIST))
-			brelse(bh);
-		else if (err != -EBUSY)
-			break; /* abort readahead if bmap lookup failed */
-
-		if (!buffer_locked(first_bh))
-			goto out_no_wait;
+	if (readahead) {
+		blkoff = block + 1;
+		for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
+			err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+			if (likely(!err || err == -EEXIST))
+				brelse(bh);
+			else if (err != -EBUSY)
+				break;
+				/* abort readahead if bmap lookup failed */
+			if (!buffer_locked(first_bh))
+				goto out_no_wait;
+		}
 	}
 
 	wait_on_buffer(first_bh);
@@ -263,7 +265,7 @@
 
 	/* Should be rewritten with merging nilfs_mdt_read_block() */
  retry:
-	ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
+	ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh);
 	if (!create || ret != -ENOENT)
 		return ret;
 
@@ -371,7 +373,7 @@
 	struct buffer_head *bh;
 	int err;
 
-	err = nilfs_mdt_read_block(inode, block, &bh);
+	err = nilfs_mdt_read_block(inode, block, 0, &bh);
 	if (unlikely(err))
 		return err;
 	nilfs_mark_buffer_dirty(bh);
@@ -445,9 +447,17 @@
  * longer than those of the super block structs; they may continue for
  * several consecutive mounts/umounts.  This would need discussions.
  */
+/**
+ * nilfs_mdt_new_common - allocate a pseudo inode for metadata file
+ * @nilfs: nilfs object
+ * @sb: super block instance the metadata file belongs to
+ * @ino: inode number
+ * @gfp_mask: gfp mask for data pages
+ * @objsz: size of the private object attached to inode->i_private
+ */
 struct inode *
 nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
-		     ino_t ino, gfp_t gfp_mask)
+		     ino_t ino, gfp_t gfp_mask, size_t objsz)
 {
 	struct inode *inode = nilfs_alloc_inode_common(nilfs);
 
@@ -455,8 +465,9 @@
 		return NULL;
 	else {
 		struct address_space * const mapping = &inode->i_data;
-		struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
+		struct nilfs_mdt_info *mi;
 
+		mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS);
 		if (!mi) {
 			nilfs_destroy_inode(inode);
 			return NULL;
@@ -513,11 +524,11 @@
 }
 
 struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
-			    ino_t ino)
+			    ino_t ino, size_t objsz)
 {
-	struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
-						   NILFS_MDT_GFP);
+	struct inode *inode;
 
+	inode = nilfs_mdt_new_common(nilfs, sb, ino, NILFS_MDT_GFP, objsz);
 	if (!inode)
 		return NULL;
 
@@ -544,14 +555,15 @@
 		&NILFS_I(orig)->i_btnode_cache;
 }
 
-void nilfs_mdt_clear(struct inode *inode)
+static void nilfs_mdt_clear(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 
 	invalidate_mapping_pages(inode->i_mapping, 0, -1);
 	truncate_inode_pages(inode->i_mapping, 0);
 
-	nilfs_bmap_clear(ii->i_bmap);
+	if (test_bit(NILFS_I_BMAP, &ii->i_state))
+		nilfs_bmap_clear(ii->i_bmap);
 	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
 }
 
@@ -559,6 +571,10 @@
 {
 	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
 
+	if (mdi->mi_palloc_cache)
+		nilfs_palloc_destroy_cache(inode);
+	nilfs_mdt_clear(inode);
+
 	kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
 	kfree(mdi);
 	nilfs_destroy_inode(inode);
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index 4315997..6c4bbb0 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -36,6 +36,7 @@
  * @mi_entry_size: size of an entry
  * @mi_first_entry_offset: offset to the first entry
  * @mi_entries_per_block: number of entries in a block
+ * @mi_palloc_cache: persistent object allocator cache
  * @mi_blocks_per_group: number of blocks in a group
  * @mi_blocks_per_desc_block: number of blocks per descriptor block
  */
@@ -46,6 +47,7 @@
 	unsigned		mi_entry_size;
 	unsigned		mi_first_entry_offset;
 	unsigned long		mi_entries_per_block;
+	struct nilfs_palloc_cache *mi_palloc_cache;
 	unsigned long		mi_blocks_per_group;
 	unsigned long		mi_blocks_per_desc_block;
 };
@@ -74,11 +76,11 @@
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
 
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+			    size_t);
 struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
-				   ino_t, gfp_t);
+				   ino_t, gfp_t, size_t);
 void nilfs_mdt_destroy(struct inode *);
-void nilfs_mdt_clear(struct inode *);
 void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
 void nilfs_mdt_set_shadow(struct inode *, struct inode *);
 
@@ -104,21 +106,4 @@
 #define nilfs_mdt_bgl_lock(inode, bg) \
 	(&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
 
-
-static inline int
-nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
-			    unsigned n)
-{
-	return nilfs_read_inode_common(
-		inode, (struct nilfs_inode *)(bh->b_data + n));
-}
-
-static inline void
-nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
-			     unsigned n)
-{
-	nilfs_write_inode_common(
-		inode, (struct nilfs_inode *)(bh->b_data + n), 1);
-}
-
 #endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ed02e88..07ba838 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -120,7 +120,7 @@
 		inode->i_op = &nilfs_file_inode_operations;
 		inode->i_fop = &nilfs_file_operations;
 		inode->i_mapping->a_ops = &nilfs_aops;
-		mark_inode_dirty(inode);
+		nilfs_mark_inode_dirty(inode);
 		err = nilfs_add_nondir(dentry, inode);
 	}
 	if (!err)
@@ -148,7 +148,7 @@
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
-		mark_inode_dirty(inode);
+		nilfs_mark_inode_dirty(inode);
 		err = nilfs_add_nondir(dentry, inode);
 	}
 	if (!err)
@@ -188,7 +188,7 @@
 		goto out_fail;
 
 	/* mark_inode_dirty(inode); */
-	/* nilfs_new_inode() and page_symlink() do this */
+	/* page_symlink() do this */
 
 	err = nilfs_add_nondir(dentry, inode);
 out:
@@ -200,7 +200,8 @@
 	return err;
 
 out_fail:
-	inode_dec_link_count(inode);
+	drop_nlink(inode);
+	nilfs_mark_inode_dirty(inode);
 	iput(inode);
 	goto out;
 }
@@ -245,7 +246,7 @@
 	if (err)
 		return err;
 
-	inode_inc_link_count(dir);
+	inc_nlink(dir);
 
 	inode = nilfs_new_inode(dir, S_IFDIR | mode);
 	err = PTR_ERR(inode);
@@ -256,7 +257,7 @@
 	inode->i_fop = &nilfs_dir_operations;
 	inode->i_mapping->a_ops = &nilfs_aops;
 
-	inode_inc_link_count(inode);
+	inc_nlink(inode);
 
 	err = nilfs_make_empty(inode, dir);
 	if (err)
@@ -266,6 +267,7 @@
 	if (err)
 		goto out_fail;
 
+	nilfs_mark_inode_dirty(inode);
 	d_instantiate(dentry, inode);
 out:
 	if (!err)
@@ -276,26 +278,23 @@
 	return err;
 
 out_fail:
-	inode_dec_link_count(inode);
-	inode_dec_link_count(inode);
+	drop_nlink(inode);
+	drop_nlink(inode);
+	nilfs_mark_inode_dirty(inode);
 	iput(inode);
 out_dir:
-	inode_dec_link_count(dir);
+	drop_nlink(dir);
+	nilfs_mark_inode_dirty(dir);
 	goto out;
 }
 
-static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode;
 	struct nilfs_dir_entry *de;
 	struct page *page;
-	struct nilfs_transaction_info ti;
 	int err;
 
-	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
-	if (err)
-		return err;
-
 	err = -ENOENT;
 	de = nilfs_find_entry(dir, dentry, &page);
 	if (!de)
@@ -317,12 +316,28 @@
 		goto out;
 
 	inode->i_ctime = dir->i_ctime;
-	inode_dec_link_count(inode);
+	drop_nlink(inode);
 	err = 0;
 out:
-	if (!err)
+	return err;
+}
+
+static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+	if (err)
+		return err;
+
+	err = nilfs_do_unlink(dir, dentry);
+
+	if (!err) {
+		nilfs_mark_inode_dirty(dir);
+		nilfs_mark_inode_dirty(dentry->d_inode);
 		err = nilfs_transaction_commit(dir->i_sb);
-	else
+	} else
 		nilfs_transaction_abort(dir->i_sb);
 
 	return err;
@@ -340,11 +355,13 @@
 
 	err = -ENOTEMPTY;
 	if (nilfs_empty_dir(inode)) {
-		err = nilfs_unlink(dir, dentry);
+		err = nilfs_do_unlink(dir, dentry);
 		if (!err) {
 			inode->i_size = 0;
-			inode_dec_link_count(inode);
-			inode_dec_link_count(dir);
+			drop_nlink(inode);
+			nilfs_mark_inode_dirty(inode);
+			drop_nlink(dir);
+			nilfs_mark_inode_dirty(dir);
 		}
 	}
 	if (!err)
@@ -395,42 +412,48 @@
 		new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
+		inc_nlink(old_inode);
 		nilfs_set_link(new_dir, new_de, new_page, old_inode);
+		nilfs_mark_inode_dirty(new_dir);
 		new_inode->i_ctime = CURRENT_TIME;
 		if (dir_de)
 			drop_nlink(new_inode);
-		inode_dec_link_count(new_inode);
+		drop_nlink(new_inode);
+		nilfs_mark_inode_dirty(new_inode);
 	} else {
 		if (dir_de) {
 			err = -EMLINK;
 			if (new_dir->i_nlink >= NILFS_LINK_MAX)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
+		inc_nlink(old_inode);
 		err = nilfs_add_link(new_dentry, old_inode);
 		if (err) {
-			inode_dec_link_count(old_inode);
+			drop_nlink(old_inode);
+			nilfs_mark_inode_dirty(old_inode);
 			goto out_dir;
 		}
-		if (dir_de)
-			inode_inc_link_count(new_dir);
+		if (dir_de) {
+			inc_nlink(new_dir);
+			nilfs_mark_inode_dirty(new_dir);
+		}
 	}
 
 	/*
 	 * Like most other Unix systems, set the ctime for inodes on a
 	 * rename.
-	 * inode_dec_link_count() will mark the inode dirty.
 	 */
 	old_inode->i_ctime = CURRENT_TIME;
 
 	nilfs_delete_entry(old_de, old_page);
-	inode_dec_link_count(old_inode);
+	drop_nlink(old_inode);
 
 	if (dir_de) {
 		nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
-		inode_dec_link_count(old_dir);
+		drop_nlink(old_dir);
 	}
+	nilfs_mark_inode_dirty(old_dir);
+	nilfs_mark_inode_dirty(old_inode);
 
 	err = nilfs_transaction_commit(old_dir->i_sb);
 	return err;
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 6dc8359..c9c96c7 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -770,14 +770,8 @@
 		nilfs_finish_roll_forward(nilfs, sbi, ri);
 	}
 
-	nilfs_detach_checkpoint(sbi);
-	return 0;
-
  failed:
 	nilfs_detach_checkpoint(sbi);
-	nilfs_mdt_clear(nilfs->ns_cpfile);
-	nilfs_mdt_clear(nilfs->ns_sufile);
-	nilfs_mdt_clear(nilfs->ns_dat);
 	return err;
 }
 
@@ -804,6 +798,7 @@
 	struct nilfs_segsum_info ssi;
 	sector_t pseg_start, pseg_end, sr_pseg_start = 0;
 	sector_t seg_start, seg_end; /* range of full segment (block number) */
+	sector_t b, end;
 	u64 seg_seq;
 	__u64 segnum, nextnum = 0;
 	__u64 cno;
@@ -819,6 +814,11 @@
 	/* Calculate range of segment */
 	nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
 
+	/* Read ahead segment */
+	b = seg_start;
+	while (b <= seg_end)
+		sb_breadahead(sbi->s_super, b++);
+
 	for (;;) {
 		/* Load segment summary */
 		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
@@ -841,14 +841,20 @@
 		ri->ri_nextnum = nextnum;
 		empty_seg = 0;
 
+		if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) {
+			/* This will never happen because a superblock
+			   (last_segment) always points to a pseg
+			   having a super root. */
+			ret = NILFS_SEG_FAIL_CONSISTENCY;
+			goto failed;
+		}
+
+		if (pseg_start == seg_start) {
+			nilfs_get_segment_range(nilfs, nextnum, &b, &end);
+			while (b <= end)
+				sb_breadahead(sbi->s_super, b++);
+		}
 		if (!NILFS_SEG_HAS_SR(&ssi)) {
-			if (!scan_newer) {
-				/* This will never happen because a superblock
-				   (last_segment) always points to a pseg
-				   having a super root. */
-				ret = NILFS_SEG_FAIL_CONSISTENCY;
-				goto failed;
-			}
 			if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
 				ri->ri_lsegs_start = pseg_start;
 				ri->ri_lsegs_start_seq = seg_seq;
@@ -919,7 +925,7 @@
 
  super_root_found:
 	/* Updating pointers relating to the latest checkpoint */
-	list_splice(&segments, ri->ri_used_segments.prev);
+	list_splice_tail(&segments, &ri->ri_used_segments);
 	nilfs->ns_last_pseg = sr_pseg_start;
 	nilfs->ns_last_seq = nilfs->ns_seg_seq;
 	nilfs->ns_last_cno = ri->ri_cno;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index e6d9e37..645c786 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -24,10 +24,22 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/crc32.h>
+#include <linux/backing-dev.h>
 #include "page.h"
 #include "segbuf.h"
 
 
+struct nilfs_write_info {
+	struct the_nilfs       *nilfs;
+	struct bio	       *bio;
+	int 			start, end; /* The region to be submitted */
+	int			rest_blocks;
+	int			max_pages;
+	int			nr_vecs;
+	sector_t		blocknr;
+};
+
+
 static struct kmem_cache *nilfs_segbuf_cachep;
 
 static void nilfs_segbuf_init_once(void *obj)
@@ -63,6 +75,11 @@
 	INIT_LIST_HEAD(&segbuf->sb_list);
 	INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
 	INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+
+	init_completion(&segbuf->sb_bio_event);
+	atomic_set(&segbuf->sb_err, 0);
+	segbuf->sb_nbio = 0;
+
 	return segbuf;
 }
 
@@ -83,6 +100,22 @@
 		segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
 }
 
+/**
+ * nilfs_segbuf_map_cont - map a new log behind a given log
+ * @segbuf: new segment buffer
+ * @prev: segment buffer containing a log to be continued
+ */
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+			   struct nilfs_segment_buffer *prev)
+{
+	segbuf->sb_segnum = prev->sb_segnum;
+	segbuf->sb_fseg_start = prev->sb_fseg_start;
+	segbuf->sb_fseg_end = prev->sb_fseg_end;
+	segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks;
+	segbuf->sb_rest_blocks =
+		segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
+
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
 				  __u64 nextnum, struct the_nilfs *nilfs)
 {
@@ -132,8 +165,6 @@
 	segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
 	segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
 	segbuf->sb_sum.ctime = ctime;
-
-	segbuf->sb_io_error = 0;
 	return 0;
 }
 
@@ -219,7 +250,7 @@
 	raw_sum->ss_datasum = cpu_to_le32(crc);
 }
 
-void nilfs_release_buffers(struct list_head *list)
+static void nilfs_release_buffers(struct list_head *list)
 {
 	struct buffer_head *bh, *n;
 
@@ -241,13 +272,56 @@
 	}
 }
 
+static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+{
+	nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+	nilfs_release_buffers(&segbuf->sb_payload_buffers);
+}
+
+/*
+ * Iterators for segment buffers
+ */
+void nilfs_clear_logs(struct list_head *logs)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	list_for_each_entry(segbuf, logs, sb_list)
+		nilfs_segbuf_clear(segbuf);
+}
+
+void nilfs_truncate_logs(struct list_head *logs,
+			 struct nilfs_segment_buffer *last)
+{
+	struct nilfs_segment_buffer *n, *segbuf;
+
+	segbuf = list_prepare_entry(last, logs, sb_list);
+	list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) {
+		list_del_init(&segbuf->sb_list);
+		nilfs_segbuf_clear(segbuf);
+		nilfs_segbuf_free(segbuf);
+	}
+}
+
+int nilfs_wait_on_logs(struct list_head *logs)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int err;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		err = nilfs_segbuf_wait(segbuf);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
 /*
  * BIO operations
  */
 static void nilfs_end_bio_write(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct nilfs_write_info *wi = bio->bi_private;
+	struct nilfs_segment_buffer *segbuf = bio->bi_private;
 
 	if (err == -EOPNOTSUPP) {
 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
@@ -256,21 +330,22 @@
 	}
 
 	if (!uptodate)
-		atomic_inc(&wi->err);
+		atomic_inc(&segbuf->sb_err);
 
 	bio_put(bio);
-	complete(&wi->bio_event);
+	complete(&segbuf->sb_bio_event);
 }
 
-static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
+static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
+				   struct nilfs_write_info *wi, int mode)
 {
 	struct bio *bio = wi->bio;
 	int err;
 
-	if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
-		wait_for_completion(&wi->bio_event);
-		wi->nbio--;
-		if (unlikely(atomic_read(&wi->err))) {
+	if (segbuf->sb_nbio > 0 && bdi_write_congested(wi->nilfs->ns_bdi)) {
+		wait_for_completion(&segbuf->sb_bio_event);
+		segbuf->sb_nbio--;
+		if (unlikely(atomic_read(&segbuf->sb_err))) {
 			bio_put(bio);
 			err = -EIO;
 			goto failed;
@@ -278,7 +353,7 @@
 	}
 
 	bio->bi_end_io = nilfs_end_bio_write;
-	bio->bi_private = wi;
+	bio->bi_private = segbuf;
 	bio_get(bio);
 	submit_bio(mode, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
@@ -286,7 +361,7 @@
 		err = -EOPNOTSUPP;
 		goto failed;
 	}
-	wi->nbio++;
+	segbuf->sb_nbio++;
 	bio_put(bio);
 
 	wi->bio = NULL;
@@ -301,17 +376,15 @@
 }
 
 /**
- * nilfs_alloc_seg_bio - allocate a bio for writing segment.
- * @sb: super block
- * @start: beginning disk block number of this BIO.
+ * nilfs_alloc_seg_bio - allocate a new bio for writing log
+ * @nilfs: nilfs object
+ * @start: start block number of the bio
  * @nr_vecs: request size of page vector.
  *
- * alloc_seg_bio() allocates a new BIO structure and initialize it.
- *
  * Return Value: On success, pointer to the struct bio is returned.
  * On error, NULL is returned.
  */
-static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
+static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
 				       int nr_vecs)
 {
 	struct bio *bio;
@@ -322,36 +395,33 @@
 			bio = bio_alloc(GFP_NOIO, nr_vecs);
 	}
 	if (likely(bio)) {
-		bio->bi_bdev = sb->s_bdev;
-		bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
+		bio->bi_bdev = nilfs->ns_bdev;
+		bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9);
 	}
 	return bio;
 }
 
-void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
-				struct nilfs_write_info *wi)
+static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
+				       struct nilfs_write_info *wi)
 {
 	wi->bio = NULL;
 	wi->rest_blocks = segbuf->sb_sum.nblocks;
-	wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
+	wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev);
 	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
 	wi->start = wi->end = 0;
-	wi->nbio = 0;
 	wi->blocknr = segbuf->sb_pseg_start;
-
-	atomic_set(&wi->err, 0);
-	init_completion(&wi->bio_event);
 }
 
-static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
-			   int mode)
+static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
+				  struct nilfs_write_info *wi,
+				  struct buffer_head *bh, int mode)
 {
 	int len, err;
 
 	BUG_ON(wi->nr_vecs <= 0);
  repeat:
 	if (!wi->bio) {
-		wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
+		wi->bio = nilfs_alloc_seg_bio(wi->nilfs, wi->blocknr + wi->end,
 					      wi->nr_vecs);
 		if (unlikely(!wi->bio))
 			return -ENOMEM;
@@ -363,76 +433,83 @@
 		return 0;
 	}
 	/* bio is FULL */
-	err = nilfs_submit_seg_bio(wi, mode);
+	err = nilfs_segbuf_submit_bio(segbuf, wi, mode);
 	/* never submit current bh */
 	if (likely(!err))
 		goto repeat;
 	return err;
 }
 
+/**
+ * nilfs_segbuf_write - submit write requests of a log
+ * @segbuf: buffer storing a log to be written
+ * @nilfs: nilfs object
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
 int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
-		       struct nilfs_write_info *wi)
+		       struct the_nilfs *nilfs)
 {
+	struct nilfs_write_info wi;
 	struct buffer_head *bh;
-	int res, rw = WRITE;
+	int res = 0, rw = WRITE;
+
+	wi.nilfs = nilfs;
+	nilfs_segbuf_prepare_write(segbuf, &wi);
 
 	list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
-		res = nilfs_submit_bh(wi, bh, rw);
+		res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
 		if (unlikely(res))
 			goto failed_bio;
 	}
 
 	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-		res = nilfs_submit_bh(wi, bh, rw);
+		res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw);
 		if (unlikely(res))
 			goto failed_bio;
 	}
 
-	if (wi->bio) {
+	if (wi.bio) {
 		/*
 		 * Last BIO is always sent through the following
 		 * submission.
 		 */
 		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
-		res = nilfs_submit_seg_bio(wi, rw);
-		if (unlikely(res))
-			goto failed_bio;
+		res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
 	}
 
-	res = 0;
- out:
-	return res;
-
  failed_bio:
-	atomic_inc(&wi->err);
-	goto out;
+	return res;
 }
 
 /**
  * nilfs_segbuf_wait - wait for completion of requested BIOs
- * @wi: nilfs_write_info
+ * @segbuf: segment buffer
  *
  * Return Value: On Success, 0 is returned. On Error, one of the following
  * negative error code is returned.
  *
  * %-EIO - I/O error
  */
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
-		      struct nilfs_write_info *wi)
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
 {
 	int err = 0;
 
-	if (!wi->nbio)
+	if (!segbuf->sb_nbio)
 		return 0;
 
 	do {
-		wait_for_completion(&wi->bio_event);
-	} while (--wi->nbio > 0);
+		wait_for_completion(&segbuf->sb_bio_event);
+	} while (--segbuf->sb_nbio > 0);
 
-	if (unlikely(atomic_read(&wi->err) > 0)) {
+	if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
 		printk(KERN_ERR "NILFS: IO error writing segment\n");
 		err = -EIO;
-		segbuf->sb_io_error = 1;
 	}
 	return err;
 }
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 0c3076f..6af1630 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -27,7 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
 #include <linux/completion.h>
-#include <linux/backing-dev.h>
 
 /**
  * struct nilfs_segsum_info - On-memory segment summary
@@ -77,7 +76,9 @@
  * @sb_rest_blocks: Number of residual blocks in the current segment
  * @sb_segsum_buffers: List of buffers for segment summaries
  * @sb_payload_buffers: List of buffers for segment payload
- * @sb_io_error: I/O error status
+ * @sb_nbio: Number of flying bio requests
+ * @sb_err: I/O error status
+ * @sb_bio_event: Completion event of log writing
  */
 struct nilfs_segment_buffer {
 	struct super_block     *sb_super;
@@ -96,7 +97,9 @@
 	struct list_head	sb_payload_buffers; /* including super root */
 
 	/* io status */
-	int			sb_io_error;
+	int			sb_nbio;
+	atomic_t		sb_err;
+	struct completion	sb_bio_event;
 };
 
 #define NILFS_LIST_SEGBUF(head)  \
@@ -125,6 +128,8 @@
 void nilfs_segbuf_free(struct nilfs_segment_buffer *);
 void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
 		      struct the_nilfs *);
+void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
+			   struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
 				  struct the_nilfs *);
 int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
@@ -161,41 +166,18 @@
 	segbuf->sb_sum.nfileblk++;
 }
 
-void nilfs_release_buffers(struct list_head *);
+int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+		       struct the_nilfs *nilfs);
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
 
-static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+void nilfs_clear_logs(struct list_head *logs);
+void nilfs_truncate_logs(struct list_head *logs,
+			 struct nilfs_segment_buffer *last);
+int nilfs_wait_on_logs(struct list_head *logs);
+
+static inline void nilfs_destroy_logs(struct list_head *logs)
 {
-	nilfs_release_buffers(&segbuf->sb_segsum_buffers);
-	nilfs_release_buffers(&segbuf->sb_payload_buffers);
+	nilfs_truncate_logs(logs, NULL);
 }
 
-struct nilfs_write_info {
-	struct bio	       *bio;
-	int 			start, end; /* The region to be submitted */
-	int			rest_blocks;
-	int			max_pages;
-	int			nr_vecs;
-	sector_t		blocknr;
-
-	int			nbio;
-	atomic_t		err;
-	struct completion	bio_event;
-				/* completion event of segment write */
-
-	/*
-	 * The following fields must be set explicitly
-	 */
-	struct super_block     *sb;
-	struct backing_dev_info *bdi; /* backing dev info */
-	struct buffer_head     *bh_sr;
-};
-
-
-void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
-				struct nilfs_write_info *);
-int nilfs_segbuf_write(struct nilfs_segment_buffer *,
-		       struct nilfs_write_info *);
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
-		      struct nilfs_write_info *);
-
 #endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6eff66a..17584c5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -974,12 +974,12 @@
 			      nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
 	raw_sr->sr_flags = 0;
 
-	nilfs_mdt_write_inode_direct(
-		nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
-	nilfs_mdt_write_inode_direct(
-		nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
-	nilfs_mdt_write_inode_direct(
-		nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
+	nilfs_write_inode_common(nilfs_dat_inode(nilfs), (void *)raw_sr +
+				 NILFS_SR_DAT_OFFSET(isz), 1);
+	nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
+				 NILFS_SR_CPFILE_OFFSET(isz), 1);
+	nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
+				 NILFS_SR_SUFILE_OFFSET(isz), 1);
 }
 
 static void nilfs_redirty_inodes(struct list_head *head)
@@ -1273,73 +1273,75 @@
 	return err;
 }
 
-static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
-{
-	struct buffer_head *bh_su;
-	struct nilfs_segment_usage *raw_su;
-	int err;
-
-	err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
-	if (unlikely(err))
-		return err;
-	nilfs_mdt_mark_buffer_dirty(bh_su);
-	nilfs_mdt_mark_dirty(sufile);
-	nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
-	return 0;
-}
-
+/**
+ * nilfs_segctor_begin_construction - setup segment buffer to make a new log
+ * @sci: nilfs_sc_info
+ * @nilfs: nilfs object
+ */
 static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
 					    struct the_nilfs *nilfs)
 {
-	struct nilfs_segment_buffer *segbuf, *n;
+	struct nilfs_segment_buffer *segbuf, *prev;
 	__u64 nextnum;
-	int err;
+	int err, alloc = 0;
 
-	if (list_empty(&sci->sc_segbufs)) {
-		segbuf = nilfs_segbuf_new(sci->sc_super);
-		if (unlikely(!segbuf))
-			return -ENOMEM;
-		list_add(&segbuf->sb_list, &sci->sc_segbufs);
-	} else
-		segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+	segbuf = nilfs_segbuf_new(sci->sc_super);
+	if (unlikely(!segbuf))
+		return -ENOMEM;
 
-	nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
-			 nilfs);
+	if (list_empty(&sci->sc_write_logs)) {
+		nilfs_segbuf_map(segbuf, nilfs->ns_segnum,
+				 nilfs->ns_pseg_offset, nilfs);
+		if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+			nilfs_shift_to_next_segment(nilfs);
+			nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+		}
 
-	if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
-		nilfs_shift_to_next_segment(nilfs);
-		nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
-	}
-	sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
-
-	err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
-	if (unlikely(err))
-		return err;
-
-	if (nilfs->ns_segnum == nilfs->ns_nextnum) {
-		/* Start from the head of a new full segment */
-		err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
-		if (unlikely(err))
-			return err;
-	} else
+		segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
 		nextnum = nilfs->ns_nextnum;
 
-	segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
+		if (nilfs->ns_segnum == nilfs->ns_nextnum)
+			/* Start from the head of a new full segment */
+			alloc++;
+	} else {
+		/* Continue logs */
+		prev = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
+		nilfs_segbuf_map_cont(segbuf, prev);
+		segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq;
+		nextnum = prev->sb_nextnum;
+
+		if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+			nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+			segbuf->sb_sum.seg_seq++;
+			alloc++;
+		}
+	}
+
+	err = nilfs_sufile_mark_dirty(nilfs->ns_sufile, segbuf->sb_segnum);
+	if (err)
+		goto failed;
+
+	if (alloc) {
+		err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
+		if (err)
+			goto failed;
+	}
 	nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
 
-	/* truncating segment buffers */
-	list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
-					  sb_list) {
-		list_del_init(&segbuf->sb_list);
-		nilfs_segbuf_free(segbuf);
-	}
+	BUG_ON(!list_empty(&sci->sc_segbufs));
+	list_add_tail(&segbuf->sb_list, &sci->sc_segbufs);
+	sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
 	return 0;
+
+ failed:
+	nilfs_segbuf_free(segbuf);
+	return err;
 }
 
 static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
 					 struct the_nilfs *nilfs, int nadd)
 {
-	struct nilfs_segment_buffer *segbuf, *prev, *n;
+	struct nilfs_segment_buffer *segbuf, *prev;
 	struct inode *sufile = nilfs->ns_sufile;
 	__u64 nextnextnum;
 	LIST_HEAD(list);
@@ -1352,7 +1354,7 @@
 	 * not be dirty.  The following call ensures that the buffer is dirty
 	 * and will pin the buffer on memory until the sufile is written.
 	 */
-	err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
+	err = nilfs_sufile_mark_dirty(sufile, prev->sb_nextnum);
 	if (unlikely(err))
 		return err;
 
@@ -1378,33 +1380,33 @@
 		list_add_tail(&segbuf->sb_list, &list);
 		prev = segbuf;
 	}
-	list_splice(&list, sci->sc_segbufs.prev);
+	list_splice_tail(&list, &sci->sc_segbufs);
 	return 0;
 
  failed_segbuf:
 	nilfs_segbuf_free(segbuf);
  failed:
-	list_for_each_entry_safe(segbuf, n, &list, sb_list) {
+	list_for_each_entry(segbuf, &list, sb_list) {
 		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
 		WARN_ON(ret); /* never fails */
-		list_del_init(&segbuf->sb_list);
-		nilfs_segbuf_free(segbuf);
 	}
+	nilfs_destroy_logs(&list);
 	return err;
 }
 
-static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
-						   struct the_nilfs *nilfs)
+static void nilfs_free_incomplete_logs(struct list_head *logs,
+				       struct the_nilfs *nilfs)
 {
-	struct nilfs_segment_buffer *segbuf;
-	int ret, done = 0;
+	struct nilfs_segment_buffer *segbuf, *prev;
+	struct inode *sufile = nilfs->ns_sufile;
+	int ret;
 
-	segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+	segbuf = NILFS_FIRST_SEGBUF(logs);
 	if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
-		ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
 		WARN_ON(ret); /* never fails */
 	}
-	if (segbuf->sb_io_error) {
+	if (atomic_read(&segbuf->sb_err)) {
 		/* Case 1: The first segment failed */
 		if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
 			/* Case 1a:  Partial segment appended into an existing
@@ -1413,106 +1415,54 @@
 						segbuf->sb_fseg_end);
 		else /* Case 1b:  New full segment */
 			set_nilfs_discontinued(nilfs);
-		done++;
 	}
 
-	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
-		ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
-		WARN_ON(ret); /* never fails */
-		if (!done && segbuf->sb_io_error) {
-			if (segbuf->sb_segnum != nilfs->ns_nextnum)
-				/* Case 2: extended segment (!= next) failed */
-				nilfs_sufile_set_error(nilfs->ns_sufile,
-						       segbuf->sb_segnum);
-			done++;
+	prev = segbuf;
+	list_for_each_entry_continue(segbuf, logs, sb_list) {
+		if (prev->sb_nextnum != segbuf->sb_nextnum) {
+			ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+			WARN_ON(ret); /* never fails */
 		}
+		if (atomic_read(&segbuf->sb_err) &&
+		    segbuf->sb_segnum != nilfs->ns_nextnum)
+			/* Case 2: extended segment (!= next) failed */
+			nilfs_sufile_set_error(sufile, segbuf->sb_segnum);
+		prev = segbuf;
 	}
 }
 
-static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
-{
-	struct nilfs_segment_buffer *segbuf;
-
-	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
-		nilfs_segbuf_clear(segbuf);
-	sci->sc_super_root = NULL;
-}
-
-static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
-{
-	struct nilfs_segment_buffer *segbuf;
-
-	while (!list_empty(&sci->sc_segbufs)) {
-		segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
-		list_del_init(&segbuf->sb_list);
-		nilfs_segbuf_free(segbuf);
-	}
-	/* sci->sc_curseg = NULL; */
-}
-
-static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
-					   struct the_nilfs *nilfs, int err)
-{
-	if (unlikely(err)) {
-		nilfs_segctor_free_incomplete_segments(sci, nilfs);
-		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
-			int ret;
-
-			ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
-							sci->sc_freesegs,
-							sci->sc_nfreesegs,
-							NULL);
-			WARN_ON(ret); /* do not happen */
-		}
-	}
-	nilfs_segctor_clear_segment_buffers(sci);
-}
-
 static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
 					  struct inode *sufile)
 {
 	struct nilfs_segment_buffer *segbuf;
-	struct buffer_head *bh_su;
-	struct nilfs_segment_usage *raw_su;
 	unsigned long live_blocks;
 	int ret;
 
 	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-		ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
-						     &raw_su, &bh_su);
-		WARN_ON(ret); /* always succeed because bh_su is dirty */
 		live_blocks = segbuf->sb_sum.nblocks +
 			(segbuf->sb_pseg_start - segbuf->sb_fseg_start);
-		raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
-		raw_su->su_nblocks = cpu_to_le32(live_blocks);
-		nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
-					       bh_su);
+		ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+						     live_blocks,
+						     sci->sc_seg_ctime);
+		WARN_ON(ret); /* always succeed because the segusage is dirty */
 	}
 }
 
-static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
-					  struct inode *sufile)
+static void nilfs_cancel_segusage(struct list_head *logs, struct inode *sufile)
 {
 	struct nilfs_segment_buffer *segbuf;
-	struct buffer_head *bh_su;
-	struct nilfs_segment_usage *raw_su;
 	int ret;
 
-	segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
-	ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
-					     &raw_su, &bh_su);
-	WARN_ON(ret); /* always succeed because bh_su is dirty */
-	raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
-					 segbuf->sb_fseg_start);
-	nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
+	segbuf = NILFS_FIRST_SEGBUF(logs);
+	ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+					     segbuf->sb_pseg_start -
+					     segbuf->sb_fseg_start, 0);
+	WARN_ON(ret); /* always succeed because the segusage is dirty */
 
-	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
-		ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
-						     &raw_su, &bh_su);
+	list_for_each_entry_continue(segbuf, logs, sb_list) {
+		ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum,
+						     0, 0);
 		WARN_ON(ret); /* always succeed */
-		raw_su->su_nblocks = 0;
-		nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
-					       bh_su);
 	}
 }
 
@@ -1520,17 +1470,15 @@
 					    struct nilfs_segment_buffer *last,
 					    struct inode *sufile)
 {
-	struct nilfs_segment_buffer *segbuf = last, *n;
+	struct nilfs_segment_buffer *segbuf = last;
 	int ret;
 
-	list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
-					  sb_list) {
-		list_del_init(&segbuf->sb_list);
+	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
 		sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
 		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
 		WARN_ON(ret);
-		nilfs_segbuf_free(segbuf);
 	}
+	nilfs_truncate_logs(&sci->sc_segbufs, last);
 }
 
 
@@ -1569,7 +1517,7 @@
 							NULL);
 			WARN_ON(err); /* do not happen */
 		}
-		nilfs_segctor_clear_segment_buffers(sci);
+		nilfs_clear_logs(&sci->sc_segbufs);
 
 		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
 		if (unlikely(err))
@@ -1814,26 +1762,18 @@
 }
 
 static int nilfs_segctor_write(struct nilfs_sc_info *sci,
-			       struct backing_dev_info *bdi)
+			       struct the_nilfs *nilfs)
 {
 	struct nilfs_segment_buffer *segbuf;
-	struct nilfs_write_info wi;
-	int err, res;
-
-	wi.sb = sci->sc_super;
-	wi.bh_sr = sci->sc_super_root;
-	wi.bdi = bdi;
+	int ret = 0;
 
 	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-		nilfs_segbuf_prepare_write(segbuf, &wi);
-		err = nilfs_segbuf_write(segbuf, &wi);
-
-		res = nilfs_segbuf_wait(segbuf, &wi);
-		err = err ? : res;
-		if (err)
-			return err;
+		ret = nilfs_segbuf_write(segbuf, nilfs);
+		if (ret)
+			break;
 	}
-	return 0;
+	list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
+	return ret;
 }
 
 static void __nilfs_end_page_io(struct page *page, int err)
@@ -1911,15 +1851,17 @@
 	}
 }
 
-static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
-				      struct page *failed_page, int err)
+static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
+			     struct buffer_head *bh_sr, int err)
 {
 	struct nilfs_segment_buffer *segbuf;
 	struct page *bd_page = NULL, *fs_page = NULL;
+	struct buffer_head *bh;
 
-	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-		struct buffer_head *bh;
+	if (list_empty(logs))
+		return;
 
+	list_for_each_entry(segbuf, logs, sb_list) {
 		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
 				    b_assoc_buffers) {
 			if (bh->b_page != bd_page) {
@@ -1931,7 +1873,7 @@
 
 		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
 				    b_assoc_buffers) {
-			if (bh == sci->sc_super_root) {
+			if (bh == bh_sr) {
 				if (bh->b_page != bd_page) {
 					end_page_writeback(bd_page);
 					bd_page = bh->b_page;
@@ -1941,7 +1883,7 @@
 			if (bh->b_page != fs_page) {
 				nilfs_end_page_io(fs_page, err);
 				if (fs_page && fs_page == failed_page)
-					goto done;
+					return;
 				fs_page = bh->b_page;
 			}
 		}
@@ -1950,8 +1892,34 @@
 		end_page_writeback(bd_page);
 
 	nilfs_end_page_io(fs_page, err);
- done:
+}
+
+static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs, int err)
+{
+	LIST_HEAD(logs);
+	int ret;
+
+	list_splice_tail_init(&sci->sc_write_logs, &logs);
+	ret = nilfs_wait_on_logs(&logs);
+	if (ret)
+		nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
+
+	list_splice_tail_init(&sci->sc_segbufs, &logs);
+	nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
+	nilfs_free_incomplete_logs(&logs, nilfs);
 	nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
+
+	if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+		ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+						sci->sc_freesegs,
+						sci->sc_nfreesegs,
+						NULL);
+		WARN_ON(ret); /* do not happen */
+	}
+
+	nilfs_destroy_logs(&logs);
+	sci->sc_super_root = NULL;
 }
 
 static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1973,7 +1941,7 @@
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int update_sr = (sci->sc_super_root != NULL);
 
-	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+	list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
 		struct buffer_head *bh;
 
 		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
@@ -2046,7 +2014,7 @@
 
 	sci->sc_nblk_inc += sci->sc_nblk_this_inc;
 
-	segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+	segbuf = NILFS_LAST_SEGBUF(&sci->sc_write_logs);
 	nilfs_set_next_segment(nilfs, segbuf);
 
 	if (update_sr) {
@@ -2057,10 +2025,23 @@
 		clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
 		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
 		set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+		nilfs_segctor_clear_metadata_dirty(sci);
 	} else
 		clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
 }
 
+static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
+{
+	int ret;
+
+	ret = nilfs_wait_on_logs(&sci->sc_write_logs);
+	if (!ret) {
+		nilfs_segctor_complete_write(sci);
+		nilfs_destroy_logs(&sci->sc_write_logs);
+	}
+	return ret;
+}
+
 static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 					struct nilfs_sb_info *sbi)
 {
@@ -2173,7 +2154,7 @@
 		/* Avoid empty segment */
 		if (sci->sc_stage.scnt == NILFS_ST_DONE &&
 		    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
-			nilfs_segctor_end_construction(sci, nilfs, 1);
+			nilfs_segctor_abort_construction(sci, nilfs, 1);
 			goto out;
 		}
 
@@ -2187,7 +2168,7 @@
 		if (has_sr) {
 			err = nilfs_segctor_fill_in_checkpoint(sci);
 			if (unlikely(err))
-				goto failed_to_make_up;
+				goto failed_to_write;
 
 			nilfs_segctor_fill_in_super_root(sci, nilfs);
 		}
@@ -2195,42 +2176,46 @@
 
 		/* Write partial segments */
 		err = nilfs_segctor_prepare_write(sci, &failed_page);
-		if (unlikely(err))
+		if (err) {
+			nilfs_abort_logs(&sci->sc_segbufs, failed_page,
+					 sci->sc_super_root, err);
 			goto failed_to_write;
-
+		}
 		nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
 
-		err = nilfs_segctor_write(sci, nilfs->ns_bdi);
+		err = nilfs_segctor_write(sci, nilfs);
 		if (unlikely(err))
 			goto failed_to_write;
 
-		nilfs_segctor_complete_write(sci);
-
-		/* Commit segments */
-		if (has_sr)
-			nilfs_segctor_clear_metadata_dirty(sci);
-
-		nilfs_segctor_end_construction(sci, nilfs, 0);
-
+		if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+		    nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
+			/*
+			 * At this point, we avoid double buffering
+			 * for blocksize < pagesize because page dirty
+			 * flag is turned off during write and dirty
+			 * buffers are not properly collected for
+			 * pages crossing over segments.
+			 */
+			err = nilfs_segctor_wait(sci);
+			if (err)
+				goto failed_to_write;
+		}
 	} while (sci->sc_stage.scnt != NILFS_ST_DONE);
 
+	sci->sc_super_root = NULL;
+
  out:
-	nilfs_segctor_destroy_segment_buffers(sci);
 	nilfs_segctor_check_out_files(sci, sbi);
 	return err;
 
  failed_to_write:
-	nilfs_segctor_abort_write(sci, failed_page, err);
-	nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
-
- failed_to_make_up:
 	if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
 		nilfs_redirty_inodes(&sci->sc_dirty_files);
 
  failed:
 	if (nilfs_doing_gc())
 		nilfs_redirty_inodes(&sci->sc_gc_inodes);
-	nilfs_segctor_end_construction(sci, nilfs, err);
+	nilfs_segctor_abort_construction(sci, nilfs, err);
 	goto out;
 }
 
@@ -2559,7 +2544,7 @@
 
 	sci->sc_freesegs = kbufs[4];
 	sci->sc_nfreesegs = argv[4].v_nmembs;
-	list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
+	list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
 
 	for (;;) {
 		nilfs_segctor_accept(sci, &req);
@@ -2788,6 +2773,7 @@
 	spin_lock_init(&sci->sc_state_lock);
 	INIT_LIST_HEAD(&sci->sc_dirty_files);
 	INIT_LIST_HEAD(&sci->sc_segbufs);
+	INIT_LIST_HEAD(&sci->sc_write_logs);
 	INIT_LIST_HEAD(&sci->sc_gc_inodes);
 	INIT_LIST_HEAD(&sci->sc_copied_buffers);
 
@@ -2855,6 +2841,7 @@
 	}
 
 	WARN_ON(!list_empty(&sci->sc_segbufs));
+	WARN_ON(!list_empty(&sci->sc_write_logs));
 
 	down_write(&sbi->s_nilfs->ns_segctor_sem);
 
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 0d2a475..3d3ab2f 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -97,6 +97,7 @@
  * @sc_dsync_start: start byte offset of data pages
  * @sc_dsync_end: end byte offset of data pages (inclusive)
  * @sc_segbufs: List of segment buffers
+ * @sc_write_logs: List of segment buffers to hold logs under writing
  * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
  * @sc_curseg: Current segment buffer
  * @sc_super_root: Pointer to the super root buffer
@@ -143,6 +144,7 @@
 
 	/* Segment buffers */
 	struct list_head	sc_segbufs;
+	struct list_head	sc_write_logs;
 	unsigned long		sc_segbuf_nblocks;
 	struct nilfs_segment_buffer *sc_curseg;
 	struct buffer_head     *sc_super_root;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 37994d4..b6c36d0 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -31,6 +31,16 @@
 #include "sufile.h"
 
 
+struct nilfs_sufile_info {
+	struct nilfs_mdt_info mi;
+	unsigned long ncleansegs;
+};
+
+static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
+{
+	return (struct nilfs_sufile_info *)NILFS_MDT(sufile);
+}
+
 static inline unsigned long
 nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
 {
@@ -62,14 +72,6 @@
 		     max - curr + 1);
 }
 
-static inline struct nilfs_sufile_header *
-nilfs_sufile_block_get_header(const struct inode *sufile,
-			      struct buffer_head *bh,
-			      void *kaddr)
-{
-	return kaddr + bh_offset(bh);
-}
-
 static struct nilfs_segment_usage *
 nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
 				     struct buffer_head *bh, void *kaddr)
@@ -110,6 +112,15 @@
 }
 
 /**
+ * nilfs_sufile_get_ncleansegs - return the number of clean segments
+ * @sufile: inode of segment usage file
+ */
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile)
+{
+	return NILFS_SUI(sufile)->ncleansegs;
+}
+
+/**
  * nilfs_sufile_updatev - modify multiple segment usages at a time
  * @sufile: inode of segment usage file
  * @segnumv: array of segment numbers
@@ -270,7 +281,7 @@
 	if (ret < 0)
 		goto out_sem;
 	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-	header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+	header = kaddr + bh_offset(header_bh);
 	ncleansegs = le64_to_cpu(header->sh_ncleansegs);
 	last_alloc = le64_to_cpu(header->sh_last_alloc);
 	kunmap_atomic(kaddr, KM_USER0);
@@ -302,13 +313,13 @@
 			kunmap_atomic(kaddr, KM_USER0);
 
 			kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-			header = nilfs_sufile_block_get_header(
-				sufile, header_bh, kaddr);
+			header = kaddr + bh_offset(header_bh);
 			le64_add_cpu(&header->sh_ncleansegs, -1);
 			le64_add_cpu(&header->sh_ndirtysegs, 1);
 			header->sh_last_alloc = cpu_to_le64(segnum);
 			kunmap_atomic(kaddr, KM_USER0);
 
+			NILFS_SUI(sufile)->ncleansegs--;
 			nilfs_mdt_mark_buffer_dirty(header_bh);
 			nilfs_mdt_mark_buffer_dirty(su_bh);
 			nilfs_mdt_mark_dirty(sufile);
@@ -351,6 +362,8 @@
 	kunmap_atomic(kaddr, KM_USER0);
 
 	nilfs_sufile_mod_counter(header_bh, -1, 1);
+	NILFS_SUI(sufile)->ncleansegs--;
+
 	nilfs_mdt_mark_buffer_dirty(su_bh);
 	nilfs_mdt_mark_dirty(sufile);
 }
@@ -380,6 +393,8 @@
 	kunmap_atomic(kaddr, KM_USER0);
 
 	nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+	NILFS_SUI(sufile)->ncleansegs -= clean;
+
 	nilfs_mdt_mark_buffer_dirty(su_bh);
 	nilfs_mdt_mark_dirty(sufile);
 }
@@ -409,58 +424,61 @@
 	nilfs_mdt_mark_buffer_dirty(su_bh);
 
 	nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
+	NILFS_SUI(sufile)->ncleansegs++;
+
 	nilfs_mdt_mark_dirty(sufile);
 }
 
 /**
- * nilfs_sufile_get_segment_usage - get a segment usage
+ * nilfs_sufile_mark_dirty - mark the buffer having a segment usage dirty
  * @sufile: inode of segment usage file
  * @segnum: segment number
- * @sup: pointer to segment usage
- * @bhp: pointer to buffer head
- *
- * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
- * specified by @segnum.
- *
- * Return Value: On success, 0 is returned, and the segment usage and the
- * buffer head of the buffer on which the segment usage is located are stored
- * in the place pointed by @sup and @bhp, respectively. On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-EINVAL - Invalid segment usage number.
  */
-int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
-				   struct nilfs_segment_usage **sup,
-				   struct buffer_head **bhp)
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
+{
+	struct buffer_head *bh;
+	int ret;
+
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
+	if (!ret) {
+		nilfs_mdt_mark_buffer_dirty(bh);
+		nilfs_mdt_mark_dirty(sufile);
+		brelse(bh);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_sufile_set_segment_usage - set usage of a segment
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ * @nblocks: number of live blocks in the segment
+ * @modtime: modification time (option)
+ */
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
+				   unsigned long nblocks, time_t modtime)
 {
 	struct buffer_head *bh;
 	struct nilfs_segment_usage *su;
 	void *kaddr;
 	int ret;
 
-	/* segnum is 0 origin */
-	if (segnum >= nilfs_sufile_get_nsegments(sufile))
-		return -EINVAL;
 	down_write(&NILFS_MDT(sufile)->mi_sem);
-	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap(bh->b_page);
-	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
-	if (nilfs_segment_usage_error(su)) {
-		kunmap(bh->b_page);
-		brelse(bh);
-		ret = -EINVAL;
-		goto out_sem;
-	}
 
-	if (sup != NULL)
-		*sup = su;
-	*bhp = bh;
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+	WARN_ON(nilfs_segment_usage_error(su));
+	if (modtime)
+		su->su_lastmod = cpu_to_le64(modtime);
+	su->su_nblocks = cpu_to_le32(nblocks);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(sufile);
+	brelse(bh);
 
  out_sem:
 	up_write(&NILFS_MDT(sufile)->mi_sem);
@@ -468,23 +486,6 @@
 }
 
 /**
- * nilfs_sufile_put_segment_usage - put a segment usage
- * @sufile: inode of segment usage file
- * @segnum: segment number
- * @bh: buffer head
- *
- * Description: nilfs_sufile_put_segment_usage() releases the segment usage
- * specified by @segnum. @bh must be the buffer head which have been returned
- * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
- */
-void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
-				    struct buffer_head *bh)
-{
-	kunmap(bh->b_page);
-	brelse(bh);
-}
-
-/**
  * nilfs_sufile_get_stat - get segment usage statistics
  * @sufile: inode of segment usage file
  * @stat: pointer to a structure of segment usage statistics
@@ -515,7 +516,7 @@
 		goto out_sem;
 
 	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
-	header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+	header = kaddr + bh_offset(header_bh);
 	sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
 	sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
 	sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
@@ -532,33 +533,6 @@
 	return ret;
 }
 
-/**
- * nilfs_sufile_get_ncleansegs - get the number of clean segments
- * @sufile: inode of segment usage file
- * @nsegsp: pointer to the number of clean segments
- *
- * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
- * segments.
- *
- * Return Value: On success, 0 is returned and the number of clean segments is
- * stored in the place pointed by @nsegsp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
-{
-	struct nilfs_sustat sustat;
-	int ret;
-
-	ret = nilfs_sufile_get_stat(sufile, &sustat);
-	if (ret == 0)
-		*nsegsp = sustat.ss_ncleansegs;
-	return ret;
-}
-
 void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 			       struct buffer_head *header_bh,
 			       struct buffer_head *su_bh)
@@ -577,8 +551,10 @@
 	nilfs_segment_usage_set_error(su);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	if (suclean)
+	if (suclean) {
 		nilfs_sufile_mod_counter(header_bh, -1, 0);
+		NILFS_SUI(sufile)->ncleansegs--;
+	}
 	nilfs_mdt_mark_buffer_dirty(su_bh);
 	nilfs_mdt_mark_dirty(sufile);
 }
@@ -657,3 +633,48 @@
 	up_read(&NILFS_MDT(sufile)->mi_sem);
 	return ret;
 }
+
+/**
+ * nilfs_sufile_read - read sufile inode
+ * @sufile: sufile inode
+ * @raw_inode: on-disk sufile inode
+ */
+int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode)
+{
+	struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+	struct buffer_head *header_bh;
+	struct nilfs_sufile_header *header;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_read_inode_common(sufile, raw_inode);
+	if (ret < 0)
+		return ret;
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (!ret) {
+		kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+		header = kaddr + bh_offset(header_bh);
+		sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(header_bh);
+	}
+	return ret;
+}
+
+/**
+ * nilfs_sufile_new - create sufile
+ * @nilfs: nilfs object
+ * @susize: size of a segment usage entry
+ */
+struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize)
+{
+	struct inode *sufile;
+
+	sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO,
+			       sizeof(struct nilfs_sufile_info));
+	if (sufile)
+		nilfs_mdt_set_entry_size(sufile, susize,
+					 sizeof(struct nilfs_sufile_header));
+	return sufile;
+}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 0e99e5c..15163b8 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -34,14 +34,13 @@
 	return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
 }
 
+unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
+
 int nilfs_sufile_alloc(struct inode *, __u64 *);
-int nilfs_sufile_get_segment_usage(struct inode *, __u64,
-				   struct nilfs_segment_usage **,
-				   struct buffer_head **);
-void nilfs_sufile_put_segment_usage(struct inode *, __u64,
-				    struct buffer_head *);
+int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
+int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
+				   unsigned long nblocks, time_t modtime);
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
-int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
 ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
 				size_t);
 
@@ -62,6 +61,9 @@
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
 			       struct buffer_head *);
 
+int nilfs_sufile_read(struct inode *sufile, struct nilfs_inode *raw_inode);
+struct inode *nilfs_sufile_new(struct the_nilfs *nilfs, size_t susize);
+
 /**
  * nilfs_sufile_scrap - make a segment garbage
  * @sufile: inode of segment usage file
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 644e667..5403b3e 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -363,14 +363,10 @@
 	list_add(&sbi->s_list, &nilfs->ns_supers);
 	up_write(&nilfs->ns_super_sem);
 
-	sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
+	sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
 	if (!sbi->s_ifile)
 		return -ENOMEM;
 
-	err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
-	if (unlikely(err))
-		goto failed;
-
 	down_read(&nilfs->ns_segctor_sem);
 	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
 					  &bh_cp);
@@ -411,7 +407,6 @@
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
-	nilfs_mdt_clear(sbi->s_ifile);
 	nilfs_mdt_destroy(sbi->s_ifile);
 	sbi->s_ifile = NULL;
 	down_write(&nilfs->ns_super_sem);
@@ -419,22 +414,6 @@
 	up_write(&nilfs->ns_super_sem);
 }
 
-static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
-{
-	struct the_nilfs *nilfs = sbi->s_nilfs;
-	int err = 0;
-
-	down_write(&nilfs->ns_sem);
-	if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
-		nilfs->ns_mount_state |= NILFS_VALID_FS;
-		err = nilfs_commit_super(sbi, 1);
-		if (likely(!err))
-			printk(KERN_INFO "NILFS: recovery complete.\n");
-	}
-	up_write(&nilfs->ns_sem);
-	return err;
-}
-
 static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
@@ -490,7 +469,7 @@
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 
 	if (!nilfs_test_opt(sbi, BARRIER))
-		seq_printf(seq, ",barrier=off");
+		seq_printf(seq, ",nobarrier");
 	if (nilfs_test_opt(sbi, SNAPSHOT))
 		seq_printf(seq, ",cp=%llu",
 			   (unsigned long long int)sbi->s_snapshot_cno);
@@ -500,6 +479,8 @@
 		seq_printf(seq, ",errors=panic");
 	if (nilfs_test_opt(sbi, STRICT_ORDER))
 		seq_printf(seq, ",order=strict");
+	if (nilfs_test_opt(sbi, NORECOVERY))
+		seq_printf(seq, ",norecovery");
 
 	return 0;
 }
@@ -568,7 +549,7 @@
 
 enum {
 	Opt_err_cont, Opt_err_panic, Opt_err_ro,
-	Opt_barrier, Opt_snapshot, Opt_order,
+	Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
 	Opt_err,
 };
 
@@ -576,25 +557,13 @@
 	{Opt_err_cont, "errors=continue"},
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
-	{Opt_barrier, "barrier=%s"},
+	{Opt_nobarrier, "nobarrier"},
 	{Opt_snapshot, "cp=%u"},
 	{Opt_order, "order=%s"},
+	{Opt_norecovery, "norecovery"},
 	{Opt_err, NULL}
 };
 
-static int match_bool(substring_t *s, int *result)
-{
-	int len = s->to - s->from;
-
-	if (strncmp(s->from, "on", len) == 0)
-		*result = 1;
-	else if (strncmp(s->from, "off", len) == 0)
-		*result = 0;
-	else
-		return 1;
-	return 0;
-}
-
 static int parse_options(char *options, struct super_block *sb)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
@@ -612,13 +581,8 @@
 
 		token = match_token(p, tokens, args);
 		switch (token) {
-		case Opt_barrier:
-			if (match_bool(&args[0], &option))
-				return 0;
-			if (option)
-				nilfs_set_opt(sbi, BARRIER);
-			else
-				nilfs_clear_opt(sbi, BARRIER);
+		case Opt_nobarrier:
+			nilfs_clear_opt(sbi, BARRIER);
 			break;
 		case Opt_order:
 			if (strcmp(args[0].from, "relaxed") == 0)
@@ -647,6 +611,9 @@
 			sbi->s_snapshot_cno = option;
 			nilfs_set_opt(sbi, SNAPSHOT);
 			break;
+		case Opt_norecovery:
+			nilfs_set_opt(sbi, NORECOVERY);
+			break;
 		default:
 			printk(KERN_ERR
 			       "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -672,9 +639,7 @@
 	int mnt_count = le16_to_cpu(sbp->s_mnt_count);
 
 	/* nilfs->sem must be locked by the caller. */
-	if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
-		printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
-	} else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
+	if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
 		printk(KERN_WARNING
 		       "NILFS warning: mounting fs with errors\n");
 #if 0
@@ -782,11 +747,10 @@
 	sb->s_root = NULL;
 	sb->s_time_gran = 1;
 
-	if (!nilfs_loaded(nilfs)) {
-		err = load_nilfs(nilfs, sbi);
-		if (err)
-			goto failed_sbi;
-	}
+	err = load_nilfs(nilfs, sbi);
+	if (err)
+		goto failed_sbi;
+
 	cno = nilfs_last_cno(nilfs);
 
 	if (sb->s_flags & MS_RDONLY) {
@@ -854,12 +818,6 @@
 		up_write(&nilfs->ns_sem);
 	}
 
-	err = nilfs_mark_recovery_complete(sbi);
-	if (unlikely(err)) {
-		printk(KERN_ERR "NILFS: recovery failed.\n");
-		goto failed_root;
-	}
-
 	down_write(&nilfs->ns_super_sem);
 	if (!nilfs_test_opt(sbi, SNAPSHOT))
 		nilfs->ns_current = sbi;
@@ -867,10 +825,6 @@
 
 	return 0;
 
- failed_root:
-	dput(sb->s_root);
-	sb->s_root = NULL;
-
  failed_segctor:
 	nilfs_detach_segment_constructor(sbi);
 
@@ -915,6 +869,14 @@
 		goto restore_opts;
 	}
 
+	if (!nilfs_valid_fs(nilfs)) {
+		printk(KERN_WARNING "NILFS (device %s): couldn't "
+		       "remount because the filesystem is in an "
+		       "incomplete recovery state.\n", sb->s_id);
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		goto out;
 	if (*flags & MS_RDONLY) {
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ad391a8..6241e17 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -146,13 +146,9 @@
 
 	might_sleep();
 	if (nilfs_loaded(nilfs)) {
-		nilfs_mdt_clear(nilfs->ns_sufile);
 		nilfs_mdt_destroy(nilfs->ns_sufile);
-		nilfs_mdt_clear(nilfs->ns_cpfile);
 		nilfs_mdt_destroy(nilfs->ns_cpfile);
-		nilfs_mdt_clear(nilfs->ns_dat);
 		nilfs_mdt_destroy(nilfs->ns_dat);
-		/* XXX: how and when to clear nilfs->ns_gc_dat? */
 		nilfs_mdt_destroy(nilfs->ns_gc_dat);
 	}
 	if (nilfs_init(nilfs)) {
@@ -166,7 +162,6 @@
 static int nilfs_load_super_root(struct the_nilfs *nilfs,
 				 struct nilfs_sb_info *sbi, sector_t sr_block)
 {
-	static struct lock_class_key dat_lock_key;
 	struct buffer_head *bh_sr;
 	struct nilfs_super_root *raw_sr;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
@@ -187,51 +182,36 @@
 	inode_size = nilfs->ns_inode_size;
 
 	err = -ENOMEM;
-	nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
+	nilfs->ns_dat = nilfs_dat_new(nilfs, dat_entry_size);
 	if (unlikely(!nilfs->ns_dat))
 		goto failed;
 
-	nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
+	nilfs->ns_gc_dat = nilfs_dat_new(nilfs, dat_entry_size);
 	if (unlikely(!nilfs->ns_gc_dat))
 		goto failed_dat;
 
-	nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
+	nilfs->ns_cpfile = nilfs_cpfile_new(nilfs, checkpoint_size);
 	if (unlikely(!nilfs->ns_cpfile))
 		goto failed_gc_dat;
 
-	nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
+	nilfs->ns_sufile = nilfs_sufile_new(nilfs, segment_usage_size);
 	if (unlikely(!nilfs->ns_sufile))
 		goto failed_cpfile;
 
-	err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
-	if (unlikely(err))
-		goto failed_sufile;
-
-	err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
-	if (unlikely(err))
-		goto failed_sufile;
-
-	lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
-	lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
-
 	nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
-	nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
-				 sizeof(struct nilfs_cpfile_header));
-	nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
-				 sizeof(struct nilfs_sufile_header));
 
-	err = nilfs_mdt_read_inode_direct(
-		nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
+	err = nilfs_dat_read(nilfs->ns_dat, (void *)bh_sr->b_data +
+			     NILFS_SR_DAT_OFFSET(inode_size));
 	if (unlikely(err))
 		goto failed_sufile;
 
-	err = nilfs_mdt_read_inode_direct(
-		nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
+	err = nilfs_cpfile_read(nilfs->ns_cpfile, (void *)bh_sr->b_data +
+				NILFS_SR_CPFILE_OFFSET(inode_size));
 	if (unlikely(err))
 		goto failed_sufile;
 
-	err = nilfs_mdt_read_inode_direct(
-		nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
+	err = nilfs_sufile_read(nilfs->ns_sufile, (void *)bh_sr->b_data +
+				NILFS_SR_SUFILE_OFFSET(inode_size));
 	if (unlikely(err))
 		goto failed_sufile;
 
@@ -281,29 +261,30 @@
 	struct nilfs_recovery_info ri;
 	unsigned int s_flags = sbi->s_super->s_flags;
 	int really_read_only = bdev_read_only(nilfs->ns_bdev);
-	unsigned valid_fs;
-	int err = 0;
+	int valid_fs = nilfs_valid_fs(nilfs);
+	int err;
+
+	if (nilfs_loaded(nilfs)) {
+		if (valid_fs ||
+		    ((s_flags & MS_RDONLY) && nilfs_test_opt(sbi, NORECOVERY)))
+			return 0;
+		printk(KERN_ERR "NILFS: the filesystem is in an incomplete "
+		       "recovery state.\n");
+		return -EINVAL;
+	}
+
+	if (!valid_fs) {
+		printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+		if (s_flags & MS_RDONLY) {
+			printk(KERN_INFO "NILFS: INFO: recovery "
+			       "required for readonly filesystem.\n");
+			printk(KERN_INFO "NILFS: write access will "
+			       "be enabled during recovery.\n");
+		}
+	}
 
 	nilfs_init_recovery_info(&ri);
 
-	down_write(&nilfs->ns_sem);
-	valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
-	up_write(&nilfs->ns_sem);
-
-	if (!valid_fs && (s_flags & MS_RDONLY)) {
-		printk(KERN_INFO "NILFS: INFO: recovery "
-		       "required for readonly filesystem.\n");
-		if (really_read_only) {
-			printk(KERN_ERR "NILFS: write access "
-			       "unavailable, cannot proceed.\n");
-			err = -EROFS;
-			goto failed;
-		}
-		printk(KERN_INFO "NILFS: write access will "
-		       "be enabled during recovery.\n");
-		sbi->s_super->s_flags &= ~MS_RDONLY;
-	}
-
 	err = nilfs_search_super_root(nilfs, sbi, &ri);
 	if (unlikely(err)) {
 		printk(KERN_ERR "NILFS: error searching super root.\n");
@@ -316,19 +297,56 @@
 		goto failed;
 	}
 
-	if (!valid_fs) {
-		err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
-		if (unlikely(err)) {
-			nilfs_mdt_destroy(nilfs->ns_cpfile);
-			nilfs_mdt_destroy(nilfs->ns_sufile);
-			nilfs_mdt_destroy(nilfs->ns_dat);
-			goto failed;
+	if (valid_fs)
+		goto skip_recovery;
+
+	if (s_flags & MS_RDONLY) {
+		if (nilfs_test_opt(sbi, NORECOVERY)) {
+			printk(KERN_INFO "NILFS: norecovery option specified. "
+			       "skipping roll-forward recovery\n");
+			goto skip_recovery;
 		}
-		if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
-			sbi->s_super->s_dirt = 1;
+		if (really_read_only) {
+			printk(KERN_ERR "NILFS: write access "
+			       "unavailable, cannot proceed.\n");
+			err = -EROFS;
+			goto failed_unload;
+		}
+		sbi->s_super->s_flags &= ~MS_RDONLY;
+	} else if (nilfs_test_opt(sbi, NORECOVERY)) {
+		printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
+		       "option was specified for a read/write mount\n");
+		err = -EINVAL;
+		goto failed_unload;
 	}
 
+	err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+	if (err)
+		goto failed_unload;
+
+	down_write(&nilfs->ns_sem);
+	nilfs->ns_mount_state |= NILFS_VALID_FS;
+	nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+	err = nilfs_commit_super(sbi, 1);
+	up_write(&nilfs->ns_sem);
+
+	if (err) {
+		printk(KERN_ERR "NILFS: failed to update super block. "
+		       "recovery unfinished.\n");
+		goto failed_unload;
+	}
+	printk(KERN_INFO "NILFS: recovery complete.\n");
+
+ skip_recovery:
 	set_nilfs_loaded(nilfs);
+	nilfs_clear_recovery_info(&ri);
+	sbi->s_super->s_flags = s_flags;
+	return 0;
+
+ failed_unload:
+	nilfs_mdt_destroy(nilfs->ns_cpfile);
+	nilfs_mdt_destroy(nilfs->ns_sufile);
+	nilfs_mdt_destroy(nilfs->ns_dat);
 
  failed:
 	nilfs_clear_recovery_info(&ri);
@@ -632,30 +650,23 @@
 {
 	struct inode *dat = nilfs_dat_inode(nilfs);
 	unsigned long ncleansegs;
-	int err;
 
 	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
-	err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
+	ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
 	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
-	if (likely(!err))
-		*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
-	return err;
+	*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
+	return 0;
 }
 
 int nilfs_near_disk_full(struct the_nilfs *nilfs)
 {
-	struct inode *sufile = nilfs->ns_sufile;
 	unsigned long ncleansegs, nincsegs;
-	int ret;
 
-	ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
-	if (likely(!ret)) {
-		nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
-			nilfs->ns_blocks_per_segment + 1;
-		if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
-			ret++;
-	}
-	return ret;
+	ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
+	nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
+		nilfs->ns_blocks_per_segment + 1;
+
+	return ncleansegs <= nilfs->ns_nrsvsegs + nincsegs;
 }
 
 /**
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 20abd55..589786e 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -258,6 +258,16 @@
 		kfree(sbi);
 }
 
+static inline int nilfs_valid_fs(struct the_nilfs *nilfs)
+{
+	unsigned valid_fs;
+
+	down_read(&nilfs->ns_sem);
+	valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+	up_read(&nilfs->ns_sem);
+	return valid_fs;
+}
+
 static inline void
 nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
 			sector_t *seg_start, sector_t *seg_end)
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index ce52040..3fe02cf 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -151,6 +151,8 @@
 #define NILFS_MOUNT_BARRIER		0x1000  /* Use block barriers */
 #define NILFS_MOUNT_STRICT_ORDER	0x2000  /* Apply strict in-order
 						   semantics also for data */
+#define NILFS_MOUNT_NORECOVERY		0x4000  /* Disable write access during
+						   mount-time recovery */
 
 
 /**
@@ -403,6 +405,28 @@
 #define NILFS_SS_GC     0x0010  /* segment written for cleaner operation */
 
 /**
+ * struct nilfs_btree_node - B-tree node
+ * @bn_flags: flags
+ * @bn_level: level
+ * @bn_nchildren: number of children
+ * @bn_pad: padding
+ */
+struct nilfs_btree_node {
+	__u8 bn_flags;
+	__u8 bn_level;
+	__le16 bn_nchildren;
+	__le32 bn_pad;
+};
+
+/* flags */
+#define NILFS_BTREE_NODE_ROOT   0x01
+
+/* level */
+#define NILFS_BTREE_LEVEL_DATA          0
+#define NILFS_BTREE_LEVEL_NODE_MIN      (NILFS_BTREE_LEVEL_DATA + 1)
+#define NILFS_BTREE_LEVEL_MAX           14
+
+/**
  * struct nilfs_palloc_group_desc - block group descriptor
  * @pg_nfrees: number of free entries in block group
  */