Btrfs: implement memory reclaim for leaf reference cache

The memory reclaiming issue happens when snapshot exists. In that
case, some cache entries may not be used during old snapshot dropping,
so they will remain in the cache until umount.

The patch adds a field to struct btrfs_leaf_ref to record create time. Besides,
the patch makes all dead roots of a given snapshot linked together in order of
create time. After a old snapshot was completely dropped, we check the dead
root list and remove all cache entries created before the oldest dead root in
the list.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 245eb00..c479206 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3275,4 +3275,3 @@
 	}
 	return 1;
 }
-
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8342208..be16cd4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -666,7 +666,8 @@
 	/* the dirty list is only used by non-reference counted roots */
 	struct list_head dirty_list;
 
-	spinlock_t orphan_lock;
+	spinlock_t list_lock;
+	struct list_head dead_list;
 	struct list_head orphan_list;
 };
 
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index eb4dd3d..1250946 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -340,4 +340,3 @@
 	}
 	return 0;
 }
-
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ec1ba8d..e826730 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -735,8 +735,9 @@
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	INIT_LIST_HEAD(&root->orphan_list);
+	INIT_LIST_HEAD(&root->dead_list);
 	spin_lock_init(&root->node_lock);
-	spin_lock_init(&root->orphan_lock);
+	spin_lock_init(&root->list_lock);
 	mutex_init(&root->objectid_mutex);
 
 	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
@@ -1717,7 +1718,7 @@
 		printk("btrfs: at umount reference cache size %Lu\n",
 			fs_info->total_ref_cache_size);
 	}
-	
+
 	if (fs_info->extent_root->node)
 		free_extent_buffer(fs_info->extent_root->node);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fe1ddbd..37ca8df 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -867,8 +867,8 @@
 		/*
 		 * For (parent_gen > 0 && parent_gen > ref_gen):
 		 *
-		 * we reach here through the oldest root, therefore 
-		 * all other reference from same snapshot should have 
+		 * we reach here through the oldest root, therefore
+		 * all other reference from same snapshot should have
 		 * a larger generation.
 		 */
 		if ((root_objectid != btrfs_ref_root(leaf, ref_item)) ||
@@ -954,7 +954,7 @@
 			if (!eb)
 				continue;
 			extent_start = eb->start;
-		} else 
+		} else
 			extent_start = bytenr;
 
 		ret = get_reference_status(root, extent_start, ref_generation,
@@ -1048,7 +1048,7 @@
 		struct btrfs_leaf_ref *ref;
 		struct btrfs_extent_info *info;
 
-		ref = btrfs_alloc_leaf_ref(nr_file_extents);
+		ref = btrfs_alloc_leaf_ref(root, nr_file_extents);
 		if (!ref) {
 			WARN_ON(1);
 			goto out;
@@ -1059,7 +1059,7 @@
 		ref->generation = btrfs_header_generation(buf);
 		ref->nritems = nr_file_extents;
 		info = ref->extents;
-		
+
 		for (i = 0; nr_file_extents > 0 && i < nritems; i++) {
 			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, &key, i);
@@ -1085,7 +1085,7 @@
 		BUG_ON(!root->ref_tree);
 		ret = btrfs_add_leaf_ref(root, ref);
 		WARN_ON(ret);
-		btrfs_free_leaf_ref(ref);
+		btrfs_free_leaf_ref(root, ref);
 	}
 out:
 	return 0;
@@ -2316,7 +2316,7 @@
 }
 
 static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
-				  	   struct btrfs_root *root,
+					   struct btrfs_root *root,
 					   struct extent_buffer *leaf)
 {
 	u64 leaf_owner;
@@ -2367,7 +2367,7 @@
 }
 
 static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
-				  	 struct btrfs_root *root,
+					 struct btrfs_root *root,
 					 struct btrfs_leaf_ref *ref)
 {
 	int i;
@@ -2521,7 +2521,7 @@
 				ret = drop_leaf_ref(trans, root, ref);
 				BUG_ON(ret);
 				btrfs_remove_leaf_ref(root, ref);
-				btrfs_free_leaf_ref(ref);
+				btrfs_free_leaf_ref(root, ref);
 				*level = 0;
 				break;
 			}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 964ec16..5368e3b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3497,4 +3497,3 @@
 	return ret;
 }
 EXPORT_SYMBOL(try_release_extent_buffer);
-
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index afe42d0..2311061 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -422,4 +422,3 @@
 	BUG_ON(ret);
 	return ret;
 }
-
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ded5281..412ab4a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1095,4 +1095,3 @@
 	.compat_ioctl	= btrfs_ioctl,
 #endif
 };
-
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3aa82ce..7af8be0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -835,17 +835,17 @@
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	spin_lock(&root->orphan_lock);
+	spin_lock(&root->list_lock);
 
 	/* already on the orphan list, we're good */
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 		return 0;
 	}
 
 	list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
 
-	spin_unlock(&root->orphan_lock);
+	spin_unlock(&root->list_lock);
 
 	/*
 	 * insert an orphan item to track this unlinked/truncated file
@@ -864,20 +864,20 @@
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	spin_lock(&root->orphan_lock);
+	spin_lock(&root->list_lock);
 
 	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 		return 0;
 	}
 
 	list_del_init(&BTRFS_I(inode)->i_orphan);
 	if (!trans) {
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 		return 0;
 	}
 
-	spin_unlock(&root->orphan_lock);
+	spin_unlock(&root->list_lock);
 
 	ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
 
@@ -973,9 +973,9 @@
 		 * add this inode to the orphan list so btrfs_orphan_del does
 		 * the proper thing when we hit it
 		 */
-		spin_lock(&root->orphan_lock);
+		spin_lock(&root->list_lock);
 		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 
 		/*
 		 * if this is a bad inode, means we actually succeeded in
@@ -3269,13 +3269,13 @@
 	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
 		posix_acl_release(BTRFS_I(inode)->i_default_acl);
 
-	spin_lock(&BTRFS_I(inode)->root->orphan_lock);
+	spin_lock(&BTRFS_I(inode)->root->list_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
 		printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
 		       " list\n", inode->i_ino);
 		dump_stack();
 	}
-	spin_unlock(&BTRFS_I(inode)->root->orphan_lock);
+	spin_unlock(&BTRFS_I(inode)->root->list_lock);
 
 	while(1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d617c29..d43e14c 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -56,4 +56,3 @@
 {
 	return mutex_is_locked(&eb->mutex);
 }
-
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 14d8637..f1374d5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -198,4 +198,3 @@
 		free_extent_buffer(next);
 	}
 }
-
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index ec95877..272b989 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -21,12 +21,18 @@
 #include "ref-cache.h"
 #include "transaction.h"
 
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents)
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+					    int nr_extents)
 {
 	struct btrfs_leaf_ref *ref;
+	size_t size = btrfs_leaf_ref_size(nr_extents);
 
-	ref = kmalloc(btrfs_leaf_ref_size(nr_extents), GFP_NOFS);
+	ref = kmalloc(size, GFP_NOFS);
 	if (ref) {
+		spin_lock(&root->fs_info->ref_cache_lock);
+		root->fs_info->total_ref_cache_size += size;
+		spin_unlock(&root->fs_info->ref_cache_lock);
+
 		memset(ref, 0, sizeof(*ref));
 		atomic_set(&ref->usage, 1);
 		INIT_LIST_HEAD(&ref->list);
@@ -34,14 +40,20 @@
 	return ref;
 }
 
-void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref)
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
 	if (!ref)
 		return;
 	WARN_ON(atomic_read(&ref->usage) == 0);
 	if (atomic_dec_and_test(&ref->usage)) {
+		size_t size = btrfs_leaf_ref_size(ref->nritems);
+
 		BUG_ON(ref->in_tree);
 		kfree(ref);
+
+		spin_lock(&root->fs_info->ref_cache_lock);
+		root->fs_info->total_ref_cache_size -= size;
+		spin_unlock(&root->fs_info->ref_cache_lock);
 	}
 }
 
@@ -64,7 +76,7 @@
 		else
 			return parent;
 	}
-	
+
 	entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
 	entry->in_tree = 1;
 	rb_link_node(node, parent, p);
@@ -91,9 +103,8 @@
 	return NULL;
 }
 
-int btrfs_remove_leaf_refs(struct btrfs_root *root)
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen)
 {
-	struct rb_node *rb;
 	struct btrfs_leaf_ref *ref = NULL;
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
 
@@ -101,17 +112,18 @@
 		return 0;
 
 	spin_lock(&tree->lock);
-	while(!btrfs_leaf_ref_tree_empty(tree)) {
-		rb = rb_first(&tree->root);
-		ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+	while(!list_empty(&tree->list)) {
+		ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
+		BUG_ON(!ref->in_tree);
+		if (ref->root_gen > max_root_gen)
+			break;
+
 		rb_erase(&ref->rb_node, &tree->root);
 		ref->in_tree = 0;
 		list_del_init(&ref->list);
 
 		spin_unlock(&tree->lock);
-
-		btrfs_free_leaf_ref(ref);
-
+		btrfs_free_leaf_ref(root, ref);
 		cond_resched();
 		spin_lock(&tree->lock);
 	}
@@ -143,7 +155,6 @@
 {
 	int ret = 0;
 	struct rb_node *rb;
-	size_t size = btrfs_leaf_ref_size(ref->nritems);
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
 
 	spin_lock(&tree->lock);
@@ -151,9 +162,6 @@
 	if (rb) {
 		ret = -EEXIST;
 	} else {
-		spin_lock(&root->fs_info->ref_cache_lock);
-		root->fs_info->total_ref_cache_size += size;
-		spin_unlock(&root->fs_info->ref_cache_lock);
 		atomic_inc(&ref->usage);
 		list_add_tail(&ref->list, &tree->list);
 	}
@@ -163,15 +171,10 @@
 
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
-	size_t size = btrfs_leaf_ref_size(ref->nritems);
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
 
 	BUG_ON(!ref->in_tree);
 	spin_lock(&tree->lock);
-	
-	spin_lock(&root->fs_info->ref_cache_lock);
-	root->fs_info->total_ref_cache_size -= size;
-	spin_unlock(&root->fs_info->ref_cache_lock);
 
 	rb_erase(&ref->rb_node, &tree->root);
 	ref->in_tree = 0;
@@ -179,7 +182,6 @@
 
 	spin_unlock(&tree->lock);
 
-	btrfs_free_leaf_ref(ref);
+	btrfs_free_leaf_ref(root, ref);
 	return 0;
 }
-
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 823c049..c361b32 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -30,6 +30,7 @@
 	int in_tree;
 	atomic_t usage;
 
+	u64 root_gen;
 	u64 bytenr;
 	u64 owner;
 	u64 generation;
@@ -41,14 +42,13 @@
 
 static inline size_t btrfs_leaf_ref_size(int nr_extents)
 {
-	return sizeof(struct btrfs_leaf_ref) + 
+	return sizeof(struct btrfs_leaf_ref) +
 	       sizeof(struct btrfs_extent_info) * nr_extents;
 }
 
 static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
 {
 	tree->root.rb_node = NULL;
-	tree->last = NULL;
 	INIT_LIST_HEAD(&tree->list);
 	spin_lock_init(&tree->lock);
 }
@@ -59,12 +59,13 @@
 }
 
 void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents);
-void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref);
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+					    int nr_extents);
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 					     u64 bytenr);
 int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-int btrfs_remove_leaf_refs(struct btrfs_root *root);
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen);
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 216f315..52c5524 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -98,20 +98,24 @@
 			BUG_ON(!dirty);
 			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
 			BUG_ON(!dirty->root);
-
 			dirty->latest_root = root;
 			INIT_LIST_HEAD(&dirty->list);
 
 			root->commit_root = btrfs_root_node(root);
-			root->dirty_root = dirty;
 
 			memcpy(dirty->root, root, sizeof(*root));
-			dirty->root->ref_tree = &root->ref_tree_struct;
-
 			spin_lock_init(&dirty->root->node_lock);
+			spin_lock_init(&dirty->root->list_lock);
 			mutex_init(&dirty->root->objectid_mutex);
+			INIT_LIST_HEAD(&dirty->root->dead_list);
 			dirty->root->node = root->commit_root;
 			dirty->root->commit_root = NULL;
+
+			spin_lock(&root->list_lock);
+			list_add(&dirty->root->dead_list, &root->dead_list);
+			spin_unlock(&root->list_lock);
+
+			root->dirty_root = dirty;
 		} else {
 			WARN_ON(1);
 		}
@@ -356,8 +360,6 @@
 		list_del_init(next);
 		root = list_entry(next, struct btrfs_root, dirty_list);
 		update_cowonly_root(trans, root);
-		if (root->fs_info->closing)
-			btrfs_remove_leaf_refs(root);
 	}
 	return 0;
 }
@@ -410,7 +412,11 @@
 
 				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
-				
+
+				spin_lock(&root->list_lock);
+				list_del_init(&dirty->root->dead_list);
+				spin_unlock(&root->list_lock);
+
 				kfree(dirty->root);
 				kfree(dirty);
 
@@ -497,6 +503,7 @@
 	unsigned long nr;
 	u64 num_bytes;
 	u64 bytes_used;
+	u64 max_useless;
 	int ret = 0;
 	int err;
 
@@ -554,10 +561,25 @@
 		}
 		mutex_unlock(&root->fs_info->drop_mutex);
 
+		spin_lock(&root->list_lock);
+		list_del_init(&dirty->root->dead_list);
+		if (!list_empty(&root->dead_list)) {
+			struct btrfs_root *oldest;
+			oldest = list_entry(root->dead_list.prev,
+					    struct btrfs_root, dead_list);
+			max_useless = oldest->root_key.offset - 1;
+		} else {
+			max_useless = root->root_key.offset - 1;
+		}
+		spin_unlock(&root->list_lock);
+
 		nr = trans->blocks_used;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
+		ret = btrfs_remove_leaf_refs(root, max_useless);
+		BUG_ON(ret);
+
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
@@ -785,10 +807,9 @@
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
+	list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 	if (root->fs_info->closing)
 		list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
-	else
-		list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -814,4 +835,3 @@
 	}
 	return 0;
 }
-
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e6ee7a..18db4cb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2527,4 +2527,3 @@
 error:
 	return ret;
 }
-