Btrfs: start of block group code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 26d0cdd..4199172 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -239,6 +239,19 @@
 	__le64 device_id;
 } __attribute__ ((__packed__));
 
+/* tag for the radix tree of block groups in ram */
+#define BTRFS_BLOCK_GROUP_DIRTY 0
+#define BTRFS_BLOCK_GROUP_HINTS 8
+#define BTRFS_BLOCK_GROUP_SIZE (256 * 1024 * 1024)
+struct btrfs_block_group_item {
+	__le64 used;
+} __attribute__ ((__packed__));
+
+struct btrfs_block_group_cache {
+	struct btrfs_key key;
+	struct btrfs_block_group_item item;
+};
+
 struct crypto_hash;
 struct btrfs_fs_info {
 	struct btrfs_root *extent_root;
@@ -249,6 +262,7 @@
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
 	struct radix_tree_root dev_radix;
+	struct radix_tree_root block_group_radix;
 
 	u64 extent_tree_insert[BTRFS_MAX_LEVEL * 3];
 	int extent_tree_insert_nr;
@@ -301,49 +315,67 @@
  * info about object characteristics.  There is one for every file and dir in
  * the FS
  */
-#define BTRFS_INODE_ITEM_KEY	1
+#define BTRFS_INODE_ITEM_KEY		1
+
+/* reserve 2-15 close to the inode for later flexibility */
 
 /*
  * dir items are the name -> inode pointers in a directory.  There is one
  * for every name in a directory.
  */
-#define BTRFS_DIR_ITEM_KEY	2
-#define BTRFS_DIR_INDEX_KEY	3
+#define BTRFS_DIR_ITEM_KEY	16
+#define BTRFS_DIR_INDEX_KEY	17
 /*
- * inline data is file data that fits in the btree.
+ * extent data is for file data
  */
-#define BTRFS_INLINE_DATA_KEY	4
-/*
- * extent data is for data that can't fit in the btree.  It points to
- * a (hopefully) huge chunk of disk
- */
-#define BTRFS_EXTENT_DATA_KEY	5
+#define BTRFS_EXTENT_DATA_KEY	18
 /*
  * csum items have the checksums for data in the extents
  */
-#define BTRFS_CSUM_ITEM_KEY	6
+#define BTRFS_CSUM_ITEM_KEY	19
+
+/* reserve 20-31 for other file stuff */
 
 /*
  * root items point to tree roots.  There are typically in the root
  * tree used by the super block to find all the other trees
  */
-#define BTRFS_ROOT_ITEM_KEY	7
+#define BTRFS_ROOT_ITEM_KEY	32
 /*
  * extent items are in the extent map tree.  These record which blocks
  * are used, and how many references there are to each block
  */
-#define BTRFS_EXTENT_ITEM_KEY	8
+#define BTRFS_EXTENT_ITEM_KEY	33
+
+/*
+ * block groups give us hints into the extent allocation trees.  Which
+ * blocks are free etc etc
+ */
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 34
 
 /*
  * dev items list the devices that make up the FS
  */
-#define BTRFS_DEV_ITEM_KEY	9
+#define BTRFS_DEV_ITEM_KEY	35
 
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
-#define BTRFS_STRING_ITEM_KEY	10
+#define BTRFS_STRING_ITEM_KEY	253
+
+
+static inline u64 btrfs_block_group_used(struct btrfs_block_group_item *bi)
+{
+	return le64_to_cpu(bi->used);
+}
+
+static inline void btrfs_set_block_group_used(struct
+						   btrfs_block_group_item *bi,
+						   u64 val)
+{
+	bi->used = cpu_to_le64(val);
+}
 
 static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i)
 {
@@ -1037,6 +1069,10 @@
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 blocknr, u64 num_blocks);
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_read_block_groups(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 956727f..1c27eb6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -529,6 +529,7 @@
 	init_bit_radix(&fs_info->pending_del_radix);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_RADIX_TREE(&fs_info->dev_radix, GFP_NOFS);
+	INIT_RADIX_TREE(&fs_info->block_group_radix, GFP_KERNEL);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
@@ -613,6 +614,8 @@
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	BUG_ON(ret);
 
+	btrfs_read_block_groups(extent_root);
+
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
 	memset(&fs_info->kobj, 0, sizeof(fs_info->kobj));
 	kobj_set_kset_s(fs_info, btrfs_subsys);
@@ -741,6 +744,7 @@
 	iput(fs_info->btree_inode);
 
 	free_dev_radix(fs_info);
+	btrfs_free_block_groups(root->fs_info);
 	del_fs_roots(fs_info);
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e6fe3fd..0bb4fc8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -127,6 +127,105 @@
 	return 0;
 }
 
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_block_group_cache *cache)
+{
+	int ret;
+	int pending_ret;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_block_group_item *bi;
+	struct btrfs_key ins;
+
+	find_free_extent(trans, extent_root, 0, 0, (u64)-1, &ins);
+	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+	BUG_ON(ret);
+	bi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
+			    struct btrfs_block_group_item);
+	memcpy(bi, &cache->item, sizeof(*bi));
+	mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(extent_root, path);
+
+	finish_current_insert(trans, extent_root);
+	pending_ret = del_pending_extents(trans, extent_root);
+	if (ret)
+		return ret;
+	if (pending_ret)
+		return pending_ret;
+	return 0;
+
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *cache[8];
+	int ret;
+	int err = 0;
+	int werr = 0;
+	struct radix_tree_root *radix = &root->fs_info->block_group_radix;
+	int i;
+	struct btrfs_path *path;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(radix, (void **)cache,
+						 0, ARRAY_SIZE(cache),
+						 BTRFS_BLOCK_GROUP_DIRTY);
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			radix_tree_tag_clear(radix, cache[i]->key.objectid +
+					     cache[i]->key.offset - 1,
+					     BTRFS_BLOCK_GROUP_DIRTY);
+			err = write_one_cache_group(trans, root,
+						    path, cache[i]);
+			if (err)
+				werr = err;
+		}
+	}
+	btrfs_free_path(path);
+	return werr;
+}
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      u64 blocknr, u64 num, int alloc)
+{
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_fs_info *info = root->fs_info;
+	u64 total = num;
+	u64 old_val;
+	u64 block_in_group;
+	int ret;
+	while(total) {
+		ret = radix_tree_gang_lookup(&info->block_group_radix,
+					     (void **)&cache, blocknr, 1);
+		if (!ret)
+			return -1;
+		block_in_group = blocknr - cache->key.objectid;
+		WARN_ON(block_in_group > cache->key.offset);
+		radix_tree_tag_set(&info->block_group_radix,
+				   cache->key.objectid + cache->key.offset - 1,
+				   BTRFS_BLOCK_GROUP_DIRTY);
+
+		old_val = btrfs_block_group_used(&cache->item);
+		num = min(total, cache->key.offset - block_in_group);
+		total -= num;
+		blocknr += num;
+		if (alloc)
+			old_val += num;
+		else
+			old_val -= num;
+		btrfs_set_block_group_used(&cache->item, old_val);
+	}
+	return 0;
+}
+
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *root)
 {
@@ -264,6 +363,8 @@
 		ret = btrfs_del_item(trans, extent_root, path);
 		if (ret)
 			BUG();
+		ret = update_block_group(trans, root, blocknr, num_blocks, 0);
+		BUG_ON(ret);
 	}
 	btrfs_release_path(extent_root, path);
 	btrfs_free_path(path);
@@ -365,21 +466,6 @@
 		num_blocks = 1;
 		total_needed = min(level + 2, BTRFS_MAX_LEVEL) * 3;
 	}
-	if (info->last_insert.objectid == 0 && search_end == (u64)-1) {
-		struct btrfs_disk_key *last_key;
-		btrfs_init_path(path);
-		ins->objectid = (u64)-1;
-		ins->offset = (u64)-1;
-		ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
-		if (ret < 0)
-			goto error;
-		BUG_ON(ret == 0);
-		if (path->slots[0] > 0)
-			path->slots[0]--;
-		l = btrfs_buffer_leaf(path->nodes[0]);
-		last_key = &l->items[path->slots[0]].key;
-		search_start = btrfs_disk_key_objectid(last_key);
-	}
 	if (info->last_insert.objectid > search_start)
 		search_start = info->last_insert.objectid;
 
@@ -420,6 +506,8 @@
 			goto check_pending;
 		}
 		btrfs_disk_key_to_cpu(&key, &l->items[slot].key);
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+			goto next;
 		if (key.objectid >= search_start) {
 			if (start_found) {
 				if (last_block < search_start)
@@ -434,6 +522,7 @@
 		}
 		start_found = 1;
 		last_block = key.objectid + key.offset;
+next:
 		path->slots[0]++;
 	}
 	// FIXME -ENOSPC
@@ -498,7 +587,6 @@
 	btrfs_free_path(path);
 	return ret;
 }
-
 /*
  * finds a free extent and does all the dirty work required for allocation
  * returns the key for the extent through ins, and a tree buffer for
@@ -532,6 +620,9 @@
 		ins->objectid = info->extent_tree_prealloc[nr];
 		info->extent_tree_insert[info->extent_tree_insert_nr++] =
 			ins->objectid;
+		ret = update_block_group(trans, root,
+					 ins->objectid, ins->offset, 1);
+		BUG_ON(ret);
 		return 0;
 	}
 	/* do the real allocation */
@@ -558,6 +649,7 @@
 		return ret;
 	if (pending_ret)
 		return pending_ret;
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
 	return 0;
 }
 
@@ -578,6 +670,7 @@
 		BUG();
 		return NULL;
 	}
+	BUG_ON(ret);
 	buf = btrfs_find_create_tree_block(root, ins.objectid);
 	set_buffer_uptodate(buf);
 	return buf;
@@ -758,3 +851,82 @@
 	btrfs_free_path(path);
 	return ret;
 }
+
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+	int ret;
+	struct btrfs_block_group_cache *cache[8];
+	int i;
+
+	while(1) {
+		ret = radix_tree_gang_lookup(&info->block_group_radix,
+					     (void **)cache, 0,
+					     ARRAY_SIZE(cache));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			radix_tree_delete(&info->block_group_radix,
+					  cache[i]->key.objectid +
+					  cache[i]->key.offset - 1);
+			kfree(cache[i]);
+		}
+	}
+	return 0;
+}
+
+int btrfs_read_block_groups(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	int ret;
+	int err = 0;
+	struct btrfs_block_group_item *bi;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_leaf *leaf;
+	u64 group_size_blocks = BTRFS_BLOCK_GROUP_SIZE / root->blocksize;
+
+	root = root->fs_info->extent_root;
+	key.objectid = 0;
+	key.offset = group_size_blocks;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, root->fs_info->extent_root,
+					&key, path, 0, 0);
+		if (ret != 0) {
+			err = ret;
+			break;
+		}
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		btrfs_disk_key_to_cpu(&found_key,
+				      &leaf->items[path->slots[0]].key);
+		cache = kmalloc(sizeof(*cache), GFP_NOFS);
+		if (!cache) {
+			err = -1;
+			break;
+		}
+		bi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_block_group_item);
+		memcpy(&cache->item, bi, sizeof(*bi));
+		memcpy(&cache->key, &found_key, sizeof(found_key));
+		key.objectid = found_key.objectid + found_key.offset;
+		btrfs_release_path(root, path);
+		ret = radix_tree_insert(&root->fs_info->block_group_radix,
+					found_key.objectid +
+					found_key.offset - 1,
+					(void *)cache);
+		BUG_ON(ret);
+		if (key.objectid >=
+		    btrfs_super_total_blocks(root->fs_info->disk_super))
+			break;
+	}
+
+	btrfs_free_path(path);
+	return 0;
+}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 1e7038b..2f95fc6 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -11,6 +11,7 @@
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
 	struct btrfs_inode_item *ii;
+	struct btrfs_block_group_item *bi;
 	u32 type;
 
 	printk("leaf %Lu total ptrs %d free space %d\n",
@@ -53,6 +54,12 @@
 			printk("\t\textent data refs %u\n",
 				btrfs_extent_refs(ei));
 			break;
+		case BTRFS_BLOCK_GROUP_ITEM_KEY:
+			bi = btrfs_item_ptr(l, i,
+					    struct btrfs_block_group_item);
+			printk("\t\tblock group used %Lu\n",
+			       btrfs_block_group_used(bi));
+			break;
 		case BTRFS_STRING_ITEM_KEY:
 			printk("\t\titem data %.*s\n", btrfs_item_size(item),
 				btrfs_leaf_data(l) + btrfs_item_offset(item));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5bbccbc..edcebf7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -377,7 +377,6 @@
 		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
 			break;
 		if (btrfs_disk_key_type(found_key) != BTRFS_CSUM_ITEM_KEY &&
-		    btrfs_disk_key_type(found_key) != BTRFS_INLINE_DATA_KEY &&
 		    btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
 			break;
 		if (btrfs_disk_key_offset(found_key) < inode->i_size)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 078cb9c..8a2545f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -125,6 +125,7 @@
 		btrfs_set_super_device_root(fs_info->disk_super,
 					    bh_blocknr(dev_root->node));
 	}
+	btrfs_write_dirty_block_groups(trans, extent_root);
 	while(1) {
 		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
 		if (old_extent_block == bh_blocknr(extent_root->node))
@@ -135,6 +136,7 @@
 					&extent_root->root_key,
 					&extent_root->root_item);
 		BUG_ON(ret);
+		btrfs_write_dirty_block_groups(trans, extent_root);
 	}
 	return 0;
 }