Btrfs: Block sized tree extents and extent deletion

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1b4e82d..f0abcf1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -8,9 +8,12 @@
 #define SEARCH_READ 0
 #define SEARCH_WRITE 1
 
-static int refill_alloc_extent(struct ctree_root *root);
+#define CTREE_EXTENT_PENDING 0
+
 int split_node(struct ctree_root *root, struct ctree_path *path, int level);
 int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size);
+struct tree_buffer *alloc_free_block(struct ctree_root *root);
+int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks);
 
 static inline void init_path(struct ctree_path *p)
 {
@@ -682,8 +685,6 @@
 	unsigned int data_end;
 	struct ctree_path path;
 
-	refill_alloc_extent(root);
-
 	/* create a root if there isn't one */
 	if (!root->node)
 		BUG();
@@ -756,6 +757,7 @@
 	struct tree_buffer *t;
 	struct node *node;
 	int nritems;
+	u64 blocknr;
 
 	while(1) {
 		t = path->nodes[level];
@@ -774,6 +776,7 @@
 		}
 		node->header.nritems--;
 		write_tree_block(root, t);
+		blocknr = t->blocknr;
 		if (node->header.nritems != 0) {
 			int tslot;
 			if (slot == 0)
@@ -799,6 +802,7 @@
 			break;
 		}
 		level++;
+		free_extent(root, blocknr, 1);
 		if (!path->nodes[level])
 			BUG();
 	}
@@ -841,8 +845,10 @@
 		if (leaf_buf == root->node) {
 			leaf->header.flags = node_level(0);
 			write_tree_block(root, leaf_buf);
-		} else
+		} else {
 			del_ptr(root, path, 1);
+			free_extent(root, leaf_buf->blocknr, 1);
+		}
 	} else {
 		if (slot == 0)
 			fixup_low_keys(root, path, &leaf->items[0].key, 1);
@@ -867,6 +873,72 @@
 	return 0;
 }
 
+static int del_pending_extents(struct ctree_root *extent_root)
+{
+	int ret;
+	struct key key;
+	struct tree_buffer *gang[4];
+	int i;
+	struct ctree_path path;
+
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
+						 (void **)gang, 0, ARRAY_SIZE(gang),
+						 CTREE_EXTENT_PENDING);
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			key.objectid = gang[i]->blocknr;
+			key.flags = 0;
+			key.offset = 1;
+			init_path(&path);
+			ret = search_slot(extent_root, &key, &path, 0);
+			if (ret) {
+				BUG();
+				// FIXME undo it and return sane
+				return ret;
+			}
+			ret = del_item(extent_root, &path);
+			if (ret) {
+				BUG();
+				return ret;
+			}
+			release_path(extent_root, &path);
+			radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr,
+						CTREE_EXTENT_PENDING);
+			tree_block_release(extent_root, gang[i]);
+		}
+	}
+	return 0;
+}
+
+int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
+{
+	struct ctree_path path;
+	struct key key;
+	struct ctree_root *extent_root = root->extent_root;
+	struct tree_buffer *t;
+	int pending_ret;
+	int ret;
+
+	key.objectid = blocknr;
+	key.flags = 0;
+	key.offset = num_blocks;
+	if (root == extent_root) {
+		t = read_tree_block(root, key.objectid);
+		radix_tree_tag_set(&root->cache_radix, key.objectid, CTREE_EXTENT_PENDING);
+		return 0;
+	}
+	init_path(&path);
+	ret = search_slot(extent_root, &key, &path, 0);
+	if (ret)
+		BUG();
+	ret = del_item(extent_root, &path);
+	release_path(extent_root, &path);
+	pending_ret = del_pending_extents(root->extent_root);
+	return ret ? ret : pending_ret;
+}
+
 int next_leaf(struct ctree_root *root, struct ctree_path *path)
 {
 	int slot;
@@ -904,8 +976,8 @@
 	return 0;
 }
 
-int alloc_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
-		 u64 search_end, u64 owner, struct key *ins)
+int find_free_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
+			 u64 search_end, struct key *ins)
 {
 	struct ctree_path path;
 	struct key *key;
@@ -915,15 +987,13 @@
 	u64 last_block;
 	int start_found = 0;
 	struct leaf *l;
-	struct extent_item extent_item;
 	struct ctree_root * root = orig_root->extent_root;
 
 	init_path(&path);
 	ins->objectid = search_start;
 	ins->offset = 0;
 	ins->flags = 0;
-
-	ret = search_slot(root, ins, &path, sizeof(struct extent_item));
+	ret = search_slot(root, ins, &path, 0);
 	while (1) {
 		l = &path.nodes[0]->leaf;
 		slot = path.slots[0];
@@ -938,6 +1008,7 @@
 				ins->objectid = search_start;
 				ins->offset = num_blocks;
 				hole_size = search_end - search_start;
+				start_found = 1;
 				goto insert;
 			}
 			ins->objectid = last_block;
@@ -956,51 +1027,119 @@
 		} else
 			start_found = 1;
 		last_block = key->objectid + key->offset;
+insert_failed:
 		path.slots[0]++;
 	}
 	// FIXME -ENOSPC
 insert:
-	release_path(root, &path);
-	extent_item.refs = 1;
-	extent_item.owner = owner;
-	if (root == orig_root && root->reserve_extent->num_blocks == 0) {
-		root->reserve_extent->blocknr = ins->objectid;
-		root->reserve_extent->num_blocks = ins->offset;
-		root->reserve_extent->num_used = 0;
+	if (orig_root->extent_root == orig_root) {
+		BUG_ON(num_blocks != 1);
+		if ((root->current_insert.objectid <= ins->objectid &&
+		    root->current_insert.objectid + root->current_insert.offset >
+		    ins->objectid) ||
+		   (root->current_insert.objectid > ins->objectid &&
+		    root->current_insert.objectid <= ins->objectid + ins->offset) ||
+		   radix_tree_tag_get(&root->cache_radix, ins->objectid,
+				      CTREE_EXTENT_PENDING)) {
+			last_block = ins->objectid + 1;
+			search_start = last_block;
+			goto insert_failed;
+		}
 	}
-	ret = insert_item(root->extent_root, ins, &extent_item, sizeof(extent_item));
-	return ret;
+	release_path(root, &path);
+	if (ins->offset != 1)
+		BUG();
+	return 0;
 }
 
-static int refill_alloc_extent(struct ctree_root *root)
+static int insert_pending_extents(struct ctree_root *extent_root)
 {
-	struct alloc_extent *ae = root->alloc_extent;
-	struct key key;
 	int ret;
-	int min_blocks = MAX_LEVEL * 2;
+	struct key key;
+	struct extent_item item;
+	struct tree_buffer *gang[4];
+	int i;
 
-	if (ae->num_blocks > ae->num_used && ae->num_blocks - ae->num_used >
-	    min_blocks)
-		return 0;
-	ae = root->reserve_extent;
-	if (ae->num_blocks > ae->num_used) {
-		if (root->alloc_extent->num_blocks == 0) {
-			/* we should swap reserve/alloc_extent when alloc
-			 * fills up
-			 */
-			BUG();
+	// FIXME -ENOSPC
+	item.refs = 1;
+	item.owner = extent_root->node->node.header.parentid;
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
+						 (void **)gang, 0, ARRAY_SIZE(gang),
+						 CTREE_EXTENT_PENDING);
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			key.objectid = gang[i]->blocknr;
+			key.flags = 0;
+			key.offset = 1;
+			ret = insert_item(extent_root, &key, &item, sizeof(item));
+			if (ret) {
+				BUG();
+				// FIXME undo it and return sane
+				return ret;
+			}
+			radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr,
+						CTREE_EXTENT_PENDING);
+			tree_block_release(extent_root, gang[i]);
 		}
-		if (ae->num_blocks - ae->num_used < min_blocks)
-			BUG();
+	}
+	return 0;
+}
+
+int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
+			 u64 search_end, u64 owner, struct key *ins, struct tree_buffer **buf)
+{
+	int ret;
+	int pending_ret;
+	struct extent_item extent_item;
+
+	extent_item.refs = 1;
+	extent_item.owner = owner;
+
+	ret = find_free_extent(root, num_blocks, search_start, search_end, ins);
+	if (ret)
+		return ret;
+
+	if (root != root->extent_root) {
+		memcpy(&root->extent_root->current_insert, ins, sizeof(*ins));
+		ret = insert_item(root->extent_root, ins, &extent_item, sizeof(extent_item));
+		memset(&root->extent_root->current_insert, 0, sizeof(struct key));
+		pending_ret = insert_pending_extents(root->extent_root);
+		if (ret)
+			return ret;
+		if (pending_ret)
+			return pending_ret;
+		*buf = find_tree_block(root, ins->objectid);
 		return 0;
 	}
-	ret = alloc_extent(root,
-			   min_blocks * 2, 0, (unsigned long)-1,
-			   root->node->node.header.parentid, &key);
-	ae->blocknr = key.objectid;
-	ae->num_blocks = key.offset;
-	ae->num_used = 0;
-	return ret;
+	/* we're allocating an extent for the extent tree, don't recurse */
+	BUG_ON(ins->offset != 1);
+	*buf = find_tree_block(root, ins->objectid);
+	BUG_ON(!*buf);
+	radix_tree_tag_set(&root->cache_radix, ins->objectid, CTREE_EXTENT_PENDING);
+	(*buf)->count++;
+	return 0;
+
+}
+
+struct tree_buffer *alloc_free_block(struct ctree_root *root)
+{
+	struct key ins;
+	int ret;
+	struct tree_buffer *buf = NULL;
+
+	ret = alloc_extent(root, 1, 0, (unsigned long)-1, root->node->node.header.parentid,
+			   &ins, &buf);
+
+	if (ret) {
+		BUG();
+		return NULL;
+	}
+	if (root != root->extent_root)
+		BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix, buf->blocknr,
+					  CTREE_EXTENT_PENDING));
+	return buf;
 }
 
 void print_leaf(struct leaf *l)
@@ -1096,6 +1235,7 @@
 	print_tree(root, root->node);
 	printf("map tree\n");
 	print_tree(root->extent_root, root->extent_root->node);
+	fflush(stdout);
 
 	srand(55);
 	for (i = 0; i < run_size; i++) {
@@ -1111,12 +1251,6 @@
 		if (!ret)
 			tree_size++;
 	}
-	printf("root used: %lu\n", root->alloc_extent->num_used);
-	printf("root tree\n");
-	// print_tree(root, root->node);
-	printf("map tree\n");
-	printf("map used: %lu\n", root->extent_root->alloc_extent->num_used);
-	// print_tree(root->extent_root, root->extent_root->node);
 	write_ctree_super(root, &super);
 	close_ctree(root);
 
@@ -1167,12 +1301,27 @@
 		ret = insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
+		if (i >= 5) {
+			struct key ugh;
+			ugh.objectid = 5;
+			ugh.flags = 0;
+			ugh.offset = 0;
+			init_path(&path);
+			ret = search_slot(root, &ugh, &path, 0);
+			if (ret) {
+				print_tree(root, root->node);
+				printf("unable to find 5 %d\n", num);
+				exit(1);
+			}
+			release_path(root, &path);
+
+		}
 	}
 	write_ctree_super(root, &super);
 	close_ctree(root);
 	root = open_ctree("dbfile", &super);
-	printf("starting search2\n");
 	srand(128);
+	printf("starting search2\n");
 	for (i = 0; i < run_size; i++) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
@@ -1219,5 +1368,7 @@
 	write_ctree_super(root, &super);
 	close_ctree(root);
 	printf("tree size is now %d\n", tree_size);
+	printf("map tree\n");
+	print_tree(root->extent_root, root->extent_root->node);
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 78407d3..8c32c0e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,21 +28,12 @@
 
 struct tree_buffer;
 
-struct alloc_extent {
-	u64 blocknr;
-	u64 num_blocks;
-	u64 num_used;
-} __attribute__ ((__packed__));
-
 struct ctree_root {
 	struct tree_buffer *node;
 	struct ctree_root *extent_root;
-	struct alloc_extent *alloc_extent;
-	struct alloc_extent *reserve_extent;
+	struct key current_insert;
 	int fp;
 	struct radix_tree_root cache_radix;
-	struct alloc_extent ai1;
-	struct alloc_extent ai2;
 };
 
 struct ctree_root_info {
@@ -52,8 +43,6 @@
 	u64 tree_root; /* the tree root */
 	u32 csum;
 	u32 ham;
-	struct alloc_extent alloc_extent;
-	struct alloc_extent reserve_extent;
 	u64 snapuuid[2]; /* root specific uuid */
 } __attribute__ ((__packed__));
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a696a42..14955e4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -12,33 +12,13 @@
 
 static int allocated_blocks = 0;
 
-static int get_free_block(struct ctree_root *root, u64 *block)
+static int check_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 {
-	struct stat st;
-	int ret = 0;
-
-	if (root->alloc_extent->num_used >= root->alloc_extent->num_blocks)
-		return -1;
-
-	*block = root->alloc_extent->blocknr + root->alloc_extent->num_used;
-	root->alloc_extent->num_used += 1;
-	if (root->alloc_extent->num_used >= root->alloc_extent->num_blocks) {
-		struct alloc_extent *ae = root->alloc_extent;
-		root->alloc_extent = root->reserve_extent;
-		root->reserve_extent = ae;
-		ae->num_blocks = 0;
-	}
-	st.st_size = 0;
-	ret = fstat(root->fp, &st);
-	if (st.st_size < (*block + 1) * CTREE_BLOCKSIZE) {
-		ret = ftruncate(root->fp,
-				(*block + 1) * CTREE_BLOCKSIZE);
-		if (ret) {
-			perror("ftruncate");
-			exit(1);
-		}
-	}
-	return ret;
+	if (buf->blocknr != buf->node.header.blocknr)
+		BUG();
+	if (root->node && buf->node.header.parentid != root->node->node.header.parentid)
+		BUG();
+	return 0;
 }
 
 struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr)
@@ -61,22 +41,23 @@
 	return buf;
 }
 
-struct tree_buffer *alloc_free_block(struct ctree_root *root)
+struct tree_buffer *find_tree_block(struct ctree_root *root, u64 blocknr)
 {
-	u64 free_block;
-	int ret;
-	struct tree_buffer * buf;
-	ret = get_free_block(root, &free_block);
-	if (ret) {
-		BUG();
-		return NULL;
+	struct tree_buffer *buf;
+	buf = radix_tree_lookup(&root->cache_radix, blocknr);
+	if (buf) {
+		buf->count++;
+	} else {
+		buf = alloc_tree_block(root, blocknr);
+		if (!buf) {
+			BUG();
+			return NULL;
+		}
 	}
-	buf = alloc_tree_block(root, free_block);
-	if (!buf)
-		BUG();
 	return buf;
 }
 
+
 struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
 {
 	loff_t offset = blocknr * CTREE_BLOCKSIZE;
@@ -86,20 +67,17 @@
 	buf = radix_tree_lookup(&root->cache_radix, blocknr);
 	if (buf) {
 		buf->count++;
-		goto test;
+	} else {
+		buf = alloc_tree_block(root, blocknr);
+		if (!buf)
+			return NULL;
+		ret = pread(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
+		if (ret != CTREE_BLOCKSIZE) {
+			free(buf);
+			return NULL;
+		}
 	}
-	buf = alloc_tree_block(root, blocknr);
-	if (!buf)
-		return NULL;
-	ret = pread(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
-	if (ret != CTREE_BLOCKSIZE) {
-		free(buf);
-		return NULL;
-	}
-test:
-	if (buf->blocknr != buf->node.header.blocknr)
-		BUG();
-	if (root->node && buf->node.header.parentid != root->node->node.header.parentid)
+	if (check_tree_block(root, buf))
 		BUG();
 	return buf;
 }
@@ -121,17 +99,10 @@
 static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root,
 			struct ctree_root_info *info, int fp)
 {
-	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
 	root->fp = fp;
 	root->node = NULL;
 	root->node = read_tree_block(root, info->tree_root);
 	root->extent_root = extent_root;
-	memcpy(&root->ai1, &info->alloc_extent, sizeof(info->alloc_extent));
-	memcpy(&root->ai2, &info->reserve_extent, sizeof(info->reserve_extent));
-	root->alloc_extent = &root->ai1;
-	root->reserve_extent = &root->ai2;
-	printf("setup done reading root %p, used %lu available %lu\n", root, root->alloc_extent->num_used, root->alloc_extent->num_blocks);
-	printf("setup done reading root %p, reserve used %lu available %lu\n", root, root->reserve_extent->num_used, root->reserve_extent->num_blocks);
 	return 0;
 }
 
@@ -147,6 +118,8 @@
 		free(root);
 		return NULL;
 	}
+	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
+	INIT_RADIX_TREE(&extent_root->cache_radix, GFP_KERNEL);
 	ret = pread(fp, super, sizeof(struct ctree_super_block),
 		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
 	if (ret == 0 || super->root_info.tree_root == 0) {
@@ -168,8 +141,6 @@
 static int __update_root(struct ctree_root *root, struct ctree_root_info *info)
 {
 	info->tree_root = root->node->blocknr;
-	memcpy(&info->alloc_extent, root->alloc_extent, sizeof(struct alloc_extent));
-	memcpy(&info->reserve_extent, root->reserve_extent, sizeof(struct alloc_extent));
 	return 0;
 }
 
@@ -201,6 +172,7 @@
 void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
 {
 	buf->count--;
+	write_tree_block(root, buf);
 	if (buf->count < 0)
 		BUG();
 	if (buf->count == 0) {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index e288fe8..2729b75 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -11,11 +11,11 @@
 };
 
 struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr);
+struct tree_buffer *find_tree_block(struct ctree_root *root, u64 blocknr);
 int write_tree_block(struct ctree_root *root, struct tree_buffer *buf);
 struct ctree_root *open_ctree(char *filename, struct ctree_super_block *s);
 int close_ctree(struct ctree_root *root);
 void tree_block_release(struct ctree_root *root, struct tree_buffer *buf);
-struct tree_buffer *alloc_free_block(struct ctree_root *root);
 int write_ctree_super(struct ctree_root *root, struct ctree_super_block *s);
 int mkfs(int fd);
 
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index 584aba4..fd4e5de 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -23,17 +23,10 @@
 	info[0].blocknr = 16;
 	info[0].objectid = 1;
 	info[0].tree_root = 17;
-	info[0].alloc_extent.blocknr = 0;
-	info[0].alloc_extent.num_blocks = 64;
-	/* 0-17 are used (inclusive) */
-	info[0].alloc_extent.num_used = 18;
 
 	info[1].blocknr = 16;
 	info[1].objectid = 2;
-	info[1].tree_root = 64;
-	info[1].alloc_extent.blocknr = 64;
-	info[1].alloc_extent.num_blocks = 64;
-	info[1].alloc_extent.num_used = 1;
+	info[1].tree_root = 18;
 	ret = pwrite(fd, info, sizeof(info),
 		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
 	if (ret != sizeof(info))
@@ -48,24 +41,36 @@
 		return -1;
 
 	empty_leaf.header.parentid = 2;
-	empty_leaf.header.blocknr = 64;
-	empty_leaf.header.nritems = 2;
+	empty_leaf.header.blocknr = 18;
+	empty_leaf.header.nritems = 3;
+
+	/* item1, reserve blocks 0-16 */
 	item.key.objectid = 0;
-	item.key.offset = 64;
+	item.key.offset = 17;
 	item.key.flags = 0;
 	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item);
 	item.size = sizeof(struct extent_item);
 	extent_item.refs = 1;
-	extent_item.owner = 1;
+	extent_item.owner = 0;
 	memcpy(empty_leaf.items, &item, sizeof(item));
 	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
-	item.key.objectid = 64;
-	item.key.offset = 64;
+
+	/* item2, give block 17 to the root */
+	item.key.objectid = 17;
+	item.key.offset = 1;
 	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item) * 2;
-	extent_item.owner = 2;
+	extent_item.owner = 1;
 	memcpy(empty_leaf.items + 1, &item, sizeof(item));
 	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
-	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 64 * CTREE_BLOCKSIZE);
+
+	/* item3, give block 18 for the extent root */
+	item.key.objectid = 18;
+	item.key.offset = 1;
+	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item) * 3;
+	extent_item.owner = 2;
+	memcpy(empty_leaf.items + 2, &item, sizeof(item));
+	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
+	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 18 * CTREE_BLOCKSIZE);
 	if (ret != sizeof(empty_leaf))
 		return -1;
 	return 0;