ext4: teach mballoc preallocation code about bigalloc clusters
In most of mballoc.c, we do everything in units of clusters, since the
block allocation bitmaps and buddy bitmaps are all denominated in
clusters. The one place where we do deal with absolute block numbers
is in the code that handles the preallocation regions, since in the
case of inode-based preallocation regions, the start of the
preallocation region can't be relative to the beginning of the group.
So this adds a bit of complexity, where pa_pstart and pa_lstart are
block numbers, while pa_free, pa_len, and fe_len are denominated in
units of clusters.
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 81e2865..8765f25 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -70,8 +70,8 @@
*
* pa_lstart -> the logical start block for this prealloc space
* pa_pstart -> the physical start block for this prealloc space
- * pa_len -> length for this prealloc space
- * pa_free -> free space available in this prealloc space
+ * pa_len -> length for this prealloc space (in clusters)
+ * pa_free -> free space available in this prealloc space (in clusters)
*
* The inode preallocation space is used looking at the _logical_ start
* block. If only the logical file block falls within the range of prealloc
@@ -459,7 +459,7 @@
ext4_fsblk_t blocknr;
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += first + i;
+ blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
@@ -734,7 +734,7 @@
if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group, 0, 0,
- "%u blocks in bitmap, %u in gd",
+ "%u clusters in bitmap, %u in gd",
free, grp->bb_free);
/*
* If we intent to continue, we consider group descritor
@@ -1339,7 +1339,7 @@
ext4_fsblk_t blocknr;
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += block;
+ blocknr += EXT4_C2B(EXT4_SB(sb), block);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
@@ -1831,7 +1831,7 @@
* we have free blocks
*/
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
- "%d free blocks as per "
+ "%d free clusters as per "
"group info. But bitmap says 0",
free);
break;
@@ -1841,7 +1841,7 @@
BUG_ON(ex.fe_len <= 0);
if (free < ex.fe_len) {
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
- "%d free blocks as per "
+ "%d free clusters as per "
"group info. But got %d blocks",
free, ex.fe_len);
/*
@@ -2752,7 +2752,7 @@
*/
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
- handle_t *handle, unsigned int reserv_blks)
+ handle_t *handle, unsigned int reserv_clstrs)
{
struct buffer_head *bitmap_bh = NULL;
struct ext4_group_desc *gdp;
@@ -2791,7 +2791,7 @@
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
- len = ac->ac_b_ex.fe_len;
+ len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (!ext4_data_block_valid(sbi, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
"fs metadata\n", block, block+len);
@@ -2838,7 +2838,7 @@
*/
if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_clstrs);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -2886,6 +2886,7 @@
ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits, max;
ext4_lblk_t end;
loff_t size, orig_size, start_off;
@@ -2916,7 +2917,7 @@
/* first, let's learn actual file size
* given current request is allocated */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
size = size << bsbits;
if (size < i_size_read(ac->ac_inode))
size = i_size_read(ac->ac_inode);
@@ -2988,7 +2989,8 @@
continue;
}
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
/* PA must not overlap original request */
BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
@@ -3018,9 +3020,11 @@
rcu_read_lock();
list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
ext4_lblk_t pa_end;
+
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0) {
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
}
spin_unlock(&pa->pa_lock);
@@ -3043,7 +3047,7 @@
/* XXX: is it better to align blocks WRT to logical
* placement or satisfy big request as is */
ac->ac_g_ex.fe_logical = start;
- ac->ac_g_ex.fe_len = size;
+ ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
/* define goal start in order to merge */
if (ar->pright && (ar->lright == (start + size))) {
@@ -3112,14 +3116,16 @@
static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
ext4_fsblk_t start;
ext4_fsblk_t end;
int len;
/* found preallocated blocks, use them */
start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
- end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
- len = end - start;
+ end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+ start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+ len = EXT4_NUM_B2C(sbi, end - start);
ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
&ac->ac_b_ex.fe_start);
ac->ac_b_ex.fe_len = len;
@@ -3127,7 +3133,7 @@
ac->ac_pa = pa;
BUG_ON(start < pa->pa_pstart);
- BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+ BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
@@ -3193,6 +3199,7 @@
static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
@@ -3210,12 +3217,14 @@
/* all fields in this condition don't change,
* so we can skip locking for them */
if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
- ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+ ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+ EXT4_C2B(sbi, pa->pa_len)))
continue;
/* non-extent files can't have physical blocks past 2^32 */
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
- pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+ (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+ EXT4_MAX_BLOCK_FILE_PHYS))
continue;
/* found preallocated blocks, use them */
@@ -3412,6 +3421,7 @@
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_prealloc_space *pa;
struct ext4_group_info *grp;
struct ext4_inode_info *ei;
@@ -3443,16 +3453,18 @@
winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
/* also, we should cover whole original request */
- wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+ wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
/* the smallest one defines real window */
win = min(winl, wins);
- offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+ offs = ac->ac_o_ex.fe_logical %
+ EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (offs && offs < win)
win = offs;
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+ ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+ EXT4_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
}
@@ -3477,7 +3489,7 @@
trace_ext4_mb_new_inode_pa(ac, pa);
ext4_mb_use_inode_pa(ac, pa);
- atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+ atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
ei = EXT4_I(ac->ac_inode);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3592,7 +3604,7 @@
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- grp_blk_start = pa->pa_pstart - bit;
+ grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
end = bit + pa->pa_len;
@@ -3607,7 +3619,8 @@
free += next - bit;
trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
- trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
+ trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+ EXT4_C2B(sbi, bit)),
next - bit);
mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
bit = next + 1;
@@ -3958,7 +3971,7 @@
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
@@ -4019,18 +4032,15 @@
/* set up allocation goals */
memset(ac, 0, sizeof(struct ext4_allocation_context));
- ac->ac_b_ex.fe_logical = ar->logical;
+ ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
ac->ac_inode = ar->inode;
- ac->ac_o_ex.fe_logical = ar->logical;
+ ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
ac->ac_o_ex.fe_group = group;
ac->ac_o_ex.fe_start = block;
ac->ac_o_ex.fe_len = len;
- ac->ac_g_ex.fe_logical = ar->logical;
- ac->ac_g_ex.fe_group = group;
- ac->ac_g_ex.fe_start = block;
- ac->ac_g_ex.fe_len = len;
+ ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
/* we have to define context: we'll we work with a file or
@@ -4182,13 +4192,14 @@
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
if (pa->pa_type == MB_GROUP_PA) {
/* see comment in ext4_mb_use_group_pa() */
spin_lock(&pa->pa_lock);
- pa->pa_pstart += ac->ac_b_ex.fe_len;
- pa->pa_lstart += ac->ac_b_ex.fe_len;
+ pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
@@ -4249,7 +4260,7 @@
struct super_block *sb;
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
- unsigned int reserv_blks = 0;
+ unsigned int reserv_clstrs = 0;
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
@@ -4279,12 +4290,14 @@
*errp = -ENOSPC;
return 0;
}
- reserv_blks = ar->len;
+ reserv_clstrs = ar->len;
if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
- dquot_alloc_block_nofail(ar->inode, ar->len);
+ dquot_alloc_block_nofail(ar->inode,
+ EXT4_C2B(sbi, ar->len));
} else {
while (ar->len &&
- dquot_alloc_block(ar->inode, ar->len)) {
+ dquot_alloc_block(ar->inode,
+ EXT4_C2B(sbi, ar->len))) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
ar->len--;
@@ -4328,7 +4341,7 @@
ext4_mb_new_preallocation(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
- *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
+ *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
if (*errp == -EAGAIN) {
/*
* drop the reference that we took
@@ -4364,13 +4377,13 @@
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
if (inquota && ar->len < inquota)
- dquot_free_block(ar->inode, inquota - ar->len);
+ dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
if (!ar->len) {
if (!ext4_test_inode_state(ar->inode,
EXT4_STATE_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
percpu_counter_sub(&sbi->s_dirtyblocks_counter,
- reserv_blks);
+ reserv_clstrs);
}
trace_ext4_allocate_blocks(ar, (unsigned long long)block);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 3cdb8aa..1641f4b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -139,9 +139,9 @@
struct ext4_free_extent {
ext4_lblk_t fe_logical;
- ext4_grpblk_t fe_start;
+ ext4_grpblk_t fe_start; /* In cluster units */
ext4_group_t fe_group;
- ext4_grpblk_t fe_len;
+ ext4_grpblk_t fe_len; /* In cluster units */
};
/*