blob: 4c616ce93d8f7974417fbe14d260608f8541dddb [file] [log] [blame]
/***********************************************************
** Copyright (C), 2008-2020, OPPO Mobile Comm Corp., Ltd.
** VENDOR_EDIT
** File: - fs/ext4/e4defrag.c
** Description: code to support ext4 defrag
**
** Version: 1.0
** Date : 2020/02/29
** Author: yanwu@TECH.Storage.FS.EXT4, add code to support ext4 defrag
**
** ------------------ Revision History:------------------------
** <author> <data> <version > <desc>
** yanwu 2020/02/29 1.0 create the file
****************************************************************/
#include <linux/fs.h>
#include <linux/version.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/freezer.h>
#include <linux/rbtree.h>
#include <linux/jiffies.h>
#include <linux/seq_file.h>
#include <linux/random.h>
#include <trace/events/block.h>
#include <asm/local.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/key.h>
#include <linux/task_io_accounting_ops.h>
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0))
#include <linux/sched/mm.h>
#endif
#include "ext4.h"
#include "mballoc.h"
#include "extents_status.h"
#include "ext4_extents.h"
#include "ext4_jbd2.h"
#define E4DEFRAG_SBI(_dfi) (container_of(_dfi, struct ext4_sb_info, dfi))
#define E4DEFRAG_SB(_dfi) (E4DEFRAG_SBI(_dfi)->s_sb)
#define E4DEFRAG_I(sb) (&EXT4_SB(sb)->dfi)
#define E4DEFRAG_NGROUPS(sb) (E4DEFRAG_I(sb)->ngroups)
#define E4DEFRAG_GS(sb, group) (&E4DEFRAG_I(sb)->groups[(group)])
#define E4DEFRAG_ET(sb) (&E4DEFRAG_I(sb)->tree)
#define E4DEFRAG_INO(sb, group, bit) (EXT4_INODES_PER_GROUP(sb) * (group) + (bit) + 1)
#define E4DEFRAG_FIRST_INO(sb, group) ((group) ? E4DEFRAG_INO(sb, group, 0) : EXT4_FIRST_INO(sb))
#define E4DEFRAG_LAST_INO(sb, group) (E4DEFRAG_INO(sb, (group) + 1, -1))
#define E4DEFRAG_GT(sb, group) (&E4DEFRAG_GS(sb, group)->table)
#define E4DEFRAG_GE(tbl, i) (&(tbl)->entries[i])
#define group_table_for_each(tbl, entry, i) \
for (i = 0; entry = E4DEFRAG_GE(tbl, i), i < (tbl)->nr_entries; i++)
/* #define DEBUG */
#ifdef DEBUG
#define e4defrag_dbg(f, a...) \
do { \
printk(KERN_DEBUG "e4defrag debug (%s, %d): %s:", \
__FILE__, __LINE__, __func__); \
printk(KERN_DEBUG f, ## a); \
} while (0)
#else
#define e4defrag_dbg(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
#define e4defrag_msg(f, a...) \
do { \
printk_ratelimited(KERN_INFO "e4defrag: " f, ## a); \
} while (0)
#define e4defrag_err(f, a...) \
do { \
printk_ratelimited(KERN_ERR "e4defrag: " f, ## a); \
} while (0)
#define DEFRAG_PROTECT_NORMAL (0x0)
#define DEFRAG_PROTECT_LOWPOWER (0x1)
#define DEFRAG_PROTECT_EOL (0x2)
int ext4_defrag_protect = DEFRAG_PROTECT_NORMAL;
struct defrag_extent {
struct rb_node node;
unsigned int ino;
unsigned short len;
ext4_lblk_t lblk;
ext4_fsblk_t pblk;
};
static struct kmem_cache *defrag_extent_cachep;
static int __init e4defrag_create_cache(void)
{
defrag_extent_cachep = kmem_cache_create("defrag_extent",
sizeof(struct defrag_extent),
0, (SLAB_RECLAIM_ACCOUNT),
NULL);
if (defrag_extent_cachep == NULL) {
return -ENOMEM;
}
return 0;
}
static void __exit e4defrag_destroy_cache(void)
{
if (defrag_extent_cachep) {
kmem_cache_destroy(defrag_extent_cachep);
}
}
static struct defrag_extent *e4defrag_alloc_extent(unsigned long ino,
ext4_lblk_t lblk,
ext4_lblk_t len,
ext4_grpblk_t pblk)
{
struct defrag_extent *ex;
ex = kmem_cache_alloc(defrag_extent_cachep, GFP_NOFS);
if (ex == NULL) {
return NULL;
}
ex->ino = ino;
ex->lblk = lblk;
ex->len = len;
ex->pblk = pblk;
return ex;
}
static void e4defrag_free_extent(struct defrag_extent *ex)
{
kmem_cache_free(defrag_extent_cachep, ex);
}
/* group table */
static inline void group_entry_init(struct group_entry *entry,
ext4_group_t group, int len, int bit)
{
entry->group = group;
entry->len = len;
entry->first_bit = bit;
entry->last_bit = bit;
}
static inline void group_entry_merge(struct group_entry *entry1,
struct group_entry *entry2)
{
entry1->len += entry2->len;
if (entry1->first_bit > entry2->first_bit) {
entry1->first_bit = entry2->first_bit;
}
if (entry1->last_bit < entry2->last_bit) {
entry1->last_bit = entry2->last_bit;
}
}
static inline unsigned long group_entry_first_ino(struct super_block *sb,
struct group_entry *entry)
{
return E4DEFRAG_INO(sb, entry->group, entry->first_bit);
}
static inline unsigned long group_entry_last_ino(struct super_block *sb,
struct group_entry *entry)
{
return E4DEFRAG_INO(sb, entry->group, entry->last_bit);
}
static inline void group_table_init(struct super_block *sb, ext4_group_t group)
{
E4DEFRAG_GT(sb, group)->nr_entries = 0;
}
/* check if we can load extents using group table */
static inline ext4_grpblk_t defrag_free(struct super_block *sb,
ext4_group_t group, bool idx);
static inline ext4_grpblk_t defrag_first_free(struct super_block *sb,
ext4_group_t group, bool idx);
static inline bool is_group_table_valid(struct super_block *sb,
ext4_group_t group)
{
unsigned int expect, found, first, free;
struct group_table *tbl = E4DEFRAG_GT(sb, group);
struct group_entry *entry;
int i;
if (!tbl->nr_entries) {
return false;
}
first = defrag_first_free(sb, group, 0);
free = defrag_free(sb, group, 0);
expect = EXT4_CLUSTERS_PER_GROUP(sb) - first - free;
found = 0;
group_table_for_each(tbl, entry, i) {
e4defrag_dbg("group %u entry (%u,%u,%u,%u)", group,
entry->group, entry->len, entry->first_bit,
entry->last_bit);
found += entry->len;
}
found = found > first ? found - first : 0;
e4defrag_dbg("group %u table (%u,%u)", group, expect, found);
/* table valid if half of extents found */
return found >= (expect >> 2);
}
/* get the ino to start scan for @group */
static inline unsigned long get_first_ino(struct super_block *sb,
ext4_group_t group)
{
struct group_table *tbl = E4DEFRAG_GT(sb, group);
return tbl->nr_entries ?
group_entry_first_ino(sb, E4DEFRAG_GE(tbl, 0)) :
E4DEFRAG_FIRST_INO(sb, group);
}
static inline bool defrag_scanned(struct super_block *sb, ext4_group_t group);
static inline void group_table_add_entry(struct super_block *sb,
ext4_group_t group,
unsigned int len, unsigned long ino)
{
int i;
unsigned int ino_bit;
ext4_group_t ino_group;
struct group_entry *entry, new_entry, *min_entry;
struct group_table *tbl = E4DEFRAG_GT(sb, group);
ino_bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
ino_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
if (defrag_scanned(sb, ino_group)) {
return;
}
min_entry = E4DEFRAG_GE(tbl, 0);
group_table_for_each(tbl, entry, i) {
if (ino_group == entry->group) {
group_entry_init(&new_entry, ino_group, len, ino_bit);
group_entry_merge(entry, &new_entry);
return;
}
if (entry->len < min_entry->len) {
min_entry = entry;
}
}
if (tbl->nr_entries < ARRAY_SIZE(tbl->entries)) {
group_entry_init(E4DEFRAG_GE(tbl, i), ino_group, len, ino_bit);
tbl->nr_entries++;
} else {
e4defrag_msg("group %u table full, evict (%d,%d)", group,
min_entry->group, min_entry->len);
group_entry_init(min_entry, ino_group, len, ino_bit);
}
}
static inline unsigned long
defrag_time_diff(unsigned long start, unsigned long end)
{
if (end >= start) {
return jiffies_to_msecs(end - start);
}
return jiffies_to_msecs(end + MAX_JIFFY_OFFSET - start);
}
/* defrag state machine */
enum defrag_phase {
DEFRAG_INIT,
DEFRAG_START,
DEFRAG_SCAN_PAUSE,
DEFRAG_GROUP_SCANNED,
DEFRAG_SCAN_DONE,
DEFRAG_SCAN_RESET,
DEFRAG_PAUSE,
DEFRAG_CONTINUE,
DEFRAG_DONE,
};
/* the last argument for defrag_update_state */
union defrag_state_data {
/* parameters for @DEFRAG_START and @DEFRAG_DONE */
struct {
ext4_grpblk_t first_free;
ext4_grpblk_t free;
ext4_grpblk_t fragments;
ext4_lblk_t moved_;
};
/* parameters for @DEFRAG_PAUSE */
ext4_lblk_t moved;
unsigned long ino;
};
static void defrag_update_state(struct super_block *sb, ext4_group_t group,
enum defrag_phase phase,
union defrag_state_data *dsd)
{
struct ext4_defrag_info *dfi = E4DEFRAG_I(sb);
struct defrag_group_state *dgs = E4DEFRAG_GS(sb, group);
unsigned long cost;
switch (phase) {
case DEFRAG_INIT:
dgs->scanned = 0;
dgs->started = 0;
dgs->paused = 0;
dgs->done = 0;
dgs->cost = 0;
dgs->count = 0;
dgs->duration = 0;
dgs->moved = 0;
dgs->scan = 0;
group_table_init(sb, group);
break;
case DEFRAG_START:
dgs->start = jiffies;
dgs->update = jiffies;
dgs->loaded = 0;
dgs->started = 1;
dgs->paused = 0;
dgs->done = 0;
dgs->count++;
dgs->first_free[0] = dsd->first_free;
dgs->free[0] = dsd->free;
dgs->fragments[0] = dsd->fragments;
dfi->group = group;
dfi->last_ino = get_first_ino(sb, group);
break;
case DEFRAG_SCAN_PAUSE:
dfi->last_ino = dsd->ino;
cost = defrag_time_diff(dgs->update, jiffies);
dgs->update = jiffies;
dgs->scan += cost;
dgs->cost += cost;
dgs->paused = 1;
break;
case DEFRAG_GROUP_SCANNED:
dgs->scanned = 1;
break;
case DEFRAG_SCAN_DONE:
cost = defrag_time_diff(dgs->update, jiffies);
dgs->update = jiffies;
dgs->scan += cost;
dgs->cost += cost;
dgs->loaded = 1;
break;
case DEFRAG_SCAN_RESET:
dgs->scanned = 0;
group_table_init(sb, group);
break;
case DEFRAG_PAUSE:
dgs->paused = 1;
dgs->moved += dsd->moved;
dgs->cost += defrag_time_diff(dgs->update, jiffies);
break;
case DEFRAG_CONTINUE:
dgs->paused = 0;
dgs->update = jiffies;
break;
case DEFRAG_DONE:
dgs->loaded = 0;
dgs->started = 0;
dgs->paused = 0;
dgs->done = 1;
dgs->first_free[1] = dsd->first_free;
dgs->free[1] = dsd->free;
dgs->fragments[1] = dsd->fragments;
dgs->moved += dsd->moved_;
dgs->end = jiffies;
dgs->cost += defrag_time_diff(dgs->update, dgs->end);
dgs->duration = (dgs->duration +
7 * defrag_time_diff(dgs->start,
dgs->end)) >> 3;
break;
default:
BUG_ON(1);
}
}
static inline bool defrag_force_mode(struct super_block *sb)
{
return E4DEFRAG_I(sb)->force_mode;
}
/* get the current group to defrag */
static inline ext4_group_t defrag_group(struct super_block *sb)
{
return E4DEFRAG_I(sb)->group;
}
/* check if inodes in @group have been scanned */
static inline bool defrag_scanned(struct super_block *sb, ext4_group_t group)
{
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return E4DEFRAG_GS(sb, group)->scanned;
}
/* check if defrag started */
static inline bool defrag_started(struct super_block *sb, ext4_group_t group)
{
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return E4DEFRAG_GS(sb, group)->started;
}
/* check if the extents in @group loaded */
static inline bool defrag_extents_loaded(struct super_block *sb,
ext4_group_t group)
{
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return E4DEFRAG_GS(sb, group)->loaded;
}
/* check if defrag paused */
static inline bool defrag_paused(struct super_block *sb, ext4_group_t group)
{
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return E4DEFRAG_GS(sb, group)->paused;
}
/* check if @group have defraged */
static inline bool defrag_done(struct super_block *sb, ext4_group_t group)
{
struct defrag_group_state *dgs = E4DEFRAG_GS(sb, group);
unsigned int interval = E4DEFRAG_I(sb)->interval;
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
if (!dgs->done) {
return false;
}
return defrag_time_diff(dgs->end, jiffies) < interval;
}
/* check if defrag @group cost too much time, 10s */
static inline bool defrag_too_long(struct super_block *sb, ext4_group_t group)
{
struct defrag_group_state *dgs = E4DEFRAG_GS(sb, group);
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return dgs->count == 0 || (dgs->cost / dgs->count >= 10000);
}
/* get the first free block in @group */
static inline ext4_grpblk_t defrag_first_free(struct super_block *sb,
ext4_group_t group, bool idx)
{
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return E4DEFRAG_GS(sb, group)->first_free[idx];
}
/* get the free blocks in @group */
static inline ext4_grpblk_t defrag_free(struct super_block *sb,
ext4_group_t group, bool idx)
{
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return E4DEFRAG_GS(sb, group)->free[idx];
}
/* get the fragments in @group */
static inline ext4_grpblk_t defrag_fragments(struct super_block *sb,
ext4_group_t group, bool idx)
{
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return E4DEFRAG_GS(sb, group)->fragments[idx];
}
/* get the time cost to load extents of @group */
static inline unsigned long defrag_scan_cost(struct super_block *sb,
ext4_group_t group)
{
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return E4DEFRAG_GS(sb, group)->scan;
}
/* get the total time cost to defrag @group */
static inline unsigned long defrag_cost(struct super_block *sb,
ext4_group_t group)
{
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return E4DEFRAG_GS(sb, group)->cost;
}
/* get number of defrag in @group */
static inline unsigned long defrag_count(struct super_block *sb,
ext4_group_t group)
{
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return E4DEFRAG_GS(sb, group)->count;
}
/* get the average duration to defrag @group */
static inline unsigned long defrag_duration(struct super_block *sb,
ext4_group_t group)
{
struct defrag_group_state *dgs = E4DEFRAG_GS(sb, group);
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return dgs->done ? dgs->duration : defrag_time_diff(dgs->start,
jiffies);
}
/* get blocks moved to defrag @group */
static inline unsigned long defrag_moved(struct super_block *sb,
ext4_group_t group)
{
BUG_ON(group >= E4DEFRAG_NGROUPS(sb));
return E4DEFRAG_GS(sb, group)->moved;
}
/* extent routines */
struct free_extent_info {
ext4_grpblk_t start;
ext4_grpblk_t len;
};
static int get_free_extent(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t len, void *priv)
{
struct free_extent_info *fei = priv;
fei->start = start;
fei->len = len;
return 1;
}
/* find the first free extent after @start in @group */
static ext4_grpblk_t e4defrag_next_free_extent(struct super_block *sb,
ext4_group_t group,
ext4_grpblk_t start,
ext4_grpblk_t * len)
{
struct free_extent_info fei;
fei.start = start;
fei.len = 0;
ext4_mballoc_query_range(sb, group, start, -1, get_free_extent, &fei);
if (len) {
*len = fei.len;
}
return fei.start;
}
/* sort by reverse order in pblk */
static inline int ex_cmp(struct defrag_extent *ex1, struct defrag_extent *ex2)
{
if (ex1->pblk > ex2->pblk) {
return -1;
} else if (ex1->pblk < ex2->pblk) {
return 1;
} else {
return 0;
}
}
/* add defrag extent to extents tree */
static int e4defrag_add_extent(struct inode *inode, ext4_lblk_t lblk, int len,
ext4_fsblk_t pblk)
{
struct defrag_extent *ex, *new_ex;
ext4_group_t group;
ext4_grpblk_t offset;
struct group_extent_tree *tree = E4DEFRAG_ET(inode->i_sb);
struct rb_node **n = &tree->root.rb_node;
struct rb_node *parent = NULL;
/* update group table */
ext4_get_group_no_and_offset(inode->i_sb, pblk, &group, &offset);
group_table_add_entry(inode->i_sb, group, len, inode->i_ino);
/* only mantain the extents in defrag group */
if (group != defrag_group(inode->i_sb)
|| offset < defrag_first_free(inode->i_sb, group, 0)) {
return 0;
}
new_ex = e4defrag_alloc_extent(inode->i_ino, lblk, len, pblk);
if (!new_ex) {
return -1;
}
while (*n) {
int res;
parent = *n;
ex = rb_entry(parent, struct defrag_extent, node);
res = ex_cmp(new_ex, ex);
if (res < 0) {
n = &(*n)->rb_left;
} else if (res > 0) {
n = &(*n)->rb_right;
} else {
e4defrag_err("extent overlap found\n");
e4defrag_free_extent(new_ex);
tree->len += len - ex->len;
ex->lblk = lblk;
ex->len = len;
ex->pblk = pblk;
return 0;
}
}
rb_link_node(&new_ex->node, parent, n);
rb_insert_color(&new_ex->node, &tree->root);
tree->len += len;
return len;
}
/* remove defrag extent from extents tree */
static inline void e4defrag_remove_extent(struct super_block *sb,
struct defrag_extent *ex)
{
rb_erase(&ex->node, &E4DEFRAG_ET(sb)->root);
e4defrag_free_extent(ex);
}
/* check if the block (@group, @offset) is free*/
static inline bool is_block_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t offset)
{
ext4_grpblk_t start, len;
start = e4defrag_next_free_extent(sb, group, offset, &len);
return start == offset && len > 0;
}
/**
* @MC_NONE: all extent can be moved
* @MC_LENGTH: extent which length matched can be moved
* @MC_FREE_ONE: extent which length matched and one side free can be moved
* @MC_FREE_BOTH: extent which length matched and both side free can be moved
*/
enum move_check {
MC_NONE,
MC_LENGTH,
MC_FREE_ONE,
MC_FREE_BOTH,
};
/* check if the extent can be moved */
static inline bool is_move_ok(struct super_block *sb, struct defrag_extent *ex,
ext4_grpblk_t len, enum move_check mc)
{
bool left_is_free, right_is_free;
ext4_group_t group;
ext4_grpblk_t offset;
if (mc == MC_NONE) {
return true;
}
/* extent is too large */
if (len < ex->len) {
return false;
}
if (mc == MC_LENGTH) {
return true;
}
ext4_get_group_no_and_offset(sb, ex->pblk, &group, &offset);
/* check if the left of extent is free */
left_is_free = (offset > 0) && is_block_free(sb, group, offset - 1);
if (left_is_free && (mc == MC_FREE_ONE || len == ex->len)) {
return true;
}
if (!left_is_free && mc == MC_FREE_BOTH && len != ex->len) {
return false;
}
/* check if the right of extent is free */
offset = offset + ex->len;
right_is_free = (offset < EXT4_CLUSTERS_PER_GROUP(sb)) &&
is_block_free(sb, group, offset);
if (right_is_free
&& (left_is_free || mc == MC_FREE_ONE || len == ex->len)) {
return true;
}
return false;
}
/* check and add 1 extent of inode to extent tree */
static bool add_extent_fn(struct extent_status *es, void *arg)
{
struct inode *inode = (struct inode *)arg;
ext4_fsblk_t es_pblk;
int ret;
if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
es_pblk = 0;
} else {
es_pblk = ext4_es_pblock(es);
}
/* only load extent with pblk mapped */
if (!es_pblk) {
return false;
}
ret = e4defrag_add_extent(inode, es->es_lblk, es->es_len, es_pblk);
return ret < 0 ? true : false;
}
static bool is_idle(struct super_block *sb);
static bool load_extents(struct inode *inode, void *priv)
{
int depth, bits;
ext4_lblk_t blocks;
struct super_block *sb = inode->i_sb;
if (!is_idle(sb)) {
union defrag_state_data dsd;
dsd.ino = inode->i_ino;
defrag_update_state(sb, defrag_group(sb), DEFRAG_SCAN_PAUSE,
&dsd);
return true;
}
e4defrag_dbg("load extents from ino: %lu\n", inode->i_ino);
/* only load the extent of regualar file not inlined */
if (!S_ISREG(inode->i_mode) || ext4_should_journal_data(inode) ||
IS_SWAPFILE(inode) || ext4_is_quota_file(inode) ||
ext4_has_inline_data(inode) || !inode->i_size ||
!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
return false;
}
/* skip encrypted file without ICE */
if (IS_ENCRYPTED(inode)) {
int err = fscrypt_require_key(inode);
if (err) {
e4defrag_dbg("ino:%lu required key failed, err:%d\n",
inode->i_ino, err);
return false;
}
/* TODO: */
/*
if (!fscrypt_using_hardware_encryption(inode)) {
e4defrag_dbg("ino:%lu not ICE\n", inode->i_ino);
return false;
}
*/
}
down_read(&EXT4_I(inode)->i_data_sem);
depth = ext_depth(inode);
/* special case: all extents in i_data */
if (depth == 0) {
struct ext4_extent_header *eh;
struct ext4_extent *ex;
eh = ext_inode_hdr(inode);
for (ex = EXT_FIRST_EXTENT(eh); ex <= EXT_LAST_EXTENT(eh); ex++) {
e4defrag_add_extent(inode,
le32_to_cpu(ex->ee_block),
ext4_ext_get_actual_len(ex),
ext4_ext_pblock(ex));
}
up_read(&EXT4_I(inode)->i_data_sem);
return false;
}
up_read(&EXT4_I(inode)->i_data_sem);
/* load the extent of the inode */
bits = sb->s_blocksize_bits;
blocks = (inode->i_size + (1 << bits) - 1) >> bits;
ext4_query_extents_range(inode, 0, blocks, add_extent_fn,
(void *)inode);
return false;
}
static inline bool defrag_extents_enough(struct super_block *sb,
ext4_group_t group)
{
unsigned int found, expect;
unsigned int first, free;
first = defrag_first_free(sb, group, 0);
free = defrag_free(sb, group, 0);
expect = EXT4_CLUSTERS_PER_GROUP(sb) - first - free;
found = E4DEFRAG_ET(sb)->len;
e4defrag_dbg("group %u extents (%u,%u,%u,%u)", group, expect, found,
first, free);
/* 1/4 of extents found */
return found >= (expect >> 2);
}
/* build the extents tree of @group */
static bool e4defrag_load_extents(struct super_block *sb, ext4_group_t group)
{
unsigned long start, end;
unsigned long last_ino;
int i;
bool is_first = true;
struct group_table *tbl = E4DEFRAG_GT(sb, group);
struct group_entry *entry;
last_ino = E4DEFRAG_I(sb)->last_ino;
e4defrag_dbg("group %u last_ino: %lu\n", group, last_ino);
/* load from group table first */
group_table_for_each(tbl, entry, i) {
if (defrag_scanned(sb, entry->group)) {
start = group_entry_first_ino(sb, entry);
end = group_entry_last_ino(sb, entry);
} else {
start = E4DEFRAG_FIRST_INO(sb, group);
end = E4DEFRAG_LAST_INO(sb, group);
}
if (is_first) {
/* we continue from last paused point */
if ((last_ino < start) || (last_ino > end)) {
continue;
}
start = last_ino;
is_first = false;
}
ext4_query_inode_range(sb, start, end, load_extents, NULL);
if (defrag_paused(sb, group)) {
return false;
}
if (!defrag_scanned(sb, group)) {
defrag_update_state(sb, entry->group,
DEFRAG_GROUP_SCANNED, NULL);
}
}
if (!is_group_table_valid(sb, group)) {
ext4_group_t ngroups, i, grp;
grp = (last_ino - 1) / EXT4_INODES_PER_GROUP(sb);
ngroups = E4DEFRAG_NGROUPS(sb);
/* search group in reverse order from local group */
for (i = 0; i < ngroups; i++, grp--) {
if (i == 0) {
start = last_ino;
} else {
if (grp == -1) {
grp = ngroups - 1;
}
start = E4DEFRAG_FIRST_INO(sb, grp);
}
end = E4DEFRAG_LAST_INO(sb, grp);
if (defrag_scanned(sb, grp)) {
continue;
}
ext4_query_inode_range(sb, start, end,
load_extents, NULL);
if (defrag_paused(sb, group)) {
return false;
}
defrag_update_state(sb, grp, DEFRAG_GROUP_SCANNED,
NULL);
/* check if enough extents found */
if (is_group_table_valid(sb, group)) {
break;
}
}
}
defrag_update_state(sb, group, DEFRAG_SCAN_DONE, NULL);
return true;
}
/* destory the extent tree of group */
static void e4defrag_release_extents(struct super_block *sb, ext4_group_t group)
{
struct group_extent_tree *tree = E4DEFRAG_ET(sb);
struct rb_node *n = rb_first(&tree->root);
struct defrag_extent *ex;
while (n) {
ex = rb_entry(n, struct defrag_extent, node);
n = rb_next(n);
e4defrag_remove_extent(sb, ex);
}
tree->len = 0;
}
/* move file range [@start, @start + len) of @ino to [@goal, @goal+@len) in disk */
static int do_move(struct super_block *sb, unsigned long ino,
ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t goal,
ext4_lblk_t * moved_len)
{
struct ext4_extent newext;
struct ext4_ext_path *path;
struct inode *orig_inode, *donor_inode;
struct file orig_file, donor_file;
handle_t *handle;
__u64 moved = 0;
int credits, err = 0;
struct ext4_allocation_request ar;
#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0))
orig_inode = ext4_iget_normal(sb, ino);
#else
orig_inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
#endif
if (IS_ERR(orig_inode)) {
err = PTR_ERR(orig_inode);
e4defrag_err("fail to get original ino: %lu, err: %d\n", ino,
err);
goto out;
}
err = fscrypt_require_key(orig_inode);
if (err) {
e4defrag_err("fail to require key for ino: %lu, err: %d\n", ino,
err);
goto out_iput;
}
donor_inode = EXT4_SB(sb)->dfi.donor_inode;
down_write(&EXT4_I(donor_inode)->i_data_sem);
path = ext4_find_extent(donor_inode, 0, NULL, EXT4_EX_NOCACHE);
up_write(&EXT4_I(donor_inode)->i_data_sem);
if (IS_ERR(path)) {
err = PTR_ERR(path);
e4defrag_err("fail to get extpath of donor inode, err: %d\n",
err);
goto out_put;
}
credits =
3 + ext4_ext_calc_credits_for_single_extent(donor_inode, len, path);
handle = ext4_journal_start(donor_inode, EXT4_HT_MOVE_EXTENTS, credits);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
e4defrag_err("fail to start journal, err: %d\n", err);
goto out_put;
}
/* allocate [@goal, @goal + @len) for donor inode */
ar.inode = donor_inode;
ar.goal = goal;
ar.len = len;
ar.flags = EXT4_MB_HINT_DATA | EXT4_MB_HINT_NOPREALLOC |
EXT4_MB_HINT_GOAL_ONLY | EXT4_MB_HINT_TRY_GOAL;
ext4_mb_new_blocks(handle, &ar, &err);
if (err) {
e4defrag_err("fail to reserve [%llu,%d), err: %d\n",
ar.goal, len, err);
ext4_journal_stop(handle);
goto out_put;
}
/* prepare extent for donor inode */
newext.ee_block = cpu_to_le32(0);
newext.ee_len = cpu_to_le16(ar.len);
ext4_ext_mark_unwritten(&newext);
ext4_ext_store_pblock(&newext, ar.goal);
down_write(&EXT4_I(donor_inode)->i_data_sem);
err = ext4_ext_insert_extent(handle, donor_inode, &path, &newext, 0);
up_write(&EXT4_I(donor_inode)->i_data_sem);
if (err) {
e4defrag_err("fail to insert extent, err: %d\n", err);
ext4_free_blocks(handle, donor_inode, NULL, goal, len,
EXT4_FREE_BLOCKS_FORGET);
ext4_journal_stop(handle);
goto out_put;
}
ext4_journal_stop(handle);
err = ext4_inode_attach_jinode(orig_inode);
if (err) {
e4defrag_err("fail to attach jinode, err: %d\n", err);
goto out_put;
}
/* swap the extent of inode and donor inode */
orig_file.f_inode = orig_inode;
donor_file.f_inode = donor_inode;
err = ext4_move_extents(&orig_file, &donor_file, lblk, 0, len, &moved);
e4defrag_dbg("moved: %llu/%d\n", moved, err);
if (!err && moved_len) {
*moved_len = moved;
}
down_write(&EXT4_I(donor_inode)->i_data_sem);
ext4_es_remove_extent(donor_inode, 0, len);
ext4_ext_remove_space(donor_inode, 0, len - 1);
up_write(&EXT4_I(donor_inode)->i_data_sem);
out_put:
ext4_ext_drop_refs(path);
kfree(path);
out_iput:
iput(orig_inode);
out:
return err;
}
static long read_bio_inflight(void);
static bool is_idle(struct super_block *sb)
{
long inflight;
if (defrag_force_mode(sb)) {
return true;
}
inflight = read_bio_inflight();
if (unlikely(inflight != 0)) {
e4defrag_dbg("bio inflight: %ld\n", inflight);
return false;
}
return true;
}
static int find_free_extent(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t len, void *priv)
{
struct free_extent_info *fei = priv;
if (len >= fei->len) {
fei->start = start;
fei->len = len;
return 1;
}
return 0;
}
/* find the first free extent can move to */
static ext4_grpblk_t e4defrag_find_free_extent(struct super_block *sb,
ext4_group_t group,
ext4_grpblk_t ex_blk,
ext4_grpblk_t ex_len,
ext4_grpblk_t * len)
{
struct free_extent_info fei;
fei.start = ex_blk;
fei.len = ex_len;
ext4_mballoc_query_range(sb, group, 0, ex_blk, find_free_extent, &fei);
if (fei.start < ex_blk) {
if (len) {
*len = fei.len;
}
return fei.start;
}
return -1;
}
/* defrag one group and return blocks moved */
static ext4_lblk_t do_defrag(struct super_block *sb, ext4_group_t group)
{
struct ext4_defrag_info *dfi = E4DEFRAG_I(sb);
struct rb_node *n;
struct defrag_extent *ex;
ext4_grpblk_t len = 0;
unsigned long moved = 0;
ext4_fsblk_t base, goal;
/* do not defrag the group if too many extents unknown */
if (!defrag_extents_enough(sb, group)) {
e4defrag_msg("group %u not enough extents found\n", group);
/* this means group table is obsolete, we need to reset table */
if (is_group_table_valid(sb, group)) {
ext4_group_t i;
for (i = 0; i < E4DEFRAG_NGROUPS(sb); i++) {
defrag_update_state(sb, i, DEFRAG_SCAN_RESET,
NULL);
}
}
return 0;
}
base = ext4_group_first_block_no(sb, group);
n = rb_first(&dfi->tree.root);
while (n) {
if (!is_idle(sb)) {
union defrag_state_data dsd;
dsd.moved = moved;
defrag_update_state(sb, group, DEFRAG_PAUSE, &dsd);
break;
}
ex = rb_entry(n, struct defrag_extent, node);
if (!is_move_ok(sb, ex, dfi->max_len, MC_FREE_ONE)) {
n = rb_next(n);
e4defrag_remove_extent(sb, ex);
continue;
}
e4defrag_dbg("ex[%u,%u/%llu/%u)\n", ex->lblk, ex->len, ex->pblk,
ex->ino);
/* find the free extent move to */
goal =
e4defrag_find_free_extent(sb, group, ex->pblk - base,
ex->len, &len);
if (goal == -1) {
n = rb_next(n);
e4defrag_remove_extent(sb, ex);
continue;
}
goal += base;
e4defrag_dbg("fex[%llu,%u)\n", goal, len);
len = min_t(ext4_lblk_t, len, ex->len);
if (!do_move(sb, ex->ino, ex->lblk, len, goal, &len)) {
moved += len;
}
/* partial moved */
if (ex->len > len) {
ex->lblk += len;
ex->pblk += len;
ex->len -= len;
} else {
n = rb_next(n);
e4defrag_remove_extent(sb, ex);
}
}
e4defrag_dbg("total moved: %lu\n", moved);
return moved;
}
#ifdef DEBUG
struct freefrag_info {
ext4_grpblk_t first_free;
ext4_grpblk_t free;
ext4_grpblk_t fragments;
ext4_grpblk_t counters[8];
};
static int acct_freefrag(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t len, void *priv)
{
struct freefrag_info *ffi = priv;
int order;
if (ffi->first_free == -1) {
ffi->first_free = start;
}
ffi->free += len;
ffi->fragments++;
order = fls(len) - 1;
if (order > 7) {
order = 7;
}
ffi->counters[order]++;
return 0;
}
static void show_freefrag(struct super_block *sb, ext4_group_t group)
{
struct freefrag_info ffi;
memset(&ffi, 0, sizeof(ffi));
ffi.first_free = -1;
ext4_mballoc_query_range(sb, group, 0, -1, acct_freefrag, &ffi);
e4defrag_msg("group %u free stats: %u %u %u\n", group,
ffi.first_free, ffi.free, ffi.fragments);
e4defrag_msg("fragments: %u %u %u %u %u %u %u\n", ffi.counters[0],
ffi.counters[1], ffi.counters[2], ffi.counters[3],
ffi.counters[4], ffi.counters[5], ffi.counters[6]);
e4defrag_msg("defrag stats: %lu %lu %lu %lu %lu",
defrag_count(sb, group), defrag_moved(sb, group),
defrag_scan_cost(sb, group), defrag_cost(sb, group),
defrag_duration(sb, group));
}
#else
static inline void show_freefrag(struct super_block *sb, ext4_group_t group)
{
};
#endif
/* get the free block stats of @group */
bool get_free_stats(struct ext4_group_info *grp, ext4_group_t group, void *priv)
{
union defrag_state_data *dsd = priv;
dsd->first_free = grp->bb_first_free;
dsd->free = grp->bb_free;
dsd->fragments = grp->bb_fragments;
return true;
}
/* defrag one block group: return true if done, false if interrupted */
static bool defrag_one_group(struct super_block *sb, ext4_group_t group)
{
unsigned long moved;
/* continue defrag if paused */
if (defrag_paused(sb, group)) {
defrag_update_state(sb, group, DEFRAG_CONTINUE, NULL);
}
/* load group extents if needed */
if (!defrag_extents_loaded(sb, group)) {
e4defrag_load_extents(sb, group);
if (defrag_paused(sb, group)) {
return false;
}
}
show_freefrag(sb, group);
/* move data to do defrag */
moved = do_defrag(sb, group);
show_freefrag(sb, group);
/* release group extents if done */
if (!defrag_paused(sb, group) || defrag_too_long(sb, group)) {
union defrag_state_data dsd;
e4defrag_release_extents(sb, group);
/* query the group free block state after defrag */
memset(&dsd, 0, sizeof(dsd));
ext4_mb_query_group_info(sb, group, 1, get_free_stats, &dsd,
false);
if (!defrag_paused(sb, group)) {
dsd.moved_ = moved;
}
defrag_update_state(sb, group, DEFRAG_DONE, &dsd);
return true;
}
return false;
}
/**
* @GSP_FIRST_MATCH: select the first group satisfy free/score threshold
* @GSP_MAX_FRAGMENTS: select the group with max fragments
* @GSP_MAX_SCORE: select the group with max fragmentation score
*/
enum bg_select_policy {
GSP_FIRST_MATCH,
GSP_MAX_FRAGMENTS,
GSP_MAX_SCORE,
};
struct bg_select_data {
struct super_block *sb;
enum bg_select_policy policy;
bool reverse;
ext4_group_t group, next;
unsigned int min_score;
unsigned int min_free;
/* stats of selected group */
unsigned int score;
unsigned int first_free;
unsigned int free;
unsigned int fragments;
};
/* init the group select data */
static inline void bg_select_data_init(struct super_block *sb,
struct bg_select_data *gsd,
enum bg_select_policy policy)
{
struct ext4_defrag_info *dfi = E4DEFRAG_I(sb);
gsd->sb = sb;
gsd->policy = policy;
gsd->reverse = true;
gsd->min_free = dfi->min_free;
gsd->min_score = FRAG_SCORE(dfi->min_score);
gsd->group = -1;
gsd->next = 0;
gsd->score = 0;
gsd->free = 0;
gsd->fragments = 0;
}
bool need_defrag(struct super_block *sb, unsigned free, unsigned fragments)
{
struct ext4_defrag_info *dfi = E4DEFRAG_I(sb);
unsigned score;
/* skip group if free space too small */
if (free <= dfi->min_free) {
return false;
}
score = (fragments - 1) * FRAG_SCORE_FACTOR / free;
/* skip group not fragmented */
if (score < dfi->min_score) {
return false;
}
return true;
}
/* select group to do defrag */
bool defrag_select_group(struct ext4_group_info * grp,
ext4_group_t group, void *priv)
{
struct bg_select_data *gsd = priv;
struct ext4_defrag_info *dfi = E4DEFRAG_I(gsd->sb);
unsigned int score;
if (unlikely(group >= dfi->ngroups)) {
struct defrag_group_state *groups;
ext4_group_t grp, ngroups;
e4defrag_msg("group number exceed ngroups (%u, %u)",
group, dfi->ngroups);
ngroups = ext4_get_groups_count(gsd->sb);
if (ngroups > dfi->ngroups) {
groups =
kvmalloc(ngroups *
sizeof(struct defrag_group_state),
GFP_KERNEL);
if (ZERO_OR_NULL_PTR(groups)) {
return false;
}
memcpy(groups, dfi->groups,
dfi->ngroups * sizeof(*groups));
dfi->groups = groups;
kvfree(dfi->groups);
for (grp = dfi->ngroups; grp < ngroups; grp++) {
defrag_update_state(gsd->sb, grp, DEFRAG_INIT,
NULL);
}
dfi->ngroups = ngroups;
}
}
gsd->next = gsd->reverse ? group - 1 : group + 1;
/* skip group if free space too small */
if (grp->bb_free <= gsd->min_free) {
return false;
}
/* skip group if the group defragged recently */
if (defrag_done(gsd->sb, group)) {
return false;
}
score = (grp->bb_fragments - 1) * FRAG_SCORE_FACTOR / grp->bb_free;
/* skip group not fragmented */
if (score < gsd->min_score) {
return false;
}
switch (gsd->policy) {
case GSP_FIRST_MATCH:
gsd->group = group;
gsd->score = score;
gsd->first_free = grp->bb_first_free;
gsd->free = grp->bb_free;
gsd->fragments = grp->bb_fragments;
return true;
case GSP_MAX_FRAGMENTS:
if (grp->bb_fragments > gsd->fragments) {
if (gsd->group != -1) {
gsd->next = gsd->group;
}
gsd->group = group;
gsd->score = score;
gsd->first_free = grp->bb_first_free;
gsd->free = grp->bb_free;
gsd->fragments = grp->bb_fragments;
}
return false;
case GSP_MAX_SCORE:
if (score > gsd->score) {
if (gsd->group != -1) {
gsd->next = gsd->group;
}
gsd->group = group;
gsd->score = score;
gsd->first_free = grp->bb_first_free;
gsd->free = grp->bb_free;
gsd->fragments = grp->bb_fragments;
}
return false;
default:
e4defrag_err("unknown defrag group select policy: %u\n",
gsd->policy);
BUG_ON(1);
}
return false;
}
/* defrag thread flag */
#define PT_E4DEFRAG_BIT (29)
#define PT_E4DEFRAG_THREAD (1 << PT_E4DEFRAG_BIT)
static inline void mark_e4defrag_thread(void)
{
current->ptrace |= PT_E4DEFRAG_THREAD;
}
static inline void clear_e4defrag_thread(void)
{
current->ptrace &= ~PT_E4DEFRAG_THREAD;
}
static inline bool is_e4defrag_thread(void)
{
return current->ptrace & PT_E4DEFRAG_THREAD;
}
/* we need session keyring to encrypt/decrypt file */
static struct key *e4defrag_install_session_keyring(pid_t pid)
{
struct task_struct *task;
struct cred *new;
struct key *key = NULL;
task = get_pid_task(find_get_pid(pid), PIDTYPE_PID);
if (!task) {
e4defrag_err("fail to find task, pid=%d\n", pid);
goto out;
}
new = prepare_kernel_cred(current);
if (!new) {
e4defrag_err("fail to prepare new cred\n");
goto out_put;
}
key = key_get(task_cred_xxx(task, session_keyring));
key_put(current_cred_xxx(session_keyring));
rcu_assign_pointer(new->session_keyring, key);
commit_creds(new);
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 0))
e4defrag_dbg("key usage: %d\n", refcount_read(&key->usage));
#else
e4defrag_dbg("key usage: %d\n", atomic_read(&key->usage));
#endif
out_put:
put_task_struct(task);
out:
return key;
}
static inline void set_sched_policy(void)
{
cpumask_t allowed_mask = CPU_MASK_NONE;
unsigned int cpu;
/* set nice as THREAD_PRIORITY_BACKGROUND */
set_user_nice(current, 10);
/* bound to little cluster */
for (cpu = 0; cpu < 4; cpu++) {
cpumask_set_cpu(cpu, &allowed_mask);
}
set_cpus_allowed_ptr(current, &allowed_mask);
}
/* main function of defrag thread */
static int e4defrag_func(void *data)
{
struct super_block *sb = (struct super_block *)data;
struct ext4_defrag_info *dfi = E4DEFRAG_I(sb);
ext4_group_t first_group = dfi->ngroups - 1, nr_skip = 0;
unsigned int wait_ms;
struct bg_select_data gsd;
bool done = true;
wait_ms = dfi->sleep_time;
set_sched_policy();
set_freezable();
mark_e4defrag_thread();
dfi->task = current;
wake_up(&dfi->init_wq);
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0))
/* not call into fs layer in mem allocation path */
memalloc_nofs_save();
#endif
/* we get the session keyring from init process and install it */
e4defrag_install_session_keyring(1);
loop:
wait_event_interruptible_timeout(dfi->wq, kthread_should_stop()
|| freezing(current)
|| (dfi->wake && !ext4_defrag_protect),
msecs_to_jiffies(wait_ms));
if (freezing(current)) {
e4defrag_dbg("suspending online defrag thread\n");
try_to_freeze();
} else if (kthread_should_stop() ||
((ext4_defrag_protect == DEFRAG_PROTECT_EOL))) {
e4defrag_dbg("exiting online defrag thread\n");
clear_e4defrag_thread();
dfi->task = NULL;
e4defrag_msg("online defrag thread exited\n");
return 0;
} else if (ext4_defrag_protect == DEFRAG_PROTECT_LOWPOWER) {
e4defrag_dbg("protecting online defrag thread as low power\n");
} else {
ext4_group_t nr_to_scan =
min_t(ext4_group_t, dfi->ngroups, dfi->nr_to_scan);
e4defrag_dbg("defrag wake up: %u\n", dfi->wake);
if (dfi->wake) {
dfi->wake = WAKE_TIMEOUT;
}
/* find a group to defrag */
if (done) {
bg_select_data_init(sb, &gsd, dfi->policy);
ext4_mb_query_group_info(sb, first_group,
nr_to_scan,
defrag_select_group, &gsd,
gsd.reverse);
first_group = gsd.next;
if (gsd.group != -1) {
union defrag_state_data dsd;
dfi->state = DS_RUNNING;
dsd.first_free = gsd.first_free;
dsd.free = gsd.free;
dsd.fragments = gsd.fragments;
defrag_update_state(sb, gsd.group,
DEFRAG_START, &dsd);
}
}
if (gsd.group != -1) {
nr_skip = 0;
e4defrag_msg("defrag group %u (%u, %u, %u)\n",
gsd.group, gsd.free, gsd.fragments,
NORM_SCORE(gsd.score));
done = defrag_one_group(sb, gsd.group);
if (defrag_force_mode(sb)) {
wait_ms = 0;
} else if (done) {
wait_ms = dfi->sleep_time;
} else {
wait_ms = dfi->min_sleep_time;
}
} else {
nr_skip += nr_to_scan;
if (nr_skip >= dfi->ngroups) {
/* all groups scanned, wait for max interval */
nr_skip = 0;
dfi->state = DS_DONE;
e4defrag_dbg("no group need to defrag now\n");
wait_ms = dfi->max_sleep_time;
} else {
/* scan next batch of groups */
e4defrag_dbg("check groups [%u, %u)\n",
first_group, nr_to_scan);
if (defrag_force_mode(sb)) {
wait_ms = 0;
} else {
wait_ms = dfi->min_sleep_time;
}
}
}
}
if (!kthread_should_stop()) {
goto loop;
}
clear_e4defrag_thread();
dfi->task = NULL;
e4defrag_msg("online defrag thread exit\n");
return 0;
}
/* create donor inode for defrag */
static struct inode *e4defrag_create_donor_inode(struct super_block *sb)
{
handle_t *handle;
struct inode *inode, *dir = d_inode(sb->s_root);
int credits = (EXT4_DATA_TRANS_BLOCKS(sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
handle = ext4_journal_start(dir, EXT4_HT_MOVE_EXTENTS, credits);
if (IS_ERR(handle)) {
e4defrag_err("fail to start journal, err: %ld\n",
PTR_ERR(handle));
return ERR_PTR(PTR_ERR(handle));
}
#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0))
inode = ext4_new_inode(handle, dir, S_IFREG, NULL, 0, NULL);
#else
inode = ext4_new_inode(handle, dir, S_IFREG, NULL, 0, NULL, 0);
#endif
if (IS_ERR(inode)) {
e4defrag_err("fail to create donor inode, err: %ld\n",
PTR_ERR(inode));
return ERR_PTR(PTR_ERR(handle));
ext4_journal_stop(handle);
return inode;
}
clear_nlink(inode);
i_size_write(inode, DEFRAG_MAX_LEN * EXT4_BLOCK_SIZE(sb));
ext4_ext_tree_init(handle, inode);
/* ext4_set_aops(inode); */
ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
unlock_new_inode(inode);
e4defrag_dbg("create donor inode: %lu\n",
IS_ERR(inode) ? -1 : inode->i_ino);
return inode;
}
/* unlink the donor inode */
static void e4defrag_delete_donor_inode(struct super_block *sb)
{
struct ext4_defrag_info *dfi = E4DEFRAG_I(sb);
struct inode *inode = dfi->donor_inode;
dfi->donor_inode = NULL;
if (inode) {
iput(inode);
}
}
static int e4defrag_start_thread(struct super_block *sb)
{
struct ext4_defrag_info *dfi = E4DEFRAG_I(sb);
struct task_struct *t;
ext4_group_t ngroups, group;
e4defrag_msg("start online defrag thread");
init_waitqueue_head(&dfi->init_wq);
init_waitqueue_head(&dfi->wq);
dfi->force_mode = 0;
dfi->state = DS_INIT;
dfi->min_sleep_time = DEFRAG_MIN_SLEEP_TIME;
dfi->sleep_time = DEFRAG_SLEEP_TIME;
dfi->max_sleep_time = DEFRAG_MAX_SLEEP_TIME;
dfi->interval = DEFRAG_INTERVAL;
dfi->max_len = DEFRAG_MAX_LEN;
dfi->min_score = DEFRAG_MIN_SCORE;
dfi->min_free = DEFRAG_MIN_FREE(sb);
dfi->nr_to_scan = DEFRAG_NR_TO_SCAN;
dfi->policy = GSP_FIRST_MATCH;
dfi->group = -1;
dfi->tree.len = 0;
dfi->tree.root = RB_ROOT;
ngroups = ext4_get_groups_count(sb);
dfi->groups =
kvmalloc(ngroups * sizeof(struct defrag_group_state), GFP_KERNEL);
if (ZERO_OR_NULL_PTR(dfi->groups)) {
return -1;
}
for (group = 0; group < ngroups; group++) {
defrag_update_state(sb, group, DEFRAG_INIT, NULL);
}
dfi->ngroups = ngroups;
dfi->donor_inode = e4defrag_create_donor_inode(sb);
if (IS_ERR(dfi->donor_inode)) {
return -1;
}
t = kthread_run(e4defrag_func, sb, "e4defrag/%s", sb->s_id);
if (IS_ERR(t)) {
e4defrag_err("fail to start online defrag thread\n");
return PTR_ERR(t);
}
wait_event(dfi->init_wq, dfi->task != NULL);
return 0;
}
static void e4defrag_stop_thread(struct super_block *sb)
{
struct ext4_defrag_info *dfi = E4DEFRAG_I(sb);
dfi->ngroups = 0;
kvfree(dfi->groups);
e4defrag_delete_donor_inode(sb);
e4defrag_msg("stop online defrag thread\n");
kthread_stop(dfi->task);
}
void e4defrag_wake_up_thread(struct super_block *sb)
{
struct ext4_defrag_info *dfi = E4DEFRAG_I(sb);
static unsigned long next_time = 0;
bool wakeup = dfi->task && !is_e4defrag_thread()
&& time_is_before_eq_jiffies(next_time)
&& is_idle(sb) && wq_has_sleeper(&dfi->wq);
if (wakeup) {
next_time = jiffies + msecs_to_jiffies(dfi->sleep_time);
dfi->wake = WAKE_SYS;
wake_up_interruptible(&dfi->wq);
}
}
/* parameters control defrag, in /sys/fs/ext4/<disk>/defrag/ */
typedef enum {
attr_pointer_ui,
attr_group_select_policy,
attr_wake,
attr_force_mode,
attr_state,
attr_stats,
} attr_id_t;
typedef enum {
ptr_explicit,
ptr_ext4_defrag_info_offset,
} attr_ptr_t;
struct e4defrag_attr {
struct attribute attr;
short attr_id;
short attr_ptr;
union {
int offset;
void *explicit_ptr;
} u;
};
#define E4DEFRAG_ATTR(_name, _mode, _id) \
static struct e4defrag_attr e4defrag_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
.attr_id = attr_##_id, \
}
#define E4DEFRAG_ATTR_FUNC(_name, _mode) E4DEFRAG_ATTR(_name, _mode, _name)
#define E4DEFRAG_ATTR_OFFSET(_name, _mode, _id, _struct, _elname) \
static struct e4defrag_attr e4defrag_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
.attr_id = attr_##_id, \
.attr_ptr = ptr_##_struct##_offset, \
.u = { \
.offset = offsetof(struct _struct, _elname), \
}, \
}
#define ATTR_LIST(name) &e4defrag_attr_##name.attr
#define E4DEFRAG_RO_ATTR(_name, _elname) \
E4DEFRAG_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_defrag_info, _elname)
#define E4DEFRAG_RW_ATTR(_name, _elname) \
E4DEFRAG_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_defrag_info, _elname)
E4DEFRAG_RW_ATTR(min_sleep_time, min_sleep_time);
E4DEFRAG_RW_ATTR(sleep_time, sleep_time);
E4DEFRAG_RW_ATTR(max_sleep_time, max_sleep_time);
E4DEFRAG_RW_ATTR(interval, interval);
E4DEFRAG_RW_ATTR(min_score, min_score);
E4DEFRAG_RW_ATTR(min_free_blocks, min_free);
E4DEFRAG_RW_ATTR(nr_to_scan, nr_to_scan);
E4DEFRAG_RW_ATTR(max_extent_size, max_len);
E4DEFRAG_RO_ATTR(current_group, group);
E4DEFRAG_ATTR_FUNC(group_select_policy, 0644);
E4DEFRAG_ATTR_FUNC(wake, 0200);
E4DEFRAG_ATTR_FUNC(force_mode, 0644);
E4DEFRAG_ATTR_FUNC(state, 0444);
E4DEFRAG_ATTR_FUNC(stats, 0444);
static struct attribute *e4defrag_attrs[] = {
ATTR_LIST(min_sleep_time),
ATTR_LIST(sleep_time),
ATTR_LIST(max_sleep_time),
ATTR_LIST(interval),
ATTR_LIST(min_score),
ATTR_LIST(min_free_blocks),
ATTR_LIST(nr_to_scan),
ATTR_LIST(max_extent_size),
ATTR_LIST(current_group),
ATTR_LIST(group_select_policy),
ATTR_LIST(wake),
ATTR_LIST(force_mode),
ATTR_LIST(state),
ATTR_LIST(stats),
NULL,
};
static ssize_t group_select_policy_show(struct ext4_defrag_info *dfi, char *buf)
{
const char *policy;
switch (dfi->policy) {
case GSP_FIRST_MATCH:
policy = "first match";
break;
case GSP_MAX_FRAGMENTS:
policy = "max fragments";
break;
case GSP_MAX_SCORE:
policy = "max score";
break;
default:
policy = "unknown";
break;
}
return snprintf(buf, PAGE_SIZE, "%s\n", policy);
}
static ssize_t group_select_policy_store(struct ext4_defrag_info *dfi,
const char *buf, size_t len)
{
unsigned long t;
int ret;
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret) {
return ret;
}
switch (t) {
case GSP_FIRST_MATCH:
case GSP_MAX_FRAGMENTS:
case GSP_MAX_SCORE:
dfi->policy = t;
return len;
default:
ret = -EINVAL;
break;
}
return ret;
}
static ssize_t state_show(struct ext4_defrag_info *dfi, char *buf)
{
const char *state;
switch (dfi->state) {
case DS_INIT:
state = "init";
break;
case DS_RUNNING:
state = "running";
break;
case DS_DONE:
state = "done";
break;
default:
state = "unknown";
break;
}
return snprintf(buf, PAGE_SIZE, "%s\n", state);
}
static ssize_t stats_show(struct ext4_defrag_info *dfi, char *buf)
{
struct super_block *sb = E4DEFRAG_SB(dfi);
unsigned long moved, cost, count, read, write;
ext4_group_t group;
moved = 0;
cost = 0;
count = 0;
for (group = 0; group < dfi->ngroups; group++) {
if (defrag_count(sb, group)) {
cost += defrag_cost(sb, group);
moved += defrag_moved(sb, group);
count += defrag_count(sb, group);
}
}
moved <<= (EXT4_BLOCK_SIZE_BITS(E4DEFRAG_SB(dfi)) - 10);
get_task_struct(dfi->task);
read = task_io_get_inblock(dfi->task) >> 1;
write = task_io_get_oublock(dfi->task) >> 1;
put_task_struct(dfi->task);
return snprintf(buf, PAGE_SIZE, "moved: %lu\nread: %lu\nwrite: %lu\n"
"cost: %lu\ncount: %lu\n", moved, read, write, cost,
count);
}
static void *calc_ptr(struct e4defrag_attr *a, struct ext4_defrag_info *dfi)
{
switch (a->attr_ptr) {
case ptr_explicit:
return a->u.explicit_ptr;
case ptr_ext4_defrag_info_offset:
return (void *)(((char *)dfi) + a->u.offset);
}
return NULL;
}
static ssize_t e4defrag_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct ext4_defrag_info *dfi =
container_of(kobj, struct ext4_defrag_info,
kobj);
struct e4defrag_attr *a =
container_of(attr, struct e4defrag_attr, attr);
void *ptr = calc_ptr(a, dfi);
switch (a->attr_id) {
case attr_pointer_ui:
if (!ptr) {
return 0;
}
return snprintf(buf, PAGE_SIZE, "%u\n", *((unsigned int *)ptr));
case attr_group_select_policy:
return group_select_policy_show(dfi, buf);
case attr_wake:
return -EINVAL;
case attr_force_mode:
return snprintf(buf, PAGE_SIZE, "%u\n", dfi->force_mode);
case attr_state:
return state_show(dfi, buf);
case attr_stats:
return stats_show(dfi, buf);
}
return 0;
}
static ssize_t e4defrag_attr_store(struct kobject *kobj,
struct attribute *attr,
const char *buf, size_t len)
{
struct ext4_defrag_info *dfi =
container_of(kobj, struct ext4_defrag_info,
kobj);
struct e4defrag_attr *a =
container_of(attr, struct e4defrag_attr, attr);
void *ptr = calc_ptr(a, dfi);
unsigned long t;
int ret;
switch (a->attr_id) {
case attr_pointer_ui:
if (!ptr) {
return 0;
}
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret) {
return ret;
}
*((unsigned int *)ptr) = t;
return len;
case attr_group_select_policy:
return group_select_policy_store(dfi, buf, len);
case attr_wake:
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret) {
return ret;
}
if (t != 0) {
dfi->wake = WAKE_USER;
wake_up_interruptible(&dfi->wq);
}
return len;
case attr_force_mode:
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret) {
return ret;
}
dfi->force_mode = t ? 1 : 0;
return len;
}
return 0;
}
static void e4defrag_kobject_release(struct kobject *kobj)
{
e4defrag_dbg("release defrag kobject\n");
memset(kobj, 0, sizeof(*kobj));
}
static const struct sysfs_ops e4defrag_attr_ops = {
.show = e4defrag_attr_show,
.store = e4defrag_attr_store,
};
static struct kobj_type e4defrag_ktype = {
.default_attrs = e4defrag_attrs,
.sysfs_ops = &e4defrag_attr_ops,
.release = e4defrag_kobject_release,
};
/* /proc/fs/ext4/<disk>/defrag_groups_state */
static void *e4defrag_seq_groups_start(struct seq_file *seq, loff_t * pos)
{
struct ext4_defrag_info *dfi = PDE_DATA(file_inode(seq->file));
ext4_group_t group;
if (*pos < 0 || *pos >= dfi->ngroups) {
return NULL;
}
group = *pos + 1;
return (void *)((unsigned long)group);
}
static void *e4defrag_seq_groups_next(struct seq_file *seq, void *v,
loff_t * pos)
{
struct ext4_defrag_info *dfi = PDE_DATA(file_inode(seq->file));
ext4_group_t group;
++*pos;
if (*pos < 0 || *pos >= dfi->ngroups) {
return NULL;
}
group = *pos + 1;
return (void *)((unsigned long)group);
}
static int e4defrag_seq_groups_show(struct seq_file *seq, void *v)
{
struct ext4_defrag_info *dfi = PDE_DATA(file_inode(seq->file));
struct super_block *sb = E4DEFRAG_SB(dfi);
ext4_group_t group = (ext4_group_t) ((unsigned long)v);
group--;
if (group == 0) {
seq_puts(seq, "group,count,moved,scan-cost,total-cost,duration,"
"first0,free0,frags0,first1,free1,frags1\n");
}
/* skip group not been defragged */
if (!defrag_count(sb, group)) {
return 0;
}
seq_printf(seq, "%u,%lu,%lu,%lu,%lu,%lu,%u,%u,%u,%u,%u,%u\n", group,
defrag_count(sb, group), defrag_moved(sb, group),
defrag_scan_cost(sb, group), defrag_cost(sb, group),
defrag_duration(sb, group), defrag_first_free(sb, group, 0),
defrag_free(sb, group, 0), defrag_fragments(sb, group, 0),
defrag_first_free(sb, group, 1), defrag_free(sb, group, 1),
defrag_fragments(sb, group, 1));
return 0;
}
static void e4defrag_seq_groups_stop(struct seq_file *seq, void *v)
{
}
const struct seq_operations e4defrag_seq_groups_ops = {
.start = e4defrag_seq_groups_start,
.next = e4defrag_seq_groups_next,
.stop = e4defrag_seq_groups_stop,
.show = e4defrag_seq_groups_show,
};
#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0))
static int e4defrag_groups_open(struct inode *inode, struct file *file)
{
struct ext4_defrag_info *dfi = PDE_DATA(inode);
int rc;
rc = seq_open(file, &e4defrag_seq_groups_ops);
if (!rc) {
struct seq_file *m = file->private_data;
m->private = dfi;
}
return rc;
}
static const struct file_operations e4defrag_seq_groups_fops = {
.open = e4defrag_groups_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#endif
int e4defrag_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_defrag_info *dfi = E4DEFRAG_I(sb);
int ret;
if (!ext4_has_feature_extents(sb)
|| ext4_has_feature_bigalloc(sb)) {
e4defrag_err("online defrag not supported\n");
return -ENOTSUPP;
}
if (dfi->task) {
e4defrag_dbg("online defrag thread already started\n");
return 0;
}
ret = e4defrag_start_thread(sb);
if (!ret) {
ret = kobject_init_and_add(&dfi->kobj, &e4defrag_ktype,
&sbi->s_kobj, "defrag");
}
if (!ret && sbi->s_proc) {
#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0))
proc_create_data("defrag_groups_state", S_IRUGO, sbi->s_proc,
&e4defrag_seq_groups_fops, dfi);
#else
proc_create_seq_data("defrag_groups_state", S_IRUGO,
sbi->s_proc, &e4defrag_seq_groups_ops,
dfi);
#endif
}
return ret;
}
void e4defrag_exit(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_defrag_info *dfi = E4DEFRAG_I(sb);
if (!dfi->task) {
return;
}
if (sbi->s_proc) {
remove_proc_entry("defrag_groups_state", sbi->s_proc);
}
kobject_put(&dfi->kobj);
e4defrag_stop_thread(sb);
}
/* count the bio queued and completed, for device idle check */
static DEFINE_PER_CPU(local_t, bio_inflight) = LOCAL_INIT(0);
static long read_bio_inflight(void)
{
int cpu;
long cnt = 0;
for_each_present_cpu(cpu) {
cnt += local_read(&per_cpu(bio_inflight, cpu));
}
return cnt;
}
static inline bool need_trace(struct bio *bio)
{
return !is_e4defrag_thread() && bio_is_sync(bio);
}
#define BIO_TRACED (12)
static void monit_bio_queue(void *priv, struct request_queue *q,
struct bio *bio)
{
if (need_trace(bio)) {
bio_set_flag(bio, BIO_TRACED);
local_inc(this_cpu_ptr(&bio_inflight));
}
}
static void monit_bio_complete(void *priv, struct request_queue *q,
struct bio *bio, int err)
{
if (bio_flagged(bio, BIO_TRACED)) {
bio_clear_flag(bio, BIO_TRACED);
local_dec(this_cpu_ptr(&bio_inflight));
}
}
static int __init e4defrag_bio_monitor_init(void)
{
int ret;
ret = register_trace_block_bio_complete(monit_bio_complete, NULL);
WARN_ON(ret);
ret = register_trace_block_bio_queue(monit_bio_queue, NULL);
WARN_ON(ret);
return ret;
}
static void e4defrag_bio_monitor_exit(void)
{
unregister_trace_block_bio_queue(monit_bio_queue, NULL);
unregister_trace_block_bio_complete(monit_bio_complete, NULL);
}
int __init e4defrag_init_fs()
{
int ret;
ret = e4defrag_create_cache();
if (ret) {
return ret;
}
return e4defrag_bio_monitor_init();
}
void e4defrag_exit_fs()
{
e4defrag_bio_monitor_exit();
e4defrag_destroy_cache();
}