Btrfs: Shrink delay allocated space in a synchronized
Shrink delayed allocation space in a synchronized manner is more
controllable than flushing all delay allocated space in an async
thread.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a68f346..85c7b95 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -700,10 +700,6 @@
struct list_head list;
- /* for controlling how we free up space for allocations */
- wait_queue_head_t flush_wait;
- int flushing;
-
/* for block groups in our same type */
struct list_head block_groups[BTRFS_NR_RAID_TYPES];
spinlock_t lock;
@@ -928,7 +924,6 @@
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
- struct btrfs_workers enospc_workers;
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
@@ -2312,6 +2307,7 @@
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_writepages(struct address_space *mapping,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca041..05f26ac 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1759,9 +1759,6 @@
min_t(u64, fs_devices->num_devices,
fs_info->thread_pool_size),
&fs_info->generic_worker);
- btrfs_init_workers(&fs_info->enospc_workers, "enospc",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
/* a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
@@ -1809,7 +1806,6 @@
btrfs_start_workers(&fs_info->endio_meta_workers, 1);
btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
btrfs_start_workers(&fs_info->endio_write_workers, 1);
- btrfs_start_workers(&fs_info->enospc_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2040,7 +2036,6 @@
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->enospc_workers);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
@@ -2473,7 +2468,6 @@
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->enospc_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2c95507..f32b161 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -74,6 +74,9 @@
static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_space_info *sinfo, u64 num_bytes);
+static int shrink_delalloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_space_info *sinfo, u64 to_reclaim);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -2693,7 +2696,6 @@
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
- init_waitqueue_head(&found->flush_wait);
spin_lock_init(&found->lock);
found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
BTRFS_BLOCK_GROUP_SYSTEM |
@@ -2907,105 +2909,6 @@
meta_sinfo->force_delalloc = 0;
}
-struct async_flush {
- struct btrfs_root *root;
- struct btrfs_space_info *info;
- struct btrfs_work work;
-};
-
-static noinline void flush_delalloc_async(struct btrfs_work *work)
-{
- struct async_flush *async;
- struct btrfs_root *root;
- struct btrfs_space_info *info;
-
- async = container_of(work, struct async_flush, work);
- root = async->root;
- info = async->info;
-
- btrfs_start_delalloc_inodes(root, 0);
- wake_up(&info->flush_wait);
- btrfs_wait_ordered_extents(root, 0, 0);
-
- spin_lock(&info->lock);
- info->flushing = 0;
- spin_unlock(&info->lock);
- wake_up(&info->flush_wait);
-
- kfree(async);
-}
-
-static void wait_on_flush(struct btrfs_space_info *info)
-{
- DEFINE_WAIT(wait);
- u64 used;
-
- while (1) {
- prepare_to_wait(&info->flush_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_lock(&info->lock);
- if (!info->flushing) {
- spin_unlock(&info->lock);
- break;
- }
-
- used = info->bytes_used + info->bytes_reserved +
- info->bytes_pinned + info->bytes_readonly +
- info->bytes_super + info->bytes_root +
- info->bytes_may_use + info->bytes_delalloc;
- if (used < info->total_bytes) {
- spin_unlock(&info->lock);
- break;
- }
- spin_unlock(&info->lock);
- schedule();
- }
- finish_wait(&info->flush_wait, &wait);
-}
-
-static void flush_delalloc(struct btrfs_root *root,
- struct btrfs_space_info *info)
-{
- struct async_flush *async;
- bool wait = false;
-
- spin_lock(&info->lock);
-
- if (!info->flushing)
- info->flushing = 1;
- else
- wait = true;
-
- spin_unlock(&info->lock);
-
- if (wait) {
- wait_on_flush(info);
- return;
- }
-
- async = kzalloc(sizeof(*async), GFP_NOFS);
- if (!async)
- goto flush;
-
- async->root = root;
- async->info = info;
- async->work.func = flush_delalloc_async;
-
- btrfs_queue_worker(&root->fs_info->enospc_workers,
- &async->work);
- wait_on_flush(info);
- return;
-
-flush:
- btrfs_start_delalloc_inodes(root, 0);
- btrfs_wait_ordered_extents(root, 0, 0);
-
- spin_lock(&info->lock);
- info->flushing = 0;
- spin_unlock(&info->lock);
- wake_up(&info->flush_wait);
-}
-
/*
* Reserve metadata space for delalloc.
*/
@@ -3058,7 +2961,7 @@
filemap_flush(inode->i_mapping);
goto again;
} else if (flushed == 3) {
- flush_delalloc(root, meta_sinfo);
+ shrink_delalloc(NULL, root, meta_sinfo, num_bytes);
goto again;
}
spin_lock(&meta_sinfo->lock);
@@ -3171,7 +3074,7 @@
}
if (retries == 2) {
- flush_delalloc(root, meta_sinfo);
+ shrink_delalloc(NULL, root, meta_sinfo, num_bytes);
goto again;
}
spin_lock(&meta_sinfo->lock);
@@ -3197,7 +3100,7 @@
{
struct btrfs_space_info *data_sinfo;
u64 used;
- int ret = 0, committed = 0, flushed = 0;
+ int ret = 0, committed = 0;
/* make sure bytes are sectorsize aligned */
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3217,13 +3120,6 @@
if (used + bytes > data_sinfo->total_bytes) {
struct btrfs_trans_handle *trans;
- if (!flushed) {
- spin_unlock(&data_sinfo->lock);
- flush_delalloc(root, data_sinfo);
- flushed = 1;
- goto again;
- }
-
/*
* if we don't have enough free bytes in this space then we need
* to alloc a new chunk.
@@ -3467,6 +3363,55 @@
return ret == 1 ? 1 : 0;
}
+/*
+ * shrink metadata reservation for delalloc
+ */
+static int shrink_delalloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_space_info *sinfo, u64 to_reclaim)
+{
+ u64 reserved;
+ u64 max_reclaim;
+ u64 reclaimed = 0;
+ int pause = 1;
+ int ret;
+
+ spin_lock(&sinfo->lock);
+ reserved = sinfo->bytes_delalloc;
+ spin_unlock(&sinfo->lock);
+
+ if (reserved == 0)
+ return 0;
+
+ max_reclaim = min(reserved, to_reclaim);
+
+ while (1) {
+ ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+ if (!ret) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(pause);
+ pause <<= 1;
+ if (pause > HZ / 10)
+ pause = HZ / 10;
+ } else {
+ pause = 1;
+ }
+
+ spin_lock(&sinfo->lock);
+ if (reserved > sinfo->bytes_delalloc)
+ reclaimed = reserved - sinfo->bytes_delalloc;
+ reserved = sinfo->bytes_delalloc;
+ spin_unlock(&sinfo->lock);
+
+ if (reserved == 0 || reclaimed >= max_reclaim)
+ break;
+
+ if (trans && trans->transaction->blocked)
+ return -EAGAIN;
+ }
+ return reclaimed >= to_reclaim;
+}
+
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc64..d53cad1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5611,6 +5611,38 @@
return 0;
}
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+{
+ struct btrfs_inode *binode;
+ struct inode *inode = NULL;
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ while (!list_empty(&root->fs_info->delalloc_inodes)) {
+ binode = list_entry(root->fs_info->delalloc_inodes.next,
+ struct btrfs_inode, delalloc_inodes);
+ inode = igrab(&binode->vfs_inode);
+ if (inode) {
+ list_move_tail(&binode->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
+ break;
+ }
+
+ list_del_init(&binode->delalloc_inodes);
+ cond_resched_lock(&root->fs_info->delalloc_lock);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+
+ if (inode) {
+ write_inode_now(inode, 0);
+ if (delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
+ return 1;
+ }
+ return 0;
+}
+
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{