diff mbox series

[RFC,v2,7/9] ext4: add LRU eviction for free space trees

Message ID 20200821015523.1698374-8-harshads@google.com
State New
Headers show
Series ext4: add free-space extent based allocator | expand

Commit Message

harshad shirwadkar Aug. 21, 2020, 1:55 a.m. UTC
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>

This patch adds LRU eviction policy for freespace trees. In order to
avoid contention on one LRU lock, the LRU scheme is implemented as two
lists - an active list and an inactive list. Trees in active list
aren't moved inside the list thereby avoiding the need of a
lock. Trees in inactive list become active only if they are accessed
twice in a short interval thereby avoiding outliers to enter the
active list.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 fs/ext4/ext4.h    |  46 +++++++++++++
 fs/ext4/mballoc.c | 164 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 197 insertions(+), 13 deletions(-)
diff mbox series

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 93bf2fe35cf1..64d0dbbcd517 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1454,6 +1454,10 @@  struct ext4_frsp_tree {
 							 */
 	struct list_head frsp_list_node;
 	struct rb_node frsp_len_node;
+	atomic_t frsp_fragments;
+	struct list_head frsp_lru_active_node;
+	unsigned long frsp_last_access;
+	atomic_t frsp_ref;
 };
 
 /* Freespace tree flags */
@@ -1468,6 +1472,47 @@  struct ext4_frsp_tree {
 #define EXT4_MB_FRSP_DEFAULT_CACHE_AGGRESSION		0x1
 #define EXT4_MB_FRSP_MAX_CACHE_AGGRESSION		0x3
 
+/* LRU management for free space trees */
+struct ext4_mb_frsp_lru {
+	rwlock_t frsp_lru_lock;
+	atomic_t frsp_active_fragments;		/* Current #of fragments in
+						 * the active queue
+						 */
+	u32 frsp_max_active_fragments;
+	/*
+	 * List of active trees. Trees at tail are oldest trees in active set.
+	 */
+	struct list_head frsp_lru_active;
+	/*
+	 * List of inactive trees but loaded trees.
+	 */
+	struct list_head frsp_lru_inactive;
+	struct super_block *frsp_lru_sb;
+};
+
+/*
+ * Minimum memory needed for our allocator. The pathological worst case for
+ * freespace trees is when every other block is allocated. In this case,
+ * the allocator will end up storing an extent node of length 1 for each free
+ * block. We need to make sure that the minimum memory available for the
+ * allocator is as much memory needed for 1 worst-case tree. This ensures that
+ * we can at-least keep 1 tree in memory. This way we avoid thrashing.
+ */
+#define EXT4_MB_FRSP_MEM_MIN(sb)					\
+	((sizeof(struct ext4_frsp_node) * ext4_flex_bg_size(EXT4_SB(sb))) \
+		* (EXT4_BLOCKS_PER_GROUP(sb) / 2))
+
+/* Half of the total memory available is allowed for active list */
+#define EXT4_MB_FRSP_MAX_ACTIVE_FRAGMENTS(sb)				\
+	((EXT4_SB(sb)->s_mb_frsp_mem_limit /				\
+		(2 * sizeof(struct ext4_frsp_node))))
+
+/*
+ * Maximum number of jiffies allowed between two successive hits on a freespace
+ * tree before we move it to the "active" queue.
+ */
+#define EXT4_MB_FRSP_ACTIVE_THRESHOLD_JIFFIES		100
+
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -1615,6 +1660,7 @@  struct ext4_sb_info {
 	u32 s_mb_frsp_cache_aggression;
 	atomic_t s_mb_num_fragments;
 	u32 s_mb_frsp_mem_limit;
+	struct ext4_mb_frsp_lru s_mb_frsp_lru;
 
 	/* workqueue for reserved extent conversions (buffered io) */
 	struct workqueue_struct *rsv_conversion_wq;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b28b7fb0506e..aea0eb8d28da 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -866,10 +866,12 @@  void ext4_mb_frsp_print_tree_len(struct super_block *sb,
 }
 #endif
 
-static struct ext4_frsp_node *ext4_mb_frsp_alloc_node(struct super_block *sb)
+static struct ext4_frsp_node *ext4_mb_frsp_alloc_node(struct super_block *sb,
+			struct ext4_frsp_tree *tree)
 {
 	struct ext4_frsp_node *node;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_mb_frsp_lru *lru = &EXT4_SB(sb)->s_mb_frsp_lru;
 
 	node = kmem_cache_alloc(ext4_freespace_node_cachep, GFP_NOFS);
 	if (!node)
@@ -879,6 +881,11 @@  static struct ext4_frsp_node *ext4_mb_frsp_alloc_node(struct super_block *sb)
 	RB_CLEAR_NODE(&node->frsp_len_node);
 
 	atomic_inc(&sbi->s_mb_num_fragments);
+	atomic_inc(&tree->frsp_fragments);
+	read_lock(&lru->frsp_lru_lock);
+	if (!list_empty(&tree->frsp_lru_active_node))
+		atomic_inc(&lru->frsp_active_fragments);
+	read_unlock(&lru->frsp_lru_lock);
 
 	if (sbi->s_mb_frsp_mem_limit &&
 		atomic_read(&sbi->s_mb_num_fragments) >
@@ -892,12 +899,18 @@  static struct ext4_frsp_node *ext4_mb_frsp_alloc_node(struct super_block *sb)
 }
 
 static void ext4_mb_frsp_free_node(struct super_block *sb,
-		struct ext4_frsp_node *node)
+		struct ext4_frsp_tree *tree, struct ext4_frsp_node *node)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_mb_frsp_lru *lru = &EXT4_SB(sb)->s_mb_frsp_lru;
 
 	kmem_cache_free(ext4_freespace_node_cachep, node);
 	atomic_dec(&sbi->s_mb_num_fragments);
+	atomic_dec(&tree->frsp_fragments);
+	read_lock(&lru->frsp_lru_lock);
+	if (!list_empty(&tree->frsp_lru_active_node))
+		atomic_dec(&lru->frsp_active_fragments);
+	read_unlock(&lru->frsp_lru_lock);
 
 	if (!sbi->s_mb_frsp_mem_limit ||
 		atomic_read(&sbi->s_mb_num_fragments) <
@@ -915,6 +928,8 @@  void ext4_mb_frsp_free_tree(struct super_block *sb, struct ext4_frsp_tree *tree)
 	mutex_lock(&tree->frsp_lock);
 	if (!(tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED))
 		goto out;
+	if (atomic_read(&tree->frsp_ref))
+		goto out;
 
 	node = rb_first_cached(&tree->frsp_offset_root);
 	while (node) {
@@ -922,7 +937,7 @@  void ext4_mb_frsp_free_tree(struct super_block *sb, struct ext4_frsp_tree *tree)
 		rb_erase_cached(node, &tree->frsp_offset_root);
 		rb_erase_cached(&frsp_node->frsp_len_node,
 				&tree->frsp_len_root);
-		ext4_mb_frsp_free_node(sb, frsp_node);
+		ext4_mb_frsp_free_node(sb, tree, frsp_node);
 		node = rb_first_cached(&tree->frsp_offset_root);
 	}
 	tree->frsp_flags &= ~EXT4_MB_FRSP_FLAG_LOADED;
@@ -985,7 +1000,7 @@  int ext4_mb_frsp_add_region(struct super_block *sb, struct ext4_frsp_tree *tree,
 				*prev_entry = NULL;
 	struct rb_node *left = NULL, *right = NULL;
 
-	new_entry = ext4_mb_frsp_alloc_node(sb);
+	new_entry = ext4_mb_frsp_alloc_node(sb, tree);
 	if (!new_entry)
 		return -ENOMEM;
 
@@ -1004,7 +1019,7 @@  int ext4_mb_frsp_add_region(struct super_block *sb, struct ext4_frsp_tree *tree,
 			rb_erase_cached(left, &tree->frsp_offset_root);
 			rb_erase_cached(&prev_entry->frsp_len_node,
 						&tree->frsp_len_root);
-			ext4_mb_frsp_free_node(sb, prev_entry);
+			ext4_mb_frsp_free_node(sb, tree, prev_entry);
 		}
 	}
 
@@ -1017,7 +1032,7 @@  int ext4_mb_frsp_add_region(struct super_block *sb, struct ext4_frsp_tree *tree,
 			rb_erase_cached(right, &tree->frsp_offset_root);
 			rb_erase_cached(&next_entry->frsp_len_node,
 						&tree->frsp_len_root);
-			ext4_mb_frsp_free_node(sb, next_entry);
+			ext4_mb_frsp_free_node(sb, tree, next_entry);
 		}
 	}
 	ext4_mb_frsp_insert_len(tree, new_entry);
@@ -1601,6 +1616,10 @@  static void ext4_mb_frsp_init_tree(struct ext4_frsp_tree *tree, int index)
 	tree->frsp_flags = 0;
 	tree->frsp_index = index;
 	INIT_LIST_HEAD(&tree->frsp_list_node);
+	atomic_set(&tree->frsp_fragments, 0);
+	tree->frsp_last_access = 0;
+	INIT_LIST_HEAD(&tree->frsp_lru_active_node);
+	atomic_set(&tree->frsp_ref, 0);
 }
 
 int ext4_mb_init_freespace_trees(struct super_block *sb)
@@ -1627,6 +1646,15 @@  int ext4_mb_init_freespace_trees(struct super_block *sb)
 	rwlock_init(&sbi->s_mb_frsp_lock);
 	atomic_set(&sbi->s_mb_num_frsp_trees_cached, 0);
 	atomic_set(&sbi->s_mb_num_fragments, 0);
+	INIT_LIST_HEAD(&sbi->s_mb_frsp_lru.frsp_lru_active);
+	INIT_LIST_HEAD(&sbi->s_mb_frsp_lru.frsp_lru_inactive);
+	rwlock_init(&sbi->s_mb_frsp_lru.frsp_lru_lock);
+	/* Set the default hard-limit to be as much as buddy bitmaps */
+	sbi->s_mb_frsp_mem_limit = ext4_get_groups_count(sb) <<
+				(sb->s_blocksize_bits + 1);
+	if (sbi->s_mb_frsp_mem_limit < EXT4_MB_FRSP_MEM_MIN(sb))
+		sbi->s_mb_frsp_mem_limit = EXT4_MB_FRSP_MEM_MIN(sb);
+	atomic_set(&sbi->s_mb_frsp_lru.frsp_active_fragments, 0);
 
 	return 0;
 }
@@ -1664,6 +1692,99 @@  int ext4_mb_add_frsp_trees(struct super_block *sb, int ngroups)
 	return 0;
 }
 
+/*
+ * Evict one tree from inactive list.
+ */
+static void ext4_frsp_evict(struct super_block *sb)
+{
+	struct ext4_mb_frsp_lru *lru = &EXT4_SB(sb)->s_mb_frsp_lru;
+	struct ext4_frsp_tree *tree = NULL;
+	bool found = false;
+
+	write_lock(&lru->frsp_lru_lock);
+	if (!list_empty(&lru->frsp_lru_inactive)) {
+		/* Evict from front, insert at tail */
+		found = 0;
+		list_for_each_entry(tree, &lru->frsp_lru_inactive,
+			frsp_list_node) {
+			if (!atomic_read(&tree->frsp_ref)) {
+				found = true;
+				break;
+			}
+		}
+		if (found)
+			list_del_init(&tree->frsp_list_node);
+	}
+	write_unlock(&lru->frsp_lru_lock);
+	if (found)
+		ext4_mb_frsp_free_tree(sb, tree);
+}
+
+/*
+ * This function maintains LRU lists. "tree" has just been accessed.
+ */
+static void ext4_mb_frsp_maintain_lru(struct super_block *sb,
+					struct ext4_frsp_tree *tree)
+{
+	struct ext4_mb_frsp_lru *lru = &EXT4_SB(sb)->s_mb_frsp_lru;
+	struct ext4_frsp_tree *last;
+	unsigned long current_jiffies = jiffies;
+
+	read_lock(&lru->frsp_lru_lock);
+	if (!list_empty(&tree->frsp_lru_active_node)) {
+		/* Already active, nothing to do */
+		read_unlock(&lru->frsp_lru_lock);
+		goto out;
+	}
+
+	/*
+	 * Check if this tree needs to be moved to active list. We move it to
+	 * active list if one of the following three conditions is true:
+	 * - This is the first access to this tree
+	 * - We haven't yet reached the max active fragments threshold, so there
+	 *   is space in active list.
+	 * - This tree was accessed twice in
+	 *   EXT4_MB_FRSP_ACTIVE_THRESHOLD_JIFFIES interval.
+	 */
+	if (tree->frsp_last_access &&
+		EXT4_MB_FRSP_MAX_ACTIVE_FRAGMENTS(sb) &&
+		atomic_read(&lru->frsp_active_fragments) >
+			EXT4_MB_FRSP_MAX_ACTIVE_FRAGMENTS(sb) &&
+		current_jiffies - tree->frsp_last_access >
+		EXT4_MB_FRSP_ACTIVE_THRESHOLD_JIFFIES) {
+		read_unlock(&lru->frsp_lru_lock);
+		goto out;
+	}
+	read_unlock(&lru->frsp_lru_lock);
+
+	write_lock(&lru->frsp_lru_lock);
+	/* Check again just in case */
+	if (!list_empty(&tree->frsp_lru_active_node)) {
+		write_unlock(&lru->frsp_lru_lock);
+		goto out;
+	}
+	list_add(&tree->frsp_lru_active_node, &lru->frsp_lru_active);
+	list_del_init(&tree->frsp_list_node);
+	atomic_add(atomic_read(&tree->frsp_fragments),
+			&lru->frsp_active_fragments);
+	/* Remove trees from active queue until we are below the limit */
+	while (EXT4_MB_FRSP_MAX_ACTIVE_FRAGMENTS(sb) &&
+		atomic_read(&lru->frsp_active_fragments) >
+			EXT4_MB_FRSP_MAX_ACTIVE_FRAGMENTS(sb)) {
+		last = list_last_entry(&lru->frsp_lru_active,
+				struct ext4_frsp_tree, frsp_lru_active_node);
+		list_del_init(&last->frsp_lru_active_node);
+		/* Evict from front, insert at tail */
+		list_add_tail(&last->frsp_list_node, &lru->frsp_lru_inactive);
+		atomic_sub(atomic_read(&last->frsp_fragments),
+			&lru->frsp_active_fragments);
+	}
+	write_unlock(&lru->frsp_lru_lock);
+
+out:
+	tree->frsp_last_access = current_jiffies;
+}
+
 /*
  * Divide blocks started from @first with length @len into
  * smaller chunks with power of 2 blocks.
@@ -2154,10 +2275,12 @@  ext4_mb_load_allocator_gfp(struct super_block *sb, ext4_group_t group,
 		e4b->frsp_tree = ext4_get_frsp_tree(sb,
 					ext4_flex_group(sbi, group));
 		e4b->frsp_flags = flags;
+		ext4_mb_frsp_maintain_lru(sb, e4b->frsp_tree);
 		if (flags & EXT4_ALLOCATOR_FRSP_NOLOAD)
 			return 0;
 
 		mutex_lock(&e4b->frsp_tree->frsp_lock);
+		atomic_inc(&e4b->frsp_tree->frsp_ref);
 		if (e4b->frsp_tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED) {
 			mutex_unlock(&e4b->frsp_tree->frsp_lock);
 			return 0;
@@ -2285,8 +2408,15 @@  static int ext4_mb_load_allocator(struct super_block *sb, ext4_group_t group,
 
 static void ext4_mb_unload_allocator(struct ext4_buddy *e4b)
 {
-	if (ext4_mb_frsp_on(e4b->bd_sb))
+	if (ext4_mb_frsp_on(e4b->bd_sb)) {
+		if (!e4b->frsp_tree ||
+			e4b->frsp_flags & EXT4_ALLOCATOR_FRSP_NOLOAD)
+			return;
+		atomic_dec(&e4b->frsp_tree->frsp_ref);
+		if (test_opt2(e4b->bd_sb, FRSP_MEM_CRUNCH))
+			ext4_frsp_evict(e4b->bd_sb);
 		return;
+	}
 	if (e4b->bd_bitmap_page)
 		put_page(e4b->bd_bitmap_page);
 	if (e4b->bd_buddy_page)
@@ -2658,7 +2788,8 @@  static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 				ex->fe_node->frsp_offset +
 				ex->fe_node->frsp_len) {
 				/* Need to split the node */
-				new = ext4_mb_frsp_alloc_node(e4b->bd_sb);
+				new = ext4_mb_frsp_alloc_node(e4b->bd_sb,
+								e4b->frsp_tree);
 				if (!new)
 					return -ENOMEM;
 				new->frsp_offset = flex_offset + ex->fe_len;
@@ -2675,7 +2806,8 @@  static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 			ex->fe_node->frsp_offset += ex->fe_len;
 			ex->fe_node->frsp_len -= ex->fe_len;
 		} else {
-			ext4_mb_frsp_free_node(e4b->bd_sb, ex->fe_node);
+			ext4_mb_frsp_free_node(e4b->bd_sb, e4b->frsp_tree,
+							ex->fe_node);
 			return 0;
 		}
 
@@ -4166,7 +4298,9 @@  static void ext4_free_data_in_buddy(struct super_block *sb,
 		/* No more items in the per group rb tree
 		 * balance refcounts from ext4_mb_free_metadata()
 		 */
-		if (!ext4_mb_frsp_on(sb)) {
+		if (ext4_mb_frsp_on(sb)) {
+			atomic_dec(&e4b.frsp_tree->frsp_ref);
+		} else {
 			put_page(e4b.bd_buddy_page);
 			put_page(e4b.bd_bitmap_page);
 		}
@@ -6146,14 +6280,18 @@  ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	new_node = &new_entry->efd_node;
 	cluster = new_entry->efd_start_cluster;
 
-	if (!*n && !ext4_mb_frsp_on(sb)) {
+	if (!*n) {
 		/* first free block exent. We need to
 		   protect buddy cache from being freed,
 		 * otherwise we'll refresh it from
 		 * on-disk bitmap and lose not-yet-available
 		 * blocks */
-		get_page(e4b->bd_buddy_page);
-		get_page(e4b->bd_bitmap_page);
+		if (ext4_mb_frsp_on(sb)) {
+			atomic_inc(&e4b->frsp_tree->frsp_ref);
+		} else {
+			get_page(e4b->bd_buddy_page);
+			get_page(e4b->bd_bitmap_page);
+		}
 	}
 	while (*n) {
 		parent = *n;