mm: multi-gen LRU: merge v11 patchset

Signed-off-by: Juhyung Park <qkrwngud825@gmail.com>
Signed-off-by: 0wnerDied <z1281552865@gmail.com>
Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>
This commit is contained in:
Juhyung Park
2022-10-27 15:59:15 +09:00
committed by kondors1995
parent 77e80d09d6
commit 5e73282a84
12 changed files with 367 additions and 330 deletions

View File

@@ -1167,7 +1167,6 @@ config ARCH_USE_MEMREMAP_PROT
config ARCH_HAS_NONLEAF_PMD_YOUNG
bool
depends on PGTABLE_LEVELS > 2
help
Architectures that select this option are capable of setting the
accessed bit in non-leaf PMD entries when using them as part of linear

View File

@@ -57,7 +57,7 @@ config X86
select ARCH_HAS_PMEM_API if X86_64
# Causing hangs/crashes, see the commit that added this change for details.
select ARCH_HAS_REFCOUNT
select ARCH_HAS_NONLEAF_PMD_YOUNG
select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN

View File

@@ -5,6 +5,17 @@
#include <linux/huge_mm.h>
#include <linux/swap.h>
#ifndef try_cmpxchg
#define try_cmpxchg(_ptr, _oldp, _new) \
({ \
typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
___r = cmpxchg((_ptr), ___o, (_new)); \
if (unlikely(___r != ___o)) \
*___op = ___r; \
likely(___r == ___o); \
})
#endif /* try_cmpxchg */
/**
* page_is_file_cache - should the page be on a file LRU or anon LRU?
* @page: the page to test
@@ -121,17 +132,38 @@ static inline int lru_hist_from_seq(unsigned long seq)
static inline int lru_tier_from_refs(int refs)
{
VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH));
VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
/* see the comment on MAX_NR_TIERS */
/* see the comment in page_lru_refs() */
return order_base_2(refs + 1);
}
static inline int page_lru_refs(struct page *page)
{
unsigned long flags = READ_ONCE(page->flags);
bool workingset = flags & BIT(PG_workingset);
/*
* Return the number of accesses beyond PG_referenced, i.e., N-1 if the
* total number of accesses is N>1, since N=0,1 both map to the first
* tier. lru_tier_from_refs() will account for this off-by-one. Also see
* the comment on MAX_NR_TIERS.
*/
return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
}
static inline int page_lru_gen(struct page *page)
{
unsigned long flags = READ_ONCE(page->flags);
return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
{
unsigned long max_seq = lruvec->lrugen.max_seq;
VM_BUG_ON(gen >= MAX_NR_GENS);
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
/* see the comment on MIN_NR_GENS */
return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
@@ -146,9 +178,9 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
enum lru_list lru = type * LRU_INACTIVE_FILE;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
VM_BUG_ON(old_gen == -1 && new_gen == -1);
VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
if (old_gen >= 0)
WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
@@ -180,17 +212,19 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
}
/* demotion requires isolation, e.g., lru_deactivate_fn() */
VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}
static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
{
int gen;
unsigned long old_flags, new_flags;
unsigned long mask, flags;
int gen = page_lru_gen(page);
int type = page_is_file_cache(page);
int zone = page_zonenum(page);
struct lru_gen_struct *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_PAGE(gen != -1, page);
if (PageUnevictable(page) || !lrugen->enabled)
return false;
/*
@@ -210,14 +244,10 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo
else
gen = lru_gen_from_seq(lrugen->min_seq[type]);
do {
new_flags = old_flags = READ_ONCE(page->flags);
VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
/* see the comment on MIN_NR_GENS */
new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
/* see the comment on MIN_NR_GENS */
mask = LRU_GEN_MASK | BIT(PG_active);
flags = (gen + 1UL) << LRU_GEN_PGOFF;
set_mask_bits(&page->flags, mask, flags);
lru_gen_update_size(lruvec, page, -1, gen);
/* for rotate_reclaimable_page() */
@@ -231,28 +261,25 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo
static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
{
int gen;
unsigned long old_flags, new_flags;
unsigned long mask, flags;
int gen = page_lru_gen(page);
do {
new_flags = old_flags = READ_ONCE(page->flags);
if (!(new_flags & LRU_GEN_MASK))
return false;
if (gen < 0)
return false;
VM_BUG_ON_PAGE(PageActive(page), page);
VM_BUG_ON_PAGE(PageUnevictable(page), page);
VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
mask = LRU_GEN_MASK;
flags = 0;
/* for shrink_page_list() or page_migrate_flags() */
if (reclaiming)
mask |= BIT(PG_referenced) | BIT(PG_reclaim);
else if (lru_gen_is_active(lruvec, gen))
flags |= BIT(PG_active);
new_flags &= ~LRU_GEN_MASK;
if (!(new_flags & BIT(PG_referenced)))
new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
/* for shrink_page_list() */
if (reclaiming)
new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
else if (lru_gen_is_active(lruvec, gen))
new_flags |= BIT(PG_active);
} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
flags = set_mask_bits(&page->flags, mask, flags);
gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
lru_gen_update_size(lruvec, page, gen, -1);
list_del(&page->lru);

View File

@@ -15,7 +15,6 @@
#include <linux/uprobes.h>
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/nodemask.h>
#include <linux/mmdebug.h>
#include <asm/mmu.h>
@@ -542,7 +541,7 @@ struct mm_struct {
* whether it has been used since the last time per-node
* page table walkers cleared the corresponding bits.
*/
nodemask_t nodes;
unsigned long bitmap;
} lru_gen;
#endif /* CONFIG_LRU_GEN */
} __randomize_layout;
@@ -584,16 +583,16 @@ static inline void lru_gen_init_mm(struct mm_struct *mm)
#ifdef CONFIG_MEMCG
mm->lru_gen.memcg = NULL;
#endif
nodes_clear(mm->lru_gen.nodes);
mm->lru_gen.bitmap = 0;
}
static inline void lru_gen_use_mm(struct mm_struct *mm)
{
/* unlikely but not a bug when racing with lru_gen_migrate_mm() */
VM_WARN_ON(list_empty(&mm->lru_gen.list));
VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lru_gen.nodes))
nodes_setall(mm->lru_gen.nodes);
if (!(current->flags & PF_KTHREAD))
WRITE_ONCE(mm->lru_gen.bitmap, -1);
}
#else /* !CONFIG_LRU_GEN */

View File

@@ -250,9 +250,9 @@ struct zone_reclaim_stat {
* Evictable pages are divided into multiple generations. The youngest and the
* oldest generation numbers, max_seq and min_seq, are monotonically increasing.
* They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
* offset within MAX_NR_GENS, gen, indexes the LRU list of the corresponding
* generation. The gen counter in page->flags stores gen+1 while a page is on
* one of lrugen->lists[]. Otherwise it stores 0.
* offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
* corresponding generation. The gen counter in page->flags stores gen+1 while
* a page is on one of lrugen->lists[]. Otherwise it stores 0.
*
* A page is added to the youngest generation on faulting. The aging needs to
* check the accessed bit at least twice before handing this page over to the
@@ -260,16 +260,17 @@ struct zone_reclaim_stat {
* fault; the second check makes sure this page hasn't been used since then.
* This process, AKA second chance, requires a minimum of two generations,
* hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
* LRU, these two generations are considered active; the rest of generations, if
* they exist, are considered inactive. See lru_gen_is_active(). PG_active is
* always cleared while a page is on one of lrugen->lists[] so that the aging
* needs not to worry about it. And it's set again when a page considered active
* is isolated for non-reclaiming purposes, e.g., migration. See
* lru_gen_add_page() and lru_gen_del_page().
* LRU, e.g., /proc/vmstat, these two generations are considered active; the
* rest of generations, if they exist, are considered inactive. See
* lru_gen_is_active(). PG_active is always cleared while a page is on one of
* lrugen->lists[] so that the aging needs not to worry about it. And it's set
* again when a page considered active is isolated for non-reclaiming purposes,
* e.g., migration. See lru_gen_add_page() and lru_gen_del_page().
*
* MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice of the
* categories of the active/inactive LRU when keeping track of accesses through
* page tables. It requires order_base_2(MAX_NR_GENS+1) bits in page->flags.
* MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
* number of categories of the active/inactive LRU when keeping track of
* accesses through page tables. It requires order_base_2(MAX_NR_GENS+1) bits in
* page->flags (LRU_GEN_MASK).
*/
#define MIN_NR_GENS 2U
#define MAX_NR_GENS 4U
@@ -280,18 +281,20 @@ struct zone_reclaim_stat {
* times through file descriptors is in tier order_base_2(N). A page in the
* first tier (N=0,1) is marked by PG_referenced unless it was faulted in
* though page tables or read ahead. A page in any other tier (N>1) is marked
* by PG_referenced and PG_workingset.
* by PG_referenced and PG_workingset. This implies a minimum of two tiers is
* supported without using additional bits in page->flags.
*
* In contrast to moving across generations which requires the LRU lock, moving
* across tiers only requires operations on page->flags and therefore has a
* negligible cost in the buffered access path. In the eviction path,
* across tiers only involves atomic operations on page->flags and therefore
* has a negligible cost in the buffered access path. In the eviction path,
* comparisons of refaulted/(evicted+protected) from the first tier and the
* rest infer whether pages accessed multiple times through file descriptors
* are statistically hot and thus worth protecting.
*
* MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice of the
* categories of the active/inactive LRU when keeping track of accesses through
* file descriptors. It requires MAX_NR_TIERS-2 additional bits in page->flags.
* MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
* number of categories of the active/inactive LRU when keeping track of
* accesses through file descriptors. It uses MAX_NR_TIERS-2 spare bits in
* page->flags (LRU_REFS_MASK).
*/
#define MAX_NR_TIERS 4U
@@ -303,7 +306,6 @@ struct page_vma_mapped_walk;
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
#ifdef CONFIG_LRU_GEN
@@ -338,6 +340,9 @@ enum {
* Normally anon and file min_seq are in sync. But if swapping is constrained,
* e.g., out of swap space, file min_seq is allowed to advance and leave anon
* min_seq behind.
*
* nr_pages[] are eventually consistent and therefore can be transiently
* negative when reset_batch_size() is pending.
*/
struct lru_gen_struct {
/* the aging increments the youngest generation number */
@@ -349,7 +354,7 @@ struct lru_gen_struct {
/* the multi-gen LRU lists */
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the sizes of the above lists */
unsigned long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the exponential moving average of refaulted */
unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
/* the exponential moving average of evicted+protected */
@@ -412,7 +417,7 @@ struct lru_gen_mm_walk {
/* total batched items */
int batched;
bool can_swap;
bool full_scan;
bool force_scan;
};
void lru_gen_init_lruvec(struct lruvec *lruvec);

View File

@@ -55,7 +55,7 @@
#define ZONES_WIDTH ZONES_SHIFT
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LRU_GEN_WIDTH+LRU_REFS_WIDTH \
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LRU_GEN_WIDTH \
<= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH NODES_SHIFT
#else
@@ -77,7 +77,7 @@
#define LAST_CPUPID_SHIFT 0
#endif
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_WIDTH+LRU_GEN_WIDTH+LRU_REFS_WIDTH+ \
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_WIDTH+LRU_GEN_WIDTH+ \
LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
#else
@@ -86,7 +86,7 @@
#ifdef CONFIG_KASAN_SW_TAGS
#define KASAN_TAG_WIDTH 8
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_WIDTH+LRU_GEN_WIDTH+LRU_REFS_WIDTH+ \
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_WIDTH+LRU_GEN_WIDTH+ \
KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
#error "KASAN: not enough bits in page flags for tag"
#endif
@@ -106,4 +106,9 @@
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
#endif
/* see the comment on MAX_NR_TIERS */
#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \
ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */

View File

@@ -24,10 +24,10 @@ int main(void)
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
#ifdef CONFIG_LRU_GEN
DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
DEFINE(LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
#else
DEFINE(LRU_GEN_WIDTH, 0);
DEFINE(LRU_REFS_WIDTH, 0);
DEFINE(__LRU_REFS_WIDTH, 0);
#endif
/* End of constants */

View File

@@ -852,8 +852,8 @@ config FORCE_ALLOC_FROM_DMA_ZONE
config LRU_GEN
bool "Multi-Gen LRU"
depends on MMU
# the following options can use up the spare bits in page flags
depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP)
# make sure page->flags has enough spare bits
depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
help
A high performance LRU implementation to overcommit memory.

View File

@@ -5241,9 +5241,10 @@ static void mem_cgroup_move_task(void)
#ifdef CONFIG_LRU_GEN
static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
struct task_struct *task = NULL;
/* find the first leader if there is any */
cgroup_taskset_for_each_leader(task, css, tset)
break;

View File

@@ -364,33 +364,30 @@ static void __lru_cache_activate_page(struct page *page)
#ifdef CONFIG_LRU_GEN
static void page_inc_refs(struct page *page)
{
unsigned long refs;
unsigned long old_flags, new_flags;
unsigned long new_flags, old_flags = READ_ONCE(page->flags);
if (PageUnevictable(page))
return;
if (!PageReferenced(page)) {
SetPageReferenced(page);
return;
}
if (!PageWorkingset(page)) {
SetPageWorkingset(page);
return;
}
/* see the comment on MAX_NR_TIERS */
do {
new_flags = old_flags = READ_ONCE(page->flags);
new_flags = old_flags & LRU_REFS_MASK;
if (new_flags == LRU_REFS_MASK)
break;
if (!(new_flags & BIT(PG_referenced))) {
new_flags |= BIT(PG_referenced);
continue;
}
if (!(new_flags & BIT(PG_workingset))) {
new_flags |= BIT(PG_workingset);
continue;
}
refs = new_flags & LRU_REFS_MASK;
refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK);
new_flags &= ~LRU_REFS_MASK;
new_flags |= refs;
} while (new_flags != old_flags &&
cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
new_flags += BIT(LRU_REFS_PGOFF);
new_flags |= old_flags & ~LRU_REFS_MASK;
} while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
}
#else
static void page_inc_refs(struct page *page)

File diff suppressed because it is too large Load Diff

View File

@@ -216,43 +216,34 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
#ifdef CONFIG_LRU_GEN
static int page_lru_refs(struct page *page)
{
unsigned long flags = READ_ONCE(page->flags);
BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
/* see the comment on MAX_NR_TIERS */
return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0;
}
void *lru_gen_eviction(struct page *page)
{
int hist, tier;
int hist;
unsigned long token;
unsigned long min_seq;
struct lruvec *lruvec;
struct lru_gen_struct *lrugen;
int type = page_is_file_cache(page);
int refs = page_lru_refs(page);
int delta = hpage_nr_pages(page);
bool workingset = PageWorkingset(page);
int refs = page_lru_refs(page);
int tier = lru_tier_from_refs(refs);
struct mem_cgroup *memcg = page_memcg(page);
struct pglist_data *pgdat = page_pgdat(page);
if (!mem_cgroup_disabled() && !memcg)
return NULL;
BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
lruvec = mem_cgroup_lruvec(pgdat, memcg);
lrugen = &lruvec->lrugen;
min_seq = READ_ONCE(lrugen->min_seq[type]);
token = (min_seq << LRU_REFS_WIDTH) | refs;
token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
hist = lru_hist_from_seq(min_seq);
tier = lru_tier_from_refs(refs + workingset);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
}
void lru_gen_refault(struct page *page, void *shadow)
@@ -271,27 +262,27 @@ void lru_gen_refault(struct page *page, void *shadow)
unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
refs = token & (BIT(LRU_REFS_WIDTH) - 1);
if (refs && !workingset)
return;
if (page_pgdat(page) != pgdat)
return;
/* see the comment in page_lru_refs() */
refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
tier = lru_tier_from_refs(refs);
rcu_read_lock();
memcg = mem_cgroup_from_id(memcg_id);
if (!mem_cgroup_disabled() && !memcg)
goto unlock;
token >>= LRU_REFS_WIDTH;
lruvec = mem_cgroup_lruvec(pgdat, memcg);
lrugen = &lruvec->lrugen;
min_seq = READ_ONCE(lrugen->min_seq[type]);
token >>= LRU_REFS_WIDTH;
if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
goto unlock;
hist = lru_hist_from_seq(min_seq);
tier = lru_tier_from_refs(refs + workingset);
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
mod_lruvec_state(lruvec, WORKINGSET_REFAULT, delta);
@@ -302,7 +293,7 @@ void lru_gen_refault(struct page *page, void *shadow)
* 2. For pages accessed through file descriptors, numbers of accesses
* might have been beyond the limit.
*/
if (lru_gen_in_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) {
if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
SetPageWorkingset(page);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE, delta);
}