Revert "FROMLIST: mm: multi-gen LRU: minimal implementation"

This reverts commit a1537a68c5.

To be replaced with upstream version.

Bug: 249601646
Change-Id: I3dfbb3ec56cfdb5a2db7ec00c124dae471cce932
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
This commit is contained in:
Kalesh Singh
2022-11-07 16:00:54 -08:00
parent 4bce99229a
commit 6e620d5117
7 changed files with 4 additions and 1041 deletions

View File

@@ -111,19 +111,6 @@ static inline int lru_gen_from_seq(unsigned long seq)
return seq % MAX_NR_GENS;
}
static inline int lru_hist_from_seq(unsigned long seq)
{
return seq % NR_HIST_GENS;
}
static inline int lru_tier_from_refs(int refs)
{
VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH));
/* see the comment on MAX_NR_TIERS */
return order_base_2(refs + 1);
}
static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
{
unsigned long max_seq = lruvec->lrugen.max_seq;
@@ -169,15 +156,6 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
__update_lru_size(lruvec, lru, zone, -delta);
return;
}
/* promotion */
if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
__update_lru_size(lruvec, lru, zone, -delta);
__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
}
/* demotion requires isolation, e.g., lru_deactivate_fn() */
VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}
static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
@@ -242,8 +220,6 @@ static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bo
gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
new_flags &= ~LRU_GEN_MASK;
if (!(new_flags & BIT(PG_referenced)))
new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
/* for shrink_page_list() */
if (reclaiming)
new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));

View File

@@ -327,34 +327,12 @@ enum lruvec_flags {
#define MIN_NR_GENS 2U
#define MAX_NR_GENS 4U
/*
* Each generation is divided into multiple tiers. Tiers represent different
* ranges of numbers of accesses through file descriptors. A page accessed N
* times through file descriptors is in tier order_base_2(N). A page in the
* first tier (N=0,1) is marked by PG_referenced unless it was faulted in
* though page tables or read ahead. A page in any other tier (N>1) is marked
* by PG_referenced and PG_workingset.
*
* In contrast to moving across generations which requires the LRU lock, moving
* across tiers only requires operations on page->flags and therefore has a
* negligible cost in the buffered access path. In the eviction path,
* comparisons of refaulted/(evicted+protected) from the first tier and the
* rest infer whether pages accessed multiple times through file descriptors
* are statistically hot and thus worth protecting.
*
* MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice of the
* categories of the active/inactive LRU when keeping track of accesses through
* file descriptors. It requires MAX_NR_TIERS-2 additional bits in page->flags.
*/
#define MAX_NR_TIERS 4U
#ifndef __GENERATING_BOUNDS_H
struct lruvec;
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
#ifdef CONFIG_LRU_GEN
@@ -363,16 +341,6 @@ enum {
LRU_GEN_FILE,
};
#define MIN_LRU_BATCH BITS_PER_LONG
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 128)
/* whether to keep historical stats from evicted generations */
#ifdef CONFIG_LRU_GEN_STATS
#define NR_HIST_GENS MAX_NR_GENS
#else
#define NR_HIST_GENS 1U
#endif
/*
* The youngest generation number is stored in max_seq for both anon and file
* types as they are aged on an equal footing. The oldest generation numbers are
@@ -392,15 +360,6 @@ struct lru_gen_struct {
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the sizes of the above lists */
unsigned long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the exponential moving average of refaulted */
unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
/* the exponential moving average of evicted+protected */
unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
/* the first tier doesn't need protection, hence the minus one */
unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
/* can be modified without holding the LRU lock */
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
};
void lru_gen_init_lruvec(struct lruvec *lruvec);

View File

@@ -24,7 +24,7 @@ int main(void)
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
#ifdef CONFIG_LRU_GEN
DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
DEFINE(LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
DEFINE(LRU_REFS_WIDTH, 0);
#else
DEFINE(LRU_GEN_WIDTH, 0);
DEFINE(LRU_REFS_WIDTH, 0);

View File

@@ -916,7 +916,6 @@ config ANON_VMA_NAME
area from being merged with adjacent virtual memory areas due to the
difference in their name.
# multi-gen LRU {
config LRU_GEN
bool "Multi-Gen LRU"
depends on MMU
@@ -925,16 +924,6 @@ config LRU_GEN
help
A high performance LRU implementation to overcommit memory.
config LRU_GEN_STATS
bool "Full stats for debugging"
depends on LRU_GEN
help
Do not enable this option unless you plan to look at historical stats
from evicted generations for debugging purpose.
This option has a per-memcg and per-node memory overhead.
# }
source "mm/damon/Kconfig"
config ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT

View File

@@ -389,43 +389,6 @@ static void __lru_cache_activate_page(struct page *page)
local_unlock(&lru_pvecs.lock);
}
#ifdef CONFIG_LRU_GEN
static void page_inc_refs(struct page *page)
{
unsigned long refs;
unsigned long old_flags, new_flags;
if (PageUnevictable(page))
return;
/* see the comment on MAX_NR_TIERS */
do {
new_flags = old_flags = READ_ONCE(page->flags);
if (!(new_flags & BIT(PG_referenced))) {
new_flags |= BIT(PG_referenced);
continue;
}
if (!(new_flags & BIT(PG_workingset))) {
new_flags |= BIT(PG_workingset);
continue;
}
refs = new_flags & LRU_REFS_MASK;
refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK);
new_flags &= ~LRU_REFS_MASK;
new_flags |= refs;
} while (new_flags != old_flags &&
cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
}
#else
static void page_inc_refs(struct page *page)
{
}
#endif /* CONFIG_LRU_GEN */
/*
* Mark a page as having seen activity.
*
@@ -440,11 +403,6 @@ void mark_page_accessed(struct page *page)
{
page = compound_head(page);
if (lru_gen_enabled()) {
page_inc_refs(page);
return;
}
trace_android_vh_mark_page_accessed(page);
if (!PageReferenced(page)) {
SetPageReferenced(page);

View File

@@ -1153,11 +1153,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
/* get a shadow entry before mem_cgroup_swapout() clears page_memcg() */
mem_cgroup_swapout(page, swap);
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(page, target_memcg);
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page, swap, shadow);
xa_unlock_irq(&mapping->i_pages);
put_swap_page(page, swap);
@@ -2617,9 +2615,6 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
unsigned long file;
struct lruvec *target_lruvec;
if (lru_gen_enabled())
return;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
/*
@@ -2953,38 +2948,11 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
* shorthand helpers
******************************************************************************/
#define DEFINE_MAX_SEQ(lruvec) \
unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
#define DEFINE_MIN_SEQ(lruvec) \
unsigned long min_seq[ANON_AND_FILE] = { \
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
}
#define for_each_gen_type_zone(gen, type, zone) \
for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
static int page_lru_gen(struct page *page)
{
unsigned long flags = READ_ONCE(page->flags);
return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
static int page_lru_tier(struct page *page)
{
int refs;
unsigned long flags = READ_ONCE(page->flags);
refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ?
((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0;
return lru_tier_from_refs(refs);
}
static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
{
struct pglist_data *pgdat = NODE_DATA(nid);
@@ -3005,755 +2973,6 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
return pgdat ? &pgdat->__lruvec : NULL;
}
static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
{
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
if (!can_demote(pgdat->node_id, sc) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
return mem_cgroup_swappiness(memcg);
}
static int get_nr_gens(struct lruvec *lruvec, int type)
{
return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
}
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
{
/* see the comment on lru_gen_struct */
return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
}
/******************************************************************************
* refault feedback loop
******************************************************************************/
/*
* A feedback loop based on Proportional-Integral-Derivative (PID) controller.
*
* The P term is refaulted/(evicted+protected) from a tier in the generation
* currently being evicted; the I term is the exponential moving average of the
* P term over the generations previously evicted, using the smoothing factor
* 1/2; the D term isn't supported.
*
* The setpoint (SP) is always the first tier of one type; the process variable
* (PV) is either any tier of the other type or any other tier of the same
* type.
*
* The error is the difference between the SP and the PV; the correction is
* turn off protection when SP>PV or turn on protection when SP<PV.
*
* For future optimizations:
* 1. The D term may discount the other two terms over time so that long-lived
* generations can resist stale information.
*/
struct ctrl_pos {
unsigned long refaulted;
unsigned long total;
int gain;
};
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
struct ctrl_pos *pos)
{
struct lru_gen_struct *lrugen = &lruvec->lrugen;
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
pos->refaulted = lrugen->avg_refaulted[type][tier] +
atomic_long_read(&lrugen->refaulted[hist][type][tier]);
pos->total = lrugen->avg_total[type][tier] +
atomic_long_read(&lrugen->evicted[hist][type][tier]);
if (tier)
pos->total += lrugen->protected[hist][type][tier - 1];
pos->gain = gain;
}
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
{
int hist, tier;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
lockdep_assert_held(&lruvec->lru_lock);
if (!carryover && !clear)
return;
hist = lru_hist_from_seq(seq);
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
if (carryover) {
unsigned long sum;
sum = lrugen->avg_refaulted[type][tier] +
atomic_long_read(&lrugen->refaulted[hist][type][tier]);
WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
sum = lrugen->avg_total[type][tier] +
atomic_long_read(&lrugen->evicted[hist][type][tier]);
if (tier)
sum += lrugen->protected[hist][type][tier - 1];
WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
}
if (clear) {
atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
if (tier)
WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
}
}
}
static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
{
/*
* Return true if the PV has a limited number of refaults or a lower
* refaulted/total than the SP.
*/
return pv->refaulted < MIN_LRU_BATCH ||
pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
(sp->refaulted + 1) * pv->total * pv->gain;
}
/******************************************************************************
* the aging
******************************************************************************/
static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
{
unsigned long old_flags, new_flags;
int type = page_is_file_lru(page);
struct lru_gen_struct *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
do {
new_flags = old_flags = READ_ONCE(page->flags);
VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page);
new_gen = (old_gen + 1) % MAX_NR_GENS;
new_flags &= ~LRU_GEN_MASK;
new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
/* for end_page_writeback() */
if (reclaiming)
new_flags |= BIT(PG_reclaim);
} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
lru_gen_update_size(lruvec, page, old_gen, new_gen);
return new_gen;
}
static void inc_min_seq(struct lruvec *lruvec)
{
int type;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
VM_BUG_ON(!seq_is_valid(lruvec));
for (type = 0; type < ANON_AND_FILE; type++) {
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
reset_ctrl_pos(lruvec, type, true);
WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
}
}
static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
{
int gen, type, zone;
bool success = false;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
VM_BUG_ON(!seq_is_valid(lruvec));
for (type = !can_swap; type < ANON_AND_FILE; type++) {
while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
gen = lru_gen_from_seq(min_seq[type]);
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
if (!list_empty(&lrugen->lists[gen][type][zone]))
goto next;
}
min_seq[type]++;
}
next:
;
}
/* see the comment on lru_gen_struct */
if (can_swap) {
min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
}
for (type = !can_swap; type < ANON_AND_FILE; type++) {
if (min_seq[type] == lrugen->min_seq[type])
continue;
reset_ctrl_pos(lruvec, type, true);
WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
success = true;
}
return success;
}
static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq)
{
int prev, next;
int type, zone;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
spin_lock_irq(&lruvec->lru_lock);
VM_BUG_ON(!seq_is_valid(lruvec));
if (max_seq != lrugen->max_seq)
goto unlock;
inc_min_seq(lruvec);
/*
* Update the active/inactive LRU sizes for compatibility. Both sides of
* the current max_seq need to be covered, since max_seq+1 can overlap
* with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
* overlap, cold/hot inversion happens. This can be solved by moving
* pages from min_seq to min_seq+1 but is omitted for simplicity.
*/
prev = lru_gen_from_seq(lrugen->max_seq - 1);
next = lru_gen_from_seq(lrugen->max_seq + 1);
for (type = 0; type < ANON_AND_FILE; type++) {
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
enum lru_list lru = type * LRU_INACTIVE_FILE;
long delta = lrugen->nr_pages[prev][type][zone] -
lrugen->nr_pages[next][type][zone];
if (!delta)
continue;
__update_lru_size(lruvec, lru, zone, delta);
__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
}
}
for (type = 0; type < ANON_AND_FILE; type++)
reset_ctrl_pos(lruvec, type, false);
/* make sure preceding modifications appear */
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
unlock:
spin_unlock_irq(&lruvec->lru_lock);
}
static long get_nr_evictable(struct lruvec *lruvec, unsigned long max_seq,
unsigned long *min_seq, bool can_swap, bool *need_aging)
{
int gen, type, zone;
long old = 0;
long young = 0;
long total = 0;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
for (type = !can_swap; type < ANON_AND_FILE; type++) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
long size = 0;
gen = lru_gen_from_seq(seq);
for (zone = 0; zone < MAX_NR_ZONES; zone++)
size += READ_ONCE(lrugen->nr_pages[gen][type][zone]);
total += size;
if (seq == max_seq)
young += size;
if (seq + MIN_NR_GENS == max_seq)
old += size;
}
}
/*
* The aging and the eviction is a typical producer-consumer model. The
* aging tries to be lazy to reduce the unnecessary overhead. On the
* other hand, the eviction stalls when the number of generations
* reaches MIN_NR_GENS. So ideally, there should be MIN_NR_GENS+1
* generations, hence the first two if's.
*
* In addition, it's ideal to spread pages out evenly, meaning
* 1/(MIN_NR_GENS+1) of the total number of pages for each generation. A
* reasonable range for this average portion would [1/MIN_NR_GENS,
* 1/(MIN_NR_GENS+2)]. From the consumer's POV, the eviction only cares
* about the lower bound of cold pages, i.e., 1/(MIN_NR_GENS+2), whereas
* from the producer's POV, the aging only cares about the upper bound
* of hot pages, i.e., 1/MIN_NR_GENS.
*/
if (min_seq[LRU_GEN_FILE] + MIN_NR_GENS > max_seq)
*need_aging = true;
else if (min_seq[LRU_GEN_FILE] + MIN_NR_GENS < max_seq)
*need_aging = false;
else if (young * MIN_NR_GENS > total)
*need_aging = true;
else if (old * (MIN_NR_GENS + 2) < total)
*need_aging = true;
else
*need_aging = false;
return total > 0 ? total : 0;
}
static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
bool need_aging;
long nr_to_scan;
int swappiness = get_swappiness(lruvec, sc);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
mem_cgroup_calculate_protection(NULL, memcg);
if (mem_cgroup_below_min(memcg))
return;
nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, swappiness, &need_aging);
if (!nr_to_scan)
return;
nr_to_scan >>= sc->priority;
if (!mem_cgroup_online(memcg))
nr_to_scan++;
if (nr_to_scan && need_aging && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
inc_max_seq(lruvec, max_seq);
}
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
VM_BUG_ON(!current_is_kswapd());
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
age_lruvec(lruvec, sc);
cond_resched();
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
}
/******************************************************************************
* the eviction
******************************************************************************/
static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx)
{
bool success;
int gen = page_lru_gen(page);
int type = page_is_file_lru(page);
int zone = page_zonenum(page);
int tier = page_lru_tier(page);
int delta = thp_nr_pages(page);
struct lru_gen_struct *lrugen = &lruvec->lrugen;
VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page);
if (!page_evictable(page)) {
success = lru_gen_del_page(lruvec, page, true);
VM_BUG_ON_PAGE(!success, page);
SetPageUnevictable(page);
add_page_to_lru_list(page, lruvec);
__count_vm_events(UNEVICTABLE_PGCULLED, delta);
return true;
}
if (type == LRU_GEN_FILE && PageAnon(page) && PageDirty(page)) {
success = lru_gen_del_page(lruvec, page, true);
VM_BUG_ON_PAGE(!success, page);
SetPageSwapBacked(page);
add_page_to_lru_list_tail(page, lruvec);
return true;
}
if (tier > tier_idx) {
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
gen = page_inc_gen(lruvec, page, false);
list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]);
WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
lrugen->protected[hist][type][tier - 1] + delta);
__mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
return true;
}
if (PageLocked(page) || PageWriteback(page) ||
(type == LRU_GEN_FILE && PageDirty(page))) {
gen = page_inc_gen(lruvec, page, true);
list_move(&page->lru, &lrugen->lists[gen][type][zone]);
return true;
}
return false;
}
static bool isolate_page(struct lruvec *lruvec, struct page *page, struct scan_control *sc)
{
bool success;
if (!sc->may_unmap && page_mapped(page))
return false;
if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
(PageDirty(page) || (PageAnon(page) && !PageSwapCache(page))))
return false;
if (!get_page_unless_zero(page))
return false;
if (!TestClearPageLRU(page)) {
put_page(page);
return false;
}
success = lru_gen_del_page(lruvec, page, true);
VM_BUG_ON_PAGE(!success, page);
return true;
}
static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
int type, int tier, struct list_head *list)
{
int gen, zone;
enum vm_event_item item;
int sorted = 0;
int scanned = 0;
int isolated = 0;
int remaining = MAX_LRU_BATCH;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
VM_BUG_ON(!list_empty(list));
if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
return 0;
gen = lru_gen_from_seq(lrugen->min_seq[type]);
for (zone = sc->reclaim_idx; zone >= 0; zone--) {
LIST_HEAD(moved);
int skipped = 0;
struct list_head *head = &lrugen->lists[gen][type][zone];
while (!list_empty(head)) {
struct page *page = lru_to_page(head);
int delta = thp_nr_pages(page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(PageUnevictable(page), page);
VM_BUG_ON_PAGE(PageActive(page), page);
VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
prefetchw_prev_lru_page(page, head, flags);
scanned += delta;
if (sort_page(lruvec, page, tier))
sorted += delta;
else if (isolate_page(lruvec, page, sc)) {
list_add(&page->lru, list);
isolated += delta;
} else {
list_move(&page->lru, &moved);
skipped += delta;
}
if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
break;
}
if (skipped) {
list_splice(&moved, head);
__count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
}
if (!remaining || isolated >= MIN_LRU_BATCH)
break;
}
item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
if (!cgroup_reclaim(sc)) {
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
}
__count_memcg_events(memcg, item, isolated);
__count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
/*
* There might not be eligible pages due to reclaim_idx, may_unmap and
* may_writepage. Check the remaining to prevent livelock if there is no
* progress.
*/
return isolated || !remaining ? scanned : 0;
}
static int get_tier_idx(struct lruvec *lruvec, int type)
{
int tier;
struct ctrl_pos sp, pv;
/*
* To leave a margin for fluctuations, use a larger gain factor (1:2).
* This value is chosen because any other tier would have at least twice
* as many refaults as the first tier.
*/
read_ctrl_pos(lruvec, type, 0, 1, &sp);
for (tier = 1; tier < MAX_NR_TIERS; tier++) {
read_ctrl_pos(lruvec, type, tier, 2, &pv);
if (!positive_ctrl_err(&sp, &pv))
break;
}
return tier - 1;
}
static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
{
int type, tier;
struct ctrl_pos sp, pv;
int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
/*
* Compare the first tier of anon with that of file to determine which
* type to scan. Also need to compare other tiers of the selected type
* with the first tier of the other type to determine the last tier (of
* the selected type) to evict.
*/
read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
type = positive_ctrl_err(&sp, &pv);
read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
for (tier = 1; tier < MAX_NR_TIERS; tier++) {
read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
if (!positive_ctrl_err(&sp, &pv))
break;
}
*tier_idx = tier - 1;
return type;
}
static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
int *type_scanned, struct list_head *list)
{
int i;
int type;
int scanned;
int tier = -1;
DEFINE_MIN_SEQ(lruvec);
VM_BUG_ON(!seq_is_valid(lruvec));
/*
* Try to make the obvious choice first. When anon and file are both
* available from the same generation, interpret swappiness 1 as file
* first and 200 as anon first.
*/
if (!swappiness)
type = LRU_GEN_FILE;
else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
type = LRU_GEN_ANON;
else if (swappiness == 1)
type = LRU_GEN_FILE;
else if (swappiness == 200)
type = LRU_GEN_ANON;
else
type = get_type_to_scan(lruvec, swappiness, &tier);
for (i = !swappiness; i < ANON_AND_FILE; i++) {
if (tier < 0)
tier = get_tier_idx(lruvec, type);
scanned = scan_pages(lruvec, sc, type, tier, list);
if (scanned)
break;
type = !type;
tier = -1;
}
*type_scanned = type;
return scanned;
}
static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
int type;
int scanned;
int reclaimed;
LIST_HEAD(list);
struct page *page;
enum vm_event_item item;
struct reclaim_stat stat;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
spin_lock_irq(&lruvec->lru_lock);
scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
if (try_to_inc_min_seq(lruvec, swappiness))
scanned++;
if (get_nr_gens(lruvec, LRU_GEN_FILE) == MIN_NR_GENS)
scanned = 0;
spin_unlock_irq(&lruvec->lru_lock);
if (list_empty(&list))
return scanned;
reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
/*
* To avoid livelock, don't add rejected pages back to the same lists
* they were isolated from. See lru_gen_add_page().
*/
list_for_each_entry(page, &list, lru) {
ClearPageReferenced(page);
ClearPageWorkingset(page);
if (PageReclaim(page) && (PageDirty(page) || PageWriteback(page)))
ClearPageActive(page);
else if (page_is_file_lru(page) || PageSwapCache(page))
SetPageActive(page);
}
spin_lock_irq(&lruvec->lru_lock);
move_pages_to_lru(lruvec, &list);
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
__count_memcg_events(memcg, item, reclaimed);
__count_vm_events(PGSTEAL_ANON + type, reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_list(&list);
free_unref_page_list(&list);
sc->nr_reclaimed += reclaimed;
return scanned;
}
static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
{
bool need_aging;
long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
if (mem_cgroup_below_min(memcg) ||
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
return 0;
nr_to_scan = get_nr_evictable(lruvec, max_seq, min_seq, can_swap, &need_aging);
if (!nr_to_scan)
return 0;
/* reset the priority if the target has been met */
nr_to_scan >>= sc->nr_reclaimed < sc->nr_to_reclaim ? sc->priority : DEF_PRIORITY;
if (!mem_cgroup_online(memcg))
nr_to_scan++;
if (!nr_to_scan)
return 0;
if (!need_aging)
return nr_to_scan;
/* leave the work to lru_gen_age_node() */
if (current_is_kswapd())
return 0;
/* try other memcgs before going to the aging path */
if (!cgroup_reclaim(sc) && !sc->force_deactivate) {
sc->skipped_deactivate = true;
return 0;
}
inc_max_seq(lruvec, max_seq);
return nr_to_scan;
}
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
struct blk_plug plug;
long scanned = 0;
lru_add_drain();
blk_start_plug(&plug);
while (true) {
int delta;
int swappiness;
long nr_to_scan;
if (sc->may_swap)
swappiness = get_swappiness(lruvec, sc);
else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
swappiness = 1;
else
swappiness = 0;
nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
if (!nr_to_scan)
break;
delta = evict_pages(lruvec, sc, swappiness);
if (!delta)
break;
scanned += delta;
if (scanned >= nr_to_scan)
break;
cond_resched();
}
blk_finish_plug(&plug);
}
/******************************************************************************
* initialization
******************************************************************************/
@@ -3796,16 +3015,6 @@ static int __init init_lru_gen(void)
};
late_initcall(init_lru_gen);
#else
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
}
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
}
#endif /* CONFIG_LRU_GEN */
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
@@ -3819,11 +3028,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
struct blk_plug plug;
bool scan_adjusted;
if (lru_gen_enabled()) {
lru_gen_shrink_lruvec(lruvec, sc);
return;
}
get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
@@ -4296,9 +3500,6 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
struct lruvec *target_lruvec;
unsigned long refaults;
if (lru_gen_enabled())
return;
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
target_lruvec->refaults[0] = refaults;
@@ -4670,11 +3871,6 @@ static void age_active_anon(struct pglist_data *pgdat,
struct mem_cgroup *memcg;
struct lruvec *lruvec;
if (lru_gen_enabled()) {
lru_gen_age_node(pgdat, sc);
return;
}
if (!can_age_anon_pages(pgdat, sc))
return;

View File

@@ -187,6 +187,7 @@ static unsigned int bucket_order __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
{
eviction >>= bucket_order;
eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
@@ -211,116 +212,10 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry;
*evictionp = entry << bucket_order;
*workingsetp = workingset;
}
#ifdef CONFIG_LRU_GEN
static int page_lru_refs(struct page *page)
{
unsigned long flags = READ_ONCE(page->flags);
BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
/* see the comment on MAX_NR_TIERS */
return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0;
}
static void *lru_gen_eviction(struct page *page)
{
int hist, tier;
unsigned long token;
unsigned long min_seq;
struct lruvec *lruvec;
struct lru_gen_struct *lrugen;
int type = page_is_file_lru(page);
int refs = page_lru_refs(page);
int delta = thp_nr_pages(page);
bool workingset = PageWorkingset(page);
struct mem_cgroup *memcg = page_memcg(page);
struct pglist_data *pgdat = page_pgdat(page);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
lrugen = &lruvec->lrugen;
min_seq = READ_ONCE(lrugen->min_seq[type]);
token = (min_seq << LRU_REFS_WIDTH) | refs;
hist = lru_hist_from_seq(min_seq);
tier = lru_tier_from_refs(refs + workingset);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
}
static void lru_gen_refault(struct page *page, void *shadow)
{
int hist, tier, refs;
int memcg_id;
bool workingset;
unsigned long token;
unsigned long min_seq;
struct lruvec *lruvec;
struct lru_gen_struct *lrugen;
struct mem_cgroup *memcg;
struct pglist_data *pgdat;
int type = page_is_file_lru(page);
int delta = thp_nr_pages(page);
unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
refs = token & (BIT(LRU_REFS_WIDTH) - 1);
if (refs && !workingset)
return;
if (page_pgdat(page) != pgdat)
return;
rcu_read_lock();
memcg = page_memcg_rcu(page);
if (mem_cgroup_id(memcg) != memcg_id)
goto unlock;
token >>= LRU_REFS_WIDTH;
lruvec = mem_cgroup_lruvec(memcg, pgdat);
lrugen = &lruvec->lrugen;
min_seq = READ_ONCE(lrugen->min_seq[type]);
if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
goto unlock;
hist = lru_hist_from_seq(min_seq);
tier = lru_tier_from_refs(refs + workingset);
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
/*
* Count the following two cases as stalls:
* 1. For pages accessed through page tables, hotter pages pushed out
* hot pages which refaulted immediately.
* 2. For pages accessed through file descriptors, numbers of accesses
* might have been beyond the limit.
*/
if (lru_gen_in_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) {
SetPageWorkingset(page);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
}
unlock:
rcu_read_unlock();
}
#else
static void *lru_gen_eviction(struct page *page)
{
return NULL;
}
static void lru_gen_refault(struct page *page, void *shadow)
{
}
#endif /* CONFIG_LRU_GEN */
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
* @lruvec: the lruvec that was aged
@@ -369,14 +264,10 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (lru_gen_enabled())
return lru_gen_eviction(page);
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
eviction >>= bucket_order;
workingset_age_nonresident(lruvec, thp_nr_pages(page));
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
@@ -405,13 +296,7 @@ void workingset_refault(struct page *page, void *shadow)
bool workingset;
int memcgid;
if (lru_gen_enabled()) {
lru_gen_refault(page, shadow);
return;
}
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
eviction <<= bucket_order;
rcu_read_lock();
/*