Files
Yu Zhao d108b36d5f BACKPORT: mm: multi-gen LRU: fix underprotected page cache
Unmapped folios accessed through file descriptors can be underprotected.
Those folios are added to the oldest generation based on:

1. The fact that they are less costly to reclaim (no need to walk the
   rmap and flush the TLB) and have less impact on performance (don't
   cause major PFs and can be non-blocking if needed again).
2. The observation that they are likely to be single-use. E.g., for
   client use cases like Android, its apps parse configuration files
   and store the data in heap (anon); for server use cases like MySQL,
   it reads from InnoDB files and holds the cached data for tables in
   buffer pools (anon).

However, the oldest generation can be very short lived, and if so, it
doesn't provide the PID controller with enough time to respond to a surge
of refaults.  (Note that the PID controller uses weighted refaults and
those from evicted generations only take a half of the whole weight.) In
other words, for a short lived generation, the moving average smooths out
the spike quickly.

To fix the problem:
1. For folios that are already on LRU, if they can be beyond the
   tracking range of tiers, i.e., five accesses through file
   descriptors, move them to the second oldest generation to give them
   more time to age. (Note that tiers are used by the PID controller
   to statistically determine whether folios accessed multiple times
   through file descriptors are worth protecting.)
2. When adding unmapped folios to LRU, adjust the placement of them so
   that they are not too close to the tail. The effect of this is
   similar to the above.

On Android, launching 55 apps sequentially:
                           Before     After      Change
  workingset_refault_anon  25641024   25598972   0%
  workingset_refault_file  115016834  106178438  -8%

Link: https://lkml.kernel.org/r/20231208061407.2125867-1-yuzhao@google.com
Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: Charan Teja Kalla <quic_charante@quicinc.com>
Tested-by: Kalesh Singh <kaleshsingh@google.com>
Cc: T.J. Mercier <tjmercier@google.com>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Jaroslav Pulchart <jaroslav.pulchart@gooddata.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ folio -> page ]
Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com>
2024-05-20 22:55:48 +03:00

359 lines
9.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_MM_INLINE_H
#define LINUX_MM_INLINE_H
#include <linux/huge_mm.h>
#include <linux/swap.h>
#ifndef try_cmpxchg
#define try_cmpxchg(_ptr, _oldp, _new) \
({ \
typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
___r = cmpxchg((_ptr), ___o, (_new)); \
if (unlikely(___r != ___o)) \
*___op = ___r; \
likely(___r == ___o); \
})
#endif /* try_cmpxchg */
/**
* page_is_file_cache - should the page be on a file LRU or anon LRU?
* @page: the page to test
*
* Returns 1 if @page is page cache page backed by a regular filesystem,
* or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed.
* Used by functions that manipulate the LRU lists, to sort a page
* onto the right LRU list.
*
* We would like to get this info without a page flag, but the state
* needs to survive until the page is last deleted from the LRU, which
* could be as far down as __page_cache_release.
*/
static inline int page_is_file_cache(struct page *page)
{
return !PageSwapBacked(page);
}
static __always_inline void __update_lru_size(struct lruvec *lruvec,
enum lru_list lru, enum zone_type zid,
int nr_pages)
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lockdep_assert_held(&pgdat->lru_lock);
__mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
}
static __always_inline void update_lru_size(struct lruvec *lruvec,
enum lru_list lru, enum zone_type zid,
int nr_pages)
{
__update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
}
/**
* __clear_page_lru_flags - clear page lru flags before releasing a page
* @page: the page that was on lru and now has a zero reference
*/
static __always_inline void __clear_page_lru_flags(struct page *page)
{
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
/* this shouldn't happen, so leave the flags to bad_page() */
if (PageActive(page) && PageUnevictable(page))
return;
__ClearPageActive(page);
__ClearPageUnevictable(page);
}
/**
* page_lru - which LRU list should a page be on?
* @page: the page to test
*
* Returns the LRU list a page should be on, as an index
* into the array of LRU lists.
*/
static __always_inline enum lru_list page_lru(struct page *page)
{
enum lru_list lru;
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
if (PageUnevictable(page))
return LRU_UNEVICTABLE;
lru = page_is_file_cache(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
if (PageActive(page))
lru += LRU_ACTIVE;
return lru;
}
#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
#ifdef CONFIG_LRU_GEN
static inline bool lru_gen_enabled(void)
{
#ifdef CONFIG_LRU_GEN_ENABLED
DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
#else
DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
#endif
}
static inline bool lru_gen_in_fault(void)
{
return current->in_lru_fault;
}
static inline int lru_gen_from_seq(unsigned long seq)
{
return seq % MAX_NR_GENS;
}
static inline int lru_hist_from_seq(unsigned long seq)
{
return seq % NR_HIST_GENS;
}
static inline int lru_tier_from_refs(int refs)
{
VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH));
/* see the comment on MAX_NR_TIERS */
return order_base_2(refs + 1);
}
static inline int page_lru_refs(struct page *page)
{
unsigned long flags = READ_ONCE(page->flags);
bool workingset = flags & BIT(PG_workingset);
/*
* Return the number of accesses beyond PG_referenced, i.e., N-1 if the
* total number of accesses is N>1, since N=0,1 both map to the first
* tier. lru_tier_from_refs() will account for this off-by-one. Also see
* the comment on MAX_NR_TIERS.
*/
return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
}
static inline int page_lru_gen(struct page *page)
{
unsigned long flags = READ_ONCE(page->flags);
return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}
static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
{
unsigned long max_seq = lruvec->lrugen.max_seq;
VM_BUG_ON(gen >= MAX_NR_GENS);
/* see the comment on MIN_NR_GENS */
return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
}
static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
int old_gen, int new_gen)
{
int type = page_is_file_cache(page);
int zone = page_zonenum(page);
int delta = hpage_nr_pages(page);
enum lru_list lru = type * LRU_INACTIVE_FILE;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
VM_BUG_ON(old_gen == -1 && new_gen == -1);
if (old_gen >= 0)
WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
lrugen->nr_pages[old_gen][type][zone] - delta);
if (new_gen >= 0)
WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
lrugen->nr_pages[new_gen][type][zone] + delta);
/* addition */
if (old_gen < 0) {
if (lru_gen_is_active(lruvec, new_gen))
lru += LRU_ACTIVE;
update_lru_size(lruvec, lru, zone, delta);
return;
}
/* deletion */
if (new_gen < 0) {
if (lru_gen_is_active(lruvec, old_gen))
lru += LRU_ACTIVE;
update_lru_size(lruvec, lru, zone, -delta);
return;
}
/* promotion */
if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
update_lru_size(lruvec, lru, zone, -delta);
update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
}
/* demotion requires isolation, e.g., lru_deactivate_fn() */
VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}
static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
{
int gen;
unsigned long old_flags, new_flags;
int type = page_is_file_cache(page);
int zone = page_zonenum(page);
struct lru_gen_struct *lrugen = &lruvec->lrugen;
if (PageUnevictable(page) || !lrugen->enabled)
return false;
/*
* There are four common cases for this page:
* 1. If it's hot, i.e., freshly faulted in, add it to the youngest
* generation, and it's protected over the rest below.
* 2. If it can't be evicted immediately, i.e., a dirty page pending
* writeback, add it to the second youngest generation.
* 3. If it should be evicted first, e.g., cold and clean from
* page_rotate_reclaimable(), add it to the oldest generation.
* 4. Everything else falls between 2 & 3 above and is added to the
* second oldest generation if it's considered inactive, or the
* oldest generation otherwise. See lru_gen_is_active().
*/
if (PageActive(page))
gen = lru_gen_from_seq(lrugen->max_seq);
else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) ||
(PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
gen = lru_gen_from_seq(lrugen->max_seq - 1);
else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq)
gen = lru_gen_from_seq(lrugen->min_seq[type]);
else
gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
do {
new_flags = old_flags = READ_ONCE(page->flags);
VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
/* see the comment on MIN_NR_GENS */
new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
lru_gen_update_size(lruvec, page, -1, gen);
/* for rotate_reclaimable_page() */
if (reclaiming)
list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
else
list_add(&page->lru, &lrugen->lists[gen][type][zone]);
return true;
}
static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
{
int gen;
unsigned long old_flags, new_flags;
do {
new_flags = old_flags = READ_ONCE(page->flags);
if (!(new_flags & LRU_GEN_MASK))
return false;
VM_BUG_ON_PAGE(PageActive(page), page);
VM_BUG_ON_PAGE(PageUnevictable(page), page);
gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
new_flags &= ~LRU_GEN_MASK;
if (!(new_flags & BIT(PG_referenced)))
new_flags &= ~(LRU_REFS_MASK | (BIT(PG_referenced) | BIT(PG_workingset)));
/* for shrink_page_list() */
if (reclaiming)
new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
else if (lru_gen_is_active(lruvec, gen))
new_flags |= BIT(PG_active);
} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
lru_gen_update_size(lruvec, page, gen, -1);
list_del(&page->lru);
return true;
}
#else
static inline bool lru_gen_enabled(void)
{
return false;
}
static inline bool lru_gen_in_fault(void)
{
return false;
}
static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
{
return false;
}
static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
{
return false;
}
#endif /* CONFIG_LRU_GEN */
static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec)
{
enum lru_list lru = page_lru(page);
if (lru_gen_add_page(lruvec, page, false))
return;
update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
}
static __always_inline void add_page_to_lru_list_tail(struct page *page,
struct lruvec *lruvec)
{
enum lru_list lru = page_lru(page);
if (lru_gen_add_page(lruvec, page, true))
return;
update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
list_add_tail(&page->lru, &lruvec->lists[lru]);
}
static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec)
{
if (lru_gen_del_page(lruvec, page, false))
return;
list_del(&page->lru);
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
-hpage_nr_pages(page));
}
#endif