Merge remote-tracking branch 'slmk/linux-4.14' into auto-kernel

* slmk/linux-4.14:
  simple_lmk: Remove unnecessary clean-up when timeout is reached
  simple_lmk: Hold an RCU read lock instead of the tasklist read lock
  mm: Don't stop kswapd on a per-node basis when there are no waiters
  simple_lmk: Consider all positive adjs when finding victims
  mm: vmpressure: Ignore allocation orders above PAGE_ALLOC_COSTLY_ORDER
  mm: Don't warn on page allocation failures for OOM-killed processes
  mm: Adjust tsk_is_oom_victim() for Simple LMK
  mm: vmpressure: Don't cache the window size
  mm: vmpressure: Interpret zero scanned pages as 100% pressure
  mm: vmpressure: Don't exclude any allocation types
  simple_lmk: Update adj targeting for Android 10
  simple_lmk: Use vmpressure notifier to trigger kills
  mm: vmpressure: make vmpressure window variable
  mm: vmpressure: account allocstalls only on higher pressures
  mm: vmpressure: scale pressure based on reclaim context
  mm: vmpressure: allow in-kernel clients to subscribe for events
  mm, vmpressure: int cast vmpressure level/model for -1 comparison
  mm: Stop kswapd early when nothing's waiting for it to free pages
  simple_lmk: Include swap memory usage in the size of victims
  simple_lmk: Relax memory barriers and clean up some styling
  simple_lmk: Place victims onto SCHED_RR
  simple_lmk: Add a timeout to stop waiting for victims to die
  simple_lmk: Ignore tasks that won't free memory
  simple_lmk: Simplify tricks used to speed up the death process
  simple_lmk: Report mm as freed as soon as exit_mmap() finishes
  simple_lmk: Mark victim thread group with TIF_MEMDIE
  simple_lmk: Disable OOM killer when Simple LMK is enabled
  simple_lmk: Print a message when there are no processes to kill
  simple_lmk: Remove compat cruft not specific to 4.14
  simple_lmk: Update copyright to 2020
  simple_lmk: Don't queue up new reclaim requests during reclaim
  simple_lmk: Increase default minfree value
  simple_lmk: Clean up some code style nitpicks
  simple_lmk: Make reclaim deterministic
  simple_lmk: Fix broken multicopy atomicity for victims_to_kill
  simple_lmk: Use proper atomic_* operations where needed
  simple_lmk: Remove kthread_should_stop() exit condition
  simple_lmk: Fix pages_found calculation
  simple_lmk: Introduce Simple Low Memory Killer for Android

Signed-off-by: UtsavBalar1231 <utsavbalar1231@gmail.com>

Conflicts:
	kernel/exit.c
	kernel/fork.c
	mm/Makefile
	mm/vmpressure.c
	mm/vmscan.c
This commit is contained in:
UtsavBalar1231
2020-07-08 09:55:52 +05:30
13 changed files with 475 additions and 100 deletions

View File

@@ -54,6 +54,39 @@ config ANDROID_BINDER_IPC_SELFTEST
exhaustively with combinations of various buffer sizes and
alignments.
config ANDROID_SIMPLE_LMK
bool "Simple Android Low Memory Killer"
depends on !ANDROID_LOW_MEMORY_KILLER && !MEMCG
---help---
This is a complete low memory killer solution for Android that is
small and simple. Processes are killed according to the priorities
that Android gives them, so that the least important processes are
always killed first. Processes are killed until memory deficits are
satisfied, as observed from kswapd struggling to free up pages. Simple
LMK stops killing processes when kswapd finally goes back to sleep.
if ANDROID_SIMPLE_LMK
config ANDROID_SIMPLE_LMK_MINFREE
int "Minimum MiB of memory to free per reclaim"
range 8 512
default 128
help
Simple LMK will try to free at least this much memory per reclaim.
config ANDROID_SIMPLE_LMK_TIMEOUT_MSEC
int "Reclaim timeout in milliseconds"
range 50 1000
default 200
help
Simple LMK tries to wait until all of the victims it kills have their
memory freed; however, sometimes victims can take a while to die,
which can block Simple LMK from killing more processes in time when
needed. After the specified timeout elapses, Simple LMK will stop
waiting and make itself available to kill more processes.
endif
endif # if ANDROID
endmenu

View File

@@ -3,3 +3,4 @@ ccflags-y += -I$(src) # needed for trace events
obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o
obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o
obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
obj-$(CONFIG_ANDROID_SIMPLE_LMK) += simple_lmk.o

View File

@@ -0,0 +1,316 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2019-2020 Sultan Alsawaf <sultan@kerneltoast.com>.
*/
#define pr_fmt(fmt) "simple_lmk: " fmt
#include <linux/kthread.h>
#include <linux/mm.h>
#include <linux/moduleparam.h>
#include <linux/oom.h>
#include <linux/sort.h>
#include <linux/vmpressure.h>
#include <uapi/linux/sched/types.h>
/* The minimum number of pages to free per reclaim */
#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE)
/* Kill up to this many victims per reclaim */
#define MAX_VICTIMS 1024
/* Timeout in jiffies for each reclaim */
#define RECLAIM_EXPIRES msecs_to_jiffies(CONFIG_ANDROID_SIMPLE_LMK_TIMEOUT_MSEC)
struct victim_info {
struct task_struct *tsk;
struct mm_struct *mm;
unsigned long size;
};
/* Pulled from the Android framework. Lower adj means higher priority. */
static const unsigned short adjs[] = {
SHRT_MAX + 1, /* Include all positive adjs in the final range */
950, /* CACHED_APP_LMK_FIRST_ADJ */
900, /* CACHED_APP_MIN_ADJ */
800, /* SERVICE_B_ADJ */
700, /* PREVIOUS_APP_ADJ */
600, /* HOME_APP_ADJ */
500, /* SERVICE_ADJ */
400, /* HEAVY_WEIGHT_APP_ADJ */
300, /* BACKUP_APP_ADJ */
250, /* PERCEPTIBLE_LOW_APP_ADJ */
200, /* PERCEPTIBLE_APP_ADJ */
100, /* VISIBLE_APP_ADJ */
50, /* PERCEPTIBLE_RECENT_FOREGROUND_APP_ADJ */
0 /* FOREGROUND_APP_ADJ */
};
static struct victim_info victims[MAX_VICTIMS];
static DECLARE_WAIT_QUEUE_HEAD(oom_waitq);
static DECLARE_COMPLETION(reclaim_done);
static DEFINE_RWLOCK(mm_free_lock);
static int nr_victims;
static atomic_t needs_reclaim = ATOMIC_INIT(0);
static atomic_t nr_killed = ATOMIC_INIT(0);
static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr)
{
const struct victim_info *lhs = (typeof(lhs))lhs_ptr;
const struct victim_info *rhs = (typeof(rhs))rhs_ptr;
return rhs->size - lhs->size;
}
static bool vtsk_is_duplicate(int vlen, struct task_struct *vtsk)
{
int i;
for (i = 0; i < vlen; i++) {
if (same_thread_group(victims[i].tsk, vtsk))
return true;
}
return false;
}
static unsigned long get_total_mm_pages(struct mm_struct *mm)
{
unsigned long pages = 0;
int i;
for (i = 0; i < NR_MM_COUNTERS; i++)
pages += get_mm_counter(mm, i);
return pages;
}
static unsigned long find_victims(int *vindex, unsigned short target_adj_min,
unsigned short target_adj_max)
{
unsigned long pages_found = 0;
int old_vindex = *vindex;
struct task_struct *tsk;
for_each_process(tsk) {
struct signal_struct *sig;
struct task_struct *vtsk;
short adj;
/*
* Search for suitable tasks with the targeted importance (adj).
* Since only tasks with a positive adj can be targeted, that
* naturally excludes tasks which shouldn't be killed, like init
* and kthreads. Although oom_score_adj can still be changed
* while this code runs, it doesn't really matter. We just need
* to make sure that if the adj changes, we won't deadlock
* trying to lock a task that we locked earlier.
*/
sig = tsk->signal;
adj = READ_ONCE(sig->oom_score_adj);
if (adj < target_adj_min || adj > target_adj_max - 1 ||
sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP) ||
(thread_group_empty(tsk) && tsk->flags & PF_EXITING) ||
vtsk_is_duplicate(*vindex, tsk))
continue;
vtsk = find_lock_task_mm(tsk);
if (!vtsk)
continue;
/* Store this potential victim away for later */
victims[*vindex].tsk = vtsk;
victims[*vindex].mm = vtsk->mm;
victims[*vindex].size = get_total_mm_pages(vtsk->mm);
/* Keep track of the number of pages that have been found */
pages_found += victims[*vindex].size;
/* Make sure there's space left in the victim array */
if (++*vindex == MAX_VICTIMS)
break;
}
/*
* Sort the victims in descending order of size to prioritize killing
* the larger ones first.
*/
if (pages_found)
sort(&victims[old_vindex], *vindex - old_vindex,
sizeof(*victims), victim_size_cmp, NULL);
return pages_found;
}
static int process_victims(int vlen, unsigned long pages_needed)
{
unsigned long pages_found = 0;
int i, nr_to_kill = 0;
/*
* Calculate the number of tasks that need to be killed and quickly
* release the references to those that'll live.
*/
for (i = 0; i < vlen; i++) {
struct victim_info *victim = &victims[i];
struct task_struct *vtsk = victim->tsk;
/* The victim's mm lock is taken in find_victims; release it */
if (pages_found >= pages_needed) {
task_unlock(vtsk);
} else {
pages_found += victim->size;
nr_to_kill++;
}
}
return nr_to_kill;
}
static void scan_and_kill(unsigned long pages_needed)
{
int i, nr_to_kill = 0, nr_found = 0;
unsigned long pages_found = 0;
/* Hold an RCU read lock while traversing the global process list */
rcu_read_lock();
for (i = 1; i < ARRAY_SIZE(adjs); i++) {
pages_found += find_victims(&nr_found, adjs[i], adjs[i - 1]);
if (pages_found >= pages_needed || nr_found == MAX_VICTIMS)
break;
}
rcu_read_unlock();
/* Pretty unlikely but it can happen */
if (unlikely(!nr_found)) {
pr_err("No processes available to kill!\n");
return;
}
/* First round of victim processing to weed out unneeded victims */
nr_to_kill = process_victims(nr_found, pages_needed);
/*
* Try to kill as few of the chosen victims as possible by sorting the
* chosen victims by size, which means larger victims that have a lower
* adj can be killed in place of smaller victims with a high adj.
*/
sort(victims, nr_to_kill, sizeof(*victims), victim_size_cmp, NULL);
/* Second round of victim processing to finally select the victims */
nr_to_kill = process_victims(nr_to_kill, pages_needed);
/* Store the final number of victims for simple_lmk_mm_freed() */
write_lock(&mm_free_lock);
nr_victims = nr_to_kill;
write_unlock(&mm_free_lock);
/* Kill the victims */
for (i = 0; i < nr_to_kill; i++) {
static const struct sched_param sched_zero_prio;
struct victim_info *victim = &victims[i];
struct task_struct *t, *vtsk = victim->tsk;
pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm,
vtsk->signal->oom_score_adj,
victim->size << (PAGE_SHIFT - 10));
/* Accelerate the victim's death by forcing the kill signal */
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, vtsk, true);
/* Mark the thread group dead so that other kernel code knows */
rcu_read_lock();
for_each_thread(vtsk, t)
set_tsk_thread_flag(t, TIF_MEMDIE);
rcu_read_unlock();
/* Elevate the victim to SCHED_RR with zero RT priority */
sched_setscheduler_nocheck(vtsk, SCHED_RR, &sched_zero_prio);
/* Allow the victim to run on any CPU. This won't schedule. */
set_cpus_allowed_ptr(vtsk, cpu_all_mask);
/* Finally release the victim's task lock acquired earlier */
task_unlock(vtsk);
}
/* Wait until all the victims die or until the timeout is reached */
wait_for_completion_timeout(&reclaim_done, RECLAIM_EXPIRES);
write_lock(&mm_free_lock);
reinit_completion(&reclaim_done);
nr_victims = 0;
nr_killed = (atomic_t)ATOMIC_INIT(0);
write_unlock(&mm_free_lock);
}
static int simple_lmk_reclaim_thread(void *data)
{
static const struct sched_param sched_max_rt_prio = {
.sched_priority = MAX_RT_PRIO - 1
};
sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio);
while (1) {
wait_event(oom_waitq, atomic_read(&needs_reclaim));
scan_and_kill(MIN_FREE_PAGES);
atomic_set_release(&needs_reclaim, 0);
}
return 0;
}
void simple_lmk_mm_freed(struct mm_struct *mm)
{
int i;
read_lock(&mm_free_lock);
for (i = 0; i < nr_victims; i++) {
if (victims[i].mm == mm) {
victims[i].mm = NULL;
if (atomic_inc_return_relaxed(&nr_killed) == nr_victims)
complete(&reclaim_done);
break;
}
}
read_unlock(&mm_free_lock);
}
static int simple_lmk_vmpressure_cb(struct notifier_block *nb,
unsigned long pressure, void *data)
{
if (pressure == 100 && !atomic_cmpxchg_acquire(&needs_reclaim, 0, 1))
wake_up(&oom_waitq);
return NOTIFY_OK;
}
static struct notifier_block vmpressure_notif = {
.notifier_call = simple_lmk_vmpressure_cb,
.priority = INT_MAX
};
/* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */
static int simple_lmk_init_set(const char *val, const struct kernel_param *kp)
{
static atomic_t init_done = ATOMIC_INIT(0);
struct task_struct *thread;
if (!atomic_cmpxchg(&init_done, 0, 1)) {
thread = kthread_run(simple_lmk_reclaim_thread, NULL,
"simple_lmkd");
BUG_ON(IS_ERR(thread));
BUG_ON(vmpressure_notifier_register(&vmpressure_notif));
}
return 0;
}
static const struct kernel_param_ops simple_lmk_init_ops = {
.set = simple_lmk_init_set
};
/* Needed to prevent Android from thinking there's no LMK and thus rebooting */
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "lowmemorykiller."
module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200);

View File

@@ -63,7 +63,11 @@ static inline bool oom_task_origin(const struct task_struct *p)
static inline bool tsk_is_oom_victim(struct task_struct * tsk)
{
#ifdef CONFIG_ANDROID_SIMPLE_LMK
return test_ti_thread_flag(task_thread_info(tsk), TIF_MEMDIE);
#else
return tsk->signal->oom_mm;
#endif
}
/*

View File

@@ -0,0 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2019-2020 Sultan Alsawaf <sultan@kerneltoast.com>.
*/
#ifndef _SIMPLE_LMK_H_
#define _SIMPLE_LMK_H_
struct mm_struct;
#ifdef CONFIG_ANDROID_SIMPLE_LMK
void simple_lmk_mm_freed(struct mm_struct *mm);
#else
static inline void simple_lmk_mm_freed(struct mm_struct *mm)
{
}
#endif
#endif /* _SIMPLE_LMK_H_ */

View File

@@ -33,7 +33,8 @@ struct mem_cgroup;
extern int vmpressure_notifier_register(struct notifier_block *nb);
extern int vmpressure_notifier_unregister(struct notifier_block *nb);
extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed);
unsigned long scanned, unsigned long reclaimed,
int order);
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
#ifdef CONFIG_MEMCG

View File

@@ -545,8 +545,12 @@ static void exit_mm(void)
mm_update_next_owner(mm);
mm_released = mmput(mm);
#ifdef CONFIG_ANDROID_SIMPLE_LMK
clear_thread_flag(TIF_MEMDIE);
#else
if (test_thread_flag(TIF_MEMDIE))
exit_oom_victim();
#endif
if (mm_released)
set_tsk_thread_flag(current, TIF_MM_RELEASED);
}

View File

@@ -96,6 +96,7 @@
#include <linux/scs.h>
#include <linux/cpu_input_boost.h>
#include <linux/devfreq_boost.h>
#include <linux/simple_lmk.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -949,6 +950,7 @@ static inline void __mmput(struct mm_struct *mm)
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
simple_lmk_mm_freed(mm);
mm_put_huge_zero_page(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {

View File

@@ -187,6 +187,7 @@ extern void prep_compound_page(struct page *page, unsigned int order);
extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
extern atomic_long_t kswapd_waiters;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA

View File

@@ -1001,7 +1001,7 @@ bool out_of_memory(struct oom_control *oc)
unsigned long freed = 0;
enum oom_constraint constraint = CONSTRAINT_NONE;
if (oom_killer_disabled)
if (oom_killer_disabled || IS_ENABLED(CONFIG_ANDROID_SIMPLE_LMK))
return false;
if (try_online_one_block(numa_node_id())) {

View File

@@ -77,6 +77,8 @@
#include <asm/div64.h>
#include "internal.h"
atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0);
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
@@ -4067,6 +4069,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int no_progress_loops;
unsigned int cpuset_mems_cookie;
int reserve_flags;
bool woke_kswapd = false;
/*
* We also sanity check to catch abuse of atomic reserves being used by
@@ -4100,8 +4103,13 @@ retry_cpuset:
if (!ac->preferred_zoneref->zone)
goto nopage;
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
if (gfp_mask & __GFP_KSWAPD_RECLAIM) {
if (!woke_kswapd) {
atomic_long_inc(&kswapd_waiters);
woke_kswapd = true;
}
wake_all_kswapds(order, ac);
}
/*
* The adjusted alloc_flags might result in immediate success, so try
@@ -4249,8 +4257,10 @@ retry:
/* Avoid allocations with no watermarks from looping endlessly */
if (tsk_is_oom_victim(current) &&
(alloc_flags == ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
(gfp_mask & __GFP_NOMEMALLOC))) {
gfp_mask |= __GFP_NOWARN;
goto nopage;
}
/* Retry as long as the OOM killer is making progress */
if (did_some_progress) {
@@ -4304,9 +4314,12 @@ nopage:
goto retry;
}
fail:
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
got_pg:
if (woke_kswapd)
atomic_long_dec(&kswapd_waiters);
if (!page)
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
return page;
}

View File

@@ -27,22 +27,6 @@
#include <linux/module.h>
#include <linux/vmpressure.h>
/*
* The window size (vmpressure_win) is the number of scanned pages before
* we try to analyze scanned/reclaimed ratio. So the window is used as a
* rate-limit tunable for the "low" level notification, and also for
* averaging the ratio for medium/critical levels. Using small window
* sizes can cause lot of false positives, but too big window size will
* delay the notifications.
*
* As the vmscan reclaimer logic works with chunks which are multiple of
* SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
*
* TODO: Make the window size depend on machine size, as we do for vmstat
* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
*/
static unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
/*
* These thresholds are used when we account memory pressure through
* scanned/reclaimed ratio. The current values were chosen empirically. In
@@ -271,26 +255,35 @@ static void vmpressure_work_fn(struct work_struct *work)
} while ((vmpr = vmpressure_parent(vmpr)));
}
static unsigned long calculate_vmpressure_win(void)
{
long x;
x = global_node_page_state(NR_FILE_PAGES) -
global_node_page_state(NR_SHMEM) -
total_swapcache_pages() +
global_zone_page_state(NR_FREE_PAGES);
if (x < 1)
return 1;
/*
* For low (free + cached), vmpressure window should be
* small, and high for higher values of (free + cached).
* But it should not be linear as well. This ensures
* timely vmpressure notifications when system is under
* memory pressure, and optimal number of events when
* cached is high. The sqaure root function is empirically
* found to serve the purpose.
*/
return int_sqrt(x);
}
#ifdef CONFIG_MEMCG
static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
bool tree, unsigned long scanned,
unsigned long reclaimed)
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
/*
* Here we only want to account pressure that userland is able to
* help us with. For example, suppose that DMA zone is under
* pressure; if we notify userland about that kind of pressure,
* then it will be mostly a waste as it will trigger unnecessary
* freeing of memory by userland (since userland is more likely to
* have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
* is why we include only movable, highmem and FS/IO pages.
* Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
* we account it too.
*/
if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
return;
/*
* If we got here with no pages scanned, then that is an indicator
* that reclaimer was unable to find any shrinkable LRUs at the
@@ -299,7 +292,9 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
* (scanning depth) goes too high (deep), we will be notified
* through vmpressure_prio(). But so far, keep calm.
*/
if (!scanned)
if (critical)
scanned = calculate_vmpressure_win();
else if (!scanned)
return;
if (tree) {
@@ -308,7 +303,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
vmpr->tree_reclaimed += reclaimed;
spin_unlock(&vmpr->sr_lock);
if (scanned < vmpressure_win)
if (!critical && scanned < calculate_vmpressure_win())
return;
schedule_work(&vmpr->work);
} else {
@@ -322,7 +317,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
spin_lock(&vmpr->sr_lock);
scanned = vmpr->scanned += scanned;
reclaimed = vmpr->reclaimed += reclaimed;
if (scanned < vmpressure_win) {
if (!critical && scanned < calculate_vmpressure_win()) {
spin_unlock(&vmpr->sr_lock);
return;
}
@@ -346,65 +341,37 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
}
}
#else
static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
}
static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
bool tree, unsigned long scanned,
unsigned long reclaimed) { }
#endif
static void calculate_vmpressure_win(void)
{
long x;
x = global_node_page_state(NR_FILE_PAGES) -
global_node_page_state(NR_SHMEM) -
total_swapcache_pages() +
global_zone_page_state(NR_FREE_PAGES);
if (x < 1)
x = 1;
/*
* For low (free + cached), vmpressure window should be
* small, and high for higher values of (free + cached).
* But it should not be linear as well. This ensures
* timely vmpressure notifications when system is under
* memory pressure, and optimal number of events when
* cached is high. The sqaure root function is empirically
* found to serve the purpose.
*/
x = int_sqrt(x);
vmpressure_win = x;
}
static void vmpressure_global(gfp_t gfp, unsigned long scanned,
unsigned long reclaimed)
static void vmpressure_global(gfp_t gfp, unsigned long scanned, bool critical,
unsigned long reclaimed)
{
struct vmpressure *vmpr = &global_vmpressure;
unsigned long pressure;
unsigned long stall;
if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
return;
if (critical)
scanned = calculate_vmpressure_win();
if (!scanned)
return;
if (scanned) {
spin_lock(&vmpr->sr_lock);
vmpr->scanned += scanned;
vmpr->reclaimed += reclaimed;
spin_lock(&vmpr->sr_lock);
if (!vmpr->scanned)
calculate_vmpressure_win();
if (!current_is_kswapd())
vmpr->stall += scanned;
vmpr->scanned += scanned;
vmpr->reclaimed += reclaimed;
stall = vmpr->stall;
scanned = vmpr->scanned;
reclaimed = vmpr->reclaimed;
spin_unlock(&vmpr->sr_lock);
if (!current_is_kswapd())
vmpr->stall += scanned;
stall = vmpr->stall;
scanned = vmpr->scanned;
reclaimed = vmpr->reclaimed;
spin_unlock(&vmpr->sr_lock);
if (scanned < vmpressure_win)
return;
if (!critical && scanned < calculate_vmpressure_win())
return;
}
spin_lock(&vmpr->sr_lock);
vmpr->scanned = 0;
@@ -412,11 +379,26 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned,
vmpr->stall = 0;
spin_unlock(&vmpr->sr_lock);
pressure = vmpressure_calc_pressure(scanned, reclaimed);
pressure = vmpressure_account_stall(pressure, stall, scanned);
if (scanned) {
pressure = vmpressure_calc_pressure(scanned, reclaimed);
pressure = vmpressure_account_stall(pressure, stall, scanned);
} else {
pressure = 100;
}
vmpressure_notify(pressure);
}
static void __vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
bool tree, unsigned long scanned,
unsigned long reclaimed)
{
if (!memcg && tree)
vmpressure_global(gfp, scanned, critical, reclaimed);
if (IS_ENABLED(CONFIG_MEMCG))
vmpressure_memcg(gfp, memcg, critical, tree, scanned, reclaimed);
}
/**
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
@@ -439,13 +421,12 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned,
* This function does not return any value.
*/
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
unsigned long scanned, unsigned long reclaimed, int order)
{
if (!memcg && tree)
vmpressure_global(gfp, scanned, reclaimed);
if (order > PAGE_ALLOC_COSTLY_ORDER)
return;
if (IS_ENABLED(CONFIG_MEMCG))
vmpressure_memcg(gfp, memcg, tree, scanned, reclaimed);
__vmpressure(gfp, memcg, false, tree, scanned, reclaimed);
}
/**
@@ -475,7 +456,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
vmpressure(gfp, memcg, true, vmpressure_win, 0);
__vmpressure(gfp, memcg, true, true, 0, 0);
}
static enum vmpressure_levels str_to_level(const char *arg)

View File

@@ -2727,7 +2727,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
sc->nr_reclaimed - reclaimed, sc->order);
/*
* Direct reclaim and kswapd have to scan all memory
@@ -2760,7 +2760,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
*/
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
sc->nr_reclaimed - nr_reclaimed, sc->order);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -3506,7 +3506,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
if (try_to_freeze() || kthread_should_stop())
if (try_to_freeze() || kthread_should_stop() ||
!atomic_long_read(&kswapd_waiters))
break;
/*