Merge remote-tracking branch 'slmk/linux-4.14' into auto-kernel
* slmk/linux-4.14: simple_lmk: Remove unnecessary clean-up when timeout is reached simple_lmk: Hold an RCU read lock instead of the tasklist read lock mm: Don't stop kswapd on a per-node basis when there are no waiters simple_lmk: Consider all positive adjs when finding victims mm: vmpressure: Ignore allocation orders above PAGE_ALLOC_COSTLY_ORDER mm: Don't warn on page allocation failures for OOM-killed processes mm: Adjust tsk_is_oom_victim() for Simple LMK mm: vmpressure: Don't cache the window size mm: vmpressure: Interpret zero scanned pages as 100% pressure mm: vmpressure: Don't exclude any allocation types simple_lmk: Update adj targeting for Android 10 simple_lmk: Use vmpressure notifier to trigger kills mm: vmpressure: make vmpressure window variable mm: vmpressure: account allocstalls only on higher pressures mm: vmpressure: scale pressure based on reclaim context mm: vmpressure: allow in-kernel clients to subscribe for events mm, vmpressure: int cast vmpressure level/model for -1 comparison mm: Stop kswapd early when nothing's waiting for it to free pages simple_lmk: Include swap memory usage in the size of victims simple_lmk: Relax memory barriers and clean up some styling simple_lmk: Place victims onto SCHED_RR simple_lmk: Add a timeout to stop waiting for victims to die simple_lmk: Ignore tasks that won't free memory simple_lmk: Simplify tricks used to speed up the death process simple_lmk: Report mm as freed as soon as exit_mmap() finishes simple_lmk: Mark victim thread group with TIF_MEMDIE simple_lmk: Disable OOM killer when Simple LMK is enabled simple_lmk: Print a message when there are no processes to kill simple_lmk: Remove compat cruft not specific to 4.14 simple_lmk: Update copyright to 2020 simple_lmk: Don't queue up new reclaim requests during reclaim simple_lmk: Increase default minfree value simple_lmk: Clean up some code style nitpicks simple_lmk: Make reclaim deterministic simple_lmk: Fix broken multicopy atomicity for victims_to_kill simple_lmk: Use proper atomic_* operations where needed simple_lmk: Remove kthread_should_stop() exit condition simple_lmk: Fix pages_found calculation simple_lmk: Introduce Simple Low Memory Killer for Android Signed-off-by: UtsavBalar1231 <utsavbalar1231@gmail.com> Conflicts: kernel/exit.c kernel/fork.c mm/Makefile mm/vmpressure.c mm/vmscan.c
This commit is contained in:
@@ -54,6 +54,39 @@ config ANDROID_BINDER_IPC_SELFTEST
|
||||
exhaustively with combinations of various buffer sizes and
|
||||
alignments.
|
||||
|
||||
config ANDROID_SIMPLE_LMK
|
||||
bool "Simple Android Low Memory Killer"
|
||||
depends on !ANDROID_LOW_MEMORY_KILLER && !MEMCG
|
||||
---help---
|
||||
This is a complete low memory killer solution for Android that is
|
||||
small and simple. Processes are killed according to the priorities
|
||||
that Android gives them, so that the least important processes are
|
||||
always killed first. Processes are killed until memory deficits are
|
||||
satisfied, as observed from kswapd struggling to free up pages. Simple
|
||||
LMK stops killing processes when kswapd finally goes back to sleep.
|
||||
|
||||
if ANDROID_SIMPLE_LMK
|
||||
|
||||
config ANDROID_SIMPLE_LMK_MINFREE
|
||||
int "Minimum MiB of memory to free per reclaim"
|
||||
range 8 512
|
||||
default 128
|
||||
help
|
||||
Simple LMK will try to free at least this much memory per reclaim.
|
||||
|
||||
config ANDROID_SIMPLE_LMK_TIMEOUT_MSEC
|
||||
int "Reclaim timeout in milliseconds"
|
||||
range 50 1000
|
||||
default 200
|
||||
help
|
||||
Simple LMK tries to wait until all of the victims it kills have their
|
||||
memory freed; however, sometimes victims can take a while to die,
|
||||
which can block Simple LMK from killing more processes in time when
|
||||
needed. After the specified timeout elapses, Simple LMK will stop
|
||||
waiting and make itself available to kill more processes.
|
||||
|
||||
endif
|
||||
|
||||
endif # if ANDROID
|
||||
|
||||
endmenu
|
||||
|
||||
@@ -3,3 +3,4 @@ ccflags-y += -I$(src) # needed for trace events
|
||||
obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o
|
||||
obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o
|
||||
obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
|
||||
obj-$(CONFIG_ANDROID_SIMPLE_LMK) += simple_lmk.o
|
||||
|
||||
316
drivers/android/simple_lmk.c
Normal file
316
drivers/android/simple_lmk.c
Normal file
@@ -0,0 +1,316 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (C) 2019-2020 Sultan Alsawaf <sultan@kerneltoast.com>.
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) "simple_lmk: " fmt
|
||||
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/oom.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/vmpressure.h>
|
||||
#include <uapi/linux/sched/types.h>
|
||||
|
||||
/* The minimum number of pages to free per reclaim */
|
||||
#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE)
|
||||
|
||||
/* Kill up to this many victims per reclaim */
|
||||
#define MAX_VICTIMS 1024
|
||||
|
||||
/* Timeout in jiffies for each reclaim */
|
||||
#define RECLAIM_EXPIRES msecs_to_jiffies(CONFIG_ANDROID_SIMPLE_LMK_TIMEOUT_MSEC)
|
||||
|
||||
struct victim_info {
|
||||
struct task_struct *tsk;
|
||||
struct mm_struct *mm;
|
||||
unsigned long size;
|
||||
};
|
||||
|
||||
/* Pulled from the Android framework. Lower adj means higher priority. */
|
||||
static const unsigned short adjs[] = {
|
||||
SHRT_MAX + 1, /* Include all positive adjs in the final range */
|
||||
950, /* CACHED_APP_LMK_FIRST_ADJ */
|
||||
900, /* CACHED_APP_MIN_ADJ */
|
||||
800, /* SERVICE_B_ADJ */
|
||||
700, /* PREVIOUS_APP_ADJ */
|
||||
600, /* HOME_APP_ADJ */
|
||||
500, /* SERVICE_ADJ */
|
||||
400, /* HEAVY_WEIGHT_APP_ADJ */
|
||||
300, /* BACKUP_APP_ADJ */
|
||||
250, /* PERCEPTIBLE_LOW_APP_ADJ */
|
||||
200, /* PERCEPTIBLE_APP_ADJ */
|
||||
100, /* VISIBLE_APP_ADJ */
|
||||
50, /* PERCEPTIBLE_RECENT_FOREGROUND_APP_ADJ */
|
||||
0 /* FOREGROUND_APP_ADJ */
|
||||
};
|
||||
|
||||
static struct victim_info victims[MAX_VICTIMS];
|
||||
static DECLARE_WAIT_QUEUE_HEAD(oom_waitq);
|
||||
static DECLARE_COMPLETION(reclaim_done);
|
||||
static DEFINE_RWLOCK(mm_free_lock);
|
||||
static int nr_victims;
|
||||
static atomic_t needs_reclaim = ATOMIC_INIT(0);
|
||||
static atomic_t nr_killed = ATOMIC_INIT(0);
|
||||
|
||||
static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr)
|
||||
{
|
||||
const struct victim_info *lhs = (typeof(lhs))lhs_ptr;
|
||||
const struct victim_info *rhs = (typeof(rhs))rhs_ptr;
|
||||
|
||||
return rhs->size - lhs->size;
|
||||
}
|
||||
|
||||
static bool vtsk_is_duplicate(int vlen, struct task_struct *vtsk)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < vlen; i++) {
|
||||
if (same_thread_group(victims[i].tsk, vtsk))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static unsigned long get_total_mm_pages(struct mm_struct *mm)
|
||||
{
|
||||
unsigned long pages = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < NR_MM_COUNTERS; i++)
|
||||
pages += get_mm_counter(mm, i);
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
static unsigned long find_victims(int *vindex, unsigned short target_adj_min,
|
||||
unsigned short target_adj_max)
|
||||
{
|
||||
unsigned long pages_found = 0;
|
||||
int old_vindex = *vindex;
|
||||
struct task_struct *tsk;
|
||||
|
||||
for_each_process(tsk) {
|
||||
struct signal_struct *sig;
|
||||
struct task_struct *vtsk;
|
||||
short adj;
|
||||
|
||||
/*
|
||||
* Search for suitable tasks with the targeted importance (adj).
|
||||
* Since only tasks with a positive adj can be targeted, that
|
||||
* naturally excludes tasks which shouldn't be killed, like init
|
||||
* and kthreads. Although oom_score_adj can still be changed
|
||||
* while this code runs, it doesn't really matter. We just need
|
||||
* to make sure that if the adj changes, we won't deadlock
|
||||
* trying to lock a task that we locked earlier.
|
||||
*/
|
||||
sig = tsk->signal;
|
||||
adj = READ_ONCE(sig->oom_score_adj);
|
||||
if (adj < target_adj_min || adj > target_adj_max - 1 ||
|
||||
sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP) ||
|
||||
(thread_group_empty(tsk) && tsk->flags & PF_EXITING) ||
|
||||
vtsk_is_duplicate(*vindex, tsk))
|
||||
continue;
|
||||
|
||||
vtsk = find_lock_task_mm(tsk);
|
||||
if (!vtsk)
|
||||
continue;
|
||||
|
||||
/* Store this potential victim away for later */
|
||||
victims[*vindex].tsk = vtsk;
|
||||
victims[*vindex].mm = vtsk->mm;
|
||||
victims[*vindex].size = get_total_mm_pages(vtsk->mm);
|
||||
|
||||
/* Keep track of the number of pages that have been found */
|
||||
pages_found += victims[*vindex].size;
|
||||
|
||||
/* Make sure there's space left in the victim array */
|
||||
if (++*vindex == MAX_VICTIMS)
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sort the victims in descending order of size to prioritize killing
|
||||
* the larger ones first.
|
||||
*/
|
||||
if (pages_found)
|
||||
sort(&victims[old_vindex], *vindex - old_vindex,
|
||||
sizeof(*victims), victim_size_cmp, NULL);
|
||||
|
||||
return pages_found;
|
||||
}
|
||||
|
||||
static int process_victims(int vlen, unsigned long pages_needed)
|
||||
{
|
||||
unsigned long pages_found = 0;
|
||||
int i, nr_to_kill = 0;
|
||||
|
||||
/*
|
||||
* Calculate the number of tasks that need to be killed and quickly
|
||||
* release the references to those that'll live.
|
||||
*/
|
||||
for (i = 0; i < vlen; i++) {
|
||||
struct victim_info *victim = &victims[i];
|
||||
struct task_struct *vtsk = victim->tsk;
|
||||
|
||||
/* The victim's mm lock is taken in find_victims; release it */
|
||||
if (pages_found >= pages_needed) {
|
||||
task_unlock(vtsk);
|
||||
} else {
|
||||
pages_found += victim->size;
|
||||
nr_to_kill++;
|
||||
}
|
||||
}
|
||||
|
||||
return nr_to_kill;
|
||||
}
|
||||
|
||||
static void scan_and_kill(unsigned long pages_needed)
|
||||
{
|
||||
int i, nr_to_kill = 0, nr_found = 0;
|
||||
unsigned long pages_found = 0;
|
||||
|
||||
/* Hold an RCU read lock while traversing the global process list */
|
||||
rcu_read_lock();
|
||||
for (i = 1; i < ARRAY_SIZE(adjs); i++) {
|
||||
pages_found += find_victims(&nr_found, adjs[i], adjs[i - 1]);
|
||||
if (pages_found >= pages_needed || nr_found == MAX_VICTIMS)
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
/* Pretty unlikely but it can happen */
|
||||
if (unlikely(!nr_found)) {
|
||||
pr_err("No processes available to kill!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* First round of victim processing to weed out unneeded victims */
|
||||
nr_to_kill = process_victims(nr_found, pages_needed);
|
||||
|
||||
/*
|
||||
* Try to kill as few of the chosen victims as possible by sorting the
|
||||
* chosen victims by size, which means larger victims that have a lower
|
||||
* adj can be killed in place of smaller victims with a high adj.
|
||||
*/
|
||||
sort(victims, nr_to_kill, sizeof(*victims), victim_size_cmp, NULL);
|
||||
|
||||
/* Second round of victim processing to finally select the victims */
|
||||
nr_to_kill = process_victims(nr_to_kill, pages_needed);
|
||||
|
||||
/* Store the final number of victims for simple_lmk_mm_freed() */
|
||||
write_lock(&mm_free_lock);
|
||||
nr_victims = nr_to_kill;
|
||||
write_unlock(&mm_free_lock);
|
||||
|
||||
/* Kill the victims */
|
||||
for (i = 0; i < nr_to_kill; i++) {
|
||||
static const struct sched_param sched_zero_prio;
|
||||
struct victim_info *victim = &victims[i];
|
||||
struct task_struct *t, *vtsk = victim->tsk;
|
||||
|
||||
pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm,
|
||||
vtsk->signal->oom_score_adj,
|
||||
victim->size << (PAGE_SHIFT - 10));
|
||||
|
||||
/* Accelerate the victim's death by forcing the kill signal */
|
||||
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, vtsk, true);
|
||||
|
||||
/* Mark the thread group dead so that other kernel code knows */
|
||||
rcu_read_lock();
|
||||
for_each_thread(vtsk, t)
|
||||
set_tsk_thread_flag(t, TIF_MEMDIE);
|
||||
rcu_read_unlock();
|
||||
|
||||
/* Elevate the victim to SCHED_RR with zero RT priority */
|
||||
sched_setscheduler_nocheck(vtsk, SCHED_RR, &sched_zero_prio);
|
||||
|
||||
/* Allow the victim to run on any CPU. This won't schedule. */
|
||||
set_cpus_allowed_ptr(vtsk, cpu_all_mask);
|
||||
|
||||
/* Finally release the victim's task lock acquired earlier */
|
||||
task_unlock(vtsk);
|
||||
}
|
||||
|
||||
/* Wait until all the victims die or until the timeout is reached */
|
||||
wait_for_completion_timeout(&reclaim_done, RECLAIM_EXPIRES);
|
||||
write_lock(&mm_free_lock);
|
||||
reinit_completion(&reclaim_done);
|
||||
nr_victims = 0;
|
||||
nr_killed = (atomic_t)ATOMIC_INIT(0);
|
||||
write_unlock(&mm_free_lock);
|
||||
}
|
||||
|
||||
static int simple_lmk_reclaim_thread(void *data)
|
||||
{
|
||||
static const struct sched_param sched_max_rt_prio = {
|
||||
.sched_priority = MAX_RT_PRIO - 1
|
||||
};
|
||||
|
||||
sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio);
|
||||
|
||||
while (1) {
|
||||
wait_event(oom_waitq, atomic_read(&needs_reclaim));
|
||||
scan_and_kill(MIN_FREE_PAGES);
|
||||
atomic_set_release(&needs_reclaim, 0);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void simple_lmk_mm_freed(struct mm_struct *mm)
|
||||
{
|
||||
int i;
|
||||
|
||||
read_lock(&mm_free_lock);
|
||||
for (i = 0; i < nr_victims; i++) {
|
||||
if (victims[i].mm == mm) {
|
||||
victims[i].mm = NULL;
|
||||
if (atomic_inc_return_relaxed(&nr_killed) == nr_victims)
|
||||
complete(&reclaim_done);
|
||||
break;
|
||||
}
|
||||
}
|
||||
read_unlock(&mm_free_lock);
|
||||
}
|
||||
|
||||
static int simple_lmk_vmpressure_cb(struct notifier_block *nb,
|
||||
unsigned long pressure, void *data)
|
||||
{
|
||||
if (pressure == 100 && !atomic_cmpxchg_acquire(&needs_reclaim, 0, 1))
|
||||
wake_up(&oom_waitq);
|
||||
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
static struct notifier_block vmpressure_notif = {
|
||||
.notifier_call = simple_lmk_vmpressure_cb,
|
||||
.priority = INT_MAX
|
||||
};
|
||||
|
||||
/* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */
|
||||
static int simple_lmk_init_set(const char *val, const struct kernel_param *kp)
|
||||
{
|
||||
static atomic_t init_done = ATOMIC_INIT(0);
|
||||
struct task_struct *thread;
|
||||
|
||||
if (!atomic_cmpxchg(&init_done, 0, 1)) {
|
||||
thread = kthread_run(simple_lmk_reclaim_thread, NULL,
|
||||
"simple_lmkd");
|
||||
BUG_ON(IS_ERR(thread));
|
||||
BUG_ON(vmpressure_notifier_register(&vmpressure_notif));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct kernel_param_ops simple_lmk_init_ops = {
|
||||
.set = simple_lmk_init_set
|
||||
};
|
||||
|
||||
/* Needed to prevent Android from thinking there's no LMK and thus rebooting */
|
||||
#undef MODULE_PARAM_PREFIX
|
||||
#define MODULE_PARAM_PREFIX "lowmemorykiller."
|
||||
module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200);
|
||||
@@ -63,7 +63,11 @@ static inline bool oom_task_origin(const struct task_struct *p)
|
||||
|
||||
static inline bool tsk_is_oom_victim(struct task_struct * tsk)
|
||||
{
|
||||
#ifdef CONFIG_ANDROID_SIMPLE_LMK
|
||||
return test_ti_thread_flag(task_thread_info(tsk), TIF_MEMDIE);
|
||||
#else
|
||||
return tsk->signal->oom_mm;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
18
include/linux/simple_lmk.h
Normal file
18
include/linux/simple_lmk.h
Normal file
@@ -0,0 +1,18 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (C) 2019-2020 Sultan Alsawaf <sultan@kerneltoast.com>.
|
||||
*/
|
||||
#ifndef _SIMPLE_LMK_H_
|
||||
#define _SIMPLE_LMK_H_
|
||||
|
||||
struct mm_struct;
|
||||
|
||||
#ifdef CONFIG_ANDROID_SIMPLE_LMK
|
||||
void simple_lmk_mm_freed(struct mm_struct *mm);
|
||||
#else
|
||||
static inline void simple_lmk_mm_freed(struct mm_struct *mm)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _SIMPLE_LMK_H_ */
|
||||
@@ -33,7 +33,8 @@ struct mem_cgroup;
|
||||
extern int vmpressure_notifier_register(struct notifier_block *nb);
|
||||
extern int vmpressure_notifier_unregister(struct notifier_block *nb);
|
||||
extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
|
||||
unsigned long scanned, unsigned long reclaimed);
|
||||
unsigned long scanned, unsigned long reclaimed,
|
||||
int order);
|
||||
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
|
||||
@@ -545,8 +545,12 @@ static void exit_mm(void)
|
||||
mm_update_next_owner(mm);
|
||||
|
||||
mm_released = mmput(mm);
|
||||
#ifdef CONFIG_ANDROID_SIMPLE_LMK
|
||||
clear_thread_flag(TIF_MEMDIE);
|
||||
#else
|
||||
if (test_thread_flag(TIF_MEMDIE))
|
||||
exit_oom_victim();
|
||||
#endif
|
||||
if (mm_released)
|
||||
set_tsk_thread_flag(current, TIF_MM_RELEASED);
|
||||
}
|
||||
|
||||
@@ -96,6 +96,7 @@
|
||||
#include <linux/scs.h>
|
||||
#include <linux/cpu_input_boost.h>
|
||||
#include <linux/devfreq_boost.h>
|
||||
#include <linux/simple_lmk.h>
|
||||
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/pgalloc.h>
|
||||
@@ -949,6 +950,7 @@ static inline void __mmput(struct mm_struct *mm)
|
||||
ksm_exit(mm);
|
||||
khugepaged_exit(mm); /* must run before exit_mmap */
|
||||
exit_mmap(mm);
|
||||
simple_lmk_mm_freed(mm);
|
||||
mm_put_huge_zero_page(mm);
|
||||
set_mm_exe_file(mm, NULL);
|
||||
if (!list_empty(&mm->mmlist)) {
|
||||
|
||||
@@ -187,6 +187,7 @@ extern void prep_compound_page(struct page *page, unsigned int order);
|
||||
extern void post_alloc_hook(struct page *page, unsigned int order,
|
||||
gfp_t gfp_flags);
|
||||
extern int user_min_free_kbytes;
|
||||
extern atomic_long_t kswapd_waiters;
|
||||
|
||||
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
|
||||
|
||||
|
||||
@@ -1001,7 +1001,7 @@ bool out_of_memory(struct oom_control *oc)
|
||||
unsigned long freed = 0;
|
||||
enum oom_constraint constraint = CONSTRAINT_NONE;
|
||||
|
||||
if (oom_killer_disabled)
|
||||
if (oom_killer_disabled || IS_ENABLED(CONFIG_ANDROID_SIMPLE_LMK))
|
||||
return false;
|
||||
|
||||
if (try_online_one_block(numa_node_id())) {
|
||||
|
||||
@@ -77,6 +77,8 @@
|
||||
#include <asm/div64.h>
|
||||
#include "internal.h"
|
||||
|
||||
atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0);
|
||||
|
||||
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
|
||||
static DEFINE_MUTEX(pcp_batch_high_lock);
|
||||
#define MIN_PERCPU_PAGELIST_FRACTION (8)
|
||||
@@ -4067,6 +4069,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
||||
int no_progress_loops;
|
||||
unsigned int cpuset_mems_cookie;
|
||||
int reserve_flags;
|
||||
bool woke_kswapd = false;
|
||||
|
||||
/*
|
||||
* We also sanity check to catch abuse of atomic reserves being used by
|
||||
@@ -4100,8 +4103,13 @@ retry_cpuset:
|
||||
if (!ac->preferred_zoneref->zone)
|
||||
goto nopage;
|
||||
|
||||
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
|
||||
if (gfp_mask & __GFP_KSWAPD_RECLAIM) {
|
||||
if (!woke_kswapd) {
|
||||
atomic_long_inc(&kswapd_waiters);
|
||||
woke_kswapd = true;
|
||||
}
|
||||
wake_all_kswapds(order, ac);
|
||||
}
|
||||
|
||||
/*
|
||||
* The adjusted alloc_flags might result in immediate success, so try
|
||||
@@ -4249,8 +4257,10 @@ retry:
|
||||
/* Avoid allocations with no watermarks from looping endlessly */
|
||||
if (tsk_is_oom_victim(current) &&
|
||||
(alloc_flags == ALLOC_OOM ||
|
||||
(gfp_mask & __GFP_NOMEMALLOC)))
|
||||
(gfp_mask & __GFP_NOMEMALLOC))) {
|
||||
gfp_mask |= __GFP_NOWARN;
|
||||
goto nopage;
|
||||
}
|
||||
|
||||
/* Retry as long as the OOM killer is making progress */
|
||||
if (did_some_progress) {
|
||||
@@ -4304,9 +4314,12 @@ nopage:
|
||||
goto retry;
|
||||
}
|
||||
fail:
|
||||
warn_alloc(gfp_mask, ac->nodemask,
|
||||
"page allocation failure: order:%u", order);
|
||||
got_pg:
|
||||
if (woke_kswapd)
|
||||
atomic_long_dec(&kswapd_waiters);
|
||||
if (!page)
|
||||
warn_alloc(gfp_mask, ac->nodemask,
|
||||
"page allocation failure: order:%u", order);
|
||||
return page;
|
||||
}
|
||||
|
||||
|
||||
163
mm/vmpressure.c
163
mm/vmpressure.c
@@ -27,22 +27,6 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/vmpressure.h>
|
||||
|
||||
/*
|
||||
* The window size (vmpressure_win) is the number of scanned pages before
|
||||
* we try to analyze scanned/reclaimed ratio. So the window is used as a
|
||||
* rate-limit tunable for the "low" level notification, and also for
|
||||
* averaging the ratio for medium/critical levels. Using small window
|
||||
* sizes can cause lot of false positives, but too big window size will
|
||||
* delay the notifications.
|
||||
*
|
||||
* As the vmscan reclaimer logic works with chunks which are multiple of
|
||||
* SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
|
||||
*
|
||||
* TODO: Make the window size depend on machine size, as we do for vmstat
|
||||
* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
|
||||
*/
|
||||
static unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
|
||||
|
||||
/*
|
||||
* These thresholds are used when we account memory pressure through
|
||||
* scanned/reclaimed ratio. The current values were chosen empirically. In
|
||||
@@ -271,26 +255,35 @@ static void vmpressure_work_fn(struct work_struct *work)
|
||||
} while ((vmpr = vmpressure_parent(vmpr)));
|
||||
}
|
||||
|
||||
static unsigned long calculate_vmpressure_win(void)
|
||||
{
|
||||
long x;
|
||||
|
||||
x = global_node_page_state(NR_FILE_PAGES) -
|
||||
global_node_page_state(NR_SHMEM) -
|
||||
total_swapcache_pages() +
|
||||
global_zone_page_state(NR_FREE_PAGES);
|
||||
if (x < 1)
|
||||
return 1;
|
||||
/*
|
||||
* For low (free + cached), vmpressure window should be
|
||||
* small, and high for higher values of (free + cached).
|
||||
* But it should not be linear as well. This ensures
|
||||
* timely vmpressure notifications when system is under
|
||||
* memory pressure, and optimal number of events when
|
||||
* cached is high. The sqaure root function is empirically
|
||||
* found to serve the purpose.
|
||||
*/
|
||||
return int_sqrt(x);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
|
||||
unsigned long scanned, unsigned long reclaimed)
|
||||
static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
|
||||
bool tree, unsigned long scanned,
|
||||
unsigned long reclaimed)
|
||||
{
|
||||
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
|
||||
|
||||
/*
|
||||
* Here we only want to account pressure that userland is able to
|
||||
* help us with. For example, suppose that DMA zone is under
|
||||
* pressure; if we notify userland about that kind of pressure,
|
||||
* then it will be mostly a waste as it will trigger unnecessary
|
||||
* freeing of memory by userland (since userland is more likely to
|
||||
* have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
|
||||
* is why we include only movable, highmem and FS/IO pages.
|
||||
* Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
|
||||
* we account it too.
|
||||
*/
|
||||
if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
|
||||
return;
|
||||
|
||||
/*
|
||||
* If we got here with no pages scanned, then that is an indicator
|
||||
* that reclaimer was unable to find any shrinkable LRUs at the
|
||||
@@ -299,7 +292,9 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
|
||||
* (scanning depth) goes too high (deep), we will be notified
|
||||
* through vmpressure_prio(). But so far, keep calm.
|
||||
*/
|
||||
if (!scanned)
|
||||
if (critical)
|
||||
scanned = calculate_vmpressure_win();
|
||||
else if (!scanned)
|
||||
return;
|
||||
|
||||
if (tree) {
|
||||
@@ -308,7 +303,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
|
||||
vmpr->tree_reclaimed += reclaimed;
|
||||
spin_unlock(&vmpr->sr_lock);
|
||||
|
||||
if (scanned < vmpressure_win)
|
||||
if (!critical && scanned < calculate_vmpressure_win())
|
||||
return;
|
||||
schedule_work(&vmpr->work);
|
||||
} else {
|
||||
@@ -322,7 +317,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
|
||||
spin_lock(&vmpr->sr_lock);
|
||||
scanned = vmpr->scanned += scanned;
|
||||
reclaimed = vmpr->reclaimed += reclaimed;
|
||||
if (scanned < vmpressure_win) {
|
||||
if (!critical && scanned < calculate_vmpressure_win()) {
|
||||
spin_unlock(&vmpr->sr_lock);
|
||||
return;
|
||||
}
|
||||
@@ -346,65 +341,37 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
|
||||
}
|
||||
}
|
||||
#else
|
||||
static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
|
||||
unsigned long scanned, unsigned long reclaimed)
|
||||
{
|
||||
}
|
||||
static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
|
||||
bool tree, unsigned long scanned,
|
||||
unsigned long reclaimed) { }
|
||||
#endif
|
||||
|
||||
static void calculate_vmpressure_win(void)
|
||||
{
|
||||
long x;
|
||||
|
||||
x = global_node_page_state(NR_FILE_PAGES) -
|
||||
global_node_page_state(NR_SHMEM) -
|
||||
total_swapcache_pages() +
|
||||
global_zone_page_state(NR_FREE_PAGES);
|
||||
if (x < 1)
|
||||
x = 1;
|
||||
/*
|
||||
* For low (free + cached), vmpressure window should be
|
||||
* small, and high for higher values of (free + cached).
|
||||
* But it should not be linear as well. This ensures
|
||||
* timely vmpressure notifications when system is under
|
||||
* memory pressure, and optimal number of events when
|
||||
* cached is high. The sqaure root function is empirically
|
||||
* found to serve the purpose.
|
||||
*/
|
||||
x = int_sqrt(x);
|
||||
vmpressure_win = x;
|
||||
}
|
||||
|
||||
static void vmpressure_global(gfp_t gfp, unsigned long scanned,
|
||||
unsigned long reclaimed)
|
||||
static void vmpressure_global(gfp_t gfp, unsigned long scanned, bool critical,
|
||||
unsigned long reclaimed)
|
||||
{
|
||||
struct vmpressure *vmpr = &global_vmpressure;
|
||||
unsigned long pressure;
|
||||
unsigned long stall;
|
||||
|
||||
if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
|
||||
return;
|
||||
if (critical)
|
||||
scanned = calculate_vmpressure_win();
|
||||
|
||||
if (!scanned)
|
||||
return;
|
||||
if (scanned) {
|
||||
spin_lock(&vmpr->sr_lock);
|
||||
vmpr->scanned += scanned;
|
||||
vmpr->reclaimed += reclaimed;
|
||||
|
||||
spin_lock(&vmpr->sr_lock);
|
||||
if (!vmpr->scanned)
|
||||
calculate_vmpressure_win();
|
||||
if (!current_is_kswapd())
|
||||
vmpr->stall += scanned;
|
||||
|
||||
vmpr->scanned += scanned;
|
||||
vmpr->reclaimed += reclaimed;
|
||||
stall = vmpr->stall;
|
||||
scanned = vmpr->scanned;
|
||||
reclaimed = vmpr->reclaimed;
|
||||
spin_unlock(&vmpr->sr_lock);
|
||||
|
||||
if (!current_is_kswapd())
|
||||
vmpr->stall += scanned;
|
||||
|
||||
stall = vmpr->stall;
|
||||
scanned = vmpr->scanned;
|
||||
reclaimed = vmpr->reclaimed;
|
||||
spin_unlock(&vmpr->sr_lock);
|
||||
|
||||
if (scanned < vmpressure_win)
|
||||
return;
|
||||
if (!critical && scanned < calculate_vmpressure_win())
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock(&vmpr->sr_lock);
|
||||
vmpr->scanned = 0;
|
||||
@@ -412,11 +379,26 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned,
|
||||
vmpr->stall = 0;
|
||||
spin_unlock(&vmpr->sr_lock);
|
||||
|
||||
pressure = vmpressure_calc_pressure(scanned, reclaimed);
|
||||
pressure = vmpressure_account_stall(pressure, stall, scanned);
|
||||
if (scanned) {
|
||||
pressure = vmpressure_calc_pressure(scanned, reclaimed);
|
||||
pressure = vmpressure_account_stall(pressure, stall, scanned);
|
||||
} else {
|
||||
pressure = 100;
|
||||
}
|
||||
vmpressure_notify(pressure);
|
||||
}
|
||||
|
||||
static void __vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
|
||||
bool tree, unsigned long scanned,
|
||||
unsigned long reclaimed)
|
||||
{
|
||||
if (!memcg && tree)
|
||||
vmpressure_global(gfp, scanned, critical, reclaimed);
|
||||
|
||||
if (IS_ENABLED(CONFIG_MEMCG))
|
||||
vmpressure_memcg(gfp, memcg, critical, tree, scanned, reclaimed);
|
||||
}
|
||||
|
||||
/**
|
||||
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
|
||||
* @gfp: reclaimer's gfp mask
|
||||
@@ -439,13 +421,12 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned,
|
||||
* This function does not return any value.
|
||||
*/
|
||||
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
|
||||
unsigned long scanned, unsigned long reclaimed)
|
||||
unsigned long scanned, unsigned long reclaimed, int order)
|
||||
{
|
||||
if (!memcg && tree)
|
||||
vmpressure_global(gfp, scanned, reclaimed);
|
||||
if (order > PAGE_ALLOC_COSTLY_ORDER)
|
||||
return;
|
||||
|
||||
if (IS_ENABLED(CONFIG_MEMCG))
|
||||
vmpressure_memcg(gfp, memcg, tree, scanned, reclaimed);
|
||||
__vmpressure(gfp, memcg, false, tree, scanned, reclaimed);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -475,7 +456,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
|
||||
* to the vmpressure() basically means that we signal 'critical'
|
||||
* level.
|
||||
*/
|
||||
vmpressure(gfp, memcg, true, vmpressure_win, 0);
|
||||
__vmpressure(gfp, memcg, true, true, 0, 0);
|
||||
}
|
||||
|
||||
static enum vmpressure_levels str_to_level(const char *arg)
|
||||
|
||||
@@ -2727,7 +2727,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
||||
/* Record the group's reclaim efficiency */
|
||||
vmpressure(sc->gfp_mask, memcg, false,
|
||||
sc->nr_scanned - scanned,
|
||||
sc->nr_reclaimed - reclaimed);
|
||||
sc->nr_reclaimed - reclaimed, sc->order);
|
||||
|
||||
/*
|
||||
* Direct reclaim and kswapd have to scan all memory
|
||||
@@ -2760,7 +2760,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
||||
*/
|
||||
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
|
||||
sc->nr_scanned - nr_scanned,
|
||||
sc->nr_reclaimed - nr_reclaimed);
|
||||
sc->nr_reclaimed - nr_reclaimed, sc->order);
|
||||
|
||||
if (reclaim_state) {
|
||||
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
|
||||
@@ -3506,7 +3506,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
|
||||
wake_up_all(&pgdat->pfmemalloc_wait);
|
||||
|
||||
/* Check if kswapd should be suspending */
|
||||
if (try_to_freeze() || kthread_should_stop())
|
||||
if (try_to_freeze() || kthread_should_stop() ||
|
||||
!atomic_long_read(&kswapd_waiters))
|
||||
break;
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user