Merge remote-tracking branch 'slmk/linux-4.14' into auto-kernel

* slmk/linux-4.14: simple_lmk: Remove unnecessary clean-up when timeout is reached simple_lmk: Hold an RCU read lock instead of the tasklist read lock mm: Don't stop kswapd on a per-node basis when there are no waiters simple_lmk: Consider all positive adjs when finding victims mm: vmpressure: Ignore allocation orders above PAGE_ALLOC_COSTLY_ORDER mm: Don't warn on page allocation failures for OOM-killed processes mm: Adjust tsk_is_oom_victim() for Simple LMK mm: vmpressure: Don't cache the window size mm: vmpressure: Interpret zero scanned pages as 100% pressure mm: vmpressure: Don't exclude any allocation types simple_lmk: Update adj targeting for Android 10 simple_lmk: Use vmpressure notifier to trigger kills mm: vmpressure: make vmpressure window variable mm: vmpressure: account allocstalls only on higher pressures mm: vmpressure: scale pressure based on reclaim context mm: vmpressure: allow in-kernel clients to subscribe for events mm, vmpressure: int cast vmpressure level/model for -1 comparison mm: Stop kswapd early when nothing's waiting for it to free pages simple_lmk: Include swap memory usage in the size of victims simple_lmk: Relax memory barriers and clean up some styling simple_lmk: Place victims onto SCHED_RR simple_lmk: Add a timeout to stop waiting for victims to die simple_lmk: Ignore tasks that won't free memory simple_lmk: Simplify tricks used to speed up the death process simple_lmk: Report mm as freed as soon as exit_mmap() finishes simple_lmk: Mark victim thread group with TIF_MEMDIE simple_lmk: Disable OOM killer when Simple LMK is enabled simple_lmk: Print a message when there are no processes to kill simple_lmk: Remove compat cruft not specific to 4.14 simple_lmk: Update copyright to 2020 simple_lmk: Don't queue up new reclaim requests during reclaim simple_lmk: Increase default minfree value simple_lmk: Clean up some code style nitpicks simple_lmk: Make reclaim deterministic simple_lmk: Fix broken multicopy atomicity for victims_to_kill simple_lmk: Use proper atomic_* operations where needed simple_lmk: Remove kthread_should_stop() exit condition simple_lmk: Fix pages_found calculation simple_lmk: Introduce Simple Low Memory Killer for Android Signed-off-by: UtsavBalar1231 <utsavbalar1231@gmail.com> Conflicts: kernel/exit.c kernel/fork.c mm/Makefile mm/vmpressure.c mm/vmscan.c
2020-07-08 09:55:52 +05:30
parent 0820cb6c24 7b7d605b76
commit a896c5cfb3
13 changed files with 475 additions and 100 deletions
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -54,6 +54,39 @@ config ANDROID_BINDER_IPC_SELFTEST
 	  exhaustively with combinations of various buffer sizes and
 	  alignments.

+config ANDROID_SIMPLE_LMK
+	bool "Simple Android Low Memory Killer"
+	depends on !ANDROID_LOW_MEMORY_KILLER && !MEMCG
+	---help---
+	  This is a complete low memory killer solution for Android that is
+	  small and simple. Processes are killed according to the priorities
+	  that Android gives them, so that the least important processes are
+	  always killed first. Processes are killed until memory deficits are
+	  satisfied, as observed from kswapd struggling to free up pages. Simple
+	  LMK stops killing processes when kswapd finally goes back to sleep.
+
+if ANDROID_SIMPLE_LMK
+
+config ANDROID_SIMPLE_LMK_MINFREE
+	int "Minimum MiB of memory to free per reclaim"
+	range 8 512
+	default 128
+	help
+	  Simple LMK will try to free at least this much memory per reclaim.
+
+config ANDROID_SIMPLE_LMK_TIMEOUT_MSEC
+	int "Reclaim timeout in milliseconds"
+	range 50 1000
+	default 200
+	help
+	  Simple LMK tries to wait until all of the victims it kills have their
+	  memory freed; however, sometimes victims can take a while to die,
+	  which can block Simple LMK from killing more processes in time when
+	  needed. After the specified timeout elapses, Simple LMK will stop
+	  waiting and make itself available to kill more processes.
+
+endif
+
 endif # if ANDROID

 endmenu
--- a/drivers/android/Makefile
+++ b/drivers/android/Makefile
@@ -3,3 +3,4 @@ ccflags-y += -I$(src)			# needed for trace events
 obj-$(CONFIG_ANDROID_BINDERFS)		+= binderfs.o
 obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o binder_alloc.o
 obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
+obj-$(CONFIG_ANDROID_SIMPLE_LMK)	+= simple_lmk.o
--- a/drivers/android/simple_lmk.c
+++ b/drivers/android/simple_lmk.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019-2020 Sultan Alsawaf <sultan@kerneltoast.com>.
+ */
+
+#define pr_fmt(fmt) "simple_lmk: " fmt
+
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/moduleparam.h>
+#include <linux/oom.h>
+#include <linux/sort.h>
+#include <linux/vmpressure.h>
+#include <uapi/linux/sched/types.h>
+
+/* The minimum number of pages to free per reclaim */
+#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE)
+
+/* Kill up to this many victims per reclaim */
+#define MAX_VICTIMS 1024
+
+/* Timeout in jiffies for each reclaim */
+#define RECLAIM_EXPIRES msecs_to_jiffies(CONFIG_ANDROID_SIMPLE_LMK_TIMEOUT_MSEC)
+
+struct victim_info {
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	unsigned long size;
+};
+
+/* Pulled from the Android framework. Lower adj means higher priority. */
+static const unsigned short adjs[] = {
+	SHRT_MAX + 1, /* Include all positive adjs in the final range */
+	950, /* CACHED_APP_LMK_FIRST_ADJ */
+	900, /* CACHED_APP_MIN_ADJ */
+	800, /* SERVICE_B_ADJ */
+	700, /* PREVIOUS_APP_ADJ */
+	600, /* HOME_APP_ADJ */
+	500, /* SERVICE_ADJ */
+	400, /* HEAVY_WEIGHT_APP_ADJ */
+	300, /* BACKUP_APP_ADJ */
+	250, /* PERCEPTIBLE_LOW_APP_ADJ */
+	200, /* PERCEPTIBLE_APP_ADJ */
+	100, /* VISIBLE_APP_ADJ */
+	50, /* PERCEPTIBLE_RECENT_FOREGROUND_APP_ADJ */
+	0 /* FOREGROUND_APP_ADJ */
+};
+
+static struct victim_info victims[MAX_VICTIMS];
+static DECLARE_WAIT_QUEUE_HEAD(oom_waitq);
+static DECLARE_COMPLETION(reclaim_done);
+static DEFINE_RWLOCK(mm_free_lock);
+static int nr_victims;
+static atomic_t needs_reclaim = ATOMIC_INIT(0);
+static atomic_t nr_killed = ATOMIC_INIT(0);
+
+static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr)
+{
+	const struct victim_info *lhs = (typeof(lhs))lhs_ptr;
+	const struct victim_info *rhs = (typeof(rhs))rhs_ptr;
+
+	return rhs->size - lhs->size;
+}
+
+static bool vtsk_is_duplicate(int vlen, struct task_struct *vtsk)
+{
+	int i;
+
+	for (i = 0; i < vlen; i++) {
+		if (same_thread_group(victims[i].tsk, vtsk))
+			return true;
+	}
+
+	return false;
+}
+
+static unsigned long get_total_mm_pages(struct mm_struct *mm)
+{
+	unsigned long pages = 0;
+	int i;
+
+	for (i = 0; i < NR_MM_COUNTERS; i++)
+		pages += get_mm_counter(mm, i);
+
+	return pages;
+}
+
+static unsigned long find_victims(int *vindex, unsigned short target_adj_min,
+				  unsigned short target_adj_max)
+{
+	unsigned long pages_found = 0;
+	int old_vindex = *vindex;
+	struct task_struct *tsk;
+
+	for_each_process(tsk) {
+		struct signal_struct *sig;
+		struct task_struct *vtsk;
+		short adj;
+
+		/*
+		 * Search for suitable tasks with the targeted importance (adj).
+		 * Since only tasks with a positive adj can be targeted, that
+		 * naturally excludes tasks which shouldn't be killed, like init
+		 * and kthreads. Although oom_score_adj can still be changed
+		 * while this code runs, it doesn't really matter. We just need
+		 * to make sure that if the adj changes, we won't deadlock
+		 * trying to lock a task that we locked earlier.
+		 */
+		sig = tsk->signal;
+		adj = READ_ONCE(sig->oom_score_adj);
+		if (adj < target_adj_min || adj > target_adj_max - 1 ||
+		    sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP) ||
+		    (thread_group_empty(tsk) && tsk->flags & PF_EXITING) ||
+		    vtsk_is_duplicate(*vindex, tsk))
+			continue;
+
+		vtsk = find_lock_task_mm(tsk);
+		if (!vtsk)
+			continue;
+
+		/* Store this potential victim away for later */
+		victims[*vindex].tsk = vtsk;
+		victims[*vindex].mm = vtsk->mm;
+		victims[*vindex].size = get_total_mm_pages(vtsk->mm);
+
+		/* Keep track of the number of pages that have been found */
+		pages_found += victims[*vindex].size;
+
+		/* Make sure there's space left in the victim array */
+		if (++*vindex == MAX_VICTIMS)
+			break;
+	}
+
+	/*
+	 * Sort the victims in descending order of size to prioritize killing
+	 * the larger ones first.
+	 */
+	if (pages_found)
+		sort(&victims[old_vindex], *vindex - old_vindex,
+		     sizeof(*victims), victim_size_cmp, NULL);
+
+	return pages_found;
+}
+
+static int process_victims(int vlen, unsigned long pages_needed)
+{
+	unsigned long pages_found = 0;
+	int i, nr_to_kill = 0;
+
+	/*
+	 * Calculate the number of tasks that need to be killed and quickly
+	 * release the references to those that'll live.
+	 */
+	for (i = 0; i < vlen; i++) {
+		struct victim_info *victim = &victims[i];
+		struct task_struct *vtsk = victim->tsk;
+
+		/* The victim's mm lock is taken in find_victims; release it */
+		if (pages_found >= pages_needed) {
+			task_unlock(vtsk);
+		} else {
+			pages_found += victim->size;
+			nr_to_kill++;
+		}
+	}
+
+	return nr_to_kill;
+}
+
+static void scan_and_kill(unsigned long pages_needed)
+{
+	int i, nr_to_kill = 0, nr_found = 0;
+	unsigned long pages_found = 0;
+
+	/* Hold an RCU read lock while traversing the global process list */
+	rcu_read_lock();
+	for (i = 1; i < ARRAY_SIZE(adjs); i++) {
+		pages_found += find_victims(&nr_found, adjs[i], adjs[i - 1]);
+		if (pages_found >= pages_needed || nr_found == MAX_VICTIMS)
+			break;
+	}
+	rcu_read_unlock();
+
+	/* Pretty unlikely but it can happen */
+	if (unlikely(!nr_found)) {
+		pr_err("No processes available to kill!\n");
+		return;
+	}
+
+	/* First round of victim processing to weed out unneeded victims */
+	nr_to_kill = process_victims(nr_found, pages_needed);
+
+	/*
+	 * Try to kill as few of the chosen victims as possible by sorting the
+	 * chosen victims by size, which means larger victims that have a lower
+	 * adj can be killed in place of smaller victims with a high adj.
+	 */
+	sort(victims, nr_to_kill, sizeof(*victims), victim_size_cmp, NULL);
+
+	/* Second round of victim processing to finally select the victims */
+	nr_to_kill = process_victims(nr_to_kill, pages_needed);
+
+	/* Store the final number of victims for simple_lmk_mm_freed() */
+	write_lock(&mm_free_lock);
+	nr_victims = nr_to_kill;
+	write_unlock(&mm_free_lock);
+
+	/* Kill the victims */
+	for (i = 0; i < nr_to_kill; i++) {
+		static const struct sched_param sched_zero_prio;
+		struct victim_info *victim = &victims[i];
+		struct task_struct *t, *vtsk = victim->tsk;
+
+		pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm,
+			vtsk->signal->oom_score_adj,
+			victim->size << (PAGE_SHIFT - 10));
+
+		/* Accelerate the victim's death by forcing the kill signal */
+		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, vtsk, true);
+
+		/* Mark the thread group dead so that other kernel code knows */
+		rcu_read_lock();
+		for_each_thread(vtsk, t)
+			set_tsk_thread_flag(t, TIF_MEMDIE);
+		rcu_read_unlock();
+
+		/* Elevate the victim to SCHED_RR with zero RT priority */
+		sched_setscheduler_nocheck(vtsk, SCHED_RR, &sched_zero_prio);
+
+		/* Allow the victim to run on any CPU. This won't schedule. */
+		set_cpus_allowed_ptr(vtsk, cpu_all_mask);
+
+		/* Finally release the victim's task lock acquired earlier */
+		task_unlock(vtsk);
+	}
+
+	/* Wait until all the victims die or until the timeout is reached */
+	wait_for_completion_timeout(&reclaim_done, RECLAIM_EXPIRES);
+	write_lock(&mm_free_lock);
+	reinit_completion(&reclaim_done);
+	nr_victims = 0;
+	nr_killed = (atomic_t)ATOMIC_INIT(0);
+	write_unlock(&mm_free_lock);
+}
+
+static int simple_lmk_reclaim_thread(void *data)
+{
+	static const struct sched_param sched_max_rt_prio = {
+		.sched_priority = MAX_RT_PRIO - 1
+	};
+
+	sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio);
+
+	while (1) {
+		wait_event(oom_waitq, atomic_read(&needs_reclaim));
+		scan_and_kill(MIN_FREE_PAGES);
+		atomic_set_release(&needs_reclaim, 0);
+	}
+
+	return 0;
+}
+
+void simple_lmk_mm_freed(struct mm_struct *mm)
+{
+	int i;
+
+	read_lock(&mm_free_lock);
+	for (i = 0; i < nr_victims; i++) {
+		if (victims[i].mm == mm) {
+			victims[i].mm = NULL;
+			if (atomic_inc_return_relaxed(&nr_killed) == nr_victims)
+				complete(&reclaim_done);
+			break;
+		}
+	}
+	read_unlock(&mm_free_lock);
+}
+
+static int simple_lmk_vmpressure_cb(struct notifier_block *nb,
+				    unsigned long pressure, void *data)
+{
+	if (pressure == 100 && !atomic_cmpxchg_acquire(&needs_reclaim, 0, 1))
+		wake_up(&oom_waitq);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block vmpressure_notif = {
+	.notifier_call = simple_lmk_vmpressure_cb,
+	.priority = INT_MAX
+};
+
+/* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */
+static int simple_lmk_init_set(const char *val, const struct kernel_param *kp)
+{
+	static atomic_t init_done = ATOMIC_INIT(0);
+	struct task_struct *thread;
+
+	if (!atomic_cmpxchg(&init_done, 0, 1)) {
+		thread = kthread_run(simple_lmk_reclaim_thread, NULL,
+				     "simple_lmkd");
+		BUG_ON(IS_ERR(thread));
+		BUG_ON(vmpressure_notifier_register(&vmpressure_notif));
+	}
+
+	return 0;
+}
+
+static const struct kernel_param_ops simple_lmk_init_ops = {
+	.set = simple_lmk_init_set
+};
+
+/* Needed to prevent Android from thinking there's no LMK and thus rebooting */
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "lowmemorykiller."
+module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200);
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -63,7 +63,11 @@ static inline bool oom_task_origin(const struct task_struct *p)

 static inline bool tsk_is_oom_victim(struct task_struct * tsk)
 {
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+	return test_ti_thread_flag(task_thread_info(tsk), TIF_MEMDIE);
+#else
 	return tsk->signal->oom_mm;
+#endif
 }

 /*
--- a/include/linux/simple_lmk.h
+++ b/include/linux/simple_lmk.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019-2020 Sultan Alsawaf <sultan@kerneltoast.com>.
+ */
+#ifndef _SIMPLE_LMK_H_
+#define _SIMPLE_LMK_H_
+
+struct mm_struct;
+
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+void simple_lmk_mm_freed(struct mm_struct *mm);
+#else
+static inline void simple_lmk_mm_freed(struct mm_struct *mm)
+{
+}
+#endif
+
+#endif /* _SIMPLE_LMK_H_ */
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -33,7 +33,8 @@ struct mem_cgroup;
 extern int vmpressure_notifier_register(struct notifier_block *nb);
 extern int vmpressure_notifier_unregister(struct notifier_block *nb);
 extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
-		       unsigned long scanned, unsigned long reclaimed);
+		       unsigned long scanned, unsigned long reclaimed,
+		       int order);
 extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);

 #ifdef CONFIG_MEMCG
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -545,8 +545,12 @@ static void exit_mm(void)
 	mm_update_next_owner(mm);

 	mm_released = mmput(mm);
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+	clear_thread_flag(TIF_MEMDIE);
+#else
 	if (test_thread_flag(TIF_MEMDIE))
 		exit_oom_victim();
+#endif
 	if (mm_released)
 		set_tsk_thread_flag(current, TIF_MM_RELEASED);
 }
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -96,6 +96,7 @@
 #include <linux/scs.h>
 #include <linux/cpu_input_boost.h>
 #include <linux/devfreq_boost.h>
+#include <linux/simple_lmk.h>

 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -949,6 +950,7 @@ static inline void __mmput(struct mm_struct *mm)
 	ksm_exit(mm);
 	khugepaged_exit(mm); /* must run before exit_mmap */
 	exit_mmap(mm);
+	simple_lmk_mm_freed(mm);
 	mm_put_huge_zero_page(mm);
 	set_mm_exe_file(mm, NULL);
 	if (!list_empty(&mm->mmlist)) {
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -187,6 +187,7 @@ extern void prep_compound_page(struct page *page, unsigned int order);
 extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern int user_min_free_kbytes;
+extern atomic_long_t kswapd_waiters;

 #if defined CONFIG_COMPACTION || defined CONFIG_CMA

--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1001,7 +1001,7 @@ bool out_of_memory(struct oom_control *oc)
 	unsigned long freed = 0;
 	enum oom_constraint constraint = CONSTRAINT_NONE;

-	if (oom_killer_disabled)
+	if (oom_killer_disabled || IS_ENABLED(CONFIG_ANDROID_SIMPLE_LMK))
 		return false;

 	if (try_online_one_block(numa_node_id())) {
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -77,6 +77,8 @@
 #include <asm/div64.h>
 #include "internal.h"

+atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0);
+
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION	(8)
@@ -4067,6 +4069,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	int no_progress_loops;
 	unsigned int cpuset_mems_cookie;
 	int reserve_flags;
+	bool woke_kswapd = false;

 	/*
 	 * We also sanity check to catch abuse of atomic reserves being used by
@@ -4100,8 +4103,13 @@ retry_cpuset:
 	if (!ac->preferred_zoneref->zone)
 		goto nopage;

-	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+	if (gfp_mask & __GFP_KSWAPD_RECLAIM) {
+		if (!woke_kswapd) {
+			atomic_long_inc(&kswapd_waiters);
+			woke_kswapd = true;
+		}
 		wake_all_kswapds(order, ac);
+	}

 	/*
 	 * The adjusted alloc_flags might result in immediate success, so try
@@ -4249,8 +4257,10 @@ retry:
 	/* Avoid allocations with no watermarks from looping endlessly */
 	if (tsk_is_oom_victim(current) &&
 	    (alloc_flags == ALLOC_OOM ||
-	     (gfp_mask & __GFP_NOMEMALLOC)))
+	     (gfp_mask & __GFP_NOMEMALLOC))) {
+		gfp_mask |= __GFP_NOWARN;
 		goto nopage;
+	}

 	/* Retry as long as the OOM killer is making progress */
 	if (did_some_progress) {
@@ -4304,9 +4314,12 @@ nopage:
 		goto retry;
 	}
 fail:
-	warn_alloc(gfp_mask, ac->nodemask,
-			"page allocation failure: order:%u", order);
 got_pg:
+	if (woke_kswapd)
+		atomic_long_dec(&kswapd_waiters);
+	if (!page)
+		warn_alloc(gfp_mask, ac->nodemask,
+				"page allocation failure: order:%u", order);
 	return page;
 }

--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -27,22 +27,6 @@
 #include <linux/module.h>
 #include <linux/vmpressure.h>

-/*
- * The window size (vmpressure_win) is the number of scanned pages before
- * we try to analyze scanned/reclaimed ratio. So the window is used as a
- * rate-limit tunable for the "low" level notification, and also for
- * averaging the ratio for medium/critical levels. Using small window
- * sizes can cause lot of false positives, but too big window size will
- * delay the notifications.
- *
- * As the vmscan reclaimer logic works with chunks which are multiple of
- * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
- *
- * TODO: Make the window size depend on machine size, as we do for vmstat
- * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
- */
-static unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
-
 /*
 * These thresholds are used when we account memory pressure through
 * scanned/reclaimed ratio. The current values were chosen empirically. In
@@ -271,26 +255,35 @@ static void vmpressure_work_fn(struct work_struct *work)
 	} while ((vmpr = vmpressure_parent(vmpr)));
 }

+static unsigned long calculate_vmpressure_win(void)
+{
+	long x;
+
+	x = global_node_page_state(NR_FILE_PAGES) -
+			global_node_page_state(NR_SHMEM) -
+			total_swapcache_pages() +
+			global_zone_page_state(NR_FREE_PAGES);
+	if (x < 1)
+		return 1;
+	/*
+	 * For low (free + cached), vmpressure window should be
+	 * small, and high for higher values of (free + cached).
+	 * But it should not be linear as well. This ensures
+	 * timely vmpressure notifications when system is under
+	 * memory pressure, and optimal number of events when
+	 * cached is high. The sqaure root function is empirically
+	 * found to serve the purpose.
+	 */
+	return int_sqrt(x);
+}
+
 #ifdef CONFIG_MEMCG
-static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
-		unsigned long scanned, unsigned long reclaimed)
+static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
+			     bool tree, unsigned long scanned,
+			     unsigned long reclaimed)
 {
 	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);

-	/*
-	 * Here we only want to account pressure that userland is able to
-	 * help us with. For example, suppose that DMA zone is under
-	 * pressure; if we notify userland about that kind of pressure,
-	 * then it will be mostly a waste as it will trigger unnecessary
-	 * freeing of memory by userland (since userland is more likely to
-	 * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
-	 * is why we include only movable, highmem and FS/IO pages.
-	 * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
-	 * we account it too.
-	 */
-	if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
-		return;
-
 	/*
 	 * If we got here with no pages scanned, then that is an indicator
 	 * that reclaimer was unable to find any shrinkable LRUs at the
@@ -299,7 +292,9 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 	 * (scanning depth) goes too high (deep), we will be notified
 	 * through vmpressure_prio(). But so far, keep calm.
 	 */
-	if (!scanned)
+	if (critical)
+		scanned = calculate_vmpressure_win();
+	else if (!scanned)
 		return;

 	if (tree) {
@@ -308,7 +303,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 		vmpr->tree_reclaimed += reclaimed;
 		spin_unlock(&vmpr->sr_lock);

-		if (scanned < vmpressure_win)
+		if (!critical && scanned < calculate_vmpressure_win())
 			return;
 		schedule_work(&vmpr->work);
 	} else {
@@ -322,7 +317,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 		spin_lock(&vmpr->sr_lock);
 		scanned = vmpr->scanned += scanned;
 		reclaimed = vmpr->reclaimed += reclaimed;
-		if (scanned < vmpressure_win) {
+		if (!critical && scanned < calculate_vmpressure_win()) {
 			spin_unlock(&vmpr->sr_lock);
 			return;
 		}
@@ -346,65 +341,37 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 	}
 }
 #else
-static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
-		unsigned long scanned, unsigned long reclaimed)
-{
-}
+static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
+			     bool tree, unsigned long scanned,
+			     unsigned long reclaimed) { }
 #endif

-static void calculate_vmpressure_win(void)
-{
-	long x;
-
-	x = global_node_page_state(NR_FILE_PAGES) -
-			global_node_page_state(NR_SHMEM) -
-			total_swapcache_pages() +
-			global_zone_page_state(NR_FREE_PAGES);
-	if (x < 1)
-		x = 1;
-	/*
-	 * For low (free + cached), vmpressure window should be
-	 * small, and high for higher values of (free + cached).
-	 * But it should not be linear as well. This ensures
-	 * timely vmpressure notifications when system is under
-	 * memory pressure, and optimal number of events when
-	 * cached is high. The sqaure root function is empirically
-	 * found to serve the purpose.
-	 */
-	x = int_sqrt(x);
-	vmpressure_win = x;
-}
-
-static void vmpressure_global(gfp_t gfp, unsigned long scanned,
-		unsigned long reclaimed)
+static void vmpressure_global(gfp_t gfp, unsigned long scanned, bool critical,
+			      unsigned long reclaimed)
 {
 	struct vmpressure *vmpr = &global_vmpressure;
 	unsigned long pressure;
 	unsigned long stall;

-	if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
-		return;
+	if (critical)
+		scanned = calculate_vmpressure_win();

-	if (!scanned)
-		return;
+	if (scanned) {
+		spin_lock(&vmpr->sr_lock);
+		vmpr->scanned += scanned;
+		vmpr->reclaimed += reclaimed;

-	spin_lock(&vmpr->sr_lock);
-	if (!vmpr->scanned)
-		calculate_vmpressure_win();
+		if (!current_is_kswapd())
+			vmpr->stall += scanned;

-	vmpr->scanned += scanned;
-	vmpr->reclaimed += reclaimed;
+		stall = vmpr->stall;
+		scanned = vmpr->scanned;
+		reclaimed = vmpr->reclaimed;
+		spin_unlock(&vmpr->sr_lock);

-	if (!current_is_kswapd())
-		vmpr->stall += scanned;
-
-	stall = vmpr->stall;
-	scanned = vmpr->scanned;
-	reclaimed = vmpr->reclaimed;
-	spin_unlock(&vmpr->sr_lock);
-
-	if (scanned < vmpressure_win)
-		return;
+		if (!critical && scanned < calculate_vmpressure_win())
+			return;
+	}

 	spin_lock(&vmpr->sr_lock);
 	vmpr->scanned = 0;
@@ -412,11 +379,26 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned,
 	vmpr->stall = 0;
 	spin_unlock(&vmpr->sr_lock);

-	pressure = vmpressure_calc_pressure(scanned, reclaimed);
-	pressure = vmpressure_account_stall(pressure, stall, scanned);
+	if (scanned) {
+		pressure = vmpressure_calc_pressure(scanned, reclaimed);
+		pressure = vmpressure_account_stall(pressure, stall, scanned);
+	} else {
+		pressure = 100;
+	}
 	vmpressure_notify(pressure);
 }

+static void __vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
+			 bool tree, unsigned long scanned,
+			 unsigned long reclaimed)
+{
+	if (!memcg && tree)
+		vmpressure_global(gfp, scanned, critical, reclaimed);
+
+	if (IS_ENABLED(CONFIG_MEMCG))
+		vmpressure_memcg(gfp, memcg, critical, tree, scanned, reclaimed);
+}
+
 /**
 * vmpressure() - Account memory pressure through scanned/reclaimed ratio
 * @gfp:	reclaimer's gfp mask
@@ -439,13 +421,12 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned,
 * This function does not return any value.
 */
 void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
-		unsigned long scanned, unsigned long reclaimed)
+		unsigned long scanned, unsigned long reclaimed, int order)
 {
-	if (!memcg && tree)
-		vmpressure_global(gfp, scanned, reclaimed);
+	if (order > PAGE_ALLOC_COSTLY_ORDER)
+		return;

-	if (IS_ENABLED(CONFIG_MEMCG))
-		vmpressure_memcg(gfp, memcg, tree, scanned, reclaimed);
+	__vmpressure(gfp, memcg, false, tree, scanned, reclaimed);
 }

 /**
@@ -475,7 +456,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
 	 * to the vmpressure() basically means that we signal 'critical'
 	 * level.
 	 */
-	vmpressure(gfp, memcg, true, vmpressure_win, 0);
+	__vmpressure(gfp, memcg, true, true, 0, 0);
 }

 static enum vmpressure_levels str_to_level(const char *arg)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2727,7 +2727,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			/* Record the group's reclaim efficiency */
 			vmpressure(sc->gfp_mask, memcg, false,
 				   sc->nr_scanned - scanned,
-				   sc->nr_reclaimed - reclaimed);
+				   sc->nr_reclaimed - reclaimed, sc->order);

 			/*
 			 * Direct reclaim and kswapd have to scan all memory
@@ -2760,7 +2760,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 		 */
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
 			   sc->nr_scanned - nr_scanned,
-			   sc->nr_reclaimed - nr_reclaimed);
+			   sc->nr_reclaimed - nr_reclaimed, sc->order);

 		if (reclaim_state) {
 			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -3506,7 +3506,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 			wake_up_all(&pgdat->pfmemalloc_wait);

 		/* Check if kswapd should be suspending */
-		if (try_to_freeze() || kthread_should_stop())
+		if (try_to_freeze() || kthread_should_stop() ||
+		    !atomic_long_read(&kswapd_waiters))
 			break;

 		/*