diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 3a90d51d8419..a1793579f159 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -54,6 +54,39 @@ config ANDROID_BINDER_IPC_SELFTEST exhaustively with combinations of various buffer sizes and alignments. +config ANDROID_SIMPLE_LMK + bool "Simple Android Low Memory Killer" + depends on !ANDROID_LOW_MEMORY_KILLER && !MEMCG + ---help--- + This is a complete low memory killer solution for Android that is + small and simple. Processes are killed according to the priorities + that Android gives them, so that the least important processes are + always killed first. Processes are killed until memory deficits are + satisfied, as observed from kswapd struggling to free up pages. Simple + LMK stops killing processes when kswapd finally goes back to sleep. + +if ANDROID_SIMPLE_LMK + +config ANDROID_SIMPLE_LMK_MINFREE + int "Minimum MiB of memory to free per reclaim" + range 8 512 + default 128 + help + Simple LMK will try to free at least this much memory per reclaim. + +config ANDROID_SIMPLE_LMK_TIMEOUT_MSEC + int "Reclaim timeout in milliseconds" + range 50 1000 + default 200 + help + Simple LMK tries to wait until all of the victims it kills have their + memory freed; however, sometimes victims can take a while to die, + which can block Simple LMK from killing more processes in time when + needed. After the specified timeout elapses, Simple LMK will stop + waiting and make itself available to kill more processes. + +endif + endif # if ANDROID endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile index c7856e3200da..7c91293b6d59 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -3,3 +3,4 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o +obj-$(CONFIG_ANDROID_SIMPLE_LMK) += simple_lmk.o diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c new file mode 100644 index 000000000000..b0bffb991aa3 --- /dev/null +++ b/drivers/android/simple_lmk.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2020 Sultan Alsawaf . + */ + +#define pr_fmt(fmt) "simple_lmk: " fmt + +#include +#include +#include +#include +#include +#include +#include + +/* The minimum number of pages to free per reclaim */ +#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE) + +/* Kill up to this many victims per reclaim */ +#define MAX_VICTIMS 1024 + +/* Timeout in jiffies for each reclaim */ +#define RECLAIM_EXPIRES msecs_to_jiffies(CONFIG_ANDROID_SIMPLE_LMK_TIMEOUT_MSEC) + +struct victim_info { + struct task_struct *tsk; + struct mm_struct *mm; + unsigned long size; +}; + +/* Pulled from the Android framework. Lower adj means higher priority. */ +static const unsigned short adjs[] = { + SHRT_MAX + 1, /* Include all positive adjs in the final range */ + 950, /* CACHED_APP_LMK_FIRST_ADJ */ + 900, /* CACHED_APP_MIN_ADJ */ + 800, /* SERVICE_B_ADJ */ + 700, /* PREVIOUS_APP_ADJ */ + 600, /* HOME_APP_ADJ */ + 500, /* SERVICE_ADJ */ + 400, /* HEAVY_WEIGHT_APP_ADJ */ + 300, /* BACKUP_APP_ADJ */ + 250, /* PERCEPTIBLE_LOW_APP_ADJ */ + 200, /* PERCEPTIBLE_APP_ADJ */ + 100, /* VISIBLE_APP_ADJ */ + 50, /* PERCEPTIBLE_RECENT_FOREGROUND_APP_ADJ */ + 0 /* FOREGROUND_APP_ADJ */ +}; + +static struct victim_info victims[MAX_VICTIMS]; +static DECLARE_WAIT_QUEUE_HEAD(oom_waitq); +static DECLARE_COMPLETION(reclaim_done); +static DEFINE_RWLOCK(mm_free_lock); +static int nr_victims; +static atomic_t needs_reclaim = ATOMIC_INIT(0); +static atomic_t nr_killed = ATOMIC_INIT(0); + +static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr) +{ + const struct victim_info *lhs = (typeof(lhs))lhs_ptr; + const struct victim_info *rhs = (typeof(rhs))rhs_ptr; + + return rhs->size - lhs->size; +} + +static bool vtsk_is_duplicate(int vlen, struct task_struct *vtsk) +{ + int i; + + for (i = 0; i < vlen; i++) { + if (same_thread_group(victims[i].tsk, vtsk)) + return true; + } + + return false; +} + +static unsigned long get_total_mm_pages(struct mm_struct *mm) +{ + unsigned long pages = 0; + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) + pages += get_mm_counter(mm, i); + + return pages; +} + +static unsigned long find_victims(int *vindex, unsigned short target_adj_min, + unsigned short target_adj_max) +{ + unsigned long pages_found = 0; + int old_vindex = *vindex; + struct task_struct *tsk; + + for_each_process(tsk) { + struct signal_struct *sig; + struct task_struct *vtsk; + short adj; + + /* + * Search for suitable tasks with the targeted importance (adj). + * Since only tasks with a positive adj can be targeted, that + * naturally excludes tasks which shouldn't be killed, like init + * and kthreads. Although oom_score_adj can still be changed + * while this code runs, it doesn't really matter. We just need + * to make sure that if the adj changes, we won't deadlock + * trying to lock a task that we locked earlier. + */ + sig = tsk->signal; + adj = READ_ONCE(sig->oom_score_adj); + if (adj < target_adj_min || adj > target_adj_max - 1 || + sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP) || + (thread_group_empty(tsk) && tsk->flags & PF_EXITING) || + vtsk_is_duplicate(*vindex, tsk)) + continue; + + vtsk = find_lock_task_mm(tsk); + if (!vtsk) + continue; + + /* Store this potential victim away for later */ + victims[*vindex].tsk = vtsk; + victims[*vindex].mm = vtsk->mm; + victims[*vindex].size = get_total_mm_pages(vtsk->mm); + + /* Keep track of the number of pages that have been found */ + pages_found += victims[*vindex].size; + + /* Make sure there's space left in the victim array */ + if (++*vindex == MAX_VICTIMS) + break; + } + + /* + * Sort the victims in descending order of size to prioritize killing + * the larger ones first. + */ + if (pages_found) + sort(&victims[old_vindex], *vindex - old_vindex, + sizeof(*victims), victim_size_cmp, NULL); + + return pages_found; +} + +static int process_victims(int vlen, unsigned long pages_needed) +{ + unsigned long pages_found = 0; + int i, nr_to_kill = 0; + + /* + * Calculate the number of tasks that need to be killed and quickly + * release the references to those that'll live. + */ + for (i = 0; i < vlen; i++) { + struct victim_info *victim = &victims[i]; + struct task_struct *vtsk = victim->tsk; + + /* The victim's mm lock is taken in find_victims; release it */ + if (pages_found >= pages_needed) { + task_unlock(vtsk); + } else { + pages_found += victim->size; + nr_to_kill++; + } + } + + return nr_to_kill; +} + +static void scan_and_kill(unsigned long pages_needed) +{ + int i, nr_to_kill = 0, nr_found = 0; + unsigned long pages_found = 0; + + /* Hold an RCU read lock while traversing the global process list */ + rcu_read_lock(); + for (i = 1; i < ARRAY_SIZE(adjs); i++) { + pages_found += find_victims(&nr_found, adjs[i], adjs[i - 1]); + if (pages_found >= pages_needed || nr_found == MAX_VICTIMS) + break; + } + rcu_read_unlock(); + + /* Pretty unlikely but it can happen */ + if (unlikely(!nr_found)) { + pr_err("No processes available to kill!\n"); + return; + } + + /* First round of victim processing to weed out unneeded victims */ + nr_to_kill = process_victims(nr_found, pages_needed); + + /* + * Try to kill as few of the chosen victims as possible by sorting the + * chosen victims by size, which means larger victims that have a lower + * adj can be killed in place of smaller victims with a high adj. + */ + sort(victims, nr_to_kill, sizeof(*victims), victim_size_cmp, NULL); + + /* Second round of victim processing to finally select the victims */ + nr_to_kill = process_victims(nr_to_kill, pages_needed); + + /* Store the final number of victims for simple_lmk_mm_freed() */ + write_lock(&mm_free_lock); + nr_victims = nr_to_kill; + write_unlock(&mm_free_lock); + + /* Kill the victims */ + for (i = 0; i < nr_to_kill; i++) { + static const struct sched_param sched_zero_prio; + struct victim_info *victim = &victims[i]; + struct task_struct *t, *vtsk = victim->tsk; + + pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm, + vtsk->signal->oom_score_adj, + victim->size << (PAGE_SHIFT - 10)); + + /* Accelerate the victim's death by forcing the kill signal */ + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, vtsk, true); + + /* Mark the thread group dead so that other kernel code knows */ + rcu_read_lock(); + for_each_thread(vtsk, t) + set_tsk_thread_flag(t, TIF_MEMDIE); + rcu_read_unlock(); + + /* Elevate the victim to SCHED_RR with zero RT priority */ + sched_setscheduler_nocheck(vtsk, SCHED_RR, &sched_zero_prio); + + /* Allow the victim to run on any CPU. This won't schedule. */ + set_cpus_allowed_ptr(vtsk, cpu_all_mask); + + /* Finally release the victim's task lock acquired earlier */ + task_unlock(vtsk); + } + + /* Wait until all the victims die or until the timeout is reached */ + wait_for_completion_timeout(&reclaim_done, RECLAIM_EXPIRES); + write_lock(&mm_free_lock); + reinit_completion(&reclaim_done); + nr_victims = 0; + nr_killed = (atomic_t)ATOMIC_INIT(0); + write_unlock(&mm_free_lock); +} + +static int simple_lmk_reclaim_thread(void *data) +{ + static const struct sched_param sched_max_rt_prio = { + .sched_priority = MAX_RT_PRIO - 1 + }; + + sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio); + + while (1) { + wait_event(oom_waitq, atomic_read(&needs_reclaim)); + scan_and_kill(MIN_FREE_PAGES); + atomic_set_release(&needs_reclaim, 0); + } + + return 0; +} + +void simple_lmk_mm_freed(struct mm_struct *mm) +{ + int i; + + read_lock(&mm_free_lock); + for (i = 0; i < nr_victims; i++) { + if (victims[i].mm == mm) { + victims[i].mm = NULL; + if (atomic_inc_return_relaxed(&nr_killed) == nr_victims) + complete(&reclaim_done); + break; + } + } + read_unlock(&mm_free_lock); +} + +static int simple_lmk_vmpressure_cb(struct notifier_block *nb, + unsigned long pressure, void *data) +{ + if (pressure == 100 && !atomic_cmpxchg_acquire(&needs_reclaim, 0, 1)) + wake_up(&oom_waitq); + + return NOTIFY_OK; +} + +static struct notifier_block vmpressure_notif = { + .notifier_call = simple_lmk_vmpressure_cb, + .priority = INT_MAX +}; + +/* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */ +static int simple_lmk_init_set(const char *val, const struct kernel_param *kp) +{ + static atomic_t init_done = ATOMIC_INIT(0); + struct task_struct *thread; + + if (!atomic_cmpxchg(&init_done, 0, 1)) { + thread = kthread_run(simple_lmk_reclaim_thread, NULL, + "simple_lmkd"); + BUG_ON(IS_ERR(thread)); + BUG_ON(vmpressure_notifier_register(&vmpressure_notif)); + } + + return 0; +} + +static const struct kernel_param_ops simple_lmk_init_ops = { + .set = simple_lmk_init_set +}; + +/* Needed to prevent Android from thinking there's no LMK and thus rebooting */ +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "lowmemorykiller." +module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200); diff --git a/include/linux/oom.h b/include/linux/oom.h index c9ad7f044997..013c7544d1cc 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -63,7 +63,11 @@ static inline bool oom_task_origin(const struct task_struct *p) static inline bool tsk_is_oom_victim(struct task_struct * tsk) { +#ifdef CONFIG_ANDROID_SIMPLE_LMK + return test_ti_thread_flag(task_thread_info(tsk), TIF_MEMDIE); +#else return tsk->signal->oom_mm; +#endif } /* diff --git a/include/linux/simple_lmk.h b/include/linux/simple_lmk.h new file mode 100644 index 000000000000..b02d1bec9731 --- /dev/null +++ b/include/linux/simple_lmk.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019-2020 Sultan Alsawaf . + */ +#ifndef _SIMPLE_LMK_H_ +#define _SIMPLE_LMK_H_ + +struct mm_struct; + +#ifdef CONFIG_ANDROID_SIMPLE_LMK +void simple_lmk_mm_freed(struct mm_struct *mm); +#else +static inline void simple_lmk_mm_freed(struct mm_struct *mm) +{ +} +#endif + +#endif /* _SIMPLE_LMK_H_ */ diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index de86c6b946c7..31bffa69e864 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -33,7 +33,8 @@ struct mem_cgroup; extern int vmpressure_notifier_register(struct notifier_block *nb); extern int vmpressure_notifier_unregister(struct notifier_block *nb); extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed); + unsigned long scanned, unsigned long reclaimed, + int order); extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); #ifdef CONFIG_MEMCG diff --git a/kernel/exit.c b/kernel/exit.c index 86e0fe725978..aba43d5fa859 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -545,8 +545,12 @@ static void exit_mm(void) mm_update_next_owner(mm); mm_released = mmput(mm); +#ifdef CONFIG_ANDROID_SIMPLE_LMK + clear_thread_flag(TIF_MEMDIE); +#else if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); +#endif if (mm_released) set_tsk_thread_flag(current, TIF_MM_RELEASED); } diff --git a/kernel/fork.c b/kernel/fork.c index a512bbb75b7e..2e3225a6d286 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include @@ -949,6 +950,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + simple_lmk_mm_freed(mm); mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { diff --git a/mm/internal.h b/mm/internal.h index 7a9fe8fd1558..7cc88ece2204 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -187,6 +187,7 @@ extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern atomic_long_t kswapd_waiters; #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4f9593a2ac47..5a8f4347c072 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1001,7 +1001,7 @@ bool out_of_memory(struct oom_control *oc) unsigned long freed = 0; enum oom_constraint constraint = CONSTRAINT_NONE; - if (oom_killer_disabled) + if (oom_killer_disabled || IS_ENABLED(CONFIG_ANDROID_SIMPLE_LMK)) return false; if (try_online_one_block(numa_node_id())) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c0ae9336d18a..19a8f3bd276a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -77,6 +77,8 @@ #include #include "internal.h" +atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0); + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -4067,6 +4069,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int no_progress_loops; unsigned int cpuset_mems_cookie; int reserve_flags; + bool woke_kswapd = false; /* * We also sanity check to catch abuse of atomic reserves being used by @@ -4100,8 +4103,13 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; - if (gfp_mask & __GFP_KSWAPD_RECLAIM) + if (gfp_mask & __GFP_KSWAPD_RECLAIM) { + if (!woke_kswapd) { + atomic_long_inc(&kswapd_waiters); + woke_kswapd = true; + } wake_all_kswapds(order, ac); + } /* * The adjusted alloc_flags might result in immediate success, so try @@ -4249,8 +4257,10 @@ retry: /* Avoid allocations with no watermarks from looping endlessly */ if (tsk_is_oom_victim(current) && (alloc_flags == ALLOC_OOM || - (gfp_mask & __GFP_NOMEMALLOC))) + (gfp_mask & __GFP_NOMEMALLOC))) { + gfp_mask |= __GFP_NOWARN; goto nopage; + } /* Retry as long as the OOM killer is making progress */ if (did_some_progress) { @@ -4304,9 +4314,12 @@ nopage: goto retry; } fail: - warn_alloc(gfp_mask, ac->nodemask, - "page allocation failure: order:%u", order); got_pg: + if (woke_kswapd) + atomic_long_dec(&kswapd_waiters); + if (!page) + warn_alloc(gfp_mask, ac->nodemask, + "page allocation failure: order:%u", order); return page; } diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 679fe3020b77..9c2c9d08718e 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -27,22 +27,6 @@ #include #include -/* - * The window size (vmpressure_win) is the number of scanned pages before - * we try to analyze scanned/reclaimed ratio. So the window is used as a - * rate-limit tunable for the "low" level notification, and also for - * averaging the ratio for medium/critical levels. Using small window - * sizes can cause lot of false positives, but too big window size will - * delay the notifications. - * - * As the vmscan reclaimer logic works with chunks which are multiple of - * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. - * - * TODO: Make the window size depend on machine size, as we do for vmstat - * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). - */ -static unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; - /* * These thresholds are used when we account memory pressure through * scanned/reclaimed ratio. The current values were chosen empirically. In @@ -271,26 +255,35 @@ static void vmpressure_work_fn(struct work_struct *work) } while ((vmpr = vmpressure_parent(vmpr))); } +static unsigned long calculate_vmpressure_win(void) +{ + long x; + + x = global_node_page_state(NR_FILE_PAGES) - + global_node_page_state(NR_SHMEM) - + total_swapcache_pages() + + global_zone_page_state(NR_FREE_PAGES); + if (x < 1) + return 1; + /* + * For low (free + cached), vmpressure window should be + * small, and high for higher values of (free + cached). + * But it should not be linear as well. This ensures + * timely vmpressure notifications when system is under + * memory pressure, and optimal number of events when + * cached is high. The sqaure root function is empirically + * found to serve the purpose. + */ + return int_sqrt(x); +} + #ifdef CONFIG_MEMCG -static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed) +static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical, + bool tree, unsigned long scanned, + unsigned long reclaimed) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); - /* - * Here we only want to account pressure that userland is able to - * help us with. For example, suppose that DMA zone is under - * pressure; if we notify userland about that kind of pressure, - * then it will be mostly a waste as it will trigger unnecessary - * freeing of memory by userland (since userland is more likely to - * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That - * is why we include only movable, highmem and FS/IO pages. - * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so - * we account it too. - */ - if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) - return; - /* * If we got here with no pages scanned, then that is an indicator * that reclaimer was unable to find any shrinkable LRUs at the @@ -299,7 +292,9 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, * (scanning depth) goes too high (deep), we will be notified * through vmpressure_prio(). But so far, keep calm. */ - if (!scanned) + if (critical) + scanned = calculate_vmpressure_win(); + else if (!scanned) return; if (tree) { @@ -308,7 +303,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, vmpr->tree_reclaimed += reclaimed; spin_unlock(&vmpr->sr_lock); - if (scanned < vmpressure_win) + if (!critical && scanned < calculate_vmpressure_win()) return; schedule_work(&vmpr->work); } else { @@ -322,7 +317,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, spin_lock(&vmpr->sr_lock); scanned = vmpr->scanned += scanned; reclaimed = vmpr->reclaimed += reclaimed; - if (scanned < vmpressure_win) { + if (!critical && scanned < calculate_vmpressure_win()) { spin_unlock(&vmpr->sr_lock); return; } @@ -346,65 +341,37 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, } } #else -static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed) -{ -} +static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical, + bool tree, unsigned long scanned, + unsigned long reclaimed) { } #endif -static void calculate_vmpressure_win(void) -{ - long x; - - x = global_node_page_state(NR_FILE_PAGES) - - global_node_page_state(NR_SHMEM) - - total_swapcache_pages() + - global_zone_page_state(NR_FREE_PAGES); - if (x < 1) - x = 1; - /* - * For low (free + cached), vmpressure window should be - * small, and high for higher values of (free + cached). - * But it should not be linear as well. This ensures - * timely vmpressure notifications when system is under - * memory pressure, and optimal number of events when - * cached is high. The sqaure root function is empirically - * found to serve the purpose. - */ - x = int_sqrt(x); - vmpressure_win = x; -} - -static void vmpressure_global(gfp_t gfp, unsigned long scanned, - unsigned long reclaimed) +static void vmpressure_global(gfp_t gfp, unsigned long scanned, bool critical, + unsigned long reclaimed) { struct vmpressure *vmpr = &global_vmpressure; unsigned long pressure; unsigned long stall; - if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) - return; + if (critical) + scanned = calculate_vmpressure_win(); - if (!scanned) - return; + if (scanned) { + spin_lock(&vmpr->sr_lock); + vmpr->scanned += scanned; + vmpr->reclaimed += reclaimed; - spin_lock(&vmpr->sr_lock); - if (!vmpr->scanned) - calculate_vmpressure_win(); + if (!current_is_kswapd()) + vmpr->stall += scanned; - vmpr->scanned += scanned; - vmpr->reclaimed += reclaimed; + stall = vmpr->stall; + scanned = vmpr->scanned; + reclaimed = vmpr->reclaimed; + spin_unlock(&vmpr->sr_lock); - if (!current_is_kswapd()) - vmpr->stall += scanned; - - stall = vmpr->stall; - scanned = vmpr->scanned; - reclaimed = vmpr->reclaimed; - spin_unlock(&vmpr->sr_lock); - - if (scanned < vmpressure_win) - return; + if (!critical && scanned < calculate_vmpressure_win()) + return; + } spin_lock(&vmpr->sr_lock); vmpr->scanned = 0; @@ -412,11 +379,26 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, vmpr->stall = 0; spin_unlock(&vmpr->sr_lock); - pressure = vmpressure_calc_pressure(scanned, reclaimed); - pressure = vmpressure_account_stall(pressure, stall, scanned); + if (scanned) { + pressure = vmpressure_calc_pressure(scanned, reclaimed); + pressure = vmpressure_account_stall(pressure, stall, scanned); + } else { + pressure = 100; + } vmpressure_notify(pressure); } +static void __vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool critical, + bool tree, unsigned long scanned, + unsigned long reclaimed) +{ + if (!memcg && tree) + vmpressure_global(gfp, scanned, critical, reclaimed); + + if (IS_ENABLED(CONFIG_MEMCG)) + vmpressure_memcg(gfp, memcg, critical, tree, scanned, reclaimed); +} + /** * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask @@ -439,13 +421,12 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, * This function does not return any value. */ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed) + unsigned long scanned, unsigned long reclaimed, int order) { - if (!memcg && tree) - vmpressure_global(gfp, scanned, reclaimed); + if (order > PAGE_ALLOC_COSTLY_ORDER) + return; - if (IS_ENABLED(CONFIG_MEMCG)) - vmpressure_memcg(gfp, memcg, tree, scanned, reclaimed); + __vmpressure(gfp, memcg, false, tree, scanned, reclaimed); } /** @@ -475,7 +456,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * to the vmpressure() basically means that we signal 'critical' * level. */ - vmpressure(gfp, memcg, true, vmpressure_win, 0); + __vmpressure(gfp, memcg, true, true, 0, 0); } static enum vmpressure_levels str_to_level(const char *arg) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5cddeb373d8b..59caf095d169 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2727,7 +2727,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + sc->nr_reclaimed - reclaimed, sc->order); /* * Direct reclaim and kswapd have to scan all memory @@ -2760,7 +2760,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) */ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); + sc->nr_reclaimed - nr_reclaimed, sc->order); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -3506,7 +3506,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - if (try_to_freeze() || kthread_should_stop()) + if (try_to_freeze() || kthread_should_stop() || + !atomic_long_read(&kswapd_waiters)) break; /*