From 82eb69d7ff827a7c1726044e8c4134837280e592 Mon Sep 17 00:00:00 2001 From: Alistair Delva Date: Mon, 23 Dec 2019 10:45:42 -0800 Subject: [PATCH 01/44] ANDROID: cuttlefish_defconfig: Disable TRANSPARENT_HUGEPAGE Fix conflict between jemalloc/scudo and MADV_NOHUGEPAGE by disabling the transparent hugepage support. It has also been suggested that this feature can make VM behavior less predictable. Bug: 131119917 Change-Id: I17556838fbf1f893e26c5658ee95b4e3b16b10ad Signed-off-by: Alistair Delva --- arch/arm64/configs/cuttlefish_defconfig | 1 - arch/x86/configs/x86_64_cuttlefish_defconfig | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/arm64/configs/cuttlefish_defconfig b/arch/arm64/configs/cuttlefish_defconfig index 01f5932d3574..55ccc9ad1b40 100644 --- a/arch/arm64/configs/cuttlefish_defconfig +++ b/arch/arm64/configs/cuttlefish_defconfig @@ -49,7 +49,6 @@ CONFIG_PREEMPT=y CONFIG_HZ_100=y # CONFIG_SPARSEMEM_VMEMMAP is not set CONFIG_KSM=y -CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_ZSMALLOC=y CONFIG_SECCOMP=y CONFIG_PARAVIRT=y diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index 1e863514afbe..041475898a1f 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -55,7 +55,6 @@ CONFIG_X86_MSR=y CONFIG_X86_CPUID=y CONFIG_KSM=y CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 -CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_ZSMALLOC=y # CONFIG_MTRR is not set CONFIG_HZ_100=y From f340444ffdcd16aef6e49ff0fd3b8bdeea800a77 Mon Sep 17 00:00:00 2001 From: chenqiwu Date: Mon, 30 Dec 2019 10:07:34 +0800 Subject: [PATCH 02/44] ANDROID: drivers base/arch_topology: avoid a stuck risk when cpufreq policy free detect_share_cap_flag() calls cpufreq_cpu_get() to get cpufreq policy for each possible cpu, meanwhile, it also increments the kobject reference count of policy to mark it busy. However, a corresponding call to cpufreq_cpu_put() is ignored to decrement the kobject reference count back, which may lead to a potential stuck risk that percpu cpuhp thread deadly waits for dropping of kobject refcount when percpu cpufreq policy free. The call trace of stuck risk could be: cpufreq_online() //If cpufreq initialization failed, goto out_free_policy. ->cpufreq_policy_free() //Do cpufreq_policy free. ->cpufreq_policy_put_kobj() ->kobject_put() //Skip if policy kfref count is not 1. ->cpufreq_sysfs_release() ->complete() //Complete policy->kobj_unregister. ->wait_for_completion() //Wait for policy->kobj_unregister. With this patch, the cpuhp thread can be easily exercised by attempting to force an unbind of the CPUfreq driver. Bug: 120440300 Bug: 147378688 Signed-off-by: chenqiwu Signed-off-by: qiwu chen Signed-off-by: Quentin Perret Change-Id: I976fc697d4090324877b28fdb5bb5a5b63f6d947 --- drivers/base/arch_topology.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 2485f3fe5b31..a39d46a26f97 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -184,20 +184,25 @@ int detect_share_cap_flag(void) cpumask_equal(topology_sibling_cpumask(cpu), policy->related_cpus)) { share_cap_level = share_cap_thread; + cpufreq_cpu_put(policy); continue; } if (cpumask_equal(topology_core_cpumask(cpu), policy->related_cpus)) { share_cap_level = share_cap_core; + cpufreq_cpu_put(policy); continue; } if (cpumask_equal(cpu_cpu_mask(cpu), policy->related_cpus)) { share_cap_level = share_cap_die; + cpufreq_cpu_put(policy); continue; } + + cpufreq_cpu_put(policy); } if (share_cap != share_cap_level) { From e4896fc7e3fa06f4e89e5b97627ad76f3b6d1e0c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 10 Jan 2020 22:00:56 +0100 Subject: [PATCH 03/44] ANDROID: arm64: cpuinfo: fix up 4.14.151 merge There was an incorrect merge in this branch for the cpuinfo file. This commit fixes it up. Original patch from Will Deacon. Reported-by: Blagovest Kolenichev Cc: Will Deacon Signed-off-by: Greg Kroah-Hartman Change-Id: I688d3e83cf8c540853c34ed719c535888998a3ad --- arch/arm64/kernel/cpuinfo.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 3f037684e1b1..9ff64e04e63d 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -69,6 +69,11 @@ static const char *const hwcap_str[] = { "fcma", "lrcpc", "dcpop", + "sha3", + "sm3", + "sm4", + "asimddp", + "sha512", "sve", "asimdfhm", "dit", @@ -76,11 +81,6 @@ static const char *const hwcap_str[] = { "ilrcpc", "flagm", "ssbs", - "sha3", - "sm3", - "sm4", - "asimddp", - "sha512", NULL }; From e9e40f4fb82e4a5e40fcfde8f2855fa535fa1be3 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Wed, 22 Jan 2020 11:19:58 +0100 Subject: [PATCH 04/44] ANDROID: selinux: modify RTM_GETLINK permission Map the permission gating RTM_GETLINK messages to a new permission so that it can be distinguished from the other netlink route permissions in selinux policy. This is a temporary Android-only patch that will be deprecated in newer kernels once the long-term solution lands as discusssed on the mailing list [1]. The maintainer's recommended solution is more general, much more complex, and likely not suitable for backporting. This patch provides the minimal change needed for Android including the userspace settable trigger which ensures that the permission change is only applied to the newest version of Android which contains the changes needed for userpace compatibility. [1]: https://lore.kernel.org/selinux/20200116142653.61738-1-jeffv@google.com/ Bug: 141455849 Bug: 148218425 Test: CtsSelinuxTargetSdkCurrentTestCases Test: atest bionic-unit-tests-static Test: atest NetworkInterfaceTest Test: Connect to Wi-Fi network Test: Set up hotspot Test: Cast from device Test: Pair Bluetooth device Test: Call getifaddrs() directly from within an app. Test: Call NetworkInterface#getNetworkInterfaces() from within an app. Change-Id: I7b44ce60ad98f858c412722d41b9842f8577151f Signed-off-by: Jeff Vander Stoep --- security/selinux/include/classmap.h | 2 +- security/selinux/include/security.h | 2 ++ security/selinux/nlmsgtab.c | 26 +++++++++++++++++++++++++- security/selinux/ss/policydb.c | 4 ++++ security/selinux/ss/policydb.h | 2 ++ security/selinux/ss/services.c | 4 ++++ 6 files changed, 38 insertions(+), 2 deletions(-) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 5ae315ab060b..702bdaf07252 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -115,7 +115,7 @@ struct security_class_mapping secclass_map[] = { { COMMON_IPC_PERMS, NULL } }, { "netlink_route_socket", { COMMON_SOCK_PERMS, - "nlmsg_read", "nlmsg_write", NULL } }, + "nlmsg_read", "nlmsg_write", "nlmsg_readpriv", NULL } }, { "netlink_tcpdiag_socket", { COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", NULL } }, diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 02f0412d42f2..45cc615fddae 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -81,6 +81,7 @@ enum { extern char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX]; +extern int selinux_android_netlink_route; extern int selinux_policycap_netpeer; extern int selinux_policycap_openperm; extern int selinux_policycap_extsockclass; @@ -276,6 +277,7 @@ extern struct vfsmount *selinuxfs_mount; extern void selnl_notify_setenforce(int val); extern void selnl_notify_policyload(u32 seqno); extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm); +extern void selinux_nlmsg_init(void); #endif /* _SELINUX_SECURITY_H_ */ diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 7b7433a1a34c..963930b4e2f8 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -28,7 +28,7 @@ struct nlmsg_perm { u32 perm; }; -static const struct nlmsg_perm nlmsg_route_perms[] = +static struct nlmsg_perm nlmsg_route_perms[] = { { RTM_NEWLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, @@ -195,3 +195,27 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) return err; } + +static void nlmsg_set_getlink_perm(u32 perm) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(nlmsg_route_perms); i++) { + if (nlmsg_route_perms[i].nlmsg_type == RTM_GETLINK) { + nlmsg_route_perms[i].perm = perm; + break; + } + } +} + +/** + * Use nlmsg_readpriv as the permission for RTM_GETLINK messages if the + * netlink_route_getlink policy capability is set. Otherwise use nlmsg_read. + */ +void selinux_nlmsg_init(void) +{ + if (selinux_android_netlink_route) + nlmsg_set_getlink_perm(NETLINK_ROUTE_SOCKET__NLMSG_READPRIV); + else + nlmsg_set_getlink_perm(NETLINK_ROUTE_SOCKET__NLMSG_READ); +} diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 9d9f6bb1e56e..51564b2aba9e 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -2386,6 +2386,10 @@ int policydb_read(struct policydb *p, void *fp) p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN); p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN); + if ((le32_to_cpu(buf[1]) & POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE)) { + p->android_netlink_route = 1; + } + if (p->policyvers >= POLICYDB_VERSION_POLCAP) { rc = ebitmap_read(&p->policycaps, fp); if (rc) diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index 215f8f30ac5a..dbb0ed57ed8b 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h @@ -238,6 +238,7 @@ struct genfs { /* The policy database */ struct policydb { int mls_enabled; + int android_netlink_route; /* symbol tables */ struct symtab symtab[SYM_NUM]; @@ -324,6 +325,7 @@ extern int policydb_write(struct policydb *p, void *fp); #define PERM_SYMTAB_SIZE 32 #define POLICYDB_CONFIG_MLS 1 +#define POLICYDB_CONFIG_ANDROID_NETLINK_ROUTE (1 << 31) /* the config flags related to unknown classes/perms are bits 2 and 3 */ #define REJECT_UNKNOWN 0x00000002 diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index b275743e23cc..f91bcb90825c 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -80,6 +80,7 @@ char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX] = { "nnp_nosuid_transition" }; +int selinux_android_netlink_route; int selinux_policycap_netpeer; int selinux_policycap_openperm; int selinux_policycap_extsockclass; @@ -2026,6 +2027,9 @@ static void security_load_policycaps(void) pr_info("SELinux: unknown policy capability %u\n", i); } + + selinux_android_netlink_route = policydb.android_netlink_route; + selinux_nlmsg_init(); } static int security_preserve_bools(struct policydb *p); From 708813f817ebb679c67ff5d41cc432714eeef186 Mon Sep 17 00:00:00 2001 From: Andrey Shvetsov Date: Thu, 16 Jan 2020 18:22:39 +0100 Subject: [PATCH 05/44] UPSTREAM: staging: most: net: fix buffer overflow If the length of the socket buffer is 0xFFFFFFFF (max size for an unsigned int), then payload_len becomes 0xFFFFFFF1 after subtracting 14 (ETH_HLEN). Then, mdp_len is set to payload_len + 16 (MDP_HDR_LEN) which overflows and results in a value of 2. These values for payload_len and mdp_len will pass current buffer size checks. This patch checks if derived from skb->len sum may overflow. The check is based on the following idea: For any `unsigned V1, V2` and derived `unsigned SUM = V1 + V2`, `V1 + V2` overflows iif `SUM < V1`. Bug: 143560807 Reported-by: Greg Kroah-Hartman Signed-off-by: Andrey Shvetsov Cc: stable Link: https://lore.kernel.org/r/20200116172238.6046-1-andrey.shvetsov@microchip.com Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 4d1356ac12f4d5180d0df345d85ff0ee42b89c72) Signed-off-by: Greg Kroah-Hartman Change-Id: I71197b2963735ba181314332737fc0c1ca2cab96 --- drivers/staging/most/aim-network/networking.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/staging/most/aim-network/networking.c b/drivers/staging/most/aim-network/networking.c index 936f013c350e..6398c27563c9 100644 --- a/drivers/staging/most/aim-network/networking.c +++ b/drivers/staging/most/aim-network/networking.c @@ -85,6 +85,11 @@ static int skb_to_mamac(const struct sk_buff *skb, struct mbo *mbo) unsigned int payload_len = skb->len - ETH_HLEN; unsigned int mdp_len = payload_len + MDP_HDR_LEN; + if (mdp_len < skb->len) { + pr_err("drop: too large packet! (%u)\n", skb->len); + return -EINVAL; + } + if (mbo->buffer_length < mdp_len) { pr_err("drop: too small buffer! (%d for %d)\n", mbo->buffer_length, mdp_len); @@ -132,6 +137,11 @@ static int skb_to_mep(const struct sk_buff *skb, struct mbo *mbo) u8 *buff = mbo->virt_address; unsigned int mep_len = skb->len + MEP_HDR_LEN; + if (mep_len < skb->len) { + pr_err("drop: too large packet! (%u)\n", skb->len); + return -EINVAL; + } + if (mbo->buffer_length < mep_len) { pr_err("drop: too small buffer! (%d for %d)\n", mbo->buffer_length, mep_len); From 2d0aaa2e697e3787d96c7587197f5570f5e8fdd2 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Mon, 24 Jun 2019 07:22:55 -0700 Subject: [PATCH 06/44] simple_lmk: Introduce Simple Low Memory Killer for Android This is a complete low memory killer solution for Android that is small and simple. Processes are killed according to the priorities that Android gives them, so that the least important processes are always killed first. Processes are killed until memory deficits are satisfied, as observed from kswapd struggling to free up pages. Simple LMK stops killing processes when kswapd finally goes back to sleep. The only tunables are the desired amount of memory to be freed per reclaim event and desired frequency of reclaim events. Simple LMK tries to free at least the desired amount of memory per reclaim and waits until all of its victims' memory is freed before proceeding to kill more processes. Signed-off-by: Sultan Alsawaf --- drivers/android/Kconfig | 44 +++++ drivers/android/Makefile | 1 + drivers/android/simple_lmk.c | 332 +++++++++++++++++++++++++++++++++++ include/linux/simple_lmk.h | 26 +++ kernel/fork.c | 2 + mm/vmscan.c | 4 + 6 files changed, 409 insertions(+) create mode 100644 drivers/android/simple_lmk.c create mode 100644 include/linux/simple_lmk.h diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index ee4880bfdcdc..b572f685374f 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -42,6 +42,50 @@ config ANDROID_BINDER_IPC_SELFTEST exhaustively with combinations of various buffer sizes and alignments. +config ANDROID_SIMPLE_LMK + bool "Simple Android Low Memory Killer" + depends on !ANDROID_LOW_MEMORY_KILLER && !MEMCG + ---help--- + This is a complete low memory killer solution for Android that is + small and simple. Processes are killed according to the priorities + that Android gives them, so that the least important processes are + always killed first. Processes are killed until memory deficits are + satisfied, as observed from kswapd struggling to free up pages. Simple + LMK stops killing processes when kswapd finally goes back to sleep. + +if ANDROID_SIMPLE_LMK + +config ANDROID_SIMPLE_LMK_AGGRESSION + int "Reclaim frequency selection" + range 1 3 + default 1 + help + This value determines how frequently Simple LMK will perform memory + reclaims. A lower value corresponds to less frequent reclaims, which + maximizes memory usage. The range of values has a logarithmic + correlation; 2 is twice as aggressive as 1, and 3 is twice as + aggressive as 2, which makes 3 four times as aggressive as 1. + + The aggression is set as a factor of kswapd's scan depth. This means + that a system with more memory will have a more expensive aggression + factor compared to a system with less memory. For example, setting an + aggression factor of 1 with 4 GiB of memory would be like setting a + factor of 2 with 8 GiB of memory; the more memory a system has, the + more expensive it is to use a lower value. + + Choosing a value of 1 here works well with systems that have 4 GiB of + memory. If the default doesn't work well, then this value should be + tweaked based on empirical results using different values. + +config ANDROID_SIMPLE_LMK_MINFREE + int "Minimum MiB of memory to free per reclaim" + range 8 512 + default 100 + help + Simple LMK will try to free at least this much memory per reclaim. + +endif + endif # if ANDROID endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile index a01254c43ee3..81cc79664cf9 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -2,3 +2,4 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o +obj-$(CONFIG_ANDROID_SIMPLE_LMK) += simple_lmk.o diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c new file mode 100644 index 000000000000..29637ecc5be5 --- /dev/null +++ b/drivers/android/simple_lmk.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Sultan Alsawaf . + */ + +#define pr_fmt(fmt) "simple_lmk: " fmt + +#include +#include +#include +#include +#include +#include +#include + +/* The sched_param struct is located elsewhere in newer kernels */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) +#include +#endif + +/* SEND_SIG_FORCED isn't present in newer kernels */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0) +#define SIG_INFO_TYPE SEND_SIG_FORCED +#else +#define SIG_INFO_TYPE SEND_SIG_PRIV +#endif + +/* The group argument to do_send_sig_info is different in newer kernels */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) +#define KILL_GROUP_TYPE true +#else +#define KILL_GROUP_TYPE PIDTYPE_TGID +#endif + +/* The minimum number of pages to free per reclaim */ +#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE) + +/* Kill up to this many victims per reclaim */ +#define MAX_VICTIMS 1024 + +struct victim_info { + struct task_struct *tsk; + struct mm_struct *mm; + unsigned long size; +}; + +/* Pulled from the Android framework. Lower adj means higher priority. */ +static const short adj_prio[] = { + 906, /* CACHED_APP_MAX_ADJ */ + 905, /* Cached app */ + 904, /* Cached app */ + 903, /* Cached app */ + 902, /* Cached app */ + 901, /* Cached app */ + 900, /* CACHED_APP_MIN_ADJ */ + 800, /* SERVICE_B_ADJ */ + 700, /* PREVIOUS_APP_ADJ */ + 600, /* HOME_APP_ADJ */ + 500, /* SERVICE_ADJ */ + 400, /* HEAVY_WEIGHT_APP_ADJ */ + 300, /* BACKUP_APP_ADJ */ + 200, /* PERCEPTIBLE_APP_ADJ */ + 100, /* VISIBLE_APP_ADJ */ + 0 /* FOREGROUND_APP_ADJ */ +}; + +static struct victim_info victims[MAX_VICTIMS]; +static DECLARE_WAIT_QUEUE_HEAD(oom_waitq); +static DECLARE_COMPLETION(reclaim_done); +static int victims_to_kill; +static bool needs_reclaim; + +static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr) +{ + const struct victim_info *lhs = (typeof(lhs))lhs_ptr; + const struct victim_info *rhs = (typeof(rhs))rhs_ptr; + + return rhs->size - lhs->size; +} + +static bool vtsk_is_duplicate(struct victim_info *varr, int vlen, + struct task_struct *vtsk) +{ + int i; + + for (i = 0; i < vlen; i++) { + if (same_thread_group(varr[i].tsk, vtsk)) + return true; + } + + return false; +} + +static unsigned long find_victims(struct victim_info *varr, int *vindex, + int vmaxlen, short target_adj) +{ + unsigned long pages_found = 0; + int old_vindex = *vindex; + struct task_struct *tsk; + + for_each_process(tsk) { + struct task_struct *vtsk; + unsigned long tasksize; + + /* + * Search for tasks with the targeted importance (adj). Since + * only tasks with a positive adj can be targeted, that + * naturally excludes tasks which shouldn't be killed, like init + * and kthreads. Although oom_score_adj can still be changed + * while this code runs, it doesn't really matter. We just need + * to make sure that if the adj changes, we won't deadlock + * trying to lock a task that we locked earlier. + */ + if (READ_ONCE(tsk->signal->oom_score_adj) != target_adj || + vtsk_is_duplicate(varr, *vindex, tsk)) + continue; + + vtsk = find_lock_task_mm(tsk); + if (!vtsk) + continue; + + /* Store this potential victim away for later */ + varr[*vindex].tsk = vtsk; + varr[*vindex].mm = vtsk->mm; + varr[*vindex].size = get_mm_rss(vtsk->mm); + + /* Keep track of the number of pages that have been found */ + pages_found += tasksize; + + /* Make sure there's space left in the victim array */ + if (++*vindex == vmaxlen) + break; + } + + /* + * Sort the victims in descending order of size to prioritize killing + * the larger ones first. + */ + if (pages_found) + sort(&varr[old_vindex], *vindex - old_vindex, sizeof(*varr), + victim_size_cmp, NULL); + + return pages_found; +} + +static int process_victims(struct victim_info *varr, int vlen, + unsigned long pages_needed) +{ + unsigned long pages_found = 0; + int i, nr_to_kill = 0; + + /* + * Calculate the number of tasks that need to be killed and quickly + * release the references to those that'll live. + */ + for (i = 0; i < vlen; i++) { + struct victim_info *victim = &victims[i]; + struct task_struct *vtsk = victim->tsk; + + /* The victim's mm lock is taken in find_victims; release it */ + if (pages_found >= pages_needed) { + task_unlock(vtsk); + continue; + } + + pages_found += victim->size; + nr_to_kill++; + } + + return nr_to_kill; +} + +static void scan_and_kill(unsigned long pages_needed) +{ + int i, nr_to_kill = 0, nr_victims = 0; + unsigned long pages_found = 0; + + /* + * Hold the tasklist lock so tasks don't disappear while scanning. This + * is preferred to holding an RCU read lock so that the list of tasks + * is guaranteed to be up to date. + */ + read_lock(&tasklist_lock); + for (i = 0; i < ARRAY_SIZE(adj_prio); i++) { + pages_found += find_victims(victims, &nr_victims, MAX_VICTIMS, + adj_prio[i]); + if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS) + break; + } + read_unlock(&tasklist_lock); + + /* Pretty unlikely but it can happen */ + if (unlikely(!nr_victims)) + return; + + /* First round of victim processing to weed out unneeded victims */ + nr_to_kill = process_victims(victims, nr_victims, pages_needed); + + /* + * Try to kill as few of the chosen victims as possible by sorting the + * chosen victims by size, which means larger victims that have a lower + * adj can be killed in place of smaller victims with a high adj. + */ + sort(victims, nr_to_kill, sizeof(*victims), victim_size_cmp, NULL); + + /* Second round of victim processing to finally select the victims */ + nr_to_kill = process_victims(victims, nr_to_kill, pages_needed); + + /* Kill the victims */ + WRITE_ONCE(victims_to_kill, nr_to_kill); + for (i = 0; i < nr_to_kill; i++) { + struct victim_info *victim = &victims[i]; + struct task_struct *vtsk = victim->tsk; + + pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm, + vtsk->signal->oom_score_adj, + victim->size << (PAGE_SHIFT - 10)); + + /* Accelerate the victim's death by forcing the kill signal */ + do_send_sig_info(SIGKILL, SIG_INFO_TYPE, vtsk, KILL_GROUP_TYPE); + + /* Grab a reference to the victim for later before unlocking */ + get_task_struct(vtsk); + task_unlock(vtsk); + } + + /* Try to speed up the death process now that we can schedule again */ + for (i = 0; i < nr_to_kill; i++) { + struct task_struct *vtsk = victims[i].tsk; + + /* Increase the victim's priority to make it die faster */ + set_user_nice(vtsk, MIN_NICE); + + /* Allow the victim to run on any CPU */ + set_cpus_allowed_ptr(vtsk, cpu_all_mask); + + /* Finally release the victim reference acquired earlier */ + put_task_struct(vtsk); + } + + /* Wait until all the victims die */ + wait_for_completion(&reclaim_done); +} + +static int simple_lmk_reclaim_thread(void *data) +{ + static const struct sched_param sched_max_rt_prio = { + .sched_priority = MAX_RT_PRIO - 1 + }; + + sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio); + + while (1) { + bool should_stop; + + wait_event(oom_waitq, (should_stop = kthread_should_stop()) || + READ_ONCE(needs_reclaim)); + + if (should_stop) + break; + + /* + * Kill a batch of processes and wait for their memory to be + * freed. After their memory is freed, sleep for 20 ms to give + * OOM'd allocations a chance to scavenge for the newly-freed + * pages. Rinse and repeat while there are still OOM'd + * allocations. + */ + do { + scan_and_kill(MIN_FREE_PAGES); + msleep(20); + } while (READ_ONCE(needs_reclaim)); + } + + return 0; +} + +void simple_lmk_decide_reclaim(int kswapd_priority) +{ + if (kswapd_priority != CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION) + return; + + if (!cmpxchg(&needs_reclaim, false, true)) + wake_up(&oom_waitq); +} + +void simple_lmk_stop_reclaim(void) +{ + WRITE_ONCE(needs_reclaim, false); +} + +void simple_lmk_mm_freed(struct mm_struct *mm) +{ + static atomic_t nr_killed = ATOMIC_INIT(0); + int i, nr_to_kill; + + nr_to_kill = READ_ONCE(victims_to_kill); + for (i = 0; i < nr_to_kill; i++) { + if (cmpxchg(&victims[i].mm, mm, NULL) == mm) { + if (atomic_inc_return(&nr_killed) == nr_to_kill) { + WRITE_ONCE(victims_to_kill, 0); + nr_killed = (atomic_t)ATOMIC_INIT(0); + complete(&reclaim_done); + } + break; + } + } +} + +/* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */ +static int simple_lmk_init_set(const char *val, const struct kernel_param *kp) +{ + static bool init_done; + struct task_struct *thread; + + if (cmpxchg(&init_done, false, true)) + return 0; + + thread = kthread_run(simple_lmk_reclaim_thread, NULL, "simple_lmkd"); + BUG_ON(IS_ERR(thread)); + + return 0; +} + +static const struct kernel_param_ops simple_lmk_init_ops = { + .set = simple_lmk_init_set +}; + +/* Needed to prevent Android from thinking there's no LMK and thus rebooting */ +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "lowmemorykiller." +module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200); diff --git a/include/linux/simple_lmk.h b/include/linux/simple_lmk.h new file mode 100644 index 000000000000..b0c247f2f2a5 --- /dev/null +++ b/include/linux/simple_lmk.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Sultan Alsawaf . + */ +#ifndef _SIMPLE_LMK_H_ +#define _SIMPLE_LMK_H_ + +struct mm_struct; + +#ifdef CONFIG_ANDROID_SIMPLE_LMK +void simple_lmk_decide_reclaim(int kswapd_priority); +void simple_lmk_stop_reclaim(void); +void simple_lmk_mm_freed(struct mm_struct *mm); +#else +static inline void simple_lmk_decide_reclaim(int kswapd_priority) +{ +} +static inline void simple_lmk_stop_reclaim(void) +{ +} +static inline void simple_lmk_mm_freed(struct mm_struct *mm) +{ +} +#endif + +#endif /* _SIMPLE_LMK_H_ */ diff --git a/kernel/fork.c b/kernel/fork.c index c106b2c1681e..211239424b6b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include @@ -940,6 +941,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + simple_lmk_mm_freed(mm); mmdrop(mm); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9fcfc5a9cfec..2a0abc7f3543 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -3350,6 +3351,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + simple_lmk_decide_reclaim(sc.priority); sc.reclaim_idx = classzone_idx; /* @@ -3482,6 +3484,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * succeed. */ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + simple_lmk_stop_reclaim(); /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3518,6 +3521,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o */ if (!remaining && prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + simple_lmk_stop_reclaim(); trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* From f36cc0eba8e851ae226f47d69c8abfc434623a55 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sat, 20 Jul 2019 09:54:58 -0700 Subject: [PATCH 07/44] simple_lmk: Fix pages_found calculation Previously, pages_found would be calculated using an uninitialized variable. Fix it. Reported-by: Julian Liu Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 29637ecc5be5..e7e91b7b4c7b 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -100,7 +100,6 @@ static unsigned long find_victims(struct victim_info *varr, int *vindex, for_each_process(tsk) { struct task_struct *vtsk; - unsigned long tasksize; /* * Search for tasks with the targeted importance (adj). Since @@ -125,7 +124,7 @@ static unsigned long find_victims(struct victim_info *varr, int *vindex, varr[*vindex].size = get_mm_rss(vtsk->mm); /* Keep track of the number of pages that have been found */ - pages_found += tasksize; + pages_found += varr[*vindex].size; /* Make sure there's space left in the victim array */ if (++*vindex == vmaxlen) From 33a41f60acf4b9c299eacfcead851ee2bd072843 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 21 Aug 2019 08:30:55 -0700 Subject: [PATCH 08/44] simple_lmk: Remove kthread_should_stop() exit condition Simple LMK's reclaim thread should never stop; there's no need to have this check. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index e7e91b7b4c7b..845679a4cbed 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -250,13 +250,7 @@ static int simple_lmk_reclaim_thread(void *data) sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio); while (1) { - bool should_stop; - - wait_event(oom_waitq, (should_stop = kthread_should_stop()) || - READ_ONCE(needs_reclaim)); - - if (should_stop) - break; + wait_event(oom_waitq, READ_ONCE(needs_reclaim)); /* * Kill a batch of processes and wait for their memory to be From 89ac9d7c9b509fed3dde5c61677db450a9541742 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 21 Aug 2019 08:37:04 -0700 Subject: [PATCH 09/44] simple_lmk: Use proper atomic_* operations where needed cmpxchg() is only atomic with respect to the local CPU, so it cannot be relied on with how it's used in Simple LMK. Switch to fully atomic operations instead for full atomic guarantees. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 845679a4cbed..d1bc94027e5d 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -68,7 +68,7 @@ static struct victim_info victims[MAX_VICTIMS]; static DECLARE_WAIT_QUEUE_HEAD(oom_waitq); static DECLARE_COMPLETION(reclaim_done); static int victims_to_kill; -static bool needs_reclaim; +static atomic_t needs_reclaim = ATOMIC_INIT(0); static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr) { @@ -250,7 +250,7 @@ static int simple_lmk_reclaim_thread(void *data) sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio); while (1) { - wait_event(oom_waitq, READ_ONCE(needs_reclaim)); + wait_event(oom_waitq, atomic_read(&needs_reclaim)); /* * Kill a batch of processes and wait for their memory to be @@ -262,7 +262,7 @@ static int simple_lmk_reclaim_thread(void *data) do { scan_and_kill(MIN_FREE_PAGES); msleep(20); - } while (READ_ONCE(needs_reclaim)); + } while (atomic_read(&needs_reclaim)); } return 0; @@ -273,13 +273,13 @@ void simple_lmk_decide_reclaim(int kswapd_priority) if (kswapd_priority != CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION) return; - if (!cmpxchg(&needs_reclaim, false, true)) + if (!atomic_cmpxchg(&needs_reclaim, 0, 1)) wake_up(&oom_waitq); } void simple_lmk_stop_reclaim(void) { - WRITE_ONCE(needs_reclaim, false); + atomic_set(&needs_reclaim, 0); } void simple_lmk_mm_freed(struct mm_struct *mm) @@ -303,10 +303,10 @@ void simple_lmk_mm_freed(struct mm_struct *mm) /* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */ static int simple_lmk_init_set(const char *val, const struct kernel_param *kp) { - static bool init_done; + static atomic_t init_done = ATOMIC_INIT(0); struct task_struct *thread; - if (cmpxchg(&init_done, false, true)) + if (atomic_cmpxchg(&init_done, 0, 1)) return 0; thread = kthread_run(simple_lmk_reclaim_thread, NULL, "simple_lmkd"); From 496ee8b626ace7e29349bb06c2057c4944551950 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Mon, 4 Nov 2019 10:56:15 -0800 Subject: [PATCH 10/44] simple_lmk: Fix broken multicopy atomicity for victims_to_kill When the reclaim thread writes to victims_to_kill on one CPU, it expects the updated value to be immediately reflected on all CPUs in order for simple_lmk_mm_freed() to work correctly. Due to the lack of memory barriers to guarantee multicopy atomicity, simple_lmk_mm_freed() can be given a victim's mm without knowing the correct victims_to_kill value, which can cause the reclaim thread to remain stuck waiting forever for all victims to be freed. This scenario, despite being rare, has been observed. Fix this by using proper atomic helpers with memory barriers. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index d1bc94027e5d..079281fdcb57 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -67,7 +67,7 @@ static const short adj_prio[] = { static struct victim_info victims[MAX_VICTIMS]; static DECLARE_WAIT_QUEUE_HEAD(oom_waitq); static DECLARE_COMPLETION(reclaim_done); -static int victims_to_kill; +static atomic_t victims_to_kill = ATOMIC_INIT(0); static atomic_t needs_reclaim = ATOMIC_INIT(0); static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr) @@ -206,7 +206,7 @@ static void scan_and_kill(unsigned long pages_needed) nr_to_kill = process_victims(victims, nr_to_kill, pages_needed); /* Kill the victims */ - WRITE_ONCE(victims_to_kill, nr_to_kill); + atomic_set_release(&victims_to_kill, nr_to_kill); for (i = 0; i < nr_to_kill; i++) { struct victim_info *victim = &victims[i]; struct task_struct *vtsk = victim->tsk; @@ -287,11 +287,11 @@ void simple_lmk_mm_freed(struct mm_struct *mm) static atomic_t nr_killed = ATOMIC_INIT(0); int i, nr_to_kill; - nr_to_kill = READ_ONCE(victims_to_kill); + nr_to_kill = atomic_read_acquire(&victims_to_kill); for (i = 0; i < nr_to_kill; i++) { if (cmpxchg(&victims[i].mm, mm, NULL) == mm) { if (atomic_inc_return(&nr_killed) == nr_to_kill) { - WRITE_ONCE(victims_to_kill, 0); + atomic_set(&victims_to_kill, 0); nr_killed = (atomic_t)ATOMIC_INIT(0); complete(&reclaim_done); } From d158b07c2610c92c965a54fe8f0cbc74d4b10b6b Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Mon, 4 Nov 2019 11:06:13 -0800 Subject: [PATCH 11/44] simple_lmk: Make reclaim deterministic The 20 ms delay in the reclaim thread is a hacky fudge factor that can cause Simple LMK to behave wildly differently depending on the circumstances of when it is invoked. When kswapd doesn't get enough CPU time to finish up and go back to sleep within 20 ms, Simple LMK performs superfluous reclaims. This is suboptimal, so make Simple LMK more deterministic by eliminating the delay and instead queuing up reclaim requests from kswapd. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 36 +++++++++++++----------------------- include/linux/simple_lmk.h | 4 ---- mm/vmscan.c | 2 -- 3 files changed, 13 insertions(+), 29 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 079281fdcb57..bce8cf651b81 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -5,7 +5,6 @@ #define pr_fmt(fmt) "simple_lmk: " fmt -#include #include #include #include @@ -250,19 +249,8 @@ static int simple_lmk_reclaim_thread(void *data) sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio); while (1) { - wait_event(oom_waitq, atomic_read(&needs_reclaim)); - - /* - * Kill a batch of processes and wait for their memory to be - * freed. After their memory is freed, sleep for 20 ms to give - * OOM'd allocations a chance to scavenge for the newly-freed - * pages. Rinse and repeat while there are still OOM'd - * allocations. - */ - do { - scan_and_kill(MIN_FREE_PAGES); - msleep(20); - } while (atomic_read(&needs_reclaim)); + wait_event(oom_waitq, atomic_add_unless(&needs_reclaim, -1, 0)); + scan_and_kill(MIN_FREE_PAGES); } return 0; @@ -270,16 +258,18 @@ static int simple_lmk_reclaim_thread(void *data) void simple_lmk_decide_reclaim(int kswapd_priority) { - if (kswapd_priority != CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION) - return; + if (kswapd_priority == CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION) { + int v, v1; - if (!atomic_cmpxchg(&needs_reclaim, 0, 1)) - wake_up(&oom_waitq); -} - -void simple_lmk_stop_reclaim(void) -{ - atomic_set(&needs_reclaim, 0); + for (v = 0;; v = v1) { + v1 = atomic_cmpxchg(&needs_reclaim, v, v + 1); + if (likely(v1 == v)) { + if (!v) + wake_up(&oom_waitq); + break; + } + } + } } void simple_lmk_mm_freed(struct mm_struct *mm) diff --git a/include/linux/simple_lmk.h b/include/linux/simple_lmk.h index b0c247f2f2a5..46cdb389be51 100644 --- a/include/linux/simple_lmk.h +++ b/include/linux/simple_lmk.h @@ -9,15 +9,11 @@ struct mm_struct; #ifdef CONFIG_ANDROID_SIMPLE_LMK void simple_lmk_decide_reclaim(int kswapd_priority); -void simple_lmk_stop_reclaim(void); void simple_lmk_mm_freed(struct mm_struct *mm); #else static inline void simple_lmk_decide_reclaim(int kswapd_priority) { } -static inline void simple_lmk_stop_reclaim(void) -{ -} static inline void simple_lmk_mm_freed(struct mm_struct *mm) { } diff --git a/mm/vmscan.c b/mm/vmscan.c index 2a0abc7f3543..03f8ff25a14b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3484,7 +3484,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * succeed. */ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { - simple_lmk_stop_reclaim(); /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3521,7 +3520,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o */ if (!remaining && prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { - simple_lmk_stop_reclaim(); trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* From d6bad99cdf3c5f47a75aeeae4c52ef3d6018f4bd Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Mon, 4 Nov 2019 11:27:29 -0800 Subject: [PATCH 12/44] simple_lmk: Clean up some code style nitpicks Using a parameter to pass around a unmodified pointer to a global variable is crufty; just use the `victims` variable directly instead. Also, compress the code in simple_lmk_init_set() a bit to make it look cleaner. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 45 ++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index bce8cf651b81..44f7319defc0 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -77,21 +77,19 @@ static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr) return rhs->size - lhs->size; } -static bool vtsk_is_duplicate(struct victim_info *varr, int vlen, - struct task_struct *vtsk) +static bool vtsk_is_duplicate(int vlen, struct task_struct *vtsk) { int i; for (i = 0; i < vlen; i++) { - if (same_thread_group(varr[i].tsk, vtsk)) + if (same_thread_group(victims[i].tsk, vtsk)) return true; } return false; } -static unsigned long find_victims(struct victim_info *varr, int *vindex, - int vmaxlen, short target_adj) +static unsigned long find_victims(int *vindex, short target_adj) { unsigned long pages_found = 0; int old_vindex = *vindex; @@ -110,7 +108,7 @@ static unsigned long find_victims(struct victim_info *varr, int *vindex, * trying to lock a task that we locked earlier. */ if (READ_ONCE(tsk->signal->oom_score_adj) != target_adj || - vtsk_is_duplicate(varr, *vindex, tsk)) + vtsk_is_duplicate(*vindex, tsk)) continue; vtsk = find_lock_task_mm(tsk); @@ -118,15 +116,15 @@ static unsigned long find_victims(struct victim_info *varr, int *vindex, continue; /* Store this potential victim away for later */ - varr[*vindex].tsk = vtsk; - varr[*vindex].mm = vtsk->mm; - varr[*vindex].size = get_mm_rss(vtsk->mm); + victims[*vindex].tsk = vtsk; + victims[*vindex].mm = vtsk->mm; + victims[*vindex].size = get_mm_rss(vtsk->mm); /* Keep track of the number of pages that have been found */ - pages_found += varr[*vindex].size; + pages_found += victims[*vindex].size; /* Make sure there's space left in the victim array */ - if (++*vindex == vmaxlen) + if (++*vindex == MAX_VICTIMS) break; } @@ -135,14 +133,13 @@ static unsigned long find_victims(struct victim_info *varr, int *vindex, * the larger ones first. */ if (pages_found) - sort(&varr[old_vindex], *vindex - old_vindex, sizeof(*varr), - victim_size_cmp, NULL); + sort(&victims[old_vindex], *vindex - old_vindex, + sizeof(*victims), victim_size_cmp, NULL); return pages_found; } -static int process_victims(struct victim_info *varr, int vlen, - unsigned long pages_needed) +static int process_victims(int vlen, unsigned long pages_needed) { unsigned long pages_found = 0; int i, nr_to_kill = 0; @@ -180,8 +177,7 @@ static void scan_and_kill(unsigned long pages_needed) */ read_lock(&tasklist_lock); for (i = 0; i < ARRAY_SIZE(adj_prio); i++) { - pages_found += find_victims(victims, &nr_victims, MAX_VICTIMS, - adj_prio[i]); + pages_found += find_victims(&nr_victims, adj_prio[i]); if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS) break; } @@ -192,7 +188,7 @@ static void scan_and_kill(unsigned long pages_needed) return; /* First round of victim processing to weed out unneeded victims */ - nr_to_kill = process_victims(victims, nr_victims, pages_needed); + nr_to_kill = process_victims(nr_victims, pages_needed); /* * Try to kill as few of the chosen victims as possible by sorting the @@ -202,7 +198,7 @@ static void scan_and_kill(unsigned long pages_needed) sort(victims, nr_to_kill, sizeof(*victims), victim_size_cmp, NULL); /* Second round of victim processing to finally select the victims */ - nr_to_kill = process_victims(victims, nr_to_kill, pages_needed); + nr_to_kill = process_victims(nr_to_kill, pages_needed); /* Kill the victims */ atomic_set_release(&victims_to_kill, nr_to_kill); @@ -296,12 +292,11 @@ static int simple_lmk_init_set(const char *val, const struct kernel_param *kp) static atomic_t init_done = ATOMIC_INIT(0); struct task_struct *thread; - if (atomic_cmpxchg(&init_done, 0, 1)) - return 0; - - thread = kthread_run(simple_lmk_reclaim_thread, NULL, "simple_lmkd"); - BUG_ON(IS_ERR(thread)); - + if (!atomic_cmpxchg(&init_done, 0, 1)) { + thread = kthread_run(simple_lmk_reclaim_thread, NULL, + "simple_lmkd"); + BUG_ON(IS_ERR(thread)); + } return 0; } From fee32857e47f40f1ea73e861cf987be22ffc286b Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 6 Nov 2019 10:02:57 -0800 Subject: [PATCH 13/44] simple_lmk: Increase default minfree value After commit "simple_lmk: Make reclaim deterministic", Simple LMK's behavior changed and thus requires some slight re-tuning to make it work well again. Signed-off-by: Sultan Alsawaf --- drivers/android/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index b572f685374f..f126cf569529 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -80,7 +80,7 @@ config ANDROID_SIMPLE_LMK_AGGRESSION config ANDROID_SIMPLE_LMK_MINFREE int "Minimum MiB of memory to free per reclaim" range 8 512 - default 100 + default 128 help Simple LMK will try to free at least this much memory per reclaim. From 44b7e9857c4f8d86137e35bd7ab64b1ae6fc8647 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Mon, 20 Jan 2020 16:03:44 -0800 Subject: [PATCH 14/44] simple_lmk: Don't queue up new reclaim requests during reclaim Queuing up reclaim requests while a reclaim is in progress doesn't make sense, since the additional reclaims may not be needed after the existing reclaim completes. This would cause Simple LMK to go berserk during periods of high memory pressure where kswapd would fire off reclaim requests nonstop. Make Simple LMK ignore new reclaim requests until an existing reclaim is finished to prevent a slaughter-fest. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 44f7319defc0..28b808a40a5a 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -245,8 +245,9 @@ static int simple_lmk_reclaim_thread(void *data) sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio); while (1) { - wait_event(oom_waitq, atomic_add_unless(&needs_reclaim, -1, 0)); + wait_event(oom_waitq, atomic_read_acquire(&needs_reclaim)); scan_and_kill(MIN_FREE_PAGES); + atomic_set_release(&needs_reclaim, 0); } return 0; @@ -254,18 +255,9 @@ static int simple_lmk_reclaim_thread(void *data) void simple_lmk_decide_reclaim(int kswapd_priority) { - if (kswapd_priority == CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION) { - int v, v1; - - for (v = 0;; v = v1) { - v1 = atomic_cmpxchg(&needs_reclaim, v, v + 1); - if (likely(v1 == v)) { - if (!v) - wake_up(&oom_waitq); - break; - } - } - } + if (kswapd_priority == CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION && + !atomic_cmpxchg(&needs_reclaim, 0, 1)) + wake_up(&oom_waitq); } void simple_lmk_mm_freed(struct mm_struct *mm) From 6abeb5839badeebd659539d8f7cbb0de810da3df Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Thu, 6 Feb 2020 20:57:53 -0800 Subject: [PATCH 15/44] simple_lmk: Update copyright to 2020 Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 2 +- include/linux/simple_lmk.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 28b808a40a5a..2884030276ea 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2019 Sultan Alsawaf . + * Copyright (C) 2019-2020 Sultan Alsawaf . */ #define pr_fmt(fmt) "simple_lmk: " fmt diff --git a/include/linux/simple_lmk.h b/include/linux/simple_lmk.h index 46cdb389be51..28103c1b1d4c 100644 --- a/include/linux/simple_lmk.h +++ b/include/linux/simple_lmk.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (C) 2019 Sultan Alsawaf . + * Copyright (C) 2019-2020 Sultan Alsawaf . */ #ifndef _SIMPLE_LMK_H_ #define _SIMPLE_LMK_H_ From 13f795a9246d9d237a31a525650fc6f1ff9f3296 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Thu, 6 Feb 2020 20:59:22 -0800 Subject: [PATCH 16/44] simple_lmk: Remove compat cruft not specific to 4.14 Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 2884030276ea..3816f8bf3946 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -10,26 +10,7 @@ #include #include #include -#include - -/* The sched_param struct is located elsewhere in newer kernels */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) #include -#endif - -/* SEND_SIG_FORCED isn't present in newer kernels */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0) -#define SIG_INFO_TYPE SEND_SIG_FORCED -#else -#define SIG_INFO_TYPE SEND_SIG_PRIV -#endif - -/* The group argument to do_send_sig_info is different in newer kernels */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) -#define KILL_GROUP_TYPE true -#else -#define KILL_GROUP_TYPE PIDTYPE_TGID -#endif /* The minimum number of pages to free per reclaim */ #define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE) @@ -211,7 +192,7 @@ static void scan_and_kill(unsigned long pages_needed) victim->size << (PAGE_SHIFT - 10)); /* Accelerate the victim's death by forcing the kill signal */ - do_send_sig_info(SIGKILL, SIG_INFO_TYPE, vtsk, KILL_GROUP_TYPE); + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, vtsk, true); /* Grab a reference to the victim for later before unlocking */ get_task_struct(vtsk); From 80fd7c105a1b3a41e3844ac1a17cce7d34f717ee Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Thu, 6 Feb 2020 21:03:24 -0800 Subject: [PATCH 17/44] simple_lmk: Print a message when there are no processes to kill Makes it clear that Simple LMK tried its best but there was nothing it could do. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 3816f8bf3946..76f40e99b80d 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -165,8 +165,10 @@ static void scan_and_kill(unsigned long pages_needed) read_unlock(&tasklist_lock); /* Pretty unlikely but it can happen */ - if (unlikely(!nr_victims)) + if (unlikely(!nr_victims)) { + pr_err("No processes available to kill!\n"); return; + } /* First round of victim processing to weed out unneeded victims */ nr_to_kill = process_victims(nr_victims, pages_needed); From 1cfcc86d99f12653df298539701936074ff49eb6 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Fri, 7 Feb 2020 23:36:58 -0800 Subject: [PATCH 18/44] simple_lmk: Disable OOM killer when Simple LMK is enabled The OOM killer only serves to be a liability when Simple LMK is used. Signed-off-by: Sultan Alsawaf --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7a5c0b229c6a..143e717ebc3c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1025,7 +1025,7 @@ bool out_of_memory(struct oom_control *oc) unsigned long freed = 0; enum oom_constraint constraint = CONSTRAINT_NONE; - if (oom_killer_disabled) + if (oom_killer_disabled || IS_ENABLED(CONFIG_ANDROID_SIMPLE_LMK)) return false; if (!is_memcg_oom(oc)) { From 4facfa347a568101d46285e2caae1e483e8cc7b6 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sat, 8 Feb 2020 17:03:35 -0800 Subject: [PATCH 19/44] simple_lmk: Mark victim thread group with TIF_MEMDIE The OOM killer sets the TIF_MEMDIE thread flag for its victims to alert other kernel code that the current process was killed due to memory pressure, and needs to finish whatever it's doing quickly. In the page allocator this allows victim processes to quickly allocate memory using emergency reserves. This is especially important when memory pressure is high; if all processes are taking a while to allocate memory, then our victim processes will face the same problem and can potentially get stuck in the page allocator for a while rather than die expeditiously. To ensure that victim processes die quickly, set TIF_MEMDIE for the entire victim thread group. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 8 +++++++- kernel/exit.c | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 76f40e99b80d..77172da82701 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -187,7 +187,7 @@ static void scan_and_kill(unsigned long pages_needed) atomic_set_release(&victims_to_kill, nr_to_kill); for (i = 0; i < nr_to_kill; i++) { struct victim_info *victim = &victims[i]; - struct task_struct *vtsk = victim->tsk; + struct task_struct *t, *vtsk = victim->tsk; pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm, vtsk->signal->oom_score_adj, @@ -196,6 +196,12 @@ static void scan_and_kill(unsigned long pages_needed) /* Accelerate the victim's death by forcing the kill signal */ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, vtsk, true); + /* Mark the thread group dead so that other kernel code knows */ + rcu_read_lock(); + for_each_thread(vtsk, t) + set_tsk_thread_flag(t, TIF_MEMDIE); + rcu_read_unlock(); + /* Grab a reference to the victim for later before unlocking */ get_task_struct(vtsk); task_unlock(vtsk); diff --git a/kernel/exit.c b/kernel/exit.c index d1baf9c96c3e..a9f6d7814248 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -543,8 +543,12 @@ static void exit_mm(void) task_unlock(current); mm_update_next_owner(mm); mmput(mm); +#ifdef CONFIG_ANDROID_SIMPLE_LMK + clear_thread_flag(TIF_MEMDIE); +#else if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); +#endif } static struct task_struct *find_alive_thread(struct task_struct *p) From 5f43e7cf08e8d3752d3d58518d1d4e64270bc3d6 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sun, 9 Feb 2020 16:24:29 -0800 Subject: [PATCH 20/44] simple_lmk: Report mm as freed as soon as exit_mmap() finishes exit_mmap() is responsible for freeing the vast majority of an mm's memory; in order to unblock Simple LMK faster, report an mm as freed as soon as exit_mmap() finishes. Signed-off-by: Sultan Alsawaf --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 211239424b6b..f6a535bab967 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -932,6 +932,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + simple_lmk_mm_freed(mm); mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -941,7 +942,6 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); - simple_lmk_mm_freed(mm); mmdrop(mm); } From 138c37fa81aa83e8e231437b034481c72691c616 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sat, 8 Feb 2020 00:00:48 -0800 Subject: [PATCH 21/44] simple_lmk: Simplify tricks used to speed up the death process set_user_nice() doesn't schedule, and although set_cpus_allowed_ptr() can schedule, it will only do so when the specified task cannot run on the new set of allowed CPUs. Since cpu_all_mask is used, set_cpus_allowed_ptr() will never schedule. Therefore, both the priority elevation and cpus_allowed change can be moved to inside the task lock to simplify and speed things up. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 77172da82701..224299997dd4 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -202,23 +202,14 @@ static void scan_and_kill(unsigned long pages_needed) set_tsk_thread_flag(t, TIF_MEMDIE); rcu_read_unlock(); - /* Grab a reference to the victim for later before unlocking */ - get_task_struct(vtsk); - task_unlock(vtsk); - } - - /* Try to speed up the death process now that we can schedule again */ - for (i = 0; i < nr_to_kill; i++) { - struct task_struct *vtsk = victims[i].tsk; - /* Increase the victim's priority to make it die faster */ set_user_nice(vtsk, MIN_NICE); - /* Allow the victim to run on any CPU */ + /* Allow the victim to run on any CPU. This won't schedule. */ set_cpus_allowed_ptr(vtsk, cpu_all_mask); - /* Finally release the victim reference acquired earlier */ - put_task_struct(vtsk); + /* Finally release the victim's task lock acquired earlier */ + task_unlock(vtsk); } /* Wait until all the victims die */ From 0cefc5dab35e5fcf3a4be487c772fa1e5fd106b4 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sat, 8 Feb 2020 03:21:01 -0800 Subject: [PATCH 22/44] simple_lmk: Ignore tasks that won't free memory Dying processes aren't going to help free memory, so ignore them. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 224299997dd4..77695a763a05 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -77,18 +77,22 @@ static unsigned long find_victims(int *vindex, short target_adj) struct task_struct *tsk; for_each_process(tsk) { + struct signal_struct *sig; struct task_struct *vtsk; /* - * Search for tasks with the targeted importance (adj). Since - * only tasks with a positive adj can be targeted, that + * Search for suitable tasks with the targeted importance (adj). + * Since only tasks with a positive adj can be targeted, that * naturally excludes tasks which shouldn't be killed, like init * and kthreads. Although oom_score_adj can still be changed * while this code runs, it doesn't really matter. We just need * to make sure that if the adj changes, we won't deadlock * trying to lock a task that we locked earlier. */ - if (READ_ONCE(tsk->signal->oom_score_adj) != target_adj || + sig = tsk->signal; + if (READ_ONCE(sig->oom_score_adj) != target_adj || + sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP) || + (thread_group_empty(tsk) && tsk->flags & PF_EXITING) || vtsk_is_duplicate(*vindex, tsk)) continue; From 4a9d1300e980c8ebb049c6e53000d071e8d384a1 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sat, 8 Feb 2020 03:22:44 -0800 Subject: [PATCH 23/44] simple_lmk: Add a timeout to stop waiting for victims to die Simple LMK tries to wait until all of the victims it kills have their memory freed; however, sometimes victims can take a while to die, which can block Simple LMK from killing more processes in time when needed. After the specified timeout elapses, Simple LMK will stop waiting and make itself available to kill more processes. Signed-off-by: Sultan Alsawaf --- drivers/android/Kconfig | 11 ++++++++++ drivers/android/simple_lmk.c | 42 +++++++++++++++++++++++++----------- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index f126cf569529..7f65391e5e73 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -84,6 +84,17 @@ config ANDROID_SIMPLE_LMK_MINFREE help Simple LMK will try to free at least this much memory per reclaim. +config ANDROID_SIMPLE_LMK_TIMEOUT_MSEC + int "Reclaim timeout in milliseconds" + range 50 1000 + default 200 + help + Simple LMK tries to wait until all of the victims it kills have their + memory freed; however, sometimes victims can take a while to die, + which can block Simple LMK from killing more processes in time when + needed. After the specified timeout elapses, Simple LMK will stop + waiting and make itself available to kill more processes. + endif endif # if ANDROID diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 77695a763a05..21895409ff8f 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -18,6 +18,9 @@ /* Kill up to this many victims per reclaim */ #define MAX_VICTIMS 1024 +/* Timeout in jiffies for each reclaim */ +#define RECLAIM_EXPIRES msecs_to_jiffies(CONFIG_ANDROID_SIMPLE_LMK_TIMEOUT_MSEC) + struct victim_info { struct task_struct *tsk; struct mm_struct *mm; @@ -47,8 +50,10 @@ static const short adj_prio[] = { static struct victim_info victims[MAX_VICTIMS]; static DECLARE_WAIT_QUEUE_HEAD(oom_waitq); static DECLARE_COMPLETION(reclaim_done); -static atomic_t victims_to_kill = ATOMIC_INIT(0); +static DEFINE_RWLOCK(mm_free_lock); +static int victims_to_kill; static atomic_t needs_reclaim = ATOMIC_INIT(0); +static atomic_t nr_killed = ATOMIC_INIT(0); static int victim_size_cmp(const void *lhs_ptr, const void *rhs_ptr) { @@ -152,7 +157,7 @@ static int process_victims(int vlen, unsigned long pages_needed) static void scan_and_kill(unsigned long pages_needed) { - int i, nr_to_kill = 0, nr_victims = 0; + int i, nr_to_kill = 0, nr_victims = 0, ret; unsigned long pages_found = 0; /* @@ -187,8 +192,12 @@ static void scan_and_kill(unsigned long pages_needed) /* Second round of victim processing to finally select the victims */ nr_to_kill = process_victims(nr_to_kill, pages_needed); + /* Store the final number of victims for simple_lmk_mm_freed() */ + write_lock(&mm_free_lock); + victims_to_kill = nr_to_kill; + write_unlock(&mm_free_lock); + /* Kill the victims */ - atomic_set_release(&victims_to_kill, nr_to_kill); for (i = 0; i < nr_to_kill; i++) { struct victim_info *victim = &victims[i]; struct task_struct *t, *vtsk = victim->tsk; @@ -216,8 +225,18 @@ static void scan_and_kill(unsigned long pages_needed) task_unlock(vtsk); } - /* Wait until all the victims die */ - wait_for_completion(&reclaim_done); + /* Wait until all the victims die or until the timeout is reached */ + ret = wait_for_completion_timeout(&reclaim_done, RECLAIM_EXPIRES); + write_lock(&mm_free_lock); + if (!ret) { + /* Extra clean-up is needed when the timeout is hit */ + reinit_completion(&reclaim_done); + for (i = 0; i < nr_to_kill; i++) + victims[i].mm = NULL; + } + victims_to_kill = 0; + nr_killed = (atomic_t)ATOMIC_INIT(0); + write_unlock(&mm_free_lock); } static int simple_lmk_reclaim_thread(void *data) @@ -246,20 +265,17 @@ void simple_lmk_decide_reclaim(int kswapd_priority) void simple_lmk_mm_freed(struct mm_struct *mm) { - static atomic_t nr_killed = ATOMIC_INIT(0); - int i, nr_to_kill; + int i; - nr_to_kill = atomic_read_acquire(&victims_to_kill); - for (i = 0; i < nr_to_kill; i++) { + read_lock(&mm_free_lock); + for (i = 0; i < victims_to_kill; i++) { if (cmpxchg(&victims[i].mm, mm, NULL) == mm) { - if (atomic_inc_return(&nr_killed) == nr_to_kill) { - atomic_set(&victims_to_kill, 0); - nr_killed = (atomic_t)ATOMIC_INIT(0); + if (atomic_inc_return(&nr_killed) == victims_to_kill) complete(&reclaim_done); - } break; } } + read_unlock(&mm_free_lock); } /* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */ From 7d8995436f4d1bdc84c43242480957bd638c00e1 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sat, 8 Feb 2020 14:26:31 -0800 Subject: [PATCH 24/44] simple_lmk: Place victims onto SCHED_RR Just increasing the victim's priority to the maximum niceness isn't enough to make it totally preempt everything in SCHED_FAIR, which is important to make sure victims die quickly. Resource-wise, this isn't very burdensome since the RT priority is just set to zero, and because dying victims don't have much to do: they only need to finish whatever they're doing quickly. SCHED_RR is used over SCHED_FIFO so that CPU time between the victims is divided evenly to help them all finish at around the same time, as fast as possible. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 21895409ff8f..ce2d1872c2d3 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -199,6 +199,7 @@ static void scan_and_kill(unsigned long pages_needed) /* Kill the victims */ for (i = 0; i < nr_to_kill; i++) { + static const struct sched_param sched_zero_prio; struct victim_info *victim = &victims[i]; struct task_struct *t, *vtsk = victim->tsk; @@ -215,8 +216,8 @@ static void scan_and_kill(unsigned long pages_needed) set_tsk_thread_flag(t, TIF_MEMDIE); rcu_read_unlock(); - /* Increase the victim's priority to make it die faster */ - set_user_nice(vtsk, MIN_NICE); + /* Elevate the victim to SCHED_RR with zero RT priority */ + sched_setscheduler_nocheck(vtsk, SCHED_RR, &sched_zero_prio); /* Allow the victim to run on any CPU. This won't schedule. */ set_cpus_allowed_ptr(vtsk, cpu_all_mask); From 218dc60642274e1fa82c178a1b5ade0d85c3520f Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Tue, 18 Feb 2020 22:37:48 -0800 Subject: [PATCH 25/44] simple_lmk: Relax memory barriers and clean up some styling wake_up() executes a full memory barrier when waking a process up, so there's no need for the acquire in the wait event. Additionally, because of this, the atomic_cmpxchg() only needs a read barrier. The cmpxchg() in simple_lmk_mm_freed() is atomic when it doesn't need to be, so replace it with an extra line of code. The atomic_inc_return() in simple_lmk_mm_freed() lies within a lock, so it doesn't need explicit memory barriers. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index ce2d1872c2d3..3372fe21962d 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -145,11 +145,10 @@ static int process_victims(int vlen, unsigned long pages_needed) /* The victim's mm lock is taken in find_victims; release it */ if (pages_found >= pages_needed) { task_unlock(vtsk); - continue; + } else { + pages_found += victim->size; + nr_to_kill++; } - - pages_found += victim->size; - nr_to_kill++; } return nr_to_kill; @@ -249,7 +248,7 @@ static int simple_lmk_reclaim_thread(void *data) sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio); while (1) { - wait_event(oom_waitq, atomic_read_acquire(&needs_reclaim)); + wait_event(oom_waitq, atomic_read(&needs_reclaim)); scan_and_kill(MIN_FREE_PAGES); atomic_set_release(&needs_reclaim, 0); } @@ -260,7 +259,7 @@ static int simple_lmk_reclaim_thread(void *data) void simple_lmk_decide_reclaim(int kswapd_priority) { if (kswapd_priority == CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION && - !atomic_cmpxchg(&needs_reclaim, 0, 1)) + !atomic_cmpxchg_acquire(&needs_reclaim, 0, 1)) wake_up(&oom_waitq); } @@ -270,11 +269,13 @@ void simple_lmk_mm_freed(struct mm_struct *mm) read_lock(&mm_free_lock); for (i = 0; i < victims_to_kill; i++) { - if (cmpxchg(&victims[i].mm, mm, NULL) == mm) { - if (atomic_inc_return(&nr_killed) == victims_to_kill) - complete(&reclaim_done); - break; - } + if (victims[i].mm != mm) + continue; + + victims[i].mm = NULL; + if (atomic_inc_return_relaxed(&nr_killed) == victims_to_kill) + complete(&reclaim_done); + break; } read_unlock(&mm_free_lock); } From eb372780cf31491a5d79c54286e9f05ea5202fb8 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Tue, 18 Feb 2020 22:39:41 -0800 Subject: [PATCH 26/44] simple_lmk: Include swap memory usage in the size of victims Swap memory usage is important when determining what to kill, so include it in the victim size calculation. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 3372fe21962d..215ee674d82d 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -75,6 +75,17 @@ static bool vtsk_is_duplicate(int vlen, struct task_struct *vtsk) return false; } +static unsigned long get_total_mm_pages(struct mm_struct *mm) +{ + unsigned long pages = 0; + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) + pages += get_mm_counter(mm, i); + + return pages; +} + static unsigned long find_victims(int *vindex, short target_adj) { unsigned long pages_found = 0; @@ -108,7 +119,7 @@ static unsigned long find_victims(int *vindex, short target_adj) /* Store this potential victim away for later */ victims[*vindex].tsk = vtsk; victims[*vindex].mm = vtsk->mm; - victims[*vindex].size = get_mm_rss(vtsk->mm); + victims[*vindex].size = get_total_mm_pages(vtsk->mm); /* Keep track of the number of pages that have been found */ pages_found += victims[*vindex].size; From 3da2489ad176af993d7ada33c53a40d8f17e0d1b Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 19 Feb 2020 14:47:13 -0800 Subject: [PATCH 27/44] mm: Stop kswapd early when nothing's waiting for it to free pages Keeping kswapd running when all the failed allocations that invoked it are satisfied incurs a high overhead due to unnecessary page eviction and writeback, as well as spurious VM pressure events to various registered shrinkers. When kswapd doesn't need to work to make an allocation succeed anymore, stop it prematurely to save resources. Signed-off-by: Sultan Alsawaf --- include/linux/mmzone.h | 1 + mm/page_alloc.c | 17 ++++++++++++++--- mm/vmscan.c | 3 ++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 71b7a8bc82ea..33259efac89d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -653,6 +653,7 @@ typedef struct pglist_data { unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; + atomic_t kswapd_waiters; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ab9b01f2bef8..aa3590cbd1f8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3885,6 +3885,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int no_progress_loops; unsigned int cpuset_mems_cookie; int reserve_flags; + pg_data_t *pgdat = ac->preferred_zoneref->zone->zone_pgdat; + bool woke_kswapd = false; /* * We also sanity check to catch abuse of atomic reserves being used by @@ -3918,8 +3920,13 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; - if (gfp_mask & __GFP_KSWAPD_RECLAIM) + if (gfp_mask & __GFP_KSWAPD_RECLAIM) { + if (!woke_kswapd) { + atomic_inc(&pgdat->kswapd_waiters); + woke_kswapd = true; + } wake_all_kswapds(order, ac); + } /* * The adjusted alloc_flags might result in immediate success, so try @@ -4113,9 +4120,12 @@ nopage: goto retry; } fail: - warn_alloc(gfp_mask, ac->nodemask, - "page allocation failure: order:%u", order); got_pg: + if (woke_kswapd) + atomic_dec(&pgdat->kswapd_waiters); + if (!page) + warn_alloc(gfp_mask, ac->nodemask, + "page allocation failure: order:%u", order); return page; } @@ -6059,6 +6069,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); + pgdat->kswapd_waiters = (atomic_t)ATOMIC_INIT(0); pgdat->per_cpu_nodestats = &boot_nodestats; diff --git a/mm/vmscan.c b/mm/vmscan.c index 03f8ff25a14b..04dfd572436e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3423,7 +3423,8 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - if (try_to_freeze() || kthread_should_stop()) + if (try_to_freeze() || kthread_should_stop() || + !atomic_read(&pgdat->kswapd_waiters)) break; /* From a02bb96bd8f57190bcd59cf35684da40c903df2f Mon Sep 17 00:00:00 2001 From: David Ng Date: Mon, 26 Mar 2018 12:46:49 -0700 Subject: [PATCH 28/44] mm, vmpressure: int cast vmpressure level/model for -1 comparison Resolve -Wenum-compare issue when comparing vmpressure level/model against -1 (invalid state). Change-Id: I1c76667ee8390e2d396c96e5ed73f30d0700ffa8 Signed-off-by: David Ng --- mm/vmpressure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 85350ce2d25d..afb5beadd219 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -400,7 +400,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg, /* Find required level */ token = strsep(&spec, ","); level = str_to_level(token); - if (level == -1) { + if ((int)level == -1) { ret = -EINVAL; goto out; } @@ -409,7 +409,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg, token = strsep(&spec, ","); if (token) { mode = str_to_mode(token); - if (mode == -1) { + if ((int)mode == -1) { ret = -EINVAL; goto out; } From d214e3a3cc7dffa3189c30a94368238842e1e0d2 Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Wed, 4 Mar 2015 16:38:28 +0530 Subject: [PATCH 29/44] mm: vmpressure: allow in-kernel clients to subscribe for events Currently, vmpressure is tied to memcg and its events are available only to userspace clients. This patch removes the dependency on CONFIG_MEMCG and adds a mechanism for in-kernel clients to subscribe for vmpressure events (in fact raw vmpressure values are delivered instead of vmpressure levels, to provide clients more flexibility to take actions on custom pressure levels which are not currently defined by vmpressure module). Change-Id: I38010f166546e8d7f12f5f355b5dbfd6ba04d587 Signed-off-by: Vinayak Menon --- include/linux/vmpressure.h | 12 ++-- mm/Makefile | 4 +- mm/vmpressure.c | 138 ++++++++++++++++++++++++++++++------- 3 files changed, 121 insertions(+), 33 deletions(-) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 61e6fddfb26f..1b8a21c3757a 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -29,11 +29,13 @@ struct vmpressure { struct mem_cgroup; -#ifdef CONFIG_MEMCG +extern int vmpressure_notifier_register(struct notifier_block *nb); +extern int vmpressure_notifier_unregister(struct notifier_block *nb); extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed); extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); +#ifdef CONFIG_MEMCG extern void vmpressure_init(struct vmpressure *vmpr); extern void vmpressure_cleanup(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); @@ -44,9 +46,9 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg, extern void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); #else -static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed) {} -static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, - int prio) {} +static inline struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) +{ + return NULL; +} #endif /* CONFIG_MEMCG */ #endif /* __LINUX_VMPRESSURE_H */ diff --git a/mm/Makefile b/mm/Makefile index e7ebd176fb93..89241bd579ec 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o swap_slots.o \ interval_tree.o list_lru.o workingset.o \ - debug.o $(mmu-y) + debug.o $(mmu-y) vmpressure.o obj-y += init-mm.o @@ -77,7 +77,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o -obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o +obj-$(CONFIG_MEMCG) += memcontrol.o obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/vmpressure.c b/mm/vmpressure.c index afb5beadd219..d3b694a89271 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include /* @@ -49,6 +51,24 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; static const unsigned int vmpressure_level_med = 60; static const unsigned int vmpressure_level_critical = 95; +static struct vmpressure global_vmpressure; +static BLOCKING_NOTIFIER_HEAD(vmpressure_notifier); + +int vmpressure_notifier_register(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&vmpressure_notifier, nb); +} + +int vmpressure_notifier_unregister(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&vmpressure_notifier, nb); +} + +static void vmpressure_notify(unsigned long pressure) +{ + blocking_notifier_call_chain(&vmpressure_notifier, pressure, NULL); +} + /* * When there are too little pages left to scan, vmpressure() may miss the * critical pressure as number of pages will be less than "window size". @@ -75,6 +95,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work) return container_of(work, struct vmpressure, work); } +#ifdef CONFIG_MEMCG static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); @@ -85,6 +106,12 @@ static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) return NULL; return memcg_to_vmpressure(memcg); } +#else +static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) +{ + return NULL; +} +#endif enum vmpressure_levels { VMPRESSURE_LOW = 0, @@ -121,7 +148,7 @@ static enum vmpressure_levels vmpressure_level(unsigned long pressure) return VMPRESSURE_LOW; } -static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, +static unsigned long vmpressure_calc_pressure(unsigned long scanned, unsigned long reclaimed) { unsigned long scale = scanned + reclaimed; @@ -148,7 +175,7 @@ out: pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, scanned, reclaimed); - return vmpressure_level(pressure); + return pressure; } struct vmpressure_event { @@ -186,6 +213,7 @@ static void vmpressure_work_fn(struct work_struct *work) struct vmpressure *vmpr = work_to_vmpressure(work); unsigned long scanned; unsigned long reclaimed; + unsigned long pressure; enum vmpressure_levels level; bool ancestor = false; bool signalled = false; @@ -210,7 +238,8 @@ static void vmpressure_work_fn(struct work_struct *work) vmpr->tree_reclaimed = 0; spin_unlock(&vmpr->sr_lock); - level = vmpressure_calc_level(scanned, reclaimed); + pressure = vmpressure_calc_pressure(scanned, reclaimed); + level = vmpressure_level(pressure); do { if (vmpressure_event(vmpr, level, ancestor, signalled)) @@ -219,28 +248,8 @@ static void vmpressure_work_fn(struct work_struct *work) } while ((vmpr = vmpressure_parent(vmpr))); } -/** - * vmpressure() - Account memory pressure through scanned/reclaimed ratio - * @gfp: reclaimer's gfp mask - * @memcg: cgroup memory controller handle - * @tree: legacy subtree mode - * @scanned: number of pages scanned - * @reclaimed: number of pages reclaimed - * - * This function should be called from the vmscan reclaim path to account - * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw - * pressure index is then further refined and averaged over time. - * - * If @tree is set, vmpressure is in traditional userspace reporting - * mode: @memcg is considered the pressure root and userspace is - * notified of the entire subtree's reclaim efficiency. - * - * If @tree is not set, reclaim efficiency is recorded for @memcg, and - * only in-kernel users are notified. - * - * This function does not return any value. - */ -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, +#ifdef CONFIG_MEMCG +static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); @@ -281,6 +290,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, schedule_work(&vmpr->work); } else { enum vmpressure_levels level; + unsigned long pressure; /* For now, no users for root-level efficiency */ if (!memcg || memcg == root_mem_cgroup) @@ -296,7 +306,8 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, vmpr->scanned = vmpr->reclaimed = 0; spin_unlock(&vmpr->sr_lock); - level = vmpressure_calc_level(scanned, reclaimed); + pressure = vmpressure_calc_pressure(scanned, reclaimed); + level = vmpressure_level(pressure); if (level > VMPRESSURE_LOW) { /* @@ -311,6 +322,74 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, } } } +#else +static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, + unsigned long scanned, unsigned long reclaimed) +{ +} +#endif + +static void vmpressure_global(gfp_t gfp, unsigned long scanned, + unsigned long reclaimed) +{ + struct vmpressure *vmpr = &global_vmpressure; + unsigned long pressure; + + if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) + return; + + if (!scanned) + return; + + spin_lock(&vmpr->sr_lock); + vmpr->scanned += scanned; + vmpr->reclaimed += reclaimed; + scanned = vmpr->scanned; + reclaimed = vmpr->reclaimed; + spin_unlock(&vmpr->sr_lock); + + if (scanned < vmpressure_win) + return; + + spin_lock(&vmpr->sr_lock); + vmpr->scanned = 0; + vmpr->reclaimed = 0; + spin_unlock(&vmpr->sr_lock); + + pressure = vmpressure_calc_pressure(scanned, reclaimed); + vmpressure_notify(pressure); +} + +/** + * vmpressure() - Account memory pressure through scanned/reclaimed ratio + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @tree: legacy subtree mode + * @scanned: number of pages scanned + * @reclaimed: number of pages reclaimed + * + * This function should be called from the vmscan reclaim path to account + * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw + * pressure index is then further refined and averaged over time. + * + * If @tree is set, vmpressure is in traditional userspace reporting + * mode: @memcg is considered the pressure root and userspace is + * notified of the entire subtree's reclaim efficiency. + * + * If @tree is not set, reclaim efficiency is recorded for @memcg, and + * only in-kernel users are notified. + * + * This function does not return any value. + */ +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, + unsigned long scanned, unsigned long reclaimed) +{ + if (!memcg && tree) + vmpressure_global(gfp, scanned, reclaimed); + + if (IS_ENABLED(CONFIG_MEMCG)) + vmpressure_memcg(gfp, memcg, tree, scanned, reclaimed); +} /** * vmpressure_prio() - Account memory pressure through reclaimer priority level @@ -491,3 +570,10 @@ void vmpressure_cleanup(struct vmpressure *vmpr) */ flush_work(&vmpr->work); } + +static int vmpressure_global_init(void) +{ + vmpressure_init(&global_vmpressure); + return 0; +} +late_initcall(vmpressure_global_init); From 204fdd9788d6ac7196e6b213a955f63a1cdbdb40 Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Tue, 31 Mar 2015 11:06:29 +0530 Subject: [PATCH 30/44] mm: vmpressure: scale pressure based on reclaim context The existing calculation of vmpressure takes into account only the ratio of reclaimed to scanned pages, but not the time spent or the difficulty in reclaiming those pages. For e.g. when there are quite a number of file pages in the system, an allocation request can be satisfied by reclaiming the file pages alone. If such a reclaim is successful, the vmpressure value will remain low irrespective of the time spent by the reclaim code to free up the file pages. With a feature like lowmemorykiller, killing a task can be faster than reclaiming the file pages alone. So if the vmpressure values reflect the reclaim difficulty level, clients can make a decision based on that, for e.g. to kill a task early. This patch monitors the number of pages scanned in the direct reclaim path and scales the vmpressure level according to that. Signed-off-by: Vinayak Menon Change-Id: I6e643d29a9a1aa0814309253a8b690ad86ec0b13 --- include/linux/vmpressure.h | 1 + mm/vmpressure.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 1b8a21c3757a..de86c6b946c7 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -16,6 +16,7 @@ struct vmpressure { unsigned long tree_scanned; unsigned long tree_reclaimed; + unsigned long stall; /* The lock is used to keep the scanned/reclaimed above in sync. */ struct spinlock sr_lock; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index d3b694a89271..e3098a56fafc 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* @@ -51,6 +52,10 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; static const unsigned int vmpressure_level_med = 60; static const unsigned int vmpressure_level_critical = 95; +static unsigned long vmpressure_scale_max = 100; +module_param_named(vmpressure_scale_max, vmpressure_scale_max, + ulong, 0644); + static struct vmpressure global_vmpressure; static BLOCKING_NOTIFIER_HEAD(vmpressure_notifier); @@ -178,6 +183,15 @@ out: return pressure; } +static unsigned long vmpressure_account_stall(unsigned long pressure, + unsigned long stall, unsigned long scanned) +{ + unsigned long scale = + ((vmpressure_scale_max - pressure) * stall) / scanned; + + return pressure + scale; +} + struct vmpressure_event { struct eventfd_ctx *efd; enum vmpressure_levels level; @@ -334,6 +348,7 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, { struct vmpressure *vmpr = &global_vmpressure; unsigned long pressure; + unsigned long stall; if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) return; @@ -344,6 +359,11 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, spin_lock(&vmpr->sr_lock); vmpr->scanned += scanned; vmpr->reclaimed += reclaimed; + + if (!current_is_kswapd()) + vmpr->stall += scanned; + + stall = vmpr->stall; scanned = vmpr->scanned; reclaimed = vmpr->reclaimed; spin_unlock(&vmpr->sr_lock); @@ -354,9 +374,11 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, spin_lock(&vmpr->sr_lock); vmpr->scanned = 0; vmpr->reclaimed = 0; + vmpr->stall = 0; spin_unlock(&vmpr->sr_lock); pressure = vmpressure_calc_pressure(scanned, reclaimed); + pressure = vmpressure_account_stall(pressure, stall, scanned); vmpressure_notify(pressure); } From 71d413a1d0db0344b21ce7225e219aa25fc5b02a Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Wed, 19 Aug 2015 16:16:39 +0530 Subject: [PATCH 31/44] mm: vmpressure: account allocstalls only on higher pressures At present any vmpressure value is scaled up if the pages are reclaimed through direct reclaim. This can result in false vmpressure values. Consider a case where a device is booted up and most of the memory is occuppied by file pages. kswapd will make sure that high watermark is maintained. Now when a sudden huge allocation request comes in, the system will definitely have to get into direct reclaims. The vmpressures can be very low, but because of allocstall accounting logic even these low values will be scaled to values nearing 100. This can result in unnecessary LMK kills for example. So define a tunable threshold for vmpressure above which the allocstalls will be accounted. Change-Id: Idd7c6724264ac89f1f68f2e9d70a32390ffca3e5 Signed-off-by: Vinayak Menon --- mm/vmpressure.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index e3098a56fafc..027db670268e 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -56,6 +56,11 @@ static unsigned long vmpressure_scale_max = 100; module_param_named(vmpressure_scale_max, vmpressure_scale_max, ulong, 0644); +/* vmpressure values >= this will be scaled based on allocstalls */ +static unsigned long allocstall_threshold = 70; +module_param_named(allocstall_threshold, allocstall_threshold, + ulong, 0644); + static struct vmpressure global_vmpressure; static BLOCKING_NOTIFIER_HEAD(vmpressure_notifier); @@ -186,8 +191,12 @@ out: static unsigned long vmpressure_account_stall(unsigned long pressure, unsigned long stall, unsigned long scanned) { - unsigned long scale = - ((vmpressure_scale_max - pressure) * stall) / scanned; + unsigned long scale; + + if (pressure < allocstall_threshold) + return pressure; + + scale = ((vmpressure_scale_max - pressure) * stall) / scanned; return pressure + scale; } From b77c1b0f03e88db97835d7e780bd370f211f4f13 Mon Sep 17 00:00:00 2001 From: Vinayak Menon Date: Mon, 19 Sep 2016 12:44:15 +0530 Subject: [PATCH 32/44] mm: vmpressure: make vmpressure window variable Right now the vmpressure window is of constant size 2MB, which works well with the following exceptions. 1) False vmpressure triggers are seen when the RAM size is greater than 3GB. This results in lowmemorykiller, which uses vmpressure events, killing tasks unnecessarily. 2) Vmpressure events are received late under memory pressure. This behaviour is seen prominently in <=2GB RAM targets. This results in lowmemorykiller kicking in late to kill tasks resulting in avoidable page cache reclaim. The problem analysis shows that the issue is with the constant size of the vmpressure window which does not adapt to the varying memory conditions. This patch recalculates the vmpressure window size at the end of each window. The chosen window size is proportional to the total of free and cached memory at that point. Change-Id: I7e9ef4ddd82e2c2dd04ce09ec8d58a8829cfb64d Signed-off-by: Vinayak Menon --- mm/vmpressure.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 027db670268e..679fe3020b77 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -41,7 +41,7 @@ * TODO: Make the window size depend on machine size, as we do for vmstat * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). */ -static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; +static unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; /* * These thresholds are used when we account memory pressure through @@ -352,6 +352,29 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, } #endif +static void calculate_vmpressure_win(void) +{ + long x; + + x = global_node_page_state(NR_FILE_PAGES) - + global_node_page_state(NR_SHMEM) - + total_swapcache_pages() + + global_zone_page_state(NR_FREE_PAGES); + if (x < 1) + x = 1; + /* + * For low (free + cached), vmpressure window should be + * small, and high for higher values of (free + cached). + * But it should not be linear as well. This ensures + * timely vmpressure notifications when system is under + * memory pressure, and optimal number of events when + * cached is high. The sqaure root function is empirically + * found to serve the purpose. + */ + x = int_sqrt(x); + vmpressure_win = x; +} + static void vmpressure_global(gfp_t gfp, unsigned long scanned, unsigned long reclaimed) { @@ -366,6 +389,9 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, return; spin_lock(&vmpr->sr_lock); + if (!vmpr->scanned) + calculate_vmpressure_win(); + vmpr->scanned += scanned; vmpr->reclaimed += reclaimed; From d7995da0e7d14a021a4e67cbec7a5193c83cf920 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Mon, 24 Feb 2020 19:03:04 -0800 Subject: [PATCH 33/44] simple_lmk: Use vmpressure notifier to trigger kills Using kswapd's scan depth to trigger task kills is inconsistent and unreliable. When memory pressure quickly spikes, the kswapd scan depth trigger fails to kick off Simple LMK fast enough, causing severe lag. Additionally, kswapd could stop scanning prematurely before reaching the desired scan depth to trigger Simple LMK, which could also cause stalls. To remedy this, use the vmpressure framework instead, since it provides more consistent and accurate readings on memory pressure. This is not very tunable though, so remove CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION. Triggering Simple LMK to kill when the reported memory pressure is 100 should yield good results on all setups. Signed-off-by: Sultan Alsawaf --- drivers/android/Kconfig | 22 ---------------------- drivers/android/simple_lmk.c | 24 +++++++++++++++++------- include/linux/simple_lmk.h | 4 ---- mm/vmscan.c | 2 -- 4 files changed, 17 insertions(+), 35 deletions(-) diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 7f65391e5e73..6a87d1298c5e 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -55,28 +55,6 @@ config ANDROID_SIMPLE_LMK if ANDROID_SIMPLE_LMK -config ANDROID_SIMPLE_LMK_AGGRESSION - int "Reclaim frequency selection" - range 1 3 - default 1 - help - This value determines how frequently Simple LMK will perform memory - reclaims. A lower value corresponds to less frequent reclaims, which - maximizes memory usage. The range of values has a logarithmic - correlation; 2 is twice as aggressive as 1, and 3 is twice as - aggressive as 2, which makes 3 four times as aggressive as 1. - - The aggression is set as a factor of kswapd's scan depth. This means - that a system with more memory will have a more expensive aggression - factor compared to a system with less memory. For example, setting an - aggression factor of 1 with 4 GiB of memory would be like setting a - factor of 2 with 8 GiB of memory; the more memory a system has, the - more expensive it is to use a lower value. - - Choosing a value of 1 here works well with systems that have 4 GiB of - memory. If the default doesn't work well, then this value should be - tweaked based on empirical results using different values. - config ANDROID_SIMPLE_LMK_MINFREE int "Minimum MiB of memory to free per reclaim" range 8 512 diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 215ee674d82d..2a3316100c79 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -10,6 +10,7 @@ #include #include #include +#include #include /* The minimum number of pages to free per reclaim */ @@ -267,13 +268,6 @@ static int simple_lmk_reclaim_thread(void *data) return 0; } -void simple_lmk_decide_reclaim(int kswapd_priority) -{ - if (kswapd_priority == CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION && - !atomic_cmpxchg_acquire(&needs_reclaim, 0, 1)) - wake_up(&oom_waitq); -} - void simple_lmk_mm_freed(struct mm_struct *mm) { int i; @@ -291,6 +285,20 @@ void simple_lmk_mm_freed(struct mm_struct *mm) read_unlock(&mm_free_lock); } +static int simple_lmk_vmpressure_cb(struct notifier_block *nb, + unsigned long pressure, void *data) +{ + if (pressure == 100 && !atomic_cmpxchg_acquire(&needs_reclaim, 0, 1)) + wake_up(&oom_waitq); + + return NOTIFY_OK; +} + +static struct notifier_block vmpressure_notif = { + .notifier_call = simple_lmk_vmpressure_cb, + .priority = INT_MAX +}; + /* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */ static int simple_lmk_init_set(const char *val, const struct kernel_param *kp) { @@ -301,7 +309,9 @@ static int simple_lmk_init_set(const char *val, const struct kernel_param *kp) thread = kthread_run(simple_lmk_reclaim_thread, NULL, "simple_lmkd"); BUG_ON(IS_ERR(thread)); + BUG_ON(vmpressure_notifier_register(&vmpressure_notif)); } + return 0; } diff --git a/include/linux/simple_lmk.h b/include/linux/simple_lmk.h index 28103c1b1d4c..b02d1bec9731 100644 --- a/include/linux/simple_lmk.h +++ b/include/linux/simple_lmk.h @@ -8,12 +8,8 @@ struct mm_struct; #ifdef CONFIG_ANDROID_SIMPLE_LMK -void simple_lmk_decide_reclaim(int kswapd_priority); void simple_lmk_mm_freed(struct mm_struct *mm); #else -static inline void simple_lmk_decide_reclaim(int kswapd_priority) -{ -} static inline void simple_lmk_mm_freed(struct mm_struct *mm) { } diff --git a/mm/vmscan.c b/mm/vmscan.c index 04dfd572436e..ac6f0964f5ea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include @@ -3351,7 +3350,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; - simple_lmk_decide_reclaim(sc.priority); sc.reclaim_idx = classzone_idx; /* From 2bf495cd34530ff82c9c61bafa1aa2a8c8224239 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 26 Feb 2020 10:14:18 -0800 Subject: [PATCH 34/44] simple_lmk: Update adj targeting for Android 10 Android 10 changed its adj assignments. Update Simple LMK to use the new adjs, which also requires looking at each pair of adjs as a range. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 43 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index 2a3316100c79..f502eb5da8d7 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -29,23 +29,21 @@ struct victim_info { }; /* Pulled from the Android framework. Lower adj means higher priority. */ -static const short adj_prio[] = { - 906, /* CACHED_APP_MAX_ADJ */ - 905, /* Cached app */ - 904, /* Cached app */ - 903, /* Cached app */ - 902, /* Cached app */ - 901, /* Cached app */ - 900, /* CACHED_APP_MIN_ADJ */ - 800, /* SERVICE_B_ADJ */ - 700, /* PREVIOUS_APP_ADJ */ - 600, /* HOME_APP_ADJ */ - 500, /* SERVICE_ADJ */ - 400, /* HEAVY_WEIGHT_APP_ADJ */ - 300, /* BACKUP_APP_ADJ */ - 200, /* PERCEPTIBLE_APP_ADJ */ - 100, /* VISIBLE_APP_ADJ */ - 0 /* FOREGROUND_APP_ADJ */ +static const short adjs[] = { + 1000, /* CACHED_APP_MAX_ADJ + 1 */ + 950, /* CACHED_APP_LMK_FIRST_ADJ */ + 900, /* CACHED_APP_MIN_ADJ */ + 800, /* SERVICE_B_ADJ */ + 700, /* PREVIOUS_APP_ADJ */ + 600, /* HOME_APP_ADJ */ + 500, /* SERVICE_ADJ */ + 400, /* HEAVY_WEIGHT_APP_ADJ */ + 300, /* BACKUP_APP_ADJ */ + 250, /* PERCEPTIBLE_LOW_APP_ADJ */ + 200, /* PERCEPTIBLE_APP_ADJ */ + 100, /* VISIBLE_APP_ADJ */ + 50, /* PERCEPTIBLE_RECENT_FOREGROUND_APP_ADJ */ + 0 /* FOREGROUND_APP_ADJ */ }; static struct victim_info victims[MAX_VICTIMS]; @@ -87,7 +85,8 @@ static unsigned long get_total_mm_pages(struct mm_struct *mm) return pages; } -static unsigned long find_victims(int *vindex, short target_adj) +static unsigned long find_victims(int *vindex, short target_adj_min, + short target_adj_max) { unsigned long pages_found = 0; int old_vindex = *vindex; @@ -96,6 +95,7 @@ static unsigned long find_victims(int *vindex, short target_adj) for_each_process(tsk) { struct signal_struct *sig; struct task_struct *vtsk; + short adj; /* * Search for suitable tasks with the targeted importance (adj). @@ -107,7 +107,8 @@ static unsigned long find_victims(int *vindex, short target_adj) * trying to lock a task that we locked earlier. */ sig = tsk->signal; - if (READ_ONCE(sig->oom_score_adj) != target_adj || + adj = READ_ONCE(sig->oom_score_adj); + if (adj < target_adj_min || adj > target_adj_max - 1 || sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP) || (thread_group_empty(tsk) && tsk->flags & PF_EXITING) || vtsk_is_duplicate(*vindex, tsk)) @@ -177,8 +178,8 @@ static void scan_and_kill(unsigned long pages_needed) * is guaranteed to be up to date. */ read_lock(&tasklist_lock); - for (i = 0; i < ARRAY_SIZE(adj_prio); i++) { - pages_found += find_victims(&nr_victims, adj_prio[i]); + for (i = 1; i < ARRAY_SIZE(adjs); i++) { + pages_found += find_victims(&nr_victims, adjs[i], adjs[i - 1]); if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS) break; } From adfac547b4509f030accfac9f363230b565979f9 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Fri, 28 Feb 2020 12:43:54 -0800 Subject: [PATCH 35/44] mm: vmpressure: Don't exclude any allocation types Although userspace processes can't directly help with kernel memory pressure, killing userspace processes can relieve kernel memory if they are responsible for that pressure in the first place. It doesn't make sense to exclude any allocation types knowing that userspace can indeed affect all memory pressure, so don't exclude any allocation types from the pressure calculations. Signed-off-by: Sultan Alsawaf --- mm/vmpressure.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 679fe3020b77..2fdb0a6e25c7 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -277,20 +277,6 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); - /* - * Here we only want to account pressure that userland is able to - * help us with. For example, suppose that DMA zone is under - * pressure; if we notify userland about that kind of pressure, - * then it will be mostly a waste as it will trigger unnecessary - * freeing of memory by userland (since userland is more likely to - * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That - * is why we include only movable, highmem and FS/IO pages. - * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so - * we account it too. - */ - if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) - return; - /* * If we got here with no pages scanned, then that is an indicator * that reclaimer was unable to find any shrinkable LRUs at the @@ -382,9 +368,6 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, unsigned long pressure; unsigned long stall; - if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) - return; - if (!scanned) return; From cb62b63f33865280a8589f3117f10a0ba5bce816 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Fri, 28 Feb 2020 12:38:10 -0800 Subject: [PATCH 36/44] mm: vmpressure: Interpret zero scanned pages as 100% pressure When no pages are scanned, it usually means no zones were reclaimable and nothing could be done. In this case, the reported pressure should be 100 to elicit help from any listeners. This fixes the vmpressure framework not working when memory pressure is very high. Signed-off-by: Sultan Alsawaf --- mm/vmpressure.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 2fdb0a6e25c7..b69c37ea2feb 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -368,26 +368,25 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, unsigned long pressure; unsigned long stall; - if (!scanned) - return; + if (scanned) { + spin_lock(&vmpr->sr_lock); + if (!vmpr->scanned) + calculate_vmpressure_win(); - spin_lock(&vmpr->sr_lock); - if (!vmpr->scanned) - calculate_vmpressure_win(); + vmpr->scanned += scanned; + vmpr->reclaimed += reclaimed; - vmpr->scanned += scanned; - vmpr->reclaimed += reclaimed; + if (!current_is_kswapd()) + vmpr->stall += scanned; - if (!current_is_kswapd()) - vmpr->stall += scanned; + stall = vmpr->stall; + scanned = vmpr->scanned; + reclaimed = vmpr->reclaimed; + spin_unlock(&vmpr->sr_lock); - stall = vmpr->stall; - scanned = vmpr->scanned; - reclaimed = vmpr->reclaimed; - spin_unlock(&vmpr->sr_lock); - - if (scanned < vmpressure_win) - return; + if (scanned < vmpressure_win) + return; + } spin_lock(&vmpr->sr_lock); vmpr->scanned = 0; @@ -395,8 +394,12 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, vmpr->stall = 0; spin_unlock(&vmpr->sr_lock); - pressure = vmpressure_calc_pressure(scanned, reclaimed); - pressure = vmpressure_account_stall(pressure, stall, scanned); + if (scanned) { + pressure = vmpressure_calc_pressure(scanned, reclaimed); + pressure = vmpressure_account_stall(pressure, stall, scanned); + } else { + pressure = 100; + } vmpressure_notify(pressure); } From 969582499faf187352801b85be4d33b13908f9a5 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Fri, 28 Feb 2020 12:57:20 -0800 Subject: [PATCH 37/44] mm: vmpressure: Don't cache the window size Caching the window size can result in delayed or inaccurate pressure reports. Since calculating a fresh window size is cheap, do so all the time instead of relying on a stale, cached value. Signed-off-by: Sultan Alsawaf --- mm/vmpressure.c | 112 ++++++++++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 60 deletions(-) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index b69c37ea2feb..cb7f3ef0004f 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -27,22 +27,6 @@ #include #include -/* - * The window size (vmpressure_win) is the number of scanned pages before - * we try to analyze scanned/reclaimed ratio. So the window is used as a - * rate-limit tunable for the "low" level notification, and also for - * averaging the ratio for medium/critical levels. Using small window - * sizes can cause lot of false positives, but too big window size will - * delay the notifications. - * - * As the vmscan reclaimer logic works with chunks which are multiple of - * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. - * - * TODO: Make the window size depend on machine size, as we do for vmstat - * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). - */ -static unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; - /* * These thresholds are used when we account memory pressure through * scanned/reclaimed ratio. The current values were chosen empirically. In @@ -271,9 +255,32 @@ static void vmpressure_work_fn(struct work_struct *work) } while ((vmpr = vmpressure_parent(vmpr))); } +static unsigned long calculate_vmpressure_win(void) +{ + long x; + + x = global_node_page_state(NR_FILE_PAGES) - + global_node_page_state(NR_SHMEM) - + total_swapcache_pages() + + global_zone_page_state(NR_FREE_PAGES); + if (x < 1) + return 1; + /* + * For low (free + cached), vmpressure window should be + * small, and high for higher values of (free + cached). + * But it should not be linear as well. This ensures + * timely vmpressure notifications when system is under + * memory pressure, and optimal number of events when + * cached is high. The sqaure root function is empirically + * found to serve the purpose. + */ + return int_sqrt(x); +} + #ifdef CONFIG_MEMCG -static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed) +static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical, + bool tree, unsigned long scanned, + unsigned long reclaimed) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); @@ -285,7 +292,9 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, * (scanning depth) goes too high (deep), we will be notified * through vmpressure_prio(). But so far, keep calm. */ - if (!scanned) + if (critical) + scanned = calculate_vmpressure_win(); + else if (!scanned) return; if (tree) { @@ -294,7 +303,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, vmpr->tree_reclaimed += reclaimed; spin_unlock(&vmpr->sr_lock); - if (scanned < vmpressure_win) + if (!critical && scanned < calculate_vmpressure_win()) return; schedule_work(&vmpr->work); } else { @@ -308,7 +317,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, spin_lock(&vmpr->sr_lock); scanned = vmpr->scanned += scanned; reclaimed = vmpr->reclaimed += reclaimed; - if (scanned < vmpressure_win) { + if (!critical && scanned < calculate_vmpressure_win()) { spin_unlock(&vmpr->sr_lock); return; } @@ -332,47 +341,23 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, } } #else -static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed) -{ -} +static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical, + bool tree, unsigned long scanned, + unsigned long reclaimed) { } #endif -static void calculate_vmpressure_win(void) -{ - long x; - - x = global_node_page_state(NR_FILE_PAGES) - - global_node_page_state(NR_SHMEM) - - total_swapcache_pages() + - global_zone_page_state(NR_FREE_PAGES); - if (x < 1) - x = 1; - /* - * For low (free + cached), vmpressure window should be - * small, and high for higher values of (free + cached). - * But it should not be linear as well. This ensures - * timely vmpressure notifications when system is under - * memory pressure, and optimal number of events when - * cached is high. The sqaure root function is empirically - * found to serve the purpose. - */ - x = int_sqrt(x); - vmpressure_win = x; -} - -static void vmpressure_global(gfp_t gfp, unsigned long scanned, - unsigned long reclaimed) +static void vmpressure_global(gfp_t gfp, unsigned long scanned, bool critical, + unsigned long reclaimed) { struct vmpressure *vmpr = &global_vmpressure; unsigned long pressure; unsigned long stall; + if (critical) + scanned = calculate_vmpressure_win(); + if (scanned) { spin_lock(&vmpr->sr_lock); - if (!vmpr->scanned) - calculate_vmpressure_win(); - vmpr->scanned += scanned; vmpr->reclaimed += reclaimed; @@ -384,7 +369,7 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, reclaimed = vmpr->reclaimed; spin_unlock(&vmpr->sr_lock); - if (scanned < vmpressure_win) + if (!critical && scanned < calculate_vmpressure_win()) return; } @@ -403,6 +388,17 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, vmpressure_notify(pressure); } +static void __vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool critical, + bool tree, unsigned long scanned, + unsigned long reclaimed) +{ + if (!memcg && tree) + vmpressure_global(gfp, scanned, critical, reclaimed); + + if (IS_ENABLED(CONFIG_MEMCG)) + vmpressure_memcg(gfp, memcg, critical, tree, scanned, reclaimed); +} + /** * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask @@ -427,11 +423,7 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { - if (!memcg && tree) - vmpressure_global(gfp, scanned, reclaimed); - - if (IS_ENABLED(CONFIG_MEMCG)) - vmpressure_memcg(gfp, memcg, tree, scanned, reclaimed); + __vmpressure(gfp, memcg, false, tree, scanned, reclaimed); } /** @@ -461,7 +453,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * to the vmpressure() basically means that we signal 'critical' * level. */ - vmpressure(gfp, memcg, true, vmpressure_win, 0); + __vmpressure(gfp, memcg, true, true, 0, 0); } static enum vmpressure_levels str_to_level(const char *arg) From e9310a4713a05a88680de2366324ed04a89c5716 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Thu, 26 Mar 2020 21:42:01 -0700 Subject: [PATCH 38/44] mm: Adjust tsk_is_oom_victim() for Simple LMK The page allocator uses tsk_is_oom_victim() to determine when to fast-path memory allocations in order to get an allocating process out of the page allocator and into do_exit() quickly. Unfortunately, tsk_is_oom_victim()'s check to see if a process is killed for OOM purposes is to look for the presence of an OOM reaper artifact that only the OOM killer sets. This means that for processes killed by Simple LMK, there is no fast-pathing done in the page allocator to get them to die faster. Remedy this by changing tsk_is_oom_victim() to look for the existence of the TIF_MEMDIE flag, which Simple LMK sets for its victims. Signed-off-by: Sultan Alsawaf --- include/linux/oom.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/oom.h b/include/linux/oom.h index 6adac113e96d..da17f33bebf5 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -63,7 +63,11 @@ static inline bool oom_task_origin(const struct task_struct *p) static inline bool tsk_is_oom_victim(struct task_struct * tsk) { +#ifdef CONFIG_ANDROID_SIMPLE_LMK + return test_ti_thread_flag(task_thread_info(tsk), TIF_MEMDIE); +#else return tsk->signal->oom_mm; +#endif } /* From c52571bf78b9909aad870236deb0a0ec28837c56 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Thu, 26 Mar 2020 21:44:28 -0700 Subject: [PATCH 39/44] mm: Don't warn on page allocation failures for OOM-killed processes It can be normal for a dying process to have its page allocation request fail when it has an OOM or LMK kill pending. In this case, it's actually detrimental to print out a massive allocation failure message because this means the running process needs to die quickly and release its memory, which is slowed down slightly by the massive kmsg splat. The allocation failure message is also a false positive in this case, since the failure is intentional rather than being the result of an inability to allocate memory. Suppress the allocation failure warning for processes that are killed to release memory in order to expedite their death and remedy the kmsg confusion from seeing spurious allocation failure messages. Signed-off-by: Sultan Alsawaf --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aa3590cbd1f8..dd3f76a44814 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4065,8 +4065,10 @@ retry: /* Avoid allocations with no watermarks from looping endlessly */ if (tsk_is_oom_victim(current) && (alloc_flags == ALLOC_OOM || - (gfp_mask & __GFP_NOMEMALLOC))) + (gfp_mask & __GFP_NOMEMALLOC))) { + gfp_mask |= __GFP_NOWARN; goto nopage; + } /* Retry as long as the OOM killer is making progress */ if (did_some_progress) { From 2ce5d5b6898f22ba649602735b0b5b80ba2e5313 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sat, 4 Apr 2020 17:48:40 -0700 Subject: [PATCH 40/44] mm: vmpressure: Ignore allocation orders above PAGE_ALLOC_COSTLY_ORDER PAGE_ALLOC_COSTLY_ORDER allocations can cause vmpressure to incorrectly think that memory pressure is high, when it's really just that the allocation's high order is difficult to satisfy. When this rare scenario occurs, ignore the input to vmpressure to avoid sending out a spurious high-pressure signal. Signed-off-by: Sultan Alsawaf --- include/linux/vmpressure.h | 3 ++- mm/vmpressure.c | 5 ++++- mm/vmscan.c | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index de86c6b946c7..31bffa69e864 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -33,7 +33,8 @@ struct mem_cgroup; extern int vmpressure_notifier_register(struct notifier_block *nb); extern int vmpressure_notifier_unregister(struct notifier_block *nb); extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed); + unsigned long scanned, unsigned long reclaimed, + int order); extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); #ifdef CONFIG_MEMCG diff --git a/mm/vmpressure.c b/mm/vmpressure.c index cb7f3ef0004f..9c2c9d08718e 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -421,8 +421,11 @@ static void __vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool critical, * This function does not return any value. */ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, - unsigned long scanned, unsigned long reclaimed) + unsigned long scanned, unsigned long reclaimed, int order) { + if (order > PAGE_ALLOC_COSTLY_ORDER) + return; + __vmpressure(gfp, memcg, false, tree, scanned, reclaimed); } diff --git a/mm/vmscan.c b/mm/vmscan.c index ac6f0964f5ea..0c383b2ae625 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2644,7 +2644,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + sc->nr_reclaimed - reclaimed, sc->order); /* * Direct reclaim and kswapd have to scan all memory @@ -2680,7 +2680,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) /* Record the subtree's reclaim efficiency */ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); + sc->nr_reclaimed - nr_reclaimed, sc->order); if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; From 667691476c0d42362407cd3a5fba9e7a626406c5 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Thu, 14 May 2020 15:16:48 -0700 Subject: [PATCH 41/44] simple_lmk: Consider all positive adjs when finding victims We are allowed to kill any process with a positive adj, so we shouldn't exclude any processes with adjs greater than 999. This would present a problem with quirky applications that set their own adj score, such as stress-ng. In the case of stress-ng, it would set its adj score to 1000 and thus exempt itself from being killed by Simple LMK. This shouldn't be allowed; any process with a positive adj, up to the highest positive adj possible (32767) should be killable. Reported-by: Danny Lin Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index f502eb5da8d7..d89e5b1ce363 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -29,21 +29,21 @@ struct victim_info { }; /* Pulled from the Android framework. Lower adj means higher priority. */ -static const short adjs[] = { - 1000, /* CACHED_APP_MAX_ADJ + 1 */ - 950, /* CACHED_APP_LMK_FIRST_ADJ */ - 900, /* CACHED_APP_MIN_ADJ */ - 800, /* SERVICE_B_ADJ */ - 700, /* PREVIOUS_APP_ADJ */ - 600, /* HOME_APP_ADJ */ - 500, /* SERVICE_ADJ */ - 400, /* HEAVY_WEIGHT_APP_ADJ */ - 300, /* BACKUP_APP_ADJ */ - 250, /* PERCEPTIBLE_LOW_APP_ADJ */ - 200, /* PERCEPTIBLE_APP_ADJ */ - 100, /* VISIBLE_APP_ADJ */ - 50, /* PERCEPTIBLE_RECENT_FOREGROUND_APP_ADJ */ - 0 /* FOREGROUND_APP_ADJ */ +static const unsigned short adjs[] = { + SHRT_MAX + 1, /* Include all positive adjs in the final range */ + 950, /* CACHED_APP_LMK_FIRST_ADJ */ + 900, /* CACHED_APP_MIN_ADJ */ + 800, /* SERVICE_B_ADJ */ + 700, /* PREVIOUS_APP_ADJ */ + 600, /* HOME_APP_ADJ */ + 500, /* SERVICE_ADJ */ + 400, /* HEAVY_WEIGHT_APP_ADJ */ + 300, /* BACKUP_APP_ADJ */ + 250, /* PERCEPTIBLE_LOW_APP_ADJ */ + 200, /* PERCEPTIBLE_APP_ADJ */ + 100, /* VISIBLE_APP_ADJ */ + 50, /* PERCEPTIBLE_RECENT_FOREGROUND_APP_ADJ */ + 0 /* FOREGROUND_APP_ADJ */ }; static struct victim_info victims[MAX_VICTIMS]; @@ -85,8 +85,8 @@ static unsigned long get_total_mm_pages(struct mm_struct *mm) return pages; } -static unsigned long find_victims(int *vindex, short target_adj_min, - short target_adj_max) +static unsigned long find_victims(int *vindex, unsigned short target_adj_min, + unsigned short target_adj_max) { unsigned long pages_found = 0; int old_vindex = *vindex; From 06f2951e89ea18ad9aea3b7fa3203349ee2ee22b Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 20 May 2020 09:55:17 -0700 Subject: [PATCH 42/44] mm: Don't stop kswapd on a per-node basis when there are no waiters The page allocator wakes all kswapds in an allocation context's allowed nodemask in the slow path, so it doesn't make sense to have the kswapd- waiter count per each NUMA node. Instead, it should be a global counter to stop all kswapds when there are no failed allocation requests. Signed-off-by: Sultan Alsawaf --- include/linux/mmzone.h | 1 - mm/internal.h | 1 + mm/page_alloc.c | 8 ++++---- mm/vmscan.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 33259efac89d..71b7a8bc82ea 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -653,7 +653,6 @@ typedef struct pglist_data { unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; - atomic_t kswapd_waiters; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by diff --git a/mm/internal.h b/mm/internal.h index a182506242c4..b4d77565c358 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -167,6 +167,7 @@ extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern atomic_long_t kswapd_waiters; #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd3f76a44814..a4ca21db64e5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -74,6 +74,8 @@ #include #include "internal.h" +atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0); + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -3885,7 +3887,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int no_progress_loops; unsigned int cpuset_mems_cookie; int reserve_flags; - pg_data_t *pgdat = ac->preferred_zoneref->zone->zone_pgdat; bool woke_kswapd = false; /* @@ -3922,7 +3923,7 @@ retry_cpuset: if (gfp_mask & __GFP_KSWAPD_RECLAIM) { if (!woke_kswapd) { - atomic_inc(&pgdat->kswapd_waiters); + atomic_long_inc(&kswapd_waiters); woke_kswapd = true; } wake_all_kswapds(order, ac); @@ -4124,7 +4125,7 @@ nopage: fail: got_pg: if (woke_kswapd) - atomic_dec(&pgdat->kswapd_waiters); + atomic_long_dec(&kswapd_waiters); if (!page) warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); @@ -6071,7 +6072,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); - pgdat->kswapd_waiters = (atomic_t)ATOMIC_INIT(0); pgdat->per_cpu_nodestats = &boot_nodestats; diff --git a/mm/vmscan.c b/mm/vmscan.c index 0c383b2ae625..cecf47b1982c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3422,7 +3422,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) /* Check if kswapd should be suspending */ if (try_to_freeze() || kthread_should_stop() || - !atomic_read(&pgdat->kswapd_waiters)) + !atomic_long_read(&kswapd_waiters)) break; /* From 9bdd0a8ccd976dfb9effab1f91243fa59c26ee0c Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 20 May 2020 19:52:05 -0700 Subject: [PATCH 43/44] simple_lmk: Hold an RCU read lock instead of the tasklist read lock We already check to see if each eligible process isn't already dying, so an RCU read lock can be used to speed things up instead of holding the tasklist read lock. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index d89e5b1ce363..a08287e4fcfe 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -172,18 +172,14 @@ static void scan_and_kill(unsigned long pages_needed) int i, nr_to_kill = 0, nr_victims = 0, ret; unsigned long pages_found = 0; - /* - * Hold the tasklist lock so tasks don't disappear while scanning. This - * is preferred to holding an RCU read lock so that the list of tasks - * is guaranteed to be up to date. - */ - read_lock(&tasklist_lock); + /* Hold an RCU read lock while traversing the global process list */ + rcu_read_lock(); for (i = 1; i < ARRAY_SIZE(adjs); i++) { pages_found += find_victims(&nr_victims, adjs[i], adjs[i - 1]); if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS) break; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* Pretty unlikely but it can happen */ if (unlikely(!nr_victims)) { From 7b7d605b765a983ea792baed08dae688faac2165 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 10 Jun 2020 17:55:04 -0700 Subject: [PATCH 44/44] simple_lmk: Remove unnecessary clean-up when timeout is reached Zeroing out the mm struct pointers when the timeout is hit isn't needed because mm_free_lock prevents any readers from accessing the mm struct pointers while clean-up occurs, and since the simple_lmk_mm_freed() loop bound is set to zero during clean-up, there is no possibility of dying processes ever reading stale mm struct pointers. Therefore, it is unnecessary to clear out the mm struct pointers when the timeout is reached. Now the only step to do when the timeout is reached is to re-init the completion, but since reinit_completion() just sets a struct member to zero, call reinit_completion() unconditionally as it is faster than encapsulating it within a conditional statement. Also take this opportunity to rename some variables and tidy up some code indentation. Signed-off-by: Sultan Alsawaf --- drivers/android/simple_lmk.c | 40 +++++++++++++++--------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c index a08287e4fcfe..b0bffb991aa3 100644 --- a/drivers/android/simple_lmk.c +++ b/drivers/android/simple_lmk.c @@ -50,7 +50,7 @@ static struct victim_info victims[MAX_VICTIMS]; static DECLARE_WAIT_QUEUE_HEAD(oom_waitq); static DECLARE_COMPLETION(reclaim_done); static DEFINE_RWLOCK(mm_free_lock); -static int victims_to_kill; +static int nr_victims; static atomic_t needs_reclaim = ATOMIC_INIT(0); static atomic_t nr_killed = ATOMIC_INIT(0); @@ -169,26 +169,26 @@ static int process_victims(int vlen, unsigned long pages_needed) static void scan_and_kill(unsigned long pages_needed) { - int i, nr_to_kill = 0, nr_victims = 0, ret; + int i, nr_to_kill = 0, nr_found = 0; unsigned long pages_found = 0; /* Hold an RCU read lock while traversing the global process list */ rcu_read_lock(); for (i = 1; i < ARRAY_SIZE(adjs); i++) { - pages_found += find_victims(&nr_victims, adjs[i], adjs[i - 1]); - if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS) + pages_found += find_victims(&nr_found, adjs[i], adjs[i - 1]); + if (pages_found >= pages_needed || nr_found == MAX_VICTIMS) break; } rcu_read_unlock(); /* Pretty unlikely but it can happen */ - if (unlikely(!nr_victims)) { + if (unlikely(!nr_found)) { pr_err("No processes available to kill!\n"); return; } /* First round of victim processing to weed out unneeded victims */ - nr_to_kill = process_victims(nr_victims, pages_needed); + nr_to_kill = process_victims(nr_found, pages_needed); /* * Try to kill as few of the chosen victims as possible by sorting the @@ -202,7 +202,7 @@ static void scan_and_kill(unsigned long pages_needed) /* Store the final number of victims for simple_lmk_mm_freed() */ write_lock(&mm_free_lock); - victims_to_kill = nr_to_kill; + nr_victims = nr_to_kill; write_unlock(&mm_free_lock); /* Kill the victims */ @@ -235,15 +235,10 @@ static void scan_and_kill(unsigned long pages_needed) } /* Wait until all the victims die or until the timeout is reached */ - ret = wait_for_completion_timeout(&reclaim_done, RECLAIM_EXPIRES); + wait_for_completion_timeout(&reclaim_done, RECLAIM_EXPIRES); write_lock(&mm_free_lock); - if (!ret) { - /* Extra clean-up is needed when the timeout is hit */ - reinit_completion(&reclaim_done); - for (i = 0; i < nr_to_kill; i++) - victims[i].mm = NULL; - } - victims_to_kill = 0; + reinit_completion(&reclaim_done); + nr_victims = 0; nr_killed = (atomic_t)ATOMIC_INIT(0); write_unlock(&mm_free_lock); } @@ -270,14 +265,13 @@ void simple_lmk_mm_freed(struct mm_struct *mm) int i; read_lock(&mm_free_lock); - for (i = 0; i < victims_to_kill; i++) { - if (victims[i].mm != mm) - continue; - - victims[i].mm = NULL; - if (atomic_inc_return_relaxed(&nr_killed) == victims_to_kill) - complete(&reclaim_done); - break; + for (i = 0; i < nr_victims; i++) { + if (victims[i].mm == mm) { + victims[i].mm = NULL; + if (atomic_inc_return_relaxed(&nr_killed) == nr_victims) + complete(&reclaim_done); + break; + } } read_unlock(&mm_free_lock); }