From b61c5be1f6e228ba18468ccfaac99050614866fc Mon Sep 17 00:00:00 2001 From: "Angelo G. Del Regno" Date: Mon, 24 Jun 2019 23:21:00 +0200 Subject: [PATCH] Backport new vmalloc for "large performance benefits" This is a backport from Linux 5.2-rc1 of a patch series to greatly enhance vmalloc's performance especially on embedded systems, plus all of its dependencies that were missing in kernel 4.9. For all the informations, refer to LKML: https://lkml.org/lkml/2018/10/19/786 Brief informations: Currently an allocation of the new VA area is done over busy list iteration until a suitable hole is found between two busy areas. Therefore each new allocation causes the list being grown. Due to long list and different permissive parameters an allocation can take a long time on embedded devices(milliseconds). This patch organizes the vmalloc memory layout into free areas of the VMALLOC_START-VMALLOC_END range. It uses a red-black tree that keeps blocks sorted by their offsets in pair with linked list keeping the free space in order of increasing addresses. Quote Phoronix: With this patch from Uladzislau Rezki, calling vmalloc() can take up to 67% less time compared to the behavior on Linux 5.1 and prior, at least with tests done by the developer under QEMU. Personal tests are showing that the device is more responsive when memory pressure is high and when huge allocations are to be done, it's also noticeably faster in this case, like when starting Chrome with more than 100 opened tabs after a system reboot (so, an uncached complete load of it). Shameless kanged from: https://github.com/sonyxperiadev/kernel / Pull Request 2016 --- Documentation/dev-tools/kmemleak.rst | 1 + arch/Kconfig | 3 + arch/arm/include/asm/cacheflush.h | 1 + arch/arm/include/asm/set_memory.h | 32 + arch/arm/mm/ioremap.c | 3 +- arch/arm/mm/mmu.c | 41 +- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/Kbuild | 1 + arch/arm64/include/asm/cacheflush.h | 1 + arch/cris/mm/init.c | 2 +- arch/parisc/kernel/module.c | 2 +- arch/powerpc/platforms/pseries/cmm.c | 10 +- arch/s390/include/asm/set_memory.h | 31 + arch/s390/mm/init.c | 2 +- arch/sparc/mm/init_32.c | 2 +- arch/tile/mm/init.c | 2 +- arch/um/kernel/mem.c | 4 +- arch/x86/include/asm/set_memory.h | 87 + arch/x86/kernel/cpu/microcode/core.c | 4 +- arch/x86/kernel/module.c | 2 +- arch/x86/kvm/lapic.c | 4 +- arch/x86/kvm/page_track.c | 4 +- arch/x86/kvm/x86.c | 4 +- arch/x86/mm/highmem_32.c | 2 +- arch/x86/mm/pageattr.c | 14 +- drivers/block/drbd/drbd_bitmap.c | 2 +- drivers/char/agp/backend.c | 4 +- drivers/hv/hv_balloon.c | 18 +- drivers/md/dm-bufio.c | 7 +- drivers/md/dm-stats.c | 9 +- drivers/media/platform/mtk-vpu/mtk_vpu.c | 2 +- drivers/parisc/ccio-dma.c | 4 +- drivers/parisc/sba_iommu.c | 2 +- drivers/staging/android/ion/ion_system_heap.c | 2 +- .../lustre/include/linux/libcfs/libcfs_prim.h | 2 +- .../lnet/libcfs/linux/linux-tracefile.c | 2 +- drivers/staging/lustre/lustre/include/obd.h | 4 +- drivers/staging/lustre/lustre/ldlm/ldlm_lib.c | 6 +- .../staging/lustre/lustre/llite/lproc_llite.c | 8 +- .../lustre/lustre/obdclass/class_obd.c | 6 +- .../lustre/obdclass/linux/linux-sysctl.c | 2 +- .../lustre/lustre/obdclass/lu_object.c | 2 +- drivers/staging/lustre/lustre/osc/lproc_osc.c | 2 +- .../lustre/lustre/ptlrpc/lproc_ptlrpc.c | 2 +- .../staging/lustre/lustre/ptlrpc/sec_bulk.c | 4 +- drivers/xen/balloon.c | 2 +- drivers/xen/xen-selfballoon.c | 6 +- fs/btrfs/free-space-tree.c | 3 +- fs/ext4/mballoc.c | 2 +- fs/ext4/super.c | 6 +- fs/file.c | 2 +- fs/file_table.c | 6 +- fs/fuse/inode.c | 2 +- fs/proc/base.c | 2 +- fs/seq_file.c | 19 +- fs/xfs/kmem.c | 2 +- include/asm-generic/pgtable.h | 4 + include/asm-generic/set_memory.h | 12 + include/drm/drm_mem_util.h | 9 +- include/linux/highmem.h | 28 +- include/linux/kmemleak.h | 7 + include/linux/kvm_host.h | 2 - include/linux/list.h | 13 + include/linux/llist.h | 19 + include/linux/mm.h | 44 +- include/linux/set_memory.h | 57 + include/linux/swap.h | 1 - include/linux/vmalloc.h | 46 +- ipc/util.c | 7 +- kernel/bpf/core.c | 9 +- kernel/bpf/syscall.c | 1 + kernel/fork.c | 6 +- kernel/groups.c | 2 +- kernel/kexec_core.c | 4 +- kernel/module.c | 2 +- kernel/power/snapshot.c | 2 +- mm/bootmem.c | 4 +- mm/highmem.c | 5 +- mm/huge_memory.c | 2 +- mm/hugetlb.c | 2 +- mm/kasan/kasan.c | 2 +- mm/kasan/quarantine.c | 2 +- mm/kmemleak.c | 136 +- mm/memblock.c | 2 +- mm/mm_init.c | 2 +- mm/nobootmem.c | 4 +- mm/nommu.c | 12 +- mm/oom_kill.c | 2 +- mm/page-writeback.c | 2 +- mm/page_alloc.c | 20 +- mm/shmem.c | 6 +- mm/slab.c | 2 +- mm/swap.c | 2 +- mm/util.c | 48 +- mm/vmalloc.c | 1534 ++++++++++++----- mm/workingset.c | 4 +- net/ceph/ceph_common.c | 2 +- net/dccp/proto.c | 6 +- net/decnet/dn_route.c | 2 +- net/ipv4/tcp_metrics.c | 2 +- net/netfilter/nf_conntrack_core.c | 6 +- net/netfilter/x_tables.c | 2 +- net/netfilter/xt_hashlimit.c | 4 +- net/sctp/protocol.c | 6 +- security/apparmor/apparmorfs.c | 2 +- security/apparmor/include/apparmor.h | 11 - security/apparmor/lib.c | 28 - security/apparmor/match.c | 2 +- virt/kvm/kvm_main.c | 18 +- 109 files changed, 1790 insertions(+), 771 deletions(-) create mode 100644 arch/arm/include/asm/set_memory.h create mode 100644 arch/s390/include/asm/set_memory.h create mode 100644 arch/x86/include/asm/set_memory.h create mode 100644 include/asm-generic/set_memory.h create mode 100644 include/linux/set_memory.h diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index b2391b829169..cb8862659178 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -150,6 +150,7 @@ See the include/linux/kmemleak.h header for the functions prototype. - ``kmemleak_init`` - initialize kmemleak - ``kmemleak_alloc`` - notify of a memory block allocation - ``kmemleak_alloc_percpu`` - notify of a percpu memory block allocation +- ``kmemleak_vmalloc`` - notify of a vmalloc() memory allocation - ``kmemleak_free`` - notify of a memory block freeing - ``kmemleak_free_part`` - notify of a partial memory block freeing - ``kmemleak_free_percpu`` - notify of a percpu memory block freeing diff --git a/arch/Kconfig b/arch/Kconfig index eab393fb6ddb..6fc0c37d2145 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -227,6 +227,9 @@ config ARCH_HAS_FORTIFY_SOURCE An architecture should select this when it can successfully build and run with CONFIG_FORTIFY_SOURCE. +config ARCH_HAS_SET_MEMORY + bool + config FORTIFY_COMPILE_CHECK depends on ARCH_HAS_FORTIFY_SOURCE bool diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 8e349ce8230c..4c825169668c 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -16,6 +16,7 @@ #include #include #include +#include #define CACHE_COLOUR(vaddr) ((vaddr & (SHMLBA - 1)) >> PAGE_SHIFT) diff --git a/arch/arm/include/asm/set_memory.h b/arch/arm/include/asm/set_memory.h new file mode 100644 index 000000000000..5aa4315abe91 --- /dev/null +++ b/arch/arm/include/asm/set_memory.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 1999-2002 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASMARM_SET_MEMORY_H +#define _ASMARM_SET_MEMORY_H + +#ifdef CONFIG_MMU +int set_memory_ro(unsigned long addr, int numpages); +int set_memory_rw(unsigned long addr, int numpages); +int set_memory_x(unsigned long addr, int numpages); +int set_memory_nx(unsigned long addr, int numpages); +#else +static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } +#endif + +#ifdef CONFIG_STRICT_KERNEL_RWX +void set_kernel_text_rw(void); +void set_kernel_text_ro(void); +#else +static inline void set_kernel_text_rw(void) { } +static inline void set_kernel_text_ro(void) { } +#endif + +#endif diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 3da63a6a2ac6..66e5d8765601 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -92,8 +92,7 @@ void __init add_static_vm_early(struct static_vm *svm) void *vaddr; vm = &svm->vm; - if (!vm_area_check_early(vm)) - vm_area_add_early(vm); + vm_area_add_early(vm); vaddr = vm->addr; list_for_each_entry(curr_svm, &static_vmlist, list) { diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 21980d28c38d..bc000444aca2 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1462,21 +1462,12 @@ static void __init map_lowmem(void) phys_addr_t kernel_x_start = round_down(__pa(_stext), SECTION_SIZE); #endif phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE); - struct static_vm *svm; - phys_addr_t start; - phys_addr_t end; - unsigned long vaddr; - unsigned long pfn; - unsigned long length; - unsigned int type; - int nr = 0; /* Map all the lowmem memory banks. */ for_each_memblock(memory, reg) { + phys_addr_t start = reg->base; + phys_addr_t end = start + reg->size; struct map_desc map; - start = reg->base; - end = start + reg->size; - nr++; if (memblock_is_nomap(reg)) continue; @@ -1528,34 +1519,6 @@ static void __init map_lowmem(void) } } } - svm = early_alloc_aligned(sizeof(*svm) * nr, __alignof__(*svm)); - - for_each_memblock(memory, reg) { - struct vm_struct *vm; - - start = reg->base; - end = start + reg->size; - - if (end > arm_lowmem_limit) - end = arm_lowmem_limit; - if (start >= end) - break; - - vm = &svm->vm; - pfn = __phys_to_pfn(start); - vaddr = __phys_to_virt(start); - length = end - start; - type = MT_MEMORY_RW; - - vm->addr = (void *)(vaddr & PAGE_MASK); - vm->size = PAGE_ALIGN(length + (vaddr & ~PAGE_MASK)); - vm->phys_addr = __pfn_to_phys(pfn); - vm->flags = VM_LOWMEM; - vm->flags |= VM_ARM_MTYPE(type); - vm->caller = map_lowmem; - add_static_vm_early(svm++); - mark_vmalloc_reserved_area(vm->addr, vm->size); - } } #ifdef CONFIG_ARM_PV_FIXUP diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f50f0ce7d3f4..bf195051af0a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -23,6 +23,7 @@ config ARM64 select ARCH_WANT_COMPAT_IPC_PARSE_VERSION select ARCH_WANT_FRAME_POINTERS select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_SET_MEMORY select ARM_AMBA select ARM_ARCH_TIMER select HAVE_KERNEL_GZIP diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 28196b18e394..08ff982cab85 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -31,6 +31,7 @@ generic-y += rwsem.h generic-y += segment.h generic-y += sembuf.h generic-y += serial.h +generic-y += set_memory.h generic-y += shmbuf.h generic-y += simd.h generic-y += sizes.h diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 9377bec034fa..11ba1a570394 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -20,6 +20,7 @@ #define __ASM_CACHEFLUSH_H #include +#include /* * This flag is used to indicate that the page pointed to by a pte is clean diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c index 1e7fd45b60f8..c4e0c37200c3 100644 --- a/arch/cris/mm/init.c +++ b/arch/cris/mm/init.c @@ -43,7 +43,7 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); free_page(addr); - totalram_pages++; + totalram_pages_inc(); } printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index a0ecdb4abcc8..3d4f5660a2e0 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -218,7 +218,7 @@ void *module_alloc(unsigned long size) * easier than trying to map the text, data, init_text and * init_data correctly */ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_HIGHMEM, + GFP_KERNEL, PAGE_KERNEL_RWX, 0, NUMA_NO_NODE, __builtin_return_address(0)); } diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 316eded255e7..44171fb34e23 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -174,7 +174,7 @@ static long cmm_alloc_pages(long nr) pa->page[pa->index++] = addr; loaned_pages++; - totalram_pages--; + totalram_pages_dec(); spin_unlock(&cmm_lock); nr--; } @@ -213,7 +213,7 @@ static long cmm_free_pages(long nr) free_page(addr); loaned_pages--; nr--; - totalram_pages++; + totalram_pages_inc(); } spin_unlock(&cmm_lock); cmm_dbg("End request with %ld pages unfulfilled\n", nr); @@ -257,7 +257,7 @@ static void cmm_get_mpp(void) int rc; struct hvcall_mpp_data mpp_data; signed long active_pages_target, page_loan_request, target; - signed long total_pages = totalram_pages + loaned_pages; + signed long total_pages = totalram_pages() + loaned_pages; signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE; rc = h_get_mpp(&mpp_data); @@ -288,7 +288,7 @@ static void cmm_get_mpp(void) cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n", page_loan_request, loaned_pages, loaned_pages_target, - oom_freed_pages, totalram_pages); + oom_freed_pages, totalram_pages()); } static struct notifier_block cmm_oom_nb = { @@ -552,7 +552,7 @@ static int cmm_mem_going_offline(void *arg) free_page(pa_curr->page[idx]); freed++; loaned_pages--; - totalram_pages++; + totalram_pages_inc(); pa_curr->page[idx] = pa_last->page[--pa_last->index]; if (pa_last->index == 0) { if (pa_curr == pa_last) diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h new file mode 100644 index 000000000000..46a4db44c47a --- /dev/null +++ b/arch/s390/include/asm/set_memory.h @@ -0,0 +1,31 @@ +#ifndef _ASMS390_SET_MEMORY_H +#define _ASMS390_SET_MEMORY_H + +#define SET_MEMORY_RO 1UL +#define SET_MEMORY_RW 2UL +#define SET_MEMORY_NX 4UL +#define SET_MEMORY_X 8UL + +int __set_memory(unsigned long addr, int numpages, unsigned long flags); + +static inline int set_memory_ro(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, SET_MEMORY_RO); +} + +static inline int set_memory_rw(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, SET_MEMORY_RW); +} + +static inline int set_memory_nx(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, SET_MEMORY_NX); +} + +static inline int set_memory_x(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, SET_MEMORY_X); +} + +#endif diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index b3e9d18f2ec6..75b263e98378 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -56,7 +56,7 @@ static void __init setup_zero_pages(void) order = 7; /* Limit number of empty zero pages for small memory sizes */ - while (order > 2 && (totalram_pages >> 10) < (1UL << order)) + while (order > 2 && (totalram_pages() >> 10) < (1UL << order)) order--; empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 3b7092d9ea8f..4dbe42253e75 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -61,7 +61,7 @@ void show_mem(unsigned int filter) show_free_areas(filter); printk("Free swap: %6ldkB\n", get_nr_swap_pages() << (PAGE_SHIFT-10)); - printk("%ld pages of RAM\n", totalram_pages); + printk("%ld pages of RAM\n", totalram_pages()); printk("%ld free pages\n", nr_free_pages()); } diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index adce25462b0d..13c220db307c 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -708,7 +708,7 @@ static void __init set_non_bootmem_pages_init(void) #ifdef CONFIG_HIGHMEM if (idx == ZONE_HIGHMEM) - totalhigh_pages += z->spanned_pages; + totalhigh_pages_add(z->spanned_pages); #endif if (kdata_huge) { unsigned long percpu_pfn = node_percpu_pfn[nid]; diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index e7437ec62710..50813cae069b 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -53,8 +53,8 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ free_all_bootmem(); - max_low_pfn = totalram_pages; - max_pfn = totalram_pages; + max_low_pfn = totalram_pages(); + max_pfn = totalram_pages(); mem_init_print_info(NULL); kmalloc_ok = 1; } diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h new file mode 100644 index 000000000000..eaec6c364e42 --- /dev/null +++ b/arch/x86/include/asm/set_memory.h @@ -0,0 +1,87 @@ +#ifndef _ASM_X86_SET_MEMORY_H +#define _ASM_X86_SET_MEMORY_H + +#include +#include + +/* + * The set_memory_* API can be used to change various attributes of a virtual + * address range. The attributes include: + * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack + * Executability : eXeutable, NoteXecutable + * Read/Write : ReadOnly, ReadWrite + * Presence : NotPresent + * + * Within a category, the attributes are mutually exclusive. + * + * The implementation of this API will take care of various aspects that + * are associated with changing such attributes, such as: + * - Flushing TLBs + * - Flushing CPU caches + * - Making sure aliases of the memory behind the mapping don't violate + * coherency rules as defined by the CPU in the system. + * + * What this API does not do: + * - Provide exclusion between various callers - including callers that + * operation on other mappings of the same physical page + * - Restore default attributes when a page is freed + * - Guarantee that mappings other than the requested one are + * in any state, other than that these do not violate rules for + * the CPU you have. Do not depend on any effects on other mappings, + * CPUs other than the one you have may have more relaxed rules. + * The caller is required to take care of these. + */ + +int _set_memory_uc(unsigned long addr, int numpages); +int _set_memory_wc(unsigned long addr, int numpages); +int _set_memory_wt(unsigned long addr, int numpages); +int _set_memory_wb(unsigned long addr, int numpages); +int set_memory_uc(unsigned long addr, int numpages); +int set_memory_wc(unsigned long addr, int numpages); +int set_memory_wt(unsigned long addr, int numpages); +int set_memory_wb(unsigned long addr, int numpages); +int set_memory_np(unsigned long addr, int numpages); +int set_memory_4k(unsigned long addr, int numpages); + +int set_memory_array_uc(unsigned long *addr, int addrinarray); +int set_memory_array_wc(unsigned long *addr, int addrinarray); +int set_memory_array_wt(unsigned long *addr, int addrinarray); +int set_memory_array_wb(unsigned long *addr, int addrinarray); + +int set_pages_array_uc(struct page **pages, int addrinarray); +int set_pages_array_wc(struct page **pages, int addrinarray); +int set_pages_array_wt(struct page **pages, int addrinarray); +int set_pages_array_wb(struct page **pages, int addrinarray); + +/* + * For legacy compatibility with the old APIs, a few functions + * are provided that work on a "struct page". + * These functions operate ONLY on the 1:1 kernel mapping of the + * memory that the struct page represents, and internally just + * call the set_memory_* function. See the description of the + * set_memory_* function for more details on conventions. + * + * These APIs should be considered *deprecated* and are likely going to + * be removed in the future. + * The reason for this is the implicit operation on the 1:1 mapping only, + * making this not a generally useful API. + * + * Specifically, many users of the old APIs had a virtual address, + * called virt_to_page() or vmalloc_to_page() on that address to + * get a struct page* that the old API required. + * To convert these cases, use set_memory_*() on the original + * virtual address, do not use these functions. + */ + +int set_pages_uc(struct page *page, int numpages); +int set_pages_wb(struct page *page, int numpages); +int set_pages_x(struct page *page, int numpages); +int set_pages_nx(struct page *page, int numpages); +int set_pages_ro(struct page *page, int numpages); +int set_pages_rw(struct page *page, int numpages); + +extern int kernel_set_to_readonly; +void set_kernel_text_rw(void); +void set_kernel_text_ro(void); + +#endif /* _ASM_X86_SET_MEMORY_H */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 499bc79fc82a..55f1c48dba05 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -325,8 +325,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, { ssize_t ret = -EINVAL; - if ((len >> PAGE_SHIFT) > totalram_pages) { - pr_err("too much data (max %ld pages)\n", totalram_pages); + if ((len >> PAGE_SHIFT) > totalram_pages()) { + pr_err("too much data (max %ld pages)\n", totalram_pages()); return ret; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 3c09ca384199..d035ec1af1d1 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -85,7 +85,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, + MODULES_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e5f92488c3cd..ff8a391e10c5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -168,8 +168,8 @@ static void recalculate_apic_map(struct kvm *kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); - new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + - sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); + new = kvzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL); if (!new) goto out; diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 85024e0cfaa5..d5bab386c0fb 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -38,8 +38,8 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - slot->arch.gfn_track[i] = kvm_kvzalloc(npages * - sizeof(*slot->arch.gfn_track[i])); + slot->arch.gfn_track[i] = kvzalloc(npages * + sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL); if (!slot->arch.gfn_track[i]) goto track_free; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 007dc3298f88..5cbf52cec4dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8358,13 +8358,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, slot->base_gfn, level) + 1; slot->arch.rmap[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); + kvzalloc(lpages * sizeof(*slot->arch.rmap[i]), GFP_KERNEL); if (!slot->arch.rmap[i]) goto out_free; if (i == 0) continue; - linfo = kvm_kvzalloc(lpages * sizeof(*linfo)); + linfo = kvzalloc(lpages * sizeof(*linfo), GFP_KERNEL); if (!linfo) goto out_free; diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 6d18b70ed5a9..8ac7132919a3 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,6 +1,6 @@ #include #include -#include /* for totalram_pages */ +#include /* for totalram_pages() */ #include void *kmap(struct page *page) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1271bc9fa3c6..7d50d46c046f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1897,8 +1897,6 @@ int set_pages_rw(struct page *page, int numpages) return set_memory_rw(addr, numpages); } -#ifdef CONFIG_DEBUG_PAGEALLOC - static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); @@ -1937,6 +1935,17 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 0); } +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_pages_np(page, 1); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_pages_p(page, 1); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) @@ -1966,7 +1975,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } #ifdef CONFIG_HIBERNATION - bool kernel_page_present(struct page *page) { unsigned int level; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index dece26f119d4..a804a4107fbc 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -409,7 +409,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN); if (!new_pages) { new_pages = __vmalloc(bytes, - GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO, + GFP_NOIO | __GFP_ZERO, PAGE_KERNEL); if (!new_pages) return NULL; diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 38ffb281df97..004a3ce8ba72 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -115,9 +115,9 @@ static int agp_find_max(void) long memory, index, result; #if PAGE_SHIFT < 20 - memory = totalram_pages >> (20 - PAGE_SHIFT); + memory = totalram_pages() >> (20 - PAGE_SHIFT); #else - memory = totalram_pages << (PAGE_SHIFT - 20); + memory = totalram_pages() << (PAGE_SHIFT - 20); #endif index = 1; diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index fdf8da929cbe..b1b7deaab49d 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1048,16 +1048,16 @@ static unsigned long compute_balloon_floor(void) * 8192 744 (1/16) * 32768 1512 (1/32) */ - if (totalram_pages < MB2PAGES(128)) - min_pages = MB2PAGES(8) + (totalram_pages >> 1); - else if (totalram_pages < MB2PAGES(512)) - min_pages = MB2PAGES(40) + (totalram_pages >> 2); - else if (totalram_pages < MB2PAGES(2048)) - min_pages = MB2PAGES(104) + (totalram_pages >> 3); - else if (totalram_pages < MB2PAGES(8192)) - min_pages = MB2PAGES(232) + (totalram_pages >> 4); + if (totalram_pages() < MB2PAGES(128)) + min_pages = MB2PAGES(8) + (totalram_pages() >> 1); + else if (totalram_pages() < MB2PAGES(512)) + min_pages = MB2PAGES(40) + (totalram_pages() >> 2); + else if (totalram_pages() < MB2PAGES(2048)) + min_pages = MB2PAGES(104) + (totalram_pages() >> 3); + else if (totalram_pages() < MB2PAGES(8192)) + min_pages = MB2PAGES(232) + (totalram_pages() >> 4); else - min_pages = MB2PAGES(488) + (totalram_pages >> 5); + min_pages = MB2PAGES(488) + (totalram_pages() >> 5); #undef MB2PAGES return min_pages; } diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 48bb5a879e6f..14b6004653a7 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -421,14 +421,13 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, */ if (gfp_mask & __GFP_NORETRY) { unsigned noio_flag = memalloc_noio_save(); - void *ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, - PAGE_KERNEL); + void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); memalloc_noio_restore(noio_flag); return ptr; } - return __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); } /* @@ -1907,7 +1906,7 @@ static int __init dm_bufio_init(void) memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); - mem = (__u64)mult_frac(totalram_pages - totalhigh_pages, + mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(), DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; if (mem > ULONG_MAX) diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index fdd4a840b30f..6ec1b8808311 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -84,7 +84,7 @@ static bool __check_shared_memory(size_t alloc_size) a = shared_memory_amount + alloc_size; if (a < shared_memory_amount) return false; - if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR) + if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR) return false; #ifdef CONFIG_MMU if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) @@ -146,12 +146,7 @@ static void *dm_kvzalloc(size_t alloc_size, int node) if (!claim_shared_memory(alloc_size)) return NULL; - if (alloc_size <= KMALLOC_MAX_SIZE) { - p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node); - if (p) - return p; - } - p = vzalloc_node(alloc_size, node); + p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node); if (p) return p; diff --git a/drivers/media/platform/mtk-vpu/mtk_vpu.c b/drivers/media/platform/mtk-vpu/mtk_vpu.c index 4d7e470c1715..58ee3e4c826b 100644 --- a/drivers/media/platform/mtk-vpu/mtk_vpu.c +++ b/drivers/media/platform/mtk-vpu/mtk_vpu.c @@ -841,7 +841,7 @@ static int mtk_vpu_probe(struct platform_device *pdev) /* Set PTCM to 96K and DTCM to 32K */ vpu_cfg_writel(vpu, 0x2, VPU_TCM_CFG); - vpu->enable_4GB = !!(totalram_pages > (SZ_2G >> PAGE_SHIFT)); + vpu->enable_4GB = !!(totalram_pages() > (SZ_2G >> PAGE_SHIFT)); dev_info(dev, "4GB mode %u\n", vpu->enable_4GB); if (vpu->enable_4GB) { diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 84a93ddcd57a..aba3897844c5 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -1272,7 +1272,7 @@ ccio_ioc_init(struct ioc *ioc) ** Hot-Plug/Removal of PCI cards. (aka PCI OLARD). */ - iova_space_size = (u32) (totalram_pages / count_parisc_driver(&ccio_driver)); + iova_space_size = (u32) (totalram_pages() / count_parisc_driver(&ccio_driver)); /* limit IOVA space size to 1MB-1GB */ @@ -1311,7 +1311,7 @@ ccio_ioc_init(struct ioc *ioc) DBG_INIT("%s() hpa 0x%p mem %luMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_regs, - (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages() >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index c715af1b6c3c..f9ed5b1c9258 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -1436,7 +1436,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) DBG_INIT("%s() hpa 0x%lx mem %ldMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_hpa, - (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages() >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c index 502ba33e3270..95c9dba9d009 100644 --- a/drivers/staging/android/ion/ion_system_heap.c +++ b/drivers/staging/android/ion/ion_system_heap.c @@ -379,7 +379,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, if (align > PAGE_SIZE) return -EINVAL; - if (size / PAGE_SIZE > totalram_pages / 2) + if (size / PAGE_SIZE > totalram_pages() / 2) return -ENOMEM; data.size = 0; diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h index 8c75d5075590..a38af2a561ab 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h @@ -42,7 +42,7 @@ #if BITS_PER_LONG == 32 /* limit to lowmem on 32-bit systems */ #define NUM_CACHEPAGES \ - min(totalram_pages, 1UL << (30 - PAGE_SHIFT) * 3 / 4) + min(totalram_pages(), 1UL << (30 - PAGE_SHIFT) * 3 / 4) #else #define NUM_CACHEPAGES totalram_pages #endif diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-tracefile.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-tracefile.c index 8b551d2708ba..2d95784b660c 100644 --- a/drivers/staging/lustre/lnet/libcfs/linux/linux-tracefile.c +++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-tracefile.c @@ -249,7 +249,7 @@ void cfs_print_to_console(struct ptldebug_header *hdr, int mask, int cfs_trace_max_debug_mb(void) { - int total_mb = (totalram_pages >> (20 - PAGE_SHIFT)); + int total_mb = (totalram_pages() >> (20 - PAGE_SHIFT)); return max(512, (total_mb * 80) / 100); } diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h index 722c33f7eecc..9057f715c0b5 100644 --- a/drivers/staging/lustre/lustre/include/obd.h +++ b/drivers/staging/lustre/lustre/include/obd.h @@ -1217,8 +1217,8 @@ static inline void client_adjust_max_dirty(struct client_obd *cli) cli->cl_dirty_max_pages = dirty_max; } - if (cli->cl_dirty_max_pages > totalram_pages / 8) - cli->cl_dirty_max_pages = totalram_pages / 8; + if (cli->cl_dirty_max_pages > totalram_pages() / 8) + cli->cl_dirty_max_pages = totalram_pages() / 8; } #endif /* __OBD_H */ diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c index 153e990c494e..1a2d41d99867 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c @@ -363,11 +363,11 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) if (!strcmp(name, LUSTRE_MDC_NAME)) { cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; - } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 128 /* MB */) { + } else if (totalram_pages() >> (20 - PAGE_SHIFT) <= 128 /* MB */) { cli->cl_max_rpcs_in_flight = 2; - } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 256 /* MB */) { + } else if (totalram_pages() >> (20 - PAGE_SHIFT) <= 256 /* MB */) { cli->cl_max_rpcs_in_flight = 3; - } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 512 /* MB */) { + } else if (totalram_pages() >> (20 - PAGE_SHIFT) <= 512 /* MB */) { cli->cl_max_rpcs_in_flight = 4; } else { cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c index 13ec24d44b04..e1da3cf62a61 100644 --- a/drivers/staging/lustre/lustre/llite/lproc_llite.c +++ b/drivers/staging/lustre/lustre/llite/lproc_llite.c @@ -244,9 +244,9 @@ static ssize_t max_read_ahead_mb_store(struct kobject *kobj, pages_number *= 1 << (20 - PAGE_SHIFT); /* MB -> pages */ - if (pages_number > totalram_pages / 2) { + if (pages_number > totalram_pages() / 2) { CERROR("can't set file readahead more than %lu MB\n", - totalram_pages >> (20 - PAGE_SHIFT + 1)); /*1/2 of RAM*/ + totalram_pages() >> (20 - PAGE_SHIFT + 1)); /*1/2 of RAM*/ return -ERANGE; } @@ -411,10 +411,10 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file, return -ERANGE; pages_number = (long)val; - if (pages_number < 0 || pages_number > totalram_pages) { + if (pages_number < 0 || pages_number > totalram_pages()) { CERROR("%s: can't set max cache more than %lu MB\n", ll_get_fsname(sb, NULL, 0), - totalram_pages >> (20 - PAGE_SHIFT)); + totalram_pages() >> (20 - PAGE_SHIFT)); return -ERANGE; } diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c index 76e1ee83a723..ff3f738f01dc 100644 --- a/drivers/staging/lustre/lustre/obdclass/class_obd.c +++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c @@ -477,10 +477,10 @@ static int __init obdclass_init(void) * For clients with less memory, a larger fraction is needed * for other purposes (mostly for BGL). */ - if (totalram_pages <= 512 << (20 - PAGE_SHIFT)) - obd_max_dirty_pages = totalram_pages / 4; + if (totalram_pages() <= 512 << (20 - PAGE_SHIFT)) + obd_max_dirty_pages = totalram_pages() / 4; else - obd_max_dirty_pages = totalram_pages / 2; + obd_max_dirty_pages = totalram_pages() / 2; err = obd_init_caches(); if (err) diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c index e6c785afceba..d7d2ea8c3c40 100644 --- a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c +++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c @@ -112,7 +112,7 @@ static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr, val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */ - if (val > ((totalram_pages / 10) * 9)) { + if (val > ((totalram_pages() / 10) * 9)) { /* Somebody wants to assign too much memory to dirty pages */ return -EINVAL; } diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c index 054e567e6c8d..107f4362109c 100644 --- a/drivers/staging/lustre/lustre/obdclass/lu_object.c +++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c @@ -884,7 +884,7 @@ static unsigned long lu_htable_order(struct lu_device *top) * * Size of lu_object is (arbitrary) taken as 1K (together with inode). */ - cache_size = totalram_pages; + cache_size = totalram_pages(); #if BITS_PER_LONG == 32 /* limit hashtable size for lowmem systems to low RAM */ diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c index f0062d44ee03..fc3fab479e16 100644 --- a/drivers/staging/lustre/lustre/osc/lproc_osc.c +++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c @@ -163,7 +163,7 @@ static ssize_t max_dirty_mb_store(struct kobject *kobj, if (pages_number <= 0 || pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) || - pages_number > totalram_pages / 4) /* 1/4 of RAM */ + pages_number > totalram_pages() / 4) /* 1/4 of RAM */ return -ERANGE; spin_lock(&cli->cl_loi_list_lock); diff --git a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c index 9bad57d65db4..84845c274a2e 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c +++ b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c @@ -306,7 +306,7 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file, * far. */ bufpages = (svc->srv_buf_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (val > totalram_pages / (2 * bufpages)) + if (val > totalram_pages() / (2 * bufpages)) return -ERANGE; spin_lock(&svc->srv_lock); diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c index b2cc5ea6cb93..f9095a84fd15 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c @@ -140,7 +140,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v) "low free mark: %lu\n" "max waitqueue depth: %u\n" "max wait time: %ld/%lu\n", - totalram_pages, + totalram_pages(), PAGES_PER_POOL, page_pools.epp_max_pages, page_pools.epp_max_pools, @@ -378,7 +378,7 @@ int sptlrpc_enc_pool_init(void) * maximum capacity is 1/8 of total physical memory. * is the 1/8 a good number? */ - page_pools.epp_max_pages = totalram_pages / 8; + page_pools.epp_max_pages = totalram_pages() / 8; page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages); init_waitqueue_head(&page_pools.epp_waitq); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 1b76e8a99c40..67a6f62e3313 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -747,7 +747,7 @@ static void __init balloon_add_region(unsigned long start_pfn, for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) { page = pfn_to_page(pfn); - /* totalram_pages and totalhigh_pages do not + /* totalram_pages() and totalhigh_pages() do not include the boot-time balloon extension, so don't subtract from it. */ __balloon_append(page); diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 66620713242a..21e96adb00b8 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -188,7 +188,7 @@ static void selfballoon_process(struct work_struct *work) bool reset_timer = false; if (xen_selfballooning_enabled) { - cur_pages = totalram_pages; + cur_pages = totalram_pages(); tgt_pages = cur_pages; /* default is no change */ goal_pages = vm_memory_committed() + totalreserve_pages + @@ -226,7 +226,7 @@ static void selfballoon_process(struct work_struct *work) if (tgt_pages < floor_pages) tgt_pages = floor_pages; balloon_set_new_target(tgt_pages + - balloon_stats.current_pages - totalram_pages); + balloon_stats.current_pages - totalram_pages()); reset_timer = true; } #ifdef CONFIG_FRONTSWAP @@ -568,7 +568,7 @@ int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) * much more reliably and response faster in some cases. */ if (!selfballoon_reserved_mb) { - reserve_pages = totalram_pages / 10; + reserve_pages = totalram_pages() / 10; selfballoon_reserved_mb = PAGES2MB(reserve_pages); } schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 57401b474ec6..14476074957a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -167,8 +167,7 @@ static u8 *alloc_bitmap(u32 bitmap_size) if (mem) return mem; - return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL); } int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 89fbff1a9b2c..e32046f0848e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2387,7 +2387,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) return 0; size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); - new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); + new_groupinfo = kvzalloc(size, GFP_KERNEL); if (!new_groupinfo) { ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4384bbe61415..f5e8c1247af5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2130,8 +2130,8 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) if (size <= sbi->s_flex_groups_allocated) return 0; - new_groups = ext4_kvzalloc(roundup_pow_of_two(size * - sizeof(*sbi->s_flex_groups)), GFP_KERNEL); + new_groups = kvzalloc(roundup_pow_of_two(size * + sizeof(*sbi->s_flex_groups)), GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex group pointers", size); @@ -3993,7 +3993,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } rcu_assign_pointer(sbi->s_group_desc, - ext4_kvmalloc(db_count * + kvmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL)); if (sbi->s_group_desc == NULL) { diff --git a/fs/file.c b/fs/file.c index be0792c0a231..33e39e40fe1e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -42,7 +42,7 @@ static void *alloc_fdmem(size_t size) if (data != NULL) return data; } - return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); } static void __free_fdtable(struct fdtable *fdt) diff --git a/fs/file_table.c b/fs/file_table.c index 251d54ee7ef7..be36504b8a01 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -334,10 +334,10 @@ void __init files_init(void) void __init files_maxfiles_init(void) { unsigned long n; - unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2; + unsigned long memreserve = (totalram_pages() - nr_free_pages()) * 3/2; - memreserve = min(memreserve, totalram_pages - 1); - n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; + memreserve = min(memreserve, totalram_pages() - 1); + n = ((totalram_pages() - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 75e0d1297775..5c68fa5a4c08 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -829,7 +829,7 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { if (*limit == 0) - *limit = ((totalram_pages << PAGE_SHIFT) >> 13) / + *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / sizeof(struct fuse_req); if (*limit >= 1 << 16) diff --git a/fs/proc/base.c b/fs/proc/base.c index b6959f6dae5b..9f64c9d6a06c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -576,7 +576,7 @@ static const struct file_operations proc_lstats_operations = { static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - unsigned long totalpages = totalram_pages + total_swap_pages; + unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; points = oom_badness(task, NULL, NULL, totalpages) * diff --git a/fs/seq_file.c b/fs/seq_file.c index 3ade39e02bb7..023d92dfffa9 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -25,24 +25,7 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { - void *buf; - gfp_t gfp = GFP_KERNEL; - - if (unlikely(size > MAX_RW_COUNT)) - return NULL; - - /* - * For high order allocations, use __GFP_NORETRY to avoid oom-killing - - * it's better to fall back to vmalloc() than to kill things. For small - * allocations, just use GFP_KERNEL which will oom kill, thus no need - * for vmalloc fallback. - */ - if (size > PAGE_SIZE) - gfp |= __GFP_NORETRY | __GFP_NOWARN; - buf = kmalloc(size, gfp); - if (!buf && size > PAGE_SIZE) - buf = vmalloc(size); - return buf; + return kvmalloc(size, GFP_KERNEL); } /** diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index bb2beaef531a..0b770f861897 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -66,7 +66,7 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) noio_flag = memalloc_noio_save(); lflags = kmem_flags_convert(flags); - ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL); if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) memalloc_noio_restore(noio_flag); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index b43fa9d95a7a..1fc64a5e2d79 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -841,6 +841,10 @@ static inline bool arch_has_pfn_modify_check(void) } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range diff --git a/include/asm-generic/set_memory.h b/include/asm-generic/set_memory.h new file mode 100644 index 000000000000..83e81f8996b2 --- /dev/null +++ b/include/asm-generic/set_memory.h @@ -0,0 +1,12 @@ +#ifndef __ASM_SET_MEMORY_H +#define __ASM_SET_MEMORY_H + +/* + * Functions to change memory attributes. + */ +int set_memory_ro(unsigned long addr, int numpages); +int set_memory_rw(unsigned long addr, int numpages); +int set_memory_x(unsigned long addr, int numpages); +int set_memory_nx(unsigned long addr, int numpages); + +#endif diff --git a/include/drm/drm_mem_util.h b/include/drm/drm_mem_util.h index 70d4e221a3ad..d0f6cf2e5324 100644 --- a/include/drm/drm_mem_util.h +++ b/include/drm/drm_mem_util.h @@ -37,8 +37,7 @@ static __inline__ void *drm_calloc_large(size_t nmemb, size_t size) if (size * nmemb <= PAGE_SIZE) return kcalloc(nmemb, size, GFP_KERNEL); - return __vmalloc(size * nmemb, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + return vzalloc(size * nmemb); } /* Modeled after cairo's malloc_ab, it's like calloc but without the zeroing. */ @@ -50,8 +49,7 @@ static __inline__ void *drm_malloc_ab(size_t nmemb, size_t size) if (size * nmemb <= PAGE_SIZE) return kmalloc(nmemb * size, GFP_KERNEL); - return __vmalloc(size * nmemb, - GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return vmalloc(size * nmemb); } static __inline__ void *drm_malloc_gfp(size_t nmemb, size_t size, gfp_t gfp) @@ -69,8 +67,7 @@ static __inline__ void *drm_malloc_gfp(size_t nmemb, size_t size, gfp_t gfp) return ptr; } - return __vmalloc(size * nmemb, - gfp | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size * nmemb, gfp, PAGE_KERNEL); } static __inline void drm_free_large(void *ptr) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 657b56524a8a..dcc6ec1d473b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -35,7 +35,31 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); -extern unsigned long totalhigh_pages; +extern atomic_long_t _totalhigh_pages; +static inline unsigned long totalhigh_pages(void) +{ + return (unsigned long)atomic_long_read(&_totalhigh_pages); +} + +static inline void totalhigh_pages_inc(void) +{ + atomic_long_inc(&_totalhigh_pages); +} + +static inline void totalhigh_pages_dec(void) +{ + atomic_long_dec(&_totalhigh_pages); +} + +static inline void totalhigh_pages_add(long count) +{ + atomic_long_add(count, &_totalhigh_pages); +} + +static inline void totalhigh_pages_set(long val) +{ + atomic_long_set(&_totalhigh_pages, val); +} void kmap_flush_unused(void); @@ -57,7 +81,7 @@ static inline struct page *kmap_to_page(void *addr) return virt_to_page(addr); } -#define totalhigh_pages 0UL +static inline unsigned long totalhigh_pages(void) { return 0UL; } #ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 1c2a32829620..590343f6c1b1 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -22,6 +22,7 @@ #define __KMEMLEAK_H #include +#include #ifdef CONFIG_DEBUG_KMEMLEAK @@ -30,6 +31,8 @@ extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, gfp_t gfp) __ref; +extern void kmemleak_vmalloc(const struct vm_struct *area, size_t size, + gfp_t gfp) __ref; extern void kmemleak_free(const void *ptr) __ref; extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; @@ -81,6 +84,10 @@ static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, gfp_t gfp) { } +static inline void kmemleak_vmalloc(const struct vm_struct *area, size_t size, + gfp_t gfp) +{ +} static inline void kmemleak_free(const void *ptr) { } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 05aa860daa5c..dad7fb0c949c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -762,8 +762,6 @@ void kvm_arch_check_processor_compat(void *rtn); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); -void *kvm_kvzalloc(unsigned long size); - #ifndef __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { diff --git a/include/linux/list.h b/include/linux/list.h index 3ef3ade9930e..85baa403499d 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -556,6 +556,19 @@ static inline void list_splice_tail_init(struct list_head *list, for (; &pos->member != (head); \ pos = list_next_entry(pos, member)) +/** + * list_for_each_entry_from_reverse - iterate backwards over list of given type + * from the current point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type, continuing from current position. + */ +#define list_for_each_entry_from_reverse(pos, head, member) \ + for (; &pos->member != (head); \ + pos = list_prev_entry(pos, member)) + /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. diff --git a/include/linux/llist.h b/include/linux/llist.h index ac6796138ba0..9bbc3ed179c8 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -121,6 +121,25 @@ static inline void init_llist_head(struct llist_head *list) #define llist_for_each(pos, node) \ for ((pos) = (node); pos; (pos) = (pos)->next) +/** + * llist_for_each_safe - iterate over some deleted entries of a lock-less list + * safe against removal of list entry + * @pos: the &struct llist_node to use as a loop cursor + * @n: another &struct llist_node to use as temporary storage + * @node: the first entry of deleted list entries + * + * In general, some entries of the lock-less list can be traversed + * safely only after being deleted from list, so start with an entry + * instead of list head. + * + * If being used on entries deleted from lock-less list directly, the + * traverse order is from the newest to the oldest added entry. If + * you want to traverse from the oldest to the newest, you must + * reverse the order by yourself before traversing. + */ +#define llist_for_each_safe(pos, n, node) \ + for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n)) + /** * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type * @pos: the type * to use as a loop cursor. diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b722db541b0..10b6a3ee38db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -44,7 +44,32 @@ static inline void set_max_mapnr(unsigned long limit) static inline void set_max_mapnr(unsigned long limit) { } #endif -extern unsigned long totalram_pages; +extern atomic_long_t _totalram_pages; +static inline unsigned long totalram_pages(void) +{ + return (unsigned long)atomic_long_read(&_totalram_pages); +} + +static inline void totalram_pages_inc(void) +{ + atomic_long_inc(&_totalram_pages); +} + +static inline void totalram_pages_dec(void) +{ + atomic_long_dec(&_totalram_pages); +} + +static inline void totalram_pages_add(long count) +{ + atomic_long_add(count, &_totalram_pages); +} + +static inline void totalram_pages_set(long val) +{ + atomic_long_set(&_totalram_pages, val); +} + extern void * high_memory; extern int page_cluster; @@ -484,16 +509,16 @@ unsigned long vmalloc_to_pfn(const void *addr); * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ - -#ifdef CONFIG_MMU -extern int is_vmalloc_addr(const void *x); -#else -static inline int is_vmalloc_addr(const void *x) +static inline bool is_vmalloc_addr(const void *x) { - return 0; -} -#endif +#ifdef CONFIG_MMU + unsigned long addr = (unsigned long)x; + return addr >= VMALLOC_START && addr < VMALLOC_END; +#else + return false; +#endif +} #ifdef CONFIG_MMU extern int is_vmalloc_or_module_addr(const void *x); #else @@ -516,6 +541,7 @@ static inline void *kvzalloc(size_t size, gfp_t flags) { return kvmalloc(size, flags | __GFP_ZERO); } + extern void kvfree(const void *addr); /* diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h new file mode 100644 index 000000000000..b5071497b8cb --- /dev/null +++ b/include/linux/set_memory.h @@ -0,0 +1,57 @@ +/* + * Copyright 2017, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + */ +#ifndef _LINUX_SET_MEMORY_H_ +#define _LINUX_SET_MEMORY_H_ + +#ifdef CONFIG_ARCH_HAS_SET_MEMORY +#include +#else +static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } +#endif + +#ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP +static inline int set_direct_map_invalid_noflush(struct page *page) +{ + return 0; +} +static inline int set_direct_map_default_noflush(struct page *page) +{ + return 0; +} +#endif + +#ifndef set_mce_nospec +static inline int set_mce_nospec(unsigned long pfn) +{ + return 0; +} +#endif + +#ifndef clear_mce_nospec +static inline int clear_mce_nospec(unsigned long pfn) +{ + return 0; +} +#endif + +#ifndef CONFIG_ARCH_HAS_MEM_ENCRYPT +static inline int set_memory_encrypted(unsigned long addr, int numpages) +{ + return 0; +} + +static inline int set_memory_decrypted(unsigned long addr, int numpages) +{ + return 0; +} +#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ + +#endif /* _LINUX_SET_MEMORY_H_ */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 72d03e816293..ab50f9e5ee64 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -286,7 +286,6 @@ static inline void workingset_node_shadows_dec(struct radix_tree_node *node) } /* linux/mm/page_alloc.c */ -extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; extern unsigned long nr_free_buffer_pages(void); extern unsigned long nr_free_pagecache_pages(void); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index de477793ed60..d662eaebebeb 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -19,8 +19,11 @@ struct notifier_block; /* in notifier.h */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ -#define VM_LOWMEM 0x00000100 /* Tracking of direct mapped lowmem */ - +/* + * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with + * vfree_atomic(). + */ +#define VM_FLUSH_RESET_PERMS 0x00000100 /* Reset direct map and flush TLB on unmap */ /* bits [20..32] reserved for arch specific ioremap internals */ /* @@ -45,12 +48,16 @@ struct vm_struct { struct vmap_area { unsigned long va_start; unsigned long va_end; + + /* + * Largest available free size in subtree. + */ + unsigned long subtree_max_size; unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ struct llist_node purge_list; /* "lazy purge" list */ struct vm_struct *vm; - struct rcu_head rcu_head; }; /* @@ -85,6 +92,17 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); +#ifndef CONFIG_MMU +extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); +static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, + gfp_t flags, void *caller) +{ + return __vmalloc_node_flags(size, node, flags); +} +#else +extern void *__vmalloc_node_flags_caller(unsigned long size, + int node, gfp_t flags, void *caller); +#endif extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); @@ -145,6 +163,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); +static inline void set_vm_flush_reset_perms(void *addr) +{ + struct vm_struct *vm = find_vm_area(addr); + + if (vm) + vm->flags |= VM_FLUSH_RESET_PERMS; +} #else static inline int map_kernel_range_noflush(unsigned long start, unsigned long size, @@ -160,6 +185,9 @@ static inline void unmap_kernel_range(unsigned long addr, unsigned long size) { } +static inline void set_vm_flush_reset_perms(void *addr) +{ +} #endif /* Allocate/destroy a 'vmalloc' VM area. */ @@ -176,13 +204,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count); extern struct list_head vmap_area_list; extern __init void vm_area_add_early(struct vm_struct *vm); extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); -extern __init int vm_area_check_early(struct vm_struct *vm); -#ifdef CONFIG_ENABLE_VMALLOC_SAVING -extern void mark_vmalloc_reserved_area(void *addr, unsigned long size); -#else -static inline void mark_vmalloc_reserved_area(void *addr, unsigned long size) -{ }; -#endif #ifdef CONFIG_SMP # ifdef CONFIG_MMU @@ -208,12 +229,7 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #endif #ifdef CONFIG_MMU -#ifdef CONFIG_ENABLE_VMALLOC_SAVING -extern unsigned long total_vmalloc_size; -#define VMALLOC_TOTAL total_vmalloc_size -#else #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -#endif #else #define VMALLOC_TOTAL 0UL #endif diff --git a/ipc/util.c b/ipc/util.c index 76d4afcde7bb..721b96d8b9c3 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -403,12 +403,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) */ void *ipc_alloc(int size) { - void *out; - if (size > PAGE_SIZE) - out = vmalloc(size); - else - out = kmalloc(size, GFP_KERNEL); - return out; + return kvmalloc(size, GFP_KERNEL); } /** diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5a2b9db4b966..5ef7e997f853 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -81,8 +81,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; struct bpf_prog *fp; @@ -198,8 +197,7 @@ void bpf_prog_free_linfo(struct bpf_prog *prog) struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; u32 pages, delta; int ret; @@ -937,8 +935,7 @@ out: static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index df0092ae3313..ecc6ad6fb5a8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -152,6 +152,7 @@ static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) } return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, __builtin_return_address(0)); + /*return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);*/ } void *bpf_map_area_alloc(size_t size, int numa_node) diff --git a/kernel/fork.c b/kernel/fork.c index ae4d5fa88d03..6002350b1bfa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -199,7 +199,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END, - THREADINFO_GFP | __GFP_HIGHMEM, + THREADINFO_GFP, PAGE_KERNEL, 0, node, __builtin_return_address(0)); @@ -420,10 +420,10 @@ static void set_max_threads(unsigned int max_threads_suggested) * The number of threads shall be limited such that the thread * structures may only consume a small part of the available memory. */ - if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) + if (fls64(totalram_pages()) + fls64(PAGE_SIZE) > 64) threads = MAX_THREADS; else - threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + threads = div64_u64((u64) totalram_pages() * (u64) PAGE_SIZE, (u64) THREAD_SIZE * 8UL); if (threads > max_threads_suggested) diff --git a/kernel/groups.c b/kernel/groups.c index 94bde5210e3d..b5bb6e88376e 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -18,7 +18,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); if (!gi) return NULL; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index f5ab72ebda11..1a5ab1a64a14 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -223,13 +223,13 @@ int sanity_check_segment_list(struct kimage *image) * wasted allocating pages, which can cause a soft lockup. */ for (i = 0; i < nr_segments; i++) { - if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2) + if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages() / 2) return -EINVAL; total_pages += PAGE_COUNT(image->segment[i].memsz); } - if (total_pages > totalram_pages / 2) + if (total_pages > totalram_pages() / 2) return -EINVAL; /* diff --git a/kernel/module.c b/kernel/module.c index bdbd03fae41a..2d2199dc16ca 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2879,7 +2879,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, /* Suck in entire file: we'll want most of it. */ info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL); + GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); if (!info->hdr) return -ENOMEM; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b02850cfc8ee..e14e98c5de2d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -101,7 +101,7 @@ unsigned long image_size; void __init hibernate_image_size_init(void) { - image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; + image_size = ((totalram_pages() * 2) / 5) * PAGE_SIZE; } /* diff --git a/mm/bootmem.c b/mm/bootmem.c index d14efd6fda06..0b8c5b3ab621 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -162,7 +162,7 @@ void free_bootmem_late(unsigned long physaddr, unsigned long size) for (; cursor < end; cursor++) { __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; + totalram_pages_inc(); } } @@ -278,7 +278,7 @@ unsigned long __init free_all_bootmem(void) list_for_each_entry(bdata, &bdata_list, list) total_pages += free_all_bootmem_core(bdata); - totalram_pages += total_pages; + totalram_pages_add(total_pages); return total_pages; } diff --git a/mm/highmem.c b/mm/highmem.c index 50b4ca6787f0..a81bbfe69049 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -104,9 +104,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) } #endif -unsigned long totalhigh_pages __read_mostly; -EXPORT_SYMBOL(totalhigh_pages); - +atomic_long_t _totalhigh_pages __read_mostly; +EXPORT_SYMBOL(_totalhigh_pages); EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 34511b5b22b1..f9714777813a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -409,7 +409,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b9128eaafffe..69c1194db950 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2220,7 +2220,7 @@ static void __init gather_bootmem_prealloc(void) prep_new_huge_page(h, page, page_to_nid(page)); /* * If we had gigantic hugepages allocated at boot time, we need - * to restore the 'stolen' pages to totalram_pages in order to + * to restore the 'stolen' pages to totalram_pages() in order to * fix confusing memory reports from free(1) and another * side-effects, like CommitLimit going negative. */ diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 3c572104eb76..2d33a92a89fb 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -691,7 +691,7 @@ int kasan_module_alloc(void *addr, size_t size) ret = __vmalloc_node_range(shadow_size, 1, shadow_start, shadow_start + shadow_size, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 3a8ddf8baf7d..daffe1da66d6 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -236,7 +236,7 @@ void quarantine_reduce(void) * Update quarantine size in case of hotplug. Allocate a fraction of * the installed memory to quarantine minus per-cpu queue limits. */ - total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + total_size = (totalram_pages() << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); new_quarantine_size = (total_size < percpu_quarantines) ? diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 3d822d1491c4..83ecadc66586 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -148,7 +148,7 @@ struct kmemleak_scan_area { */ struct kmemleak_object { spinlock_t lock; - unsigned long flags; /* object status flags */ + unsigned int flags; /* object status flags */ struct list_head object_list; struct list_head gray_list; struct rb_node rb_node; @@ -157,6 +157,8 @@ struct kmemleak_object { atomic_t use_count; unsigned long pointer; size_t size; + /* pass surplus references to this pointer */ + unsigned long excess_ref; /* minimum number of a pointers found before it is considered leak */ int min_count; /* the total number of pointers found pointing to this object */ @@ -263,7 +265,8 @@ enum { KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, - KMEMLEAK_NO_SCAN + KMEMLEAK_NO_SCAN, + KMEMLEAK_SET_EXCESS_REF }; /* @@ -272,9 +275,12 @@ enum { */ struct early_log { int op_type; /* kmemleak operation type */ - const void *ptr; /* allocated/freed memory block */ - size_t size; /* memory block size */ int min_count; /* minimum reference count */ + const void *ptr; /* allocated/freed memory block */ + union { + size_t size; /* memory block size */ + unsigned long excess_ref; /* surplus reference passing */ + }; unsigned long trace[MAX_TRACE]; /* stack trace */ unsigned int trace_len; /* stack trace length */ }; @@ -403,7 +409,7 @@ static void dump_object_info(struct kmemleak_object *object) object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); - pr_notice(" flags = 0x%lx\n", object->flags); + pr_notice(" flags = 0x%x\n", object->flags); pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); @@ -572,6 +578,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object->flags = OBJECT_ALLOCATED; object->pointer = ptr; object->size = size; + object->excess_ref = 0; object->min_count = min_count; object->count = 0; /* white color initially */ object->jiffies = jiffies; @@ -804,6 +811,30 @@ out: put_object(object); } +/* + * Any surplus references (object already gray) to 'ptr' are passed to + * 'excess_ref'. This is used in the vmalloc() case where a pointer to + * vm_struct may be used as an alternative reference to the vmalloc'ed object + * (see free_thread_stack()). + */ +static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n", + ptr); + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->excess_ref = excess_ref; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + /* * Set the OBJECT_NO_SCAN flag for the object corresponding to the give * pointer. Such object will not be scanned by kmemleak but references to it @@ -918,7 +949,7 @@ static void early_alloc_percpu(struct early_log *log) * @gfp: kmalloc() flags used for kmemleak internal memory allocations * * This function is called from the kernel allocators when a new object - * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.). + * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.). */ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) @@ -961,6 +992,36 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); +/** + * kmemleak_vmalloc - register a newly vmalloc'ed object + * @area: pointer to vm_struct + * @size: size of the object + * @gfp: __vmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the vmalloc() kernel allocator when a new + * object (memory block) is allocated. + */ +void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp) +{ + pr_debug("%s(0x%p, %zu)\n", __func__, area, size); + + /* + * A min_count = 2 is needed because vm_struct contains a reference to + * the virtual address of the vmalloc'ed block. + */ + if (kmemleak_enabled) { + create_object((unsigned long)area->addr, size, 2, gfp); + object_set_excess_ref((unsigned long)area, + (unsigned long)area->addr); + } else if (kmemleak_early_log) { + log_early(KMEMLEAK_ALLOC, area->addr, size, 2); + /* reusing early_log.size for storing area->addr */ + log_early(KMEMLEAK_SET_EXCESS_REF, + area, (unsigned long)area->addr, 0); + } +} +EXPORT_SYMBOL_GPL(kmemleak_vmalloc); + /** * kmemleak_free - unregister a previously registered object * @ptr: pointer to beginning of the object @@ -1197,6 +1258,30 @@ static bool update_checksum(struct kmemleak_object *object) return object->checksum != old_csum; } +/* + * Update an object's references. object->lock must be held by the caller. + */ +static void update_refs(struct kmemleak_object *object) +{ + if (!color_white(object)) { + /* non-orphan, ignored or new */ + return; + } + + /* + * Increase the object's reference count (number of pointers to the + * memory block). If this count reaches the required minimum, the + * object's color will become gray and it will be added to the + * gray_list. + */ + object->count++; + if (color_gray(object)) { + /* put_object() called when removing from gray_list */ + WARN_ON(!get_object(object)); + list_add_tail(&object->gray_list, &gray_list); + } +} + /* * Memory scanning is a long process and it needs to be interruptable. This * function checks whether such interrupt condition occurred. @@ -1234,6 +1319,7 @@ static void scan_block(void *_start, void *_end, for (ptr = start; ptr < end; ptr++) { struct kmemleak_object *object; unsigned long pointer; + unsigned long excess_ref; if (scan_should_stop()) break; @@ -1269,25 +1355,27 @@ static void scan_block(void *_start, void *_end, * enclosed by scan_mutex. */ spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); - if (!color_white(object)) { - /* non-orphan, ignored or new */ - spin_unlock(&object->lock); - continue; - } - - /* - * Increase the object's reference count (number of pointers - * to the memory block). If this count reaches the required - * minimum, the object's color will become gray and it will be - * added to the gray_list. - */ - object->count++; + /* only pass surplus references (object already gray) */ if (color_gray(object)) { - /* put_object() called when removing from gray_list */ - WARN_ON(!get_object(object)); - list_add_tail(&object->gray_list, &gray_list); + excess_ref = object->excess_ref; + /* no need for update_refs() if object already gray */ + } else { + excess_ref = 0; + update_refs(object); } spin_unlock(&object->lock); + + if (excess_ref) { + object = lookup_object(excess_ref, 0); + if (!object) + continue; + if (object == scanned) + /* circular reference, ignore */ + continue; + spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + update_refs(object); + spin_unlock(&object->lock); + } } read_unlock_irqrestore(&kmemleak_lock, flags); } @@ -1994,6 +2082,10 @@ void __init kmemleak_init(void) case KMEMLEAK_NO_SCAN: kmemleak_no_scan(log->ptr); break; + case KMEMLEAK_SET_EXCESS_REF: + object_set_excess_ref((unsigned long)log->ptr, + log->excess_ref); + break; default: kmemleak_warn("Unknown early log operation: %d\n", log->op_type); diff --git a/mm/memblock.c b/mm/memblock.c index 9c96c537b68d..422cd907008f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1397,7 +1397,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) for (; cursor < end; cursor++) { __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; + totalram_pages_inc(); } } diff --git a/mm/mm_init.c b/mm/mm_init.c index 5b72266b4b03..db3a196826da 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -151,7 +151,7 @@ static void __meminit mm_compute_batch(void) s32 batch = max_t(s32, nr*2, 32); /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ - memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); + memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff); vm_committed_as_batch = max_t(s32, memsized_batch, batch); } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index aa59572cbac6..88817aab4393 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -91,7 +91,7 @@ void free_bootmem_late(unsigned long addr, unsigned long size) for (; cursor < end; cursor++) { __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; + totalram_pages_inc(); } } @@ -184,7 +184,7 @@ unsigned long __init free_all_bootmem(void) reset_all_zones_managed_pages(); pages = free_low_memory_core_early(); - totalram_pages += pages; + totalram_pages_add(pages); return pages; } diff --git a/mm/nommu.c b/mm/nommu.c index b40ec74f364c..10be196aec37 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -236,12 +236,16 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); +void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc(size, flags, PAGE_KERNEL); +} + void *vmalloc_user(unsigned long size) { void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); if (ret) { struct vm_area_struct *vma; @@ -359,10 +363,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a72d15dd5304..dadd8465729f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -237,7 +237,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) } /* Default to all available memory */ - oc->totalpages = totalram_pages + total_swap_pages; + oc->totalpages = totalram_pages() + total_swap_pages; if (!IS_ENABLED(CONFIG_NUMA)) return CONSTRAINT_NONE; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dedc1fac8f35..f3bfef0b3394 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2066,7 +2066,7 @@ static int page_writeback_cpu_online(unsigned int cpu) * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" * is now applied to total non-HIGHPAGE memory (by subtracting - * totalhigh_pages from vm_total_pages), and as such we can't + * totalhigh_pages() from vm_total_pages), and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 532fa53c1316..fcb17aa08248 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -121,7 +122,8 @@ EXPORT_SYMBOL(node_states); /* Protect totalram_pages and zone->managed_pages */ static DEFINE_SPINLOCK(managed_page_count_lock); -unsigned long totalram_pages __read_mostly; +atomic_long_t _totalram_pages __read_mostly; +EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; @@ -4375,11 +4377,11 @@ EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { - val->totalram = totalram_pages; + val->totalram = totalram_pages(); val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); - val->totalhigh = totalhigh_pages; + val->totalhigh = totalhigh_pages(); val->freehigh = nr_free_highpages(); val->mem_unit = PAGE_SIZE; } @@ -6641,10 +6643,10 @@ void adjust_managed_page_count(struct page *page, long count) { spin_lock(&managed_page_count_lock); page_zone(page)->managed_pages += count; - totalram_pages += count; + totalram_pages_add(count); #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) - totalhigh_pages += count; + totalhigh_pages_add(count); #endif spin_unlock(&managed_page_count_lock); } @@ -6675,9 +6677,9 @@ EXPORT_SYMBOL(free_reserved_area); void free_highmem_page(struct page *page) { __free_reserved_page(page); - totalram_pages++; + totalram_pages_inc(); page_zone(page)->managed_pages++; - totalhigh_pages++; + totalhigh_pages_inc(); } #endif @@ -6726,10 +6728,10 @@ void __init mem_init_print_info(const char *str) physpages << (PAGE_SHIFT - 10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT - 10), + totalhigh_pages() << (PAGE_SHIFT - 10), #endif str ? ", " : "", str ? str : ""); } diff --git a/mm/shmem.c b/mm/shmem.c index a612d765dc59..a06511d6ceaa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -101,12 +101,12 @@ struct shmem_falloc { #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { - return totalram_pages / 2; + return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { - return min(totalram_pages - totalhigh_pages, totalram_pages / 2); + return min(totalram_pages() - totalhigh_pages(), totalram_pages() / 2); } #endif @@ -3380,7 +3380,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, size = memparse(value,&rest); if (*rest == '%') { size <<= PAGE_SHIFT; - size *= totalram_pages; + size *= totalram_pages(); do_div(size, 100); rest++; } diff --git a/mm/slab.c b/mm/slab.c index a671328e0610..89fe1cf2d241 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1256,7 +1256,7 @@ void __init kmem_cache_init(void) * page orders on machines with more than 32MB of memory if * not overridden on the command line. */ - if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) + if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT) slab_max_order = SLAB_MAX_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated diff --git a/mm/swap.c b/mm/swap.c index 0b01f9d7c1e5..36a213f15002 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -981,7 +981,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); */ void __init swap_setup(void) { - unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); + unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); #ifdef CONFIG_SWAP int i; diff --git a/mm/util.c b/mm/util.c index 734b2d0e4a49..571f4100be6a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -381,6 +381,52 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +/** + * kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * Reclaim modifiers - __GFP_NORETRY, __GFP_REPEAT and __GFP_NOFAIL are not supported + * + * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. + */ +void *kvmalloc_node(size_t size, gfp_t flags, int node) +{ + gfp_t kmalloc_flags = flags; + void *ret; + + /* + * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) + * so the given set of flags has to be compatible. + */ + WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); + + /* + * Make sure that larger requests are not too disruptive - no OOM + * killer and no allocation failure warnings as we have a fallback + */ + if (size > PAGE_SIZE) + kmalloc_flags |= __GFP_NORETRY | __GFP_NOWARN; + + ret = kmalloc_node(size, kmalloc_flags, node); + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + if (ret || size <= PAGE_SIZE) + return ret; + + return __vmalloc_node_flags_caller(size, node, flags, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(kvmalloc_node); + void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) @@ -527,7 +573,7 @@ unsigned long vm_commit_limit(void) if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else - allowed = ((totalram_pages - hugetlb_total_pages()) + allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 04529a37bed6..02787b96dd00 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include #include #include @@ -50,12 +52,10 @@ static void __vunmap(const void *, int); static void free_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); - struct llist_node *llnode = llist_del_all(&p->list); - while (llnode) { - void *p = llnode; - llnode = llist_next(llnode); - __vunmap(p, 1); - } + struct llist_node *t, *llnode; + + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + __vunmap((void *)llnode, 1); } /*** Page table manipulation functions ***/ @@ -286,6 +286,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ +#define DEBUG_AUGMENT_PROPAGATE_CHECK 0 +#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 + +#define VM_LAZY_FREE 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); @@ -293,65 +297,67 @@ static DEFINE_SPINLOCK(vmap_area_lock); LIST_HEAD(vmap_area_list); static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; +static bool vmap_initialized __read_mostly; -/* The vmap cache globals are protected by vmap_area_lock */ -static struct rb_node *free_vmap_cache; -static unsigned long cached_hole_size; -static unsigned long cached_vstart; -static unsigned long cached_align; +/* + * This kmem_cache is used for vmap_area objects. Instead of + * allocating from slab we reuse an object from this cache to + * make things faster. Especially in "no edge" splitting of + * free block. + */ +static struct kmem_cache *vmap_area_cachep; -static unsigned long vmap_area_pcpu_hole; +/* + * This linked list is used in pair with free_vmap_area_root. + * It gives O(1) access to prev/next to perform fast coalescing. + */ +static LIST_HEAD(free_vmap_area_list); -#ifdef CONFIG_ENABLE_VMALLOC_SAVING -#define POSSIBLE_VMALLOC_START PAGE_OFFSET +/* + * This augment red-black tree represents the free vmap space. + * All vmap_area objects in this tree are sorted by va->va_start + * address. It is used for allocation and merging when a vmap + * object is released. + * + * Each vmap_area node contains a maximum available free block + * of its sub-tree, right or left. Therefore it is possible to + * find a lowest match of free area. + */ +static struct rb_root free_vmap_area_root = RB_ROOT; -#define VMALLOC_BITMAP_SIZE ((VMALLOC_END - PAGE_OFFSET) >> \ - PAGE_SHIFT) -#define VMALLOC_TO_BIT(addr) ((addr - PAGE_OFFSET) >> PAGE_SHIFT) -#define BIT_TO_VMALLOC(i) (PAGE_OFFSET + i * PAGE_SIZE) - -unsigned long total_vmalloc_size; -unsigned long vmalloc_reserved; - -DECLARE_BITMAP(possible_areas, VMALLOC_BITMAP_SIZE); - -void mark_vmalloc_reserved_area(void *x, unsigned long size) +static __always_inline unsigned long +va_size(struct vmap_area *va) { - unsigned long addr = (unsigned long)x; - - bitmap_set(possible_areas, VMALLOC_TO_BIT(addr), size >> PAGE_SHIFT); - vmalloc_reserved += size; + return (va->va_end - va->va_start); } -int is_vmalloc_addr(const void *x) +static __always_inline unsigned long +get_subtree_max_size(struct rb_node *node) { - unsigned long addr = (unsigned long)x; + struct vmap_area *va; - if (addr < POSSIBLE_VMALLOC_START || addr >= VMALLOC_END) - return 0; - - if (test_bit(VMALLOC_TO_BIT(addr), possible_areas)) - return 0; - - return 1; + va = rb_entry_safe(node, struct vmap_area, rb_node); + return va ? va->subtree_max_size : 0; } -static void calc_total_vmalloc_size(void) +/* + * Gets called when remove the node and rotate. + */ +static __always_inline unsigned long +compute_subtree_max_size(struct vmap_area *va) { - total_vmalloc_size = VMALLOC_END - POSSIBLE_VMALLOC_START - - vmalloc_reserved; -} -#else -int is_vmalloc_addr(const void *x) -{ - unsigned long addr = (unsigned long)x; - - return addr >= VMALLOC_START && addr < VMALLOC_END; + return max3(va_size(va), + get_subtree_max_size(va->rb_node.rb_left), + get_subtree_max_size(va->rb_node.rb_right)); } -static void calc_total_vmalloc_size(void) { } -#endif -EXPORT_SYMBOL(is_vmalloc_addr); +RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb, + struct vmap_area, rb_node, unsigned long, subtree_max_size, + compute_subtree_max_size) + +static void purge_vmap_area_lazy(void); +static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); +static unsigned long lazy_max_pages(void); static atomic_long_t nr_vmalloc_pages; @@ -379,41 +385,656 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) return NULL; } -static void __insert_vmap_area(struct vmap_area *va) +/* + * This function returns back addresses of parent node + * and its left or right link for further processing. + */ +static __always_inline struct rb_node ** +find_va_links(struct vmap_area *va, + struct rb_root *root, struct rb_node *from, + struct rb_node **parent) { - struct rb_node **p = &vmap_area_root.rb_node; - struct rb_node *parent = NULL; - struct rb_node *tmp; + struct vmap_area *tmp_va; + struct rb_node **link; - while (*p) { - struct vmap_area *tmp_va; - - parent = *p; - tmp_va = rb_entry(parent, struct vmap_area, rb_node); - if (va->va_start < tmp_va->va_end) - p = &(*p)->rb_left; - else if (va->va_end > tmp_va->va_start) - p = &(*p)->rb_right; - else - BUG(); + if (root) { + link = &root->rb_node; + if (unlikely(!*link)) { + *parent = NULL; + return link; + } + } else { + link = &from; } - rb_link_node(&va->rb_node, parent, p); - rb_insert_color(&va->rb_node, &vmap_area_root); + /* + * Go to the bottom of the tree. When we hit the last point + * we end up with parent rb_node and correct direction, i name + * it link, where the new va->rb_node will be attached to. + */ + do { + tmp_va = rb_entry(*link, struct vmap_area, rb_node); - /* address-sort this list */ - tmp = rb_prev(&va->rb_node); - if (tmp) { - struct vmap_area *prev; - prev = rb_entry(tmp, struct vmap_area, rb_node); - list_add_rcu(&va->list, &prev->list); - } else - list_add_rcu(&va->list, &vmap_area_list); + /* + * During the traversal we also do some sanity check. + * Trigger the BUG() if there are sides(left/right) + * or full overlaps. + */ + if (va->va_start < tmp_va->va_end && + va->va_end <= tmp_va->va_start) + link = &(*link)->rb_left; + else if (va->va_end > tmp_va->va_start && + va->va_start >= tmp_va->va_end) + link = &(*link)->rb_right; + else + BUG(); + } while (*link); + + *parent = &tmp_va->rb_node; + return link; } -static void purge_vmap_area_lazy(void); +static __always_inline struct list_head * +get_va_next_sibling(struct rb_node *parent, struct rb_node **link) +{ + struct list_head *list; -static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); + if (unlikely(!parent)) + /* + * The red-black tree where we try to find VA neighbors + * before merging or inserting is empty, i.e. it means + * there is no free vmap space. Normally it does not + * happen but we handle this case anyway. + */ + return NULL; + + list = &rb_entry(parent, struct vmap_area, rb_node)->list; + return (&parent->rb_right == link ? list->next : list); +} + +static __always_inline void +link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, struct list_head *head) +{ + /* + * VA is still not in the list, but we can + * identify its future previous list_head node. + */ + if (likely(parent)) { + head = &rb_entry(parent, struct vmap_area, rb_node)->list; + if (&parent->rb_right != link) + head = head->prev; + } + + /* Insert to the rb-tree */ + rb_link_node(&va->rb_node, parent, link); + if (root == &free_vmap_area_root) { + /* + * Some explanation here. Just perform simple insertion + * to the tree. We do not set va->subtree_max_size to + * its current size before calling rb_insert_augmented(). + * It is because of we populate the tree from the bottom + * to parent levels when the node _is_ in the tree. + * + * Therefore we set subtree_max_size to zero after insertion, + * to let __augment_tree_propagate_from() puts everything to + * the correct order later on. + */ + rb_insert_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + va->subtree_max_size = 0; + } else { + rb_insert_color(&va->rb_node, root); + } + + /* Address-sort this list */ + list_add(&va->list, head); +} + +static __always_inline void +unlink_va(struct vmap_area *va, struct rb_root *root) +{ + /* + * During merging a VA node can be empty, therefore + * not linked with the tree nor list. Just check it. + */ + if (!RB_EMPTY_NODE(&va->rb_node)) { + if (root == &free_vmap_area_root) + rb_erase_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + else + rb_erase(&va->rb_node, root); + + list_del(&va->list); + RB_CLEAR_NODE(&va->rb_node); + } +} + +#if DEBUG_AUGMENT_PROPAGATE_CHECK +static void +augment_tree_propagate_check(struct rb_node *n) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long size; + bool found = false; + + if (n == NULL) + return; + + va = rb_entry(n, struct vmap_area, rb_node); + size = va->subtree_max_size; + node = n; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) == size) { + node = node->rb_left; + } else { + if (va_size(va) == size) { + found = true; + break; + } + + node = node->rb_right; + } + } + + if (!found) { + va = rb_entry(n, struct vmap_area, rb_node); + pr_emerg("tree is corrupted: %lu, %lu\n", + va_size(va), va->subtree_max_size); + } + + augment_tree_propagate_check(n->rb_left); + augment_tree_propagate_check(n->rb_right); +} +#endif + +#if DEBUG_AUGMENT_PROPAGATE_CHECK +static void +augment_tree_propagate_check(struct rb_node *n) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long size; + bool found = false; + + if (n == NULL) + return; + + va = rb_entry(n, struct vmap_area, rb_node); + size = va->subtree_max_size; + node = n; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) == size) { + node = node->rb_left; + } else { + if (va_size(va) == size) { + found = true; + break; + } + + node = node->rb_right; + } + } + + if (!found) { + va = rb_entry(n, struct vmap_area, rb_node); + pr_emerg("tree is corrupted: %lu, %lu\n", + va_size(va), va->subtree_max_size); + } + + augment_tree_propagate_check(n->rb_left); + augment_tree_propagate_check(n->rb_right); +} +#endif + +/* + * This function populates subtree_max_size from bottom to upper + * levels starting from VA point. The propagation must be done + * when VA size is modified by changing its va_start/va_end. Or + * in case of newly inserting of VA to the tree. + * + * It means that __augment_tree_propagate_from() must be called: + * - After VA has been inserted to the tree(free path); + * - After VA has been shrunk(allocation path); + * - After VA has been increased(merging path). + * + * Please note that, it does not mean that upper parent nodes + * and their subtree_max_size are recalculated all the time up + * to the root node. + * + * 4--8 + * /\ + * / \ + * / \ + * 2--2 8--8 + * + * For example if we modify the node 4, shrinking it to 2, then + * no any modification is required. If we shrink the node 2 to 1 + * its subtree_max_size is updated only, and set to 1. If we shrink + * the node 8 to 6, then its subtree_max_size is set to 6 and parent + * node becomes 4--6. + */ +static __always_inline void +augment_tree_propagate_from(struct vmap_area *va) +{ + struct rb_node *node = &va->rb_node; + unsigned long new_va_sub_max_size; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + new_va_sub_max_size = compute_subtree_max_size(va); + + /* + * If the newly calculated maximum available size of the + * subtree is equal to the current one, then it means that + * the tree is propagated correctly. So we have to stop at + * this point to save cycles. + */ + if (va->subtree_max_size == new_va_sub_max_size) + break; + + va->subtree_max_size = new_va_sub_max_size; + node = rb_parent(&va->rb_node); + } + +#if DEBUG_AUGMENT_PROPAGATE_CHECK + augment_tree_propagate_check(free_vmap_area_root.rb_node); +#endif +} + +static void +insert_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + link = find_va_links(va, root, NULL, &parent); + link_va(va, root, parent, link, head); +} + +static void +insert_vmap_area_augment(struct vmap_area *va, + struct rb_node *from, struct rb_root *root, + struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + if (from) + link = find_va_links(va, NULL, from, &parent); + else + link = find_va_links(va, root, NULL, &parent); + + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); +} + +/* + * Merge de-allocated chunk of VA memory with previous + * and next free blocks. If coalesce is not done a new + * free area is inserted. If VA has been merged, it is + * freed. + */ +static __always_inline void +merge_or_add_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct vmap_area *sibling; + struct list_head *next; + struct rb_node **link; + struct rb_node *parent; + bool merged = false; + + /* + * Find a place in the tree where VA potentially will be + * inserted, unless it is merged with its sibling/siblings. + */ + link = find_va_links(va, root, NULL, &parent); + + /* + * Get next node of VA to check if merging can be done. + */ + next = get_va_next_sibling(parent, link); + if (unlikely(next == NULL)) + goto insert; + + /* + * start end + * | | + * |<------VA------>|<-----Next----->| + * | | + * start end + */ + if (next != head) { + sibling = list_entry(next, struct vmap_area, list); + if (sibling->va_start == va->va_end) { + sibling->va_start = va->va_start; + + /* Check and update the tree if needed. */ + augment_tree_propagate_from(sibling); + + /* Remove this VA, it has been merged. */ + unlink_va(va, root); + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + /* Point to the new merged area. */ + va = sibling; + merged = true; + } + } + + /* + * start end + * | | + * |<-----Prev----->|<------VA------>| + * | | + * start end + */ + if (next->prev != head) { + sibling = list_entry(next->prev, struct vmap_area, list); + if (sibling->va_end == va->va_start) { + sibling->va_end = va->va_end; + + /* Check and update the tree if needed. */ + augment_tree_propagate_from(sibling); + + /* Remove this VA, it has been merged. */ + unlink_va(va, root); + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + return; + } + } + +insert: + if (!merged) { + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); + } + +#if DEBUG_AUGMENT_PROPAGATE_CHECK + augment_tree_propagate_check(free_vmap_area_root.rb_node); +#endif +} + +static __always_inline bool +is_within_this_va(struct vmap_area *va, unsigned long size, + unsigned long align, unsigned long vstart) +{ + unsigned long nva_start_addr; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Can be overflowed due to big size or alignment. */ + if (nva_start_addr + size < nva_start_addr || + nva_start_addr < vstart) + return false; + + return (nva_start_addr + size <= va->va_end); +} + +/* + * Find the first free block(lowest start address) in the tree, + * that will accomplish the request corresponding to passing + * parameters. + */ +static __always_inline struct vmap_area * +find_vmap_lowest_match(unsigned long size, + unsigned long align, unsigned long vstart) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long length; + + /* Start from the root. */ + node = free_vmap_area_root.rb_node; + + /* Adjust the search size for alignment overhead. */ + length = size + align - 1; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) >= length && + vstart < va->va_start) { + node = node->rb_left; + } else { + if (is_within_this_va(va, size, align, vstart)) + return va; + + /* + * Does not make sense to go deeper towards the right + * sub-tree if it does not have a free block that is + * equal or bigger to the requested search length. + */ + if (get_subtree_max_size(node->rb_right) >= length) { + node = node->rb_right; + continue; + } + + /* + * OK. We roll back and find the fist right sub-tree, + * that will satisfy the search criteria. It can happen + * only once due to "vstart" restriction. + */ + while ((node = rb_parent(node))) { + va = rb_entry(node, struct vmap_area, rb_node); + if (is_within_this_va(va, size, align, vstart)) + return va; + + if (get_subtree_max_size(node->rb_right) >= length && + vstart <= va->va_start) { + node = node->rb_right; + break; + } + } + } + } + + return NULL; +} + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK +#include + +static struct vmap_area * +find_vmap_lowest_linear_match(unsigned long size, + unsigned long align, unsigned long vstart) +{ + struct vmap_area *va; + + list_for_each_entry(va, &free_vmap_area_list, list) { + if (!is_within_this_va(va, size, align, vstart)) + continue; + + return va; + } + + return NULL; +} + +static void +find_vmap_lowest_match_check(unsigned long size) +{ + struct vmap_area *va_1, *va_2; + unsigned long vstart; + unsigned int rnd; + + get_random_bytes(&rnd, sizeof(rnd)); + vstart = VMALLOC_START + rnd; + + va_1 = find_vmap_lowest_match(size, 1, vstart); + va_2 = find_vmap_lowest_linear_match(size, 1, vstart); + + if (va_1 != va_2) + pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", + va_1, va_2, vstart); +} +#endif + +enum fit_type { + NOTHING_FIT = 0, + FL_FIT_TYPE = 1, /* full fit */ + LE_FIT_TYPE = 2, /* left edge fit */ + RE_FIT_TYPE = 3, /* right edge fit */ + NE_FIT_TYPE = 4 /* no edge fit */ +}; + +static __always_inline enum fit_type +classify_va_fit_type(struct vmap_area *va, + unsigned long nva_start_addr, unsigned long size) +{ + enum fit_type type; + + /* Check if it is within VA. */ + if (nva_start_addr < va->va_start || + nva_start_addr + size > va->va_end) + return NOTHING_FIT; + + /* Now classify. */ + if (va->va_start == nva_start_addr) { + if (va->va_end == nva_start_addr + size) + type = FL_FIT_TYPE; + else + type = LE_FIT_TYPE; + } else if (va->va_end == nva_start_addr + size) { + type = RE_FIT_TYPE; + } else { + type = NE_FIT_TYPE; + } + + return type; +} + +static __always_inline int +adjust_va_to_fit_type(struct vmap_area *va, + unsigned long nva_start_addr, unsigned long size, + enum fit_type type) +{ + struct vmap_area *lva; + + if (type == FL_FIT_TYPE) { + /* + * No need to split VA, it fully fits. + * + * | | + * V NVA V + * |---------------| + */ + unlink_va(va, &free_vmap_area_root); + kmem_cache_free(vmap_area_cachep, va); + } else if (type == LE_FIT_TYPE) { + /* + * Split left edge of fit VA. + * + * | | + * V NVA V R + * |-------|-------| + */ + va->va_start += size; + } else if (type == RE_FIT_TYPE) { + /* + * Split right edge of fit VA. + * + * | | + * L V NVA V + * |-------|-------| + */ + va->va_end = nva_start_addr; + } else if (type == NE_FIT_TYPE) { + /* + * Split no edge of fit VA. + * + * | | + * L V NVA V R + * |---|-------|---| + */ + lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); + if (unlikely(!lva)) + return -1; + + /* + * Build the remainder. + */ + lva->va_start = va->va_start; + lva->va_end = nva_start_addr; + + /* + * Shrink this VA to remaining size. + */ + va->va_start = nva_start_addr + size; + } else { + return -1; + } + + if (type != FL_FIT_TYPE) { + augment_tree_propagate_from(va); + + if (type == NE_FIT_TYPE) + insert_vmap_area_augment(lva, &va->rb_node, + &free_vmap_area_root, &free_vmap_area_list); + } + + return 0; +} + +/* + * Returns a start address of the newly allocated area, if success. + * Otherwise a vend is returned that indicates failure. + */ +static __always_inline unsigned long +__alloc_vmap_area(unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, int node) +{ + unsigned long nva_start_addr; + struct vmap_area *va; + enum fit_type type; + int ret; + + va = find_vmap_lowest_match(size, align, vstart); + if (unlikely(!va)) + return vend; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Check the "vend" restriction. */ + if (nva_start_addr + size > vend) + return vend; + + /* Classify what we have found. */ + type = classify_va_fit_type(va, nva_start_addr, size); + if (WARN_ON_ONCE(type == NOTHING_FIT)) + return vend; + + /* Update the free vmap_area. */ + ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); + if (ret) + return vend; + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK + find_vmap_lowest_match_check(size); +#endif + + return nva_start_addr; +} /* * Allocate a region of KVA of the specified size and alignment, within the @@ -425,18 +1046,19 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int node, gfp_t gfp_mask) { struct vmap_area *va; - struct rb_node *n; unsigned long addr; int purged = 0; - struct vmap_area *first; BUG_ON(!size); BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); + if (unlikely(!vmap_initialized)) + return ERR_PTR(-EBUSY); + might_sleep(); - va = kmalloc_node(sizeof(struct vmap_area), + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -449,87 +1071,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, retry: spin_lock(&vmap_area_lock); + /* - * Invalidate cache if we have more permissive parameters. - * cached_hole_size notes the largest hole noticed _below_ - * the vmap_area cached in free_vmap_cache: if size fits - * into that hole, we want to scan from vstart to reuse - * the hole instead of allocating above free_vmap_cache. - * Note that __free_vmap_area may update free_vmap_cache - * without updating cached_hole_size or cached_align. + * If an allocation fails, the "vend" address is + * returned. Therefore trigger the overflow path. */ - if (!free_vmap_cache || - size < cached_hole_size || - vstart < cached_vstart || - align < cached_align) { -nocache: - cached_hole_size = 0; - free_vmap_cache = NULL; - } - /* record if we encounter less permissive parameters */ - cached_vstart = vstart; - cached_align = align; - - /* find starting point for our search */ - if (free_vmap_cache) { - first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); - addr = ALIGN(first->va_end, align); - if (addr < vstart) - goto nocache; - if (addr + size < addr) - goto overflow; - - } else { - addr = ALIGN(vstart, align); - if (addr + size < addr) - goto overflow; - - n = vmap_area_root.rb_node; - first = NULL; - - while (n) { - struct vmap_area *tmp; - tmp = rb_entry(n, struct vmap_area, rb_node); - if (tmp->va_end >= addr) { - first = tmp; - if (tmp->va_start <= addr) - break; - n = n->rb_left; - } else - n = n->rb_right; - } - - if (!first) - goto found; - } - - /* from the starting point, walk areas until a suitable hole is found */ - while (addr + size > first->va_start && addr + size <= vend) { - if (addr + cached_hole_size < first->va_start) - cached_hole_size = first->va_start - addr; - addr = ALIGN(first->va_end, align); - if (addr + size < addr) - goto overflow; - - if (list_is_last(&first->list, &vmap_area_list)) - goto found; - - first = list_next_entry(first, list); - } - -found: - /* - * Check also calculated address against the vstart, - * because it can be 0 because of big align request. - */ - if (addr + size > vend || addr < vstart) + addr = __alloc_vmap_area(size, align, vstart, vend, node); + if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->flags = 0; - __insert_vmap_area(va); - free_vmap_cache = &va->rb_node; + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_unlock(&vmap_area_lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); @@ -558,7 +1113,8 @@ overflow: if (printk_ratelimit()) pr_warn("vmap allocation for size %lu failed: use vmalloc= to increase size\n", size); - kfree(va); + + kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } @@ -578,35 +1134,16 @@ static void __free_vmap_area(struct vmap_area *va) { BUG_ON(RB_EMPTY_NODE(&va->rb_node)); - if (free_vmap_cache) { - if (va->va_end < cached_vstart) { - free_vmap_cache = NULL; - } else { - struct vmap_area *cache; - cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node); - if (va->va_start <= cache->va_start) { - free_vmap_cache = rb_prev(&va->rb_node); - /* - * We don't try to update cached_hole_size or - * cached_align, but it won't go very wrong. - */ - } - } - } - rb_erase(&va->rb_node, &vmap_area_root); - RB_CLEAR_NODE(&va->rb_node); - list_del_rcu(&va->list); + /* + * Remove from the busy tree/list. + */ + unlink_va(va, &vmap_area_root); /* - * Track the highest possible candidate for pcpu area - * allocation. Areas outside of vmalloc area can be returned - * here too, consider only end addresses which fall inside - * vmalloc area proper. + * Merge VA with its neighbors, otherwise just add it. */ - if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) - vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); - - kfree_rcu(va, rcu_head); + merge_or_add_vmap_area(va, + &free_vmap_area_root, &free_vmap_area_list); } /* @@ -627,26 +1164,6 @@ static void unmap_vmap_area(struct vmap_area *va) vunmap_page_range(va->va_start, va->va_end); } -static void vmap_debug_free_range(unsigned long start, unsigned long end) -{ - /* - * Unmap page tables and force a TLB flush immediately if pagealloc - * debugging is enabled. This catches use after free bugs similarly to - * those in linear kernel virtual address space after a page has been - * freed. - * - * All the lazy freeing logic is still retained, in order to minimise - * intrusiveness of this debugging feature. - * - * This is going to be *slow* (linear kernel virtual address debugging - * doesn't do a broadcast TLB flush so it is a lot faster). - */ - if (debug_pagealloc_enabled()) { - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); - } -} - /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. @@ -672,7 +1189,7 @@ static unsigned long lazy_max_pages(void) return log * (32UL * 1024 * 1024 / PAGE_SIZE); } -static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* * Serialize vmap purging. There is no actual criticial section protected @@ -690,7 +1207,7 @@ static void purge_fragmented_blocks_allcpus(void); */ void set_iounmap_nonlazy(void) { - atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); + atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); } /* @@ -698,34 +1215,40 @@ void set_iounmap_nonlazy(void) */ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { + unsigned long resched_threshold; struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - bool do_free = false; lockdep_assert_held(&vmap_purge_lock); valist = llist_del_all(&vmap_purge_list); + if (unlikely(valist == NULL)) + return false; + + /* + * TODO: to calculate a flush range without looping. + * The list can be up to lazy_max_pages() elements. + */ llist_for_each_entry(va, valist, purge_list) { if (va->va_start < start) start = va->va_start; if (va->va_end > end) end = va->va_end; - do_free = true; } - if (!do_free) - return false; - flush_tlb_kernel_range(start, end); + resched_threshold = lazy_max_pages() << 1; spin_lock(&vmap_area_lock); llist_for_each_entry_safe(va, n_va, valist, purge_list) { - int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; __free_vmap_area(va); - atomic_sub(nr, &vmap_lazy_nr); - cond_resched_lock(&vmap_area_lock); + atomic_long_sub(nr, &vmap_lazy_nr); + + if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) + cond_resched_lock(&vmap_area_lock); } spin_unlock(&vmap_area_lock); return true; @@ -761,10 +1284,10 @@ static void purge_vmap_area_lazy(void) */ static void free_vmap_area_noflush(struct vmap_area *va) { - int nr_lazy; + unsigned long nr_lazy; - nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, - &vmap_lazy_nr); + nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> + PAGE_SHIFT, &vmap_lazy_nr); /* After this point, we may free va at any time */ llist_add(&va->purge_list, &vmap_purge_list); @@ -780,6 +1303,9 @@ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); unmap_vmap_area(va); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range(va->va_start, va->va_end); + free_vmap_area_noflush(va); } @@ -824,8 +1350,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) -static bool vmap_initialized __read_mostly = false; - struct vmap_block_queue { spinlock_t lock; struct list_head free; @@ -1077,6 +1601,10 @@ static void vb_free(const void *addr, unsigned long size) vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range((unsigned long)addr, + (unsigned long)addr + size); + spin_lock(&vb->lock); /* Expand dirty range */ @@ -1092,24 +1620,9 @@ static void vb_free(const void *addr, unsigned long size) spin_unlock(&vb->lock); } -/** - * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer - * - * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily - * to amortize TLB flushing overheads. What this means is that any page you - * have now, may, in a former life, have been mapped into kernel virtual - * address by the vmap layer and so there might be some CPUs with TLB entries - * still referencing that page (additional to the regular 1:1 kernel mapping). - * - * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can - * be sure that none of the pages we have control over will have any aliases - * from the vmap layer. - */ -void vm_unmap_aliases(void) +static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) { - unsigned long start = ULONG_MAX, end = 0; int cpu; - int flush = 0; if (unlikely(!vmap_initialized)) return; @@ -1146,6 +1659,27 @@ void vm_unmap_aliases(void) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int flush = 0; + + _vm_unmap_aliases(start, end, flush); +} EXPORT_SYMBOL_GPL(vm_unmap_aliases); /** @@ -1165,16 +1699,16 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); - debug_check_no_locks_freed(mem, size); - vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) { + debug_check_no_locks_freed(mem, size); vb_free(mem, size); return; } va = find_vmap_area(addr); BUG_ON(!va); + debug_check_no_locks_freed((void *)va->va_start, + (va->va_end - va->va_start)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1224,33 +1758,6 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; - -/** - * vm_area_check_early - check if vmap area is already mapped - * @vm: vm_struct to be checked - * - * This function is used to check if the vmap area has been - * mapped already. @vm->addr, @vm->size and @vm->flags should - * contain proper values. - * - */ -int __init vm_area_check_early(struct vm_struct *vm) -{ - struct vm_struct *tmp, **p; - - BUG_ON(vmap_initialized); - for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { - if (tmp->addr >= vm->addr) { - if (tmp->addr < vm->addr + vm->size) - return 1; - } else { - if (tmp->addr + tmp->size > vm->addr) - return 1; - } - } - return 0; -} - /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -1302,12 +1809,58 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) vm_area_add_early(vm); } +static void vmap_init_free_space(void) +{ + unsigned long vmap_start = 1; + const unsigned long vmap_end = ULONG_MAX; + struct vmap_area *busy, *free; + + /* + * B F B B B F + * -|-----|.....|-----|-----|-----|.....|- + * | The KVA space | + * |<--------------------------------->| + */ + list_for_each_entry(busy, &vmap_area_list, list) { + if (busy->va_start - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = busy->va_start; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } + + vmap_start = busy->va_end; + } + + if (vmap_end - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = vmap_end; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } +} + void __init vmalloc_init(void) { struct vmap_area *va; struct vm_struct *tmp; int i; + /* + * Create the cache for vmap_area objects. + */ + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + for_each_possible_cpu(i) { struct vmap_block_queue *vbq; struct vfree_deferred *p; @@ -1322,16 +1875,21 @@ void __init vmalloc_init(void) /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { - va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + continue; + va->flags = VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; - __insert_vmap_area(va); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } - vmap_area_pcpu_hole = VMALLOC_END; - calc_total_vmalloc_size(); + /* + * Now we can initialize a free vmap space. + */ + vmap_init_free_space(); vmap_initialized = true; } @@ -1558,9 +2116,9 @@ struct vm_struct *remove_vm_area(const void *addr) spin_lock(&vmap_area_lock); va->vm = NULL; va->flags &= ~VM_VM_AREA; + va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); - vmap_debug_free_range(va->va_start, va->va_end); kasan_free_shadow(vm); free_unmap_vmap_area(va); @@ -1569,6 +2127,72 @@ struct vm_struct *remove_vm_area(const void *addr) return NULL; } +static inline void set_area_direct_map(const struct vm_struct *area, + int (*set_direct_map)(struct page *page)) +{ + int i; + + for (i = 0; i < area->nr_pages; i++) + if (page_address(area->pages[i])) + set_direct_map(area->pages[i]); +} + +/* Handle removing and resetting vm mappings related to the vm_struct. */ +static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +{ + unsigned long addr = (unsigned long)area->addr; + unsigned long start = ULONG_MAX, end = 0; + int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; + int i; + + /* + * The below block can be removed when all architectures that have + * direct map permissions also have set_direct_map_() implementations. + * This is concerned with resetting the direct map any an vm alias with + * execute permissions, without leaving a RW+X window. + */ + if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + set_memory_nx(addr, area->nr_pages); + set_memory_rw(addr, area->nr_pages); + } + + remove_vm_area(area->addr); + + /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ + if (!flush_reset) + return; + + /* + * If not deallocating pages, just do the flush of the VM area and + * return. + */ + if (!deallocate_pages) { + vm_unmap_aliases(); + return; + } + + /* + * If execution gets here, flush the vm mapping and reset the direct + * map. Find the start and end range of the direct mappings to make sure + * the vm_unmap_aliases() flush includes the direct map. + */ + for (i = 0; i < area->nr_pages; i++) { + if (page_address(area->pages[i])) { + start = min(addr, start); + end = max(addr, end); + } + } + + /* + * Set direct map to something invalid so that it won't be cached if + * there are any accesses after the TLB flush, then flush the TLB and + * reset the direct map permissions to the default. + */ + set_area_direct_map(area, set_direct_map_invalid_noflush); + _vm_unmap_aliases(start, end, 1); + set_area_direct_map(area, set_direct_map_default_noflush); +} + static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; @@ -1587,10 +2211,11 @@ static void __vunmap(const void *addr, int deallocate_pages) return; } - debug_check_no_locks_freed(addr, get_vm_area_size(area)); - debug_check_no_obj_freed(addr, get_vm_area_size(area)); + debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); + debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); + + vm_remove_mappings(area, deallocate_pages); - remove_vm_area(addr); if (deallocate_pages) { int i; @@ -1641,6 +2266,14 @@ void vfree_atomic(const void *addr) __vfree_deferred(addr); } +static void __vfree(const void *addr) +{ + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else + __vunmap(addr, 1); +} + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1661,12 +2294,12 @@ void vfree(const void *addr) kmemleak_free(addr); + might_sleep_if(!in_interrupt()); + if (!addr) return; - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else - __vunmap(addr, 1); + + __vfree(addr); } EXPORT_SYMBOL(vfree); @@ -1706,7 +2339,7 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); - if (count > totalram_pages) + if (count > totalram_pages()) return NULL; size = (unsigned long)count << PAGE_SHIFT; @@ -1733,6 +2366,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; + const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? + 0 : + __GFP_HIGHMEM; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1740,7 +2376,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, + pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, PAGE_KERNEL, node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); @@ -1756,9 +2392,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask); + page = alloc_page(alloc_mask|highmem_mask); else - page = alloc_pages_node(node, alloc_mask, 0); + page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1767,7 +2403,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfpflags_allow_blocking(gfp_mask)) + if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); @@ -1780,7 +2416,7 @@ fail: warn_alloc(gfp_mask, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); - vfree(area->addr); + __vfree(area->addr); return NULL; } @@ -1810,7 +2446,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long real_size = size; size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > totalram_pages) + if (!size || (size >> PAGE_SHIFT) > totalram_pages()) goto fail; area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | @@ -1835,12 +2471,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, */ clear_vm_uninitialized_flag(area); - /* - * A ref_count = 2 is needed because vm_struct allocated in - * __get_vm_area_node() contains a reference to the virtual address of - * the vmalloc'ed block. - */ - kmemleak_alloc(addr, real_size, 2, gfp_mask); + kmemleak_vmalloc(area, size, gfp_mask); return addr; @@ -1850,6 +2481,15 @@ fail: return NULL; } +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node_range); +#endif + /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size @@ -1862,6 +2502,13 @@ fail: * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. + * + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT + * and __GFP_NOFAIL are not supported + * + * Any use of gfp flags outside of GFP_KERNEL should be consulted + * with mm people. + * */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, @@ -1878,6 +2525,20 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); +static inline void *__vmalloc_node_flags(unsigned long size, + int node, gfp_t flags) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, + node, __builtin_return_address(0)); +} + + +void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, + void *caller) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); +} + /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -1890,7 +2551,7 @@ EXPORT_SYMBOL(__vmalloc); void *vmalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_HIGHMEM); + GFP_KERNEL); } EXPORT_SYMBOL(vmalloc); @@ -1907,7 +2568,7 @@ EXPORT_SYMBOL(vmalloc); void *vzalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -1920,18 +2581,10 @@ EXPORT_SYMBOL(vzalloc); */ void *vmalloc_user(unsigned long size) { - struct vm_struct *area; - void *ret; - - ret = __vmalloc_node(size, SHMLBA, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL, NUMA_NO_NODE, - __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_user); @@ -1948,7 +2601,7 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -1968,14 +2621,10 @@ EXPORT_SYMBOL(vmalloc_node); void *vzalloc_node(unsigned long size, int node) { return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc_node); -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - /** * vmalloc_user_node_flags - allocate memory for userspace on a specific node * @size: allocation size @@ -2010,16 +2659,21 @@ EXPORT_SYMBOL(vmalloc_user_node_flags); void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) -#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL +#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) -#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL +#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) #else -#define GFP_VMALLOC32 GFP_KERNEL +/* + * 64b systems should always have either DMA or DMA32 zones. For others + * GFP_DMA32 should do the right thing and use the normal zone. + */ +#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL #endif /** @@ -2045,16 +2699,10 @@ EXPORT_SYMBOL(vmalloc_32); */ void *vmalloc_32_user(unsigned long size) { - struct vm_struct *area; - void *ret; - - ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_user); @@ -2462,81 +3110,64 @@ static struct vmap_area *node_to_va(struct rb_node *n) } /** - * pvm_find_next_prev - find the next and prev vmap_area surrounding @end - * @end: target address - * @pnext: out arg for the next vmap_area - * @pprev: out arg for the previous vmap_area + * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to + * @addr: target address * - * Returns: %true if either or both of next and prev are found, - * %false if no vmap_area exists - * - * Find vmap_areas end addresses of which enclose @end. ie. if not - * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. + * Returns: vmap_area if it is found. If there is no such area + * the first highest(reverse order) vmap_area is returned + * i.e. va->va_start < addr && va->va_end < addr or NULL + * if there are no any areas before @addr. */ -static bool pvm_find_next_prev(unsigned long end, - struct vmap_area **pnext, - struct vmap_area **pprev) +static struct vmap_area * +pvm_find_va_enclose_addr(unsigned long addr) { - struct rb_node *n = vmap_area_root.rb_node; - struct vmap_area *va = NULL; + struct vmap_area *va, *tmp; + struct rb_node *n; + + n = free_vmap_area_root.rb_node; + va = NULL; while (n) { - va = rb_entry(n, struct vmap_area, rb_node); - if (end < va->va_end) - n = n->rb_left; - else if (end > va->va_end) + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_start <= addr) { + va = tmp; + if (tmp->va_end >= addr) + break; + n = n->rb_right; - else - break; + } else { + n = n->rb_left; + } } - if (!va) - return false; - - if (va->va_end > end) { - *pnext = va; - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); - } else { - *pprev = va; - *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); - } - return true; + return va; } /** - * pvm_determine_end - find the highest aligned address between two vmap_areas - * @pnext: in/out arg for the next vmap_area - * @pprev: in/out arg for the previous vmap_area - * @align: alignment + * pvm_determine_end_from_reverse - find the highest aligned address + * of free block below VMALLOC_END + * @va: + * in - the VA we start the search(reverse order); + * out - the VA with the highest aligned end address. * - * Returns: determined end address - * - * Find the highest aligned address between *@pnext and *@pprev below - * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned - * down address is between the end addresses of the two vmap_areas. - * - * Please note that the address returned by this function may fall - * inside *@pnext vmap_area. The caller is responsible for checking - * that. + * Returns: determined end address within vmap_area */ -static unsigned long pvm_determine_end(struct vmap_area **pnext, - struct vmap_area **pprev, - unsigned long align) +static unsigned long +pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) { - const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); unsigned long addr; - if (*pnext) - addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); - else - addr = vmalloc_end; - - while (*pprev && (*pprev)->va_end > addr) { - *pnext = *pprev; - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + if (likely(*va)) { + list_for_each_entry_from_reverse((*va), + &free_vmap_area_list, list) { + addr = min((*va)->va_end & ~(align - 1), vmalloc_end); + if ((*va)->va_start < addr) + return addr; + } } - return addr; + return 0; } /** @@ -2556,12 +3187,12 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * to gigabytes. To avoid interacting with regular vmallocs, these * areas are allocated from top. * - * Despite its complicated look, this allocator is rather simple. It - * does everything top-down and scans areas from the end looking for - * matching slot. While scanning, if any of the areas overlaps with - * existing vmap_area, the base address is pulled down to fit the - * area. Scanning is repeated till all the areas fit and then all - * necessary data structres are inserted and the result is returned. + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans free blocks from the end looking + * for matching base. While scanning, if any of the areas do not fit the + * base address is pulled down to fit the area. Scanning is repeated till + * all the areas fit and then all necessary data structures are inserted + * and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, @@ -2569,11 +3200,12 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); - struct vmap_area **vas, *prev, *next; + struct vmap_area **vas, *va; struct vm_struct **vms; int area, area2, last_area, term_area; - unsigned long base, start, end, last_end; + unsigned long base, start, size, end, last_end; bool purged = false; + enum fit_type type; /* verify parameters and allocate data structures */ BUG_ON(offset_in_page(align) || !is_power_of_2(align)); @@ -2589,15 +3221,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, if (start > offsets[last_area]) last_area = area; - for (area2 = 0; area2 < nr_vms; area2++) { + for (area2 = area + 1; area2 < nr_vms; area2++) { unsigned long start2 = offsets[area2]; unsigned long end2 = start2 + sizes[area2]; - if (area2 == area) - continue; - - BUG_ON(start2 >= start && start2 < end); - BUG_ON(end2 <= end && end2 > start); + BUG_ON(start2 < end && start < end2); } } last_end = offsets[last_area] + sizes[last_area]; @@ -2613,7 +3241,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, goto err_free2; for (area = 0; area < nr_vms; area++) { - vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; @@ -2626,49 +3254,29 @@ retry: start = offsets[area]; end = start + sizes[area]; - if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { - base = vmalloc_end - last_end; - goto found; - } - base = pvm_determine_end(&next, &prev, align) - end; + va = pvm_find_va_enclose_addr(vmalloc_end); + base = pvm_determine_end_from_reverse(&va, align) - end; while (true) { - BUG_ON(next && next->va_end <= base + end); - BUG_ON(prev && prev->va_end > base + end); - /* * base might have underflowed, add last_end before * comparing. */ - if (base + last_end < vmalloc_start + last_end) { - spin_unlock(&vmap_area_lock); - if (!purged) { - purge_vmap_area_lazy(); - purged = true; - goto retry; - } - goto err_free; - } + if (base + last_end < vmalloc_start + last_end) + goto overflow; /* - * If next overlaps, move base downwards so that it's - * right below next and then recheck. + * Fitting base has not been found. */ - if (next && next->va_start < base + end) { - base = pvm_determine_end(&next, &prev, align) - end; - term_area = area; - continue; - } + if (va == NULL) + goto overflow; /* - * If prev overlaps, shift down next and prev and move - * base so that it's right below new next and then - * recheck. + * If this VA does not fit, move base downwards and recheck. */ - if (prev && prev->va_end > base + start) { - next = prev; - prev = node_to_va(rb_prev(&next->rb_node)); - base = pvm_determine_end(&next, &prev, align) - end; + if (base + start < va->va_start || base + end > va->va_end) { + va = node_to_va(rb_prev(&va->rb_node)); + base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } @@ -2680,22 +3288,41 @@ retry: area = (area + nr_vms - 1) % nr_vms; if (area == term_area) break; + start = offsets[area]; end = start + sizes[area]; - pvm_find_next_prev(base + end, &next, &prev); + va = pvm_find_va_enclose_addr(base + end); } -found: + /* we've found a fitting base, insert all va's */ for (area = 0; area < nr_vms; area++) { - struct vmap_area *va = vas[area]; + int ret; - va->va_start = base + offsets[area]; - va->va_end = va->va_start + sizes[area]; - __insert_vmap_area(va); + start = base + offsets[area]; + size = sizes[area]; + + va = pvm_find_va_enclose_addr(start); + if (WARN_ON_ONCE(va == NULL)) + /* It is a BUG(), but trigger recovery instead. */ + goto recovery; + + type = classify_va_fit_type(va, start, size); + if (WARN_ON_ONCE(type == NOTHING_FIT)) + /* It is a BUG(), but trigger recovery instead. */ + goto recovery; + + ret = adjust_va_to_fit_type(va, start, size, type); + if (unlikely(ret)) + goto recovery; + + /* Allocated area. */ + va = vas[area]; + va->va_start = start; + va->va_end = start + size; + + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } - vmap_area_pcpu_hole = base + offsets[last_area]; - spin_unlock(&vmap_area_lock); /* insert all vm's */ @@ -2706,9 +3333,38 @@ found: kfree(vas); return vms; +recovery: + /* Remove previously inserted areas. */ + while (area--) { + __free_vmap_area(vas[area]); + vas[area] = NULL; + } + +overflow: + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + + /* Before "retry", check if we recover. */ + for (area = 0; area < nr_vms; area++) { + if (vas[area]) + continue; + + vas[area] = kmem_cache_zalloc( + vmap_area_cachep, GFP_KERNEL); + if (!vas[area]) + goto err_free; + } + + goto retry; + } + err_free: for (area = 0; area < nr_vms; area++) { - kfree(vas[area]); + if (vas[area]) + kmem_cache_free(vmap_area_cachep, vas[area]); + kfree(vms[area]); } err_free2: @@ -2805,8 +3461,14 @@ static int s_show(struct seq_file *m, void *p) * s_show can encounter race with remove_vm_area, !VM_VM_AREA on * behalf of vmap area is being tear down or vm_map_ram allocation. */ - if (!(va->flags & VM_VM_AREA)) + if (!(va->flags & VM_VM_AREA)) { + seq_printf(m, "0x%pK-0x%pK %7ld %s\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start, + va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram"); + return 0; + } v = va->vm; diff --git a/mm/workingset.c b/mm/workingset.c index a6976119b4a4..522f8e66d957 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -511,10 +511,10 @@ static int __init workingset_init(void) * actionable refault distance, which is currently half of * memory (totalram_pages/2). However, memory hotplug may add * some more pages at runtime, so keep working with up to - * double the initial memory by using totalram_pages as-is. + * double the initial memory by using totalram_pages() as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; - max_order = fls_long(totalram_pages - 1); + max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 18c4b34bd6e0..696f13b58e65 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -174,7 +174,7 @@ void *ceph_kvmalloc(size_t size, gfp_t flags) return ptr; } - return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, flags, PAGE_KERNEL); } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 936dab12f99f..21db0a88b158 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1149,10 +1149,10 @@ static int __init dccp_init(void) * * The methodology is similar to that of the buffer cache. */ - if (totalram_pages >= (128 * 1024)) - goal = totalram_pages >> (21 - PAGE_SHIFT); + if (totalram_pages() >= (128 * 1024)) + goal = totalram_pages() >> (21 - PAGE_SHIFT); else - goal = totalram_pages >> (23 - PAGE_SHIFT); + goal = totalram_pages() >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 403593bd2b83..86b7eec8470c 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1877,7 +1877,7 @@ void __init dn_route_init(void) dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; add_timer(&dn_route_timer); - goal = totalram_pages >> (26 - PAGE_SHIFT); + goal = totalram_pages() >> (26 - PAGE_SHIFT); for(order = 0; (1UL << order) < goal; order++) /* NOTHING */; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index bf1f3b2b29d1..5e0d05f852c0 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1142,7 +1142,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) slots = tcpmhash_entries; if (!slots) { - if (totalram_pages >= 128 * 1024) + if (totalram_pages() >= 128 * 1024) slots = 16 * 1024; else slots = 8 * 1024; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ba46d5fa646b..ca54273657bd 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2002,11 +2002,11 @@ int nf_conntrack_init_start(void) * >= 4GB machines have 65536 buckets. */ nf_conntrack_htable_size - = (((totalram_pages << PAGE_SHIFT) / 16384) + = (((totalram_pages() << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) + if (totalram_pages() > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) nf_conntrack_htable_size = 65536; - else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) + else if (totalram_pages() > (1024 * 1024 * 1024 / PAGE_SIZE)) nf_conntrack_htable_size = 16384; if (nf_conntrack_htable_size < 32) nf_conntrack_htable_size = 32; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2820fa6f399c..5c021ffcb51f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1038,7 +1038,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) return NULL; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((size >> PAGE_SHIFT) + 2 > totalram_pages) + if ((size >> PAGE_SHIFT) + 2 > totalram_pages()) return NULL; if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 97df6f9dbdde..d8cfddf61b30 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -253,9 +253,9 @@ static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg, if (cfg->size) { size = cfg->size; } else { - size = (totalram_pages << PAGE_SHIFT) / 16384 / + size = (totalram_pages() << PAGE_SHIFT) / 16384 / sizeof(struct list_head); - if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) + if (totalram_pages() > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) size = 16; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 02afbe571008..24321a504dc5 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1433,10 +1433,10 @@ static __init int sctp_init(void) * The methodology is similar to that of the tcp hash tables. * Though not identical. Start by getting a goal size */ - if (totalram_pages >= (128 * 1024)) - goal = totalram_pages >> (22 - PAGE_SHIFT); + if (totalram_pages() >= (128 * 1024)) + goal = totalram_pages() >> (22 - PAGE_SHIFT); else - goal = totalram_pages >> (24 - PAGE_SHIFT); + goal = totalram_pages() >> (24 - PAGE_SHIFT); /* Then compute the page order for said goal */ order = get_order(goal); diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 5923d5665209..0ebdadd9c103 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -100,7 +100,7 @@ static char *aa_simple_write_to_buffer(int op, const char __user *userbuf, return ERR_PTR(-EACCES); /* freed by caller to simple_write_to_buffer */ - data = kvmalloc(alloc_size); + data = kvmalloc(alloc_size, GFP_KERNEL); if (data == NULL) return ERR_PTR(-ENOMEM); diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index 5d721e990876..6c119d29da6b 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -66,17 +66,6 @@ extern int apparmor_initialized __initdata; /* fn's in lib */ char *aa_split_fqname(char *args, char **ns_name); void aa_info_message(const char *str); -void *__aa_kvmalloc(size_t size, gfp_t flags); - -static inline void *kvmalloc(size_t size) -{ - return __aa_kvmalloc(size, 0); -} - -static inline void *kvzalloc(size_t size) -{ - return __aa_kvmalloc(size, __GFP_ZERO); -} /* returns 0 if kref not incremented */ static inline int kref_get_not0(struct kref *kref) diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index c1827e068454..a4975a7a395c 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -76,31 +76,3 @@ void aa_info_message(const char *str) printk(KERN_INFO "AppArmor: %s\n", str); } -/** - * __aa_kvmalloc - do allocation preferring kmalloc but falling back to vmalloc - * @size: how many bytes of memory are required - * @flags: the type of memory to allocate (see kmalloc). - * - * Return: allocated buffer or NULL if failed - * - * It is possible that policy being loaded from the user is larger than - * what can be allocated by kmalloc, in those cases fall back to vmalloc. - */ -void *__aa_kvmalloc(size_t size, gfp_t flags) -{ - void *buffer = NULL; - - if (size == 0) - return NULL; - - /* do not attempt kmalloc if we need more than 16 pages at once */ - if (size <= (16*PAGE_SIZE)) - buffer = kmalloc(size, flags | GFP_NOIO | __GFP_NOWARN); - if (!buffer) { - if (flags & __GFP_ZERO) - buffer = vzalloc(size); - else - buffer = vmalloc(size); - } - return buffer; -} diff --git a/security/apparmor/match.c b/security/apparmor/match.c index 3f900fcca8fb..0ffbafa48fa5 100644 --- a/security/apparmor/match.c +++ b/security/apparmor/match.c @@ -61,7 +61,7 @@ static struct table_header *unpack_table(char *blob, size_t bsize) if (bsize < tsize) goto out; - table = kvzalloc(tsize); + table = kvzalloc(tsize, GFP_KERNEL); if (table) { table->td_id = th.td_id; table->td_flags = th.td_flags; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d251b718bf53..5acfb995e966 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -530,7 +530,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void) int i; struct kvm_memslots *slots; - slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); + slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) return NULL; @@ -749,18 +749,6 @@ out_err_no_disable: return ERR_PTR(r); } -/* - * Avoid using vmalloc for a small buffer. - * Should not be used when the size is statically known. - */ -void *kvm_kvzalloc(unsigned long size) -{ - if (size > PAGE_SIZE) - return vzalloc(size); - else - return kzalloc(size, GFP_KERNEL); -} - static void kvm_destroy_devices(struct kvm *kvm) { struct kvm_device *dev, *tmp; @@ -845,7 +833,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); - memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); + memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL); if (!memslot->dirty_bitmap) return -ENOMEM; @@ -1064,7 +1052,7 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_free; } - slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); + slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) goto out_free; memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));