diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index b2391b829169..cb8862659178 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -150,6 +150,7 @@ See the include/linux/kmemleak.h header for the functions prototype. - ``kmemleak_init`` - initialize kmemleak - ``kmemleak_alloc`` - notify of a memory block allocation - ``kmemleak_alloc_percpu`` - notify of a percpu memory block allocation +- ``kmemleak_vmalloc`` - notify of a vmalloc() memory allocation - ``kmemleak_free`` - notify of a memory block freeing - ``kmemleak_free_part`` - notify of a partial memory block freeing - ``kmemleak_free_percpu`` - notify of a percpu memory block freeing diff --git a/arch/Kconfig b/arch/Kconfig index eab393fb6ddb..6fc0c37d2145 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -227,6 +227,9 @@ config ARCH_HAS_FORTIFY_SOURCE An architecture should select this when it can successfully build and run with CONFIG_FORTIFY_SOURCE. +config ARCH_HAS_SET_MEMORY + bool + config FORTIFY_COMPILE_CHECK depends on ARCH_HAS_FORTIFY_SOURCE bool diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 8e349ce8230c..4c825169668c 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -16,6 +16,7 @@ #include #include #include +#include #define CACHE_COLOUR(vaddr) ((vaddr & (SHMLBA - 1)) >> PAGE_SHIFT) diff --git a/arch/arm/include/asm/set_memory.h b/arch/arm/include/asm/set_memory.h new file mode 100644 index 000000000000..5aa4315abe91 --- /dev/null +++ b/arch/arm/include/asm/set_memory.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 1999-2002 Russell King + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASMARM_SET_MEMORY_H +#define _ASMARM_SET_MEMORY_H + +#ifdef CONFIG_MMU +int set_memory_ro(unsigned long addr, int numpages); +int set_memory_rw(unsigned long addr, int numpages); +int set_memory_x(unsigned long addr, int numpages); +int set_memory_nx(unsigned long addr, int numpages); +#else +static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } +#endif + +#ifdef CONFIG_STRICT_KERNEL_RWX +void set_kernel_text_rw(void); +void set_kernel_text_ro(void); +#else +static inline void set_kernel_text_rw(void) { } +static inline void set_kernel_text_ro(void) { } +#endif + +#endif diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 3da63a6a2ac6..66e5d8765601 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -92,8 +92,7 @@ void __init add_static_vm_early(struct static_vm *svm) void *vaddr; vm = &svm->vm; - if (!vm_area_check_early(vm)) - vm_area_add_early(vm); + vm_area_add_early(vm); vaddr = vm->addr; list_for_each_entry(curr_svm, &static_vmlist, list) { diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 21980d28c38d..bc000444aca2 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1462,21 +1462,12 @@ static void __init map_lowmem(void) phys_addr_t kernel_x_start = round_down(__pa(_stext), SECTION_SIZE); #endif phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE); - struct static_vm *svm; - phys_addr_t start; - phys_addr_t end; - unsigned long vaddr; - unsigned long pfn; - unsigned long length; - unsigned int type; - int nr = 0; /* Map all the lowmem memory banks. */ for_each_memblock(memory, reg) { + phys_addr_t start = reg->base; + phys_addr_t end = start + reg->size; struct map_desc map; - start = reg->base; - end = start + reg->size; - nr++; if (memblock_is_nomap(reg)) continue; @@ -1528,34 +1519,6 @@ static void __init map_lowmem(void) } } } - svm = early_alloc_aligned(sizeof(*svm) * nr, __alignof__(*svm)); - - for_each_memblock(memory, reg) { - struct vm_struct *vm; - - start = reg->base; - end = start + reg->size; - - if (end > arm_lowmem_limit) - end = arm_lowmem_limit; - if (start >= end) - break; - - vm = &svm->vm; - pfn = __phys_to_pfn(start); - vaddr = __phys_to_virt(start); - length = end - start; - type = MT_MEMORY_RW; - - vm->addr = (void *)(vaddr & PAGE_MASK); - vm->size = PAGE_ALIGN(length + (vaddr & ~PAGE_MASK)); - vm->phys_addr = __pfn_to_phys(pfn); - vm->flags = VM_LOWMEM; - vm->flags |= VM_ARM_MTYPE(type); - vm->caller = map_lowmem; - add_static_vm_early(svm++); - mark_vmalloc_reserved_area(vm->addr, vm->size); - } } #ifdef CONFIG_ARM_PV_FIXUP diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f50f0ce7d3f4..bf195051af0a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -23,6 +23,7 @@ config ARM64 select ARCH_WANT_COMPAT_IPC_PARSE_VERSION select ARCH_WANT_FRAME_POINTERS select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_SET_MEMORY select ARM_AMBA select ARM_ARCH_TIMER select HAVE_KERNEL_GZIP diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 28196b18e394..08ff982cab85 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -31,6 +31,7 @@ generic-y += rwsem.h generic-y += segment.h generic-y += sembuf.h generic-y += serial.h +generic-y += set_memory.h generic-y += shmbuf.h generic-y += simd.h generic-y += sizes.h diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 9377bec034fa..11ba1a570394 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -20,6 +20,7 @@ #define __ASM_CACHEFLUSH_H #include +#include /* * This flag is used to indicate that the page pointed to by a pte is clean diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c index 1e7fd45b60f8..c4e0c37200c3 100644 --- a/arch/cris/mm/init.c +++ b/arch/cris/mm/init.c @@ -43,7 +43,7 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); free_page(addr); - totalram_pages++; + totalram_pages_inc(); } printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index a0ecdb4abcc8..3d4f5660a2e0 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -218,7 +218,7 @@ void *module_alloc(unsigned long size) * easier than trying to map the text, data, init_text and * init_data correctly */ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_HIGHMEM, + GFP_KERNEL, PAGE_KERNEL_RWX, 0, NUMA_NO_NODE, __builtin_return_address(0)); } diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 316eded255e7..44171fb34e23 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -174,7 +174,7 @@ static long cmm_alloc_pages(long nr) pa->page[pa->index++] = addr; loaned_pages++; - totalram_pages--; + totalram_pages_dec(); spin_unlock(&cmm_lock); nr--; } @@ -213,7 +213,7 @@ static long cmm_free_pages(long nr) free_page(addr); loaned_pages--; nr--; - totalram_pages++; + totalram_pages_inc(); } spin_unlock(&cmm_lock); cmm_dbg("End request with %ld pages unfulfilled\n", nr); @@ -257,7 +257,7 @@ static void cmm_get_mpp(void) int rc; struct hvcall_mpp_data mpp_data; signed long active_pages_target, page_loan_request, target; - signed long total_pages = totalram_pages + loaned_pages; + signed long total_pages = totalram_pages() + loaned_pages; signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE; rc = h_get_mpp(&mpp_data); @@ -288,7 +288,7 @@ static void cmm_get_mpp(void) cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n", page_loan_request, loaned_pages, loaned_pages_target, - oom_freed_pages, totalram_pages); + oom_freed_pages, totalram_pages()); } static struct notifier_block cmm_oom_nb = { @@ -552,7 +552,7 @@ static int cmm_mem_going_offline(void *arg) free_page(pa_curr->page[idx]); freed++; loaned_pages--; - totalram_pages++; + totalram_pages_inc(); pa_curr->page[idx] = pa_last->page[--pa_last->index]; if (pa_last->index == 0) { if (pa_curr == pa_last) diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h new file mode 100644 index 000000000000..46a4db44c47a --- /dev/null +++ b/arch/s390/include/asm/set_memory.h @@ -0,0 +1,31 @@ +#ifndef _ASMS390_SET_MEMORY_H +#define _ASMS390_SET_MEMORY_H + +#define SET_MEMORY_RO 1UL +#define SET_MEMORY_RW 2UL +#define SET_MEMORY_NX 4UL +#define SET_MEMORY_X 8UL + +int __set_memory(unsigned long addr, int numpages, unsigned long flags); + +static inline int set_memory_ro(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, SET_MEMORY_RO); +} + +static inline int set_memory_rw(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, SET_MEMORY_RW); +} + +static inline int set_memory_nx(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, SET_MEMORY_NX); +} + +static inline int set_memory_x(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, SET_MEMORY_X); +} + +#endif diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index b3e9d18f2ec6..75b263e98378 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -56,7 +56,7 @@ static void __init setup_zero_pages(void) order = 7; /* Limit number of empty zero pages for small memory sizes */ - while (order > 2 && (totalram_pages >> 10) < (1UL << order)) + while (order > 2 && (totalram_pages() >> 10) < (1UL << order)) order--; empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 3b7092d9ea8f..4dbe42253e75 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -61,7 +61,7 @@ void show_mem(unsigned int filter) show_free_areas(filter); printk("Free swap: %6ldkB\n", get_nr_swap_pages() << (PAGE_SHIFT-10)); - printk("%ld pages of RAM\n", totalram_pages); + printk("%ld pages of RAM\n", totalram_pages()); printk("%ld free pages\n", nr_free_pages()); } diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index adce25462b0d..13c220db307c 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -708,7 +708,7 @@ static void __init set_non_bootmem_pages_init(void) #ifdef CONFIG_HIGHMEM if (idx == ZONE_HIGHMEM) - totalhigh_pages += z->spanned_pages; + totalhigh_pages_add(z->spanned_pages); #endif if (kdata_huge) { unsigned long percpu_pfn = node_percpu_pfn[nid]; diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index e7437ec62710..50813cae069b 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -53,8 +53,8 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ free_all_bootmem(); - max_low_pfn = totalram_pages; - max_pfn = totalram_pages; + max_low_pfn = totalram_pages(); + max_pfn = totalram_pages(); mem_init_print_info(NULL); kmalloc_ok = 1; } diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h new file mode 100644 index 000000000000..eaec6c364e42 --- /dev/null +++ b/arch/x86/include/asm/set_memory.h @@ -0,0 +1,87 @@ +#ifndef _ASM_X86_SET_MEMORY_H +#define _ASM_X86_SET_MEMORY_H + +#include +#include + +/* + * The set_memory_* API can be used to change various attributes of a virtual + * address range. The attributes include: + * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack + * Executability : eXeutable, NoteXecutable + * Read/Write : ReadOnly, ReadWrite + * Presence : NotPresent + * + * Within a category, the attributes are mutually exclusive. + * + * The implementation of this API will take care of various aspects that + * are associated with changing such attributes, such as: + * - Flushing TLBs + * - Flushing CPU caches + * - Making sure aliases of the memory behind the mapping don't violate + * coherency rules as defined by the CPU in the system. + * + * What this API does not do: + * - Provide exclusion between various callers - including callers that + * operation on other mappings of the same physical page + * - Restore default attributes when a page is freed + * - Guarantee that mappings other than the requested one are + * in any state, other than that these do not violate rules for + * the CPU you have. Do not depend on any effects on other mappings, + * CPUs other than the one you have may have more relaxed rules. + * The caller is required to take care of these. + */ + +int _set_memory_uc(unsigned long addr, int numpages); +int _set_memory_wc(unsigned long addr, int numpages); +int _set_memory_wt(unsigned long addr, int numpages); +int _set_memory_wb(unsigned long addr, int numpages); +int set_memory_uc(unsigned long addr, int numpages); +int set_memory_wc(unsigned long addr, int numpages); +int set_memory_wt(unsigned long addr, int numpages); +int set_memory_wb(unsigned long addr, int numpages); +int set_memory_np(unsigned long addr, int numpages); +int set_memory_4k(unsigned long addr, int numpages); + +int set_memory_array_uc(unsigned long *addr, int addrinarray); +int set_memory_array_wc(unsigned long *addr, int addrinarray); +int set_memory_array_wt(unsigned long *addr, int addrinarray); +int set_memory_array_wb(unsigned long *addr, int addrinarray); + +int set_pages_array_uc(struct page **pages, int addrinarray); +int set_pages_array_wc(struct page **pages, int addrinarray); +int set_pages_array_wt(struct page **pages, int addrinarray); +int set_pages_array_wb(struct page **pages, int addrinarray); + +/* + * For legacy compatibility with the old APIs, a few functions + * are provided that work on a "struct page". + * These functions operate ONLY on the 1:1 kernel mapping of the + * memory that the struct page represents, and internally just + * call the set_memory_* function. See the description of the + * set_memory_* function for more details on conventions. + * + * These APIs should be considered *deprecated* and are likely going to + * be removed in the future. + * The reason for this is the implicit operation on the 1:1 mapping only, + * making this not a generally useful API. + * + * Specifically, many users of the old APIs had a virtual address, + * called virt_to_page() or vmalloc_to_page() on that address to + * get a struct page* that the old API required. + * To convert these cases, use set_memory_*() on the original + * virtual address, do not use these functions. + */ + +int set_pages_uc(struct page *page, int numpages); +int set_pages_wb(struct page *page, int numpages); +int set_pages_x(struct page *page, int numpages); +int set_pages_nx(struct page *page, int numpages); +int set_pages_ro(struct page *page, int numpages); +int set_pages_rw(struct page *page, int numpages); + +extern int kernel_set_to_readonly; +void set_kernel_text_rw(void); +void set_kernel_text_ro(void); + +#endif /* _ASM_X86_SET_MEMORY_H */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 499bc79fc82a..55f1c48dba05 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -325,8 +325,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, { ssize_t ret = -EINVAL; - if ((len >> PAGE_SHIFT) > totalram_pages) { - pr_err("too much data (max %ld pages)\n", totalram_pages); + if ((len >> PAGE_SHIFT) > totalram_pages()) { + pr_err("too much data (max %ld pages)\n", totalram_pages()); return ret; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 3c09ca384199..d035ec1af1d1 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -85,7 +85,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, + MODULES_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e5f92488c3cd..ff8a391e10c5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -168,8 +168,8 @@ static void recalculate_apic_map(struct kvm *kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); - new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + - sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); + new = kvzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL); if (!new) goto out; diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 85024e0cfaa5..d5bab386c0fb 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -38,8 +38,8 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - slot->arch.gfn_track[i] = kvm_kvzalloc(npages * - sizeof(*slot->arch.gfn_track[i])); + slot->arch.gfn_track[i] = kvzalloc(npages * + sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL); if (!slot->arch.gfn_track[i]) goto track_free; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 007dc3298f88..5cbf52cec4dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8358,13 +8358,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, slot->base_gfn, level) + 1; slot->arch.rmap[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); + kvzalloc(lpages * sizeof(*slot->arch.rmap[i]), GFP_KERNEL); if (!slot->arch.rmap[i]) goto out_free; if (i == 0) continue; - linfo = kvm_kvzalloc(lpages * sizeof(*linfo)); + linfo = kvzalloc(lpages * sizeof(*linfo), GFP_KERNEL); if (!linfo) goto out_free; diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 6d18b70ed5a9..8ac7132919a3 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,6 +1,6 @@ #include #include -#include /* for totalram_pages */ +#include /* for totalram_pages() */ #include void *kmap(struct page *page) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1271bc9fa3c6..7d50d46c046f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1897,8 +1897,6 @@ int set_pages_rw(struct page *page, int numpages) return set_memory_rw(addr, numpages); } -#ifdef CONFIG_DEBUG_PAGEALLOC - static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); @@ -1937,6 +1935,17 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 0); } +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_pages_np(page, 1); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_pages_p(page, 1); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) @@ -1966,7 +1975,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } #ifdef CONFIG_HIBERNATION - bool kernel_page_present(struct page *page) { unsigned int level; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index dece26f119d4..a804a4107fbc 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -409,7 +409,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN); if (!new_pages) { new_pages = __vmalloc(bytes, - GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO, + GFP_NOIO | __GFP_ZERO, PAGE_KERNEL); if (!new_pages) return NULL; diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 38ffb281df97..004a3ce8ba72 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -115,9 +115,9 @@ static int agp_find_max(void) long memory, index, result; #if PAGE_SHIFT < 20 - memory = totalram_pages >> (20 - PAGE_SHIFT); + memory = totalram_pages() >> (20 - PAGE_SHIFT); #else - memory = totalram_pages << (PAGE_SHIFT - 20); + memory = totalram_pages() << (PAGE_SHIFT - 20); #endif index = 1; diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index fdf8da929cbe..b1b7deaab49d 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1048,16 +1048,16 @@ static unsigned long compute_balloon_floor(void) * 8192 744 (1/16) * 32768 1512 (1/32) */ - if (totalram_pages < MB2PAGES(128)) - min_pages = MB2PAGES(8) + (totalram_pages >> 1); - else if (totalram_pages < MB2PAGES(512)) - min_pages = MB2PAGES(40) + (totalram_pages >> 2); - else if (totalram_pages < MB2PAGES(2048)) - min_pages = MB2PAGES(104) + (totalram_pages >> 3); - else if (totalram_pages < MB2PAGES(8192)) - min_pages = MB2PAGES(232) + (totalram_pages >> 4); + if (totalram_pages() < MB2PAGES(128)) + min_pages = MB2PAGES(8) + (totalram_pages() >> 1); + else if (totalram_pages() < MB2PAGES(512)) + min_pages = MB2PAGES(40) + (totalram_pages() >> 2); + else if (totalram_pages() < MB2PAGES(2048)) + min_pages = MB2PAGES(104) + (totalram_pages() >> 3); + else if (totalram_pages() < MB2PAGES(8192)) + min_pages = MB2PAGES(232) + (totalram_pages() >> 4); else - min_pages = MB2PAGES(488) + (totalram_pages >> 5); + min_pages = MB2PAGES(488) + (totalram_pages() >> 5); #undef MB2PAGES return min_pages; } diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 48bb5a879e6f..14b6004653a7 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -421,14 +421,13 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, */ if (gfp_mask & __GFP_NORETRY) { unsigned noio_flag = memalloc_noio_save(); - void *ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, - PAGE_KERNEL); + void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); memalloc_noio_restore(noio_flag); return ptr; } - return __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); } /* @@ -1907,7 +1906,7 @@ static int __init dm_bufio_init(void) memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); - mem = (__u64)mult_frac(totalram_pages - totalhigh_pages, + mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(), DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; if (mem > ULONG_MAX) diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index fdd4a840b30f..6ec1b8808311 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -84,7 +84,7 @@ static bool __check_shared_memory(size_t alloc_size) a = shared_memory_amount + alloc_size; if (a < shared_memory_amount) return false; - if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR) + if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR) return false; #ifdef CONFIG_MMU if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) @@ -146,12 +146,7 @@ static void *dm_kvzalloc(size_t alloc_size, int node) if (!claim_shared_memory(alloc_size)) return NULL; - if (alloc_size <= KMALLOC_MAX_SIZE) { - p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node); - if (p) - return p; - } - p = vzalloc_node(alloc_size, node); + p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node); if (p) return p; diff --git a/drivers/media/platform/mtk-vpu/mtk_vpu.c b/drivers/media/platform/mtk-vpu/mtk_vpu.c index 4d7e470c1715..58ee3e4c826b 100644 --- a/drivers/media/platform/mtk-vpu/mtk_vpu.c +++ b/drivers/media/platform/mtk-vpu/mtk_vpu.c @@ -841,7 +841,7 @@ static int mtk_vpu_probe(struct platform_device *pdev) /* Set PTCM to 96K and DTCM to 32K */ vpu_cfg_writel(vpu, 0x2, VPU_TCM_CFG); - vpu->enable_4GB = !!(totalram_pages > (SZ_2G >> PAGE_SHIFT)); + vpu->enable_4GB = !!(totalram_pages() > (SZ_2G >> PAGE_SHIFT)); dev_info(dev, "4GB mode %u\n", vpu->enable_4GB); if (vpu->enable_4GB) { diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 84a93ddcd57a..aba3897844c5 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -1272,7 +1272,7 @@ ccio_ioc_init(struct ioc *ioc) ** Hot-Plug/Removal of PCI cards. (aka PCI OLARD). */ - iova_space_size = (u32) (totalram_pages / count_parisc_driver(&ccio_driver)); + iova_space_size = (u32) (totalram_pages() / count_parisc_driver(&ccio_driver)); /* limit IOVA space size to 1MB-1GB */ @@ -1311,7 +1311,7 @@ ccio_ioc_init(struct ioc *ioc) DBG_INIT("%s() hpa 0x%p mem %luMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_regs, - (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages() >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index c715af1b6c3c..f9ed5b1c9258 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -1436,7 +1436,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) DBG_INIT("%s() hpa 0x%lx mem %ldMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_hpa, - (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages() >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c index 502ba33e3270..95c9dba9d009 100644 --- a/drivers/staging/android/ion/ion_system_heap.c +++ b/drivers/staging/android/ion/ion_system_heap.c @@ -379,7 +379,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, if (align > PAGE_SIZE) return -EINVAL; - if (size / PAGE_SIZE > totalram_pages / 2) + if (size / PAGE_SIZE > totalram_pages() / 2) return -ENOMEM; data.size = 0; diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h index 8c75d5075590..a38af2a561ab 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h @@ -42,7 +42,7 @@ #if BITS_PER_LONG == 32 /* limit to lowmem on 32-bit systems */ #define NUM_CACHEPAGES \ - min(totalram_pages, 1UL << (30 - PAGE_SHIFT) * 3 / 4) + min(totalram_pages(), 1UL << (30 - PAGE_SHIFT) * 3 / 4) #else #define NUM_CACHEPAGES totalram_pages #endif diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-tracefile.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-tracefile.c index 8b551d2708ba..2d95784b660c 100644 --- a/drivers/staging/lustre/lnet/libcfs/linux/linux-tracefile.c +++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-tracefile.c @@ -249,7 +249,7 @@ void cfs_print_to_console(struct ptldebug_header *hdr, int mask, int cfs_trace_max_debug_mb(void) { - int total_mb = (totalram_pages >> (20 - PAGE_SHIFT)); + int total_mb = (totalram_pages() >> (20 - PAGE_SHIFT)); return max(512, (total_mb * 80) / 100); } diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h index 722c33f7eecc..9057f715c0b5 100644 --- a/drivers/staging/lustre/lustre/include/obd.h +++ b/drivers/staging/lustre/lustre/include/obd.h @@ -1217,8 +1217,8 @@ static inline void client_adjust_max_dirty(struct client_obd *cli) cli->cl_dirty_max_pages = dirty_max; } - if (cli->cl_dirty_max_pages > totalram_pages / 8) - cli->cl_dirty_max_pages = totalram_pages / 8; + if (cli->cl_dirty_max_pages > totalram_pages() / 8) + cli->cl_dirty_max_pages = totalram_pages() / 8; } #endif /* __OBD_H */ diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c index 153e990c494e..1a2d41d99867 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c @@ -363,11 +363,11 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) if (!strcmp(name, LUSTRE_MDC_NAME)) { cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; - } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 128 /* MB */) { + } else if (totalram_pages() >> (20 - PAGE_SHIFT) <= 128 /* MB */) { cli->cl_max_rpcs_in_flight = 2; - } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 256 /* MB */) { + } else if (totalram_pages() >> (20 - PAGE_SHIFT) <= 256 /* MB */) { cli->cl_max_rpcs_in_flight = 3; - } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 512 /* MB */) { + } else if (totalram_pages() >> (20 - PAGE_SHIFT) <= 512 /* MB */) { cli->cl_max_rpcs_in_flight = 4; } else { cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c index 13ec24d44b04..e1da3cf62a61 100644 --- a/drivers/staging/lustre/lustre/llite/lproc_llite.c +++ b/drivers/staging/lustre/lustre/llite/lproc_llite.c @@ -244,9 +244,9 @@ static ssize_t max_read_ahead_mb_store(struct kobject *kobj, pages_number *= 1 << (20 - PAGE_SHIFT); /* MB -> pages */ - if (pages_number > totalram_pages / 2) { + if (pages_number > totalram_pages() / 2) { CERROR("can't set file readahead more than %lu MB\n", - totalram_pages >> (20 - PAGE_SHIFT + 1)); /*1/2 of RAM*/ + totalram_pages() >> (20 - PAGE_SHIFT + 1)); /*1/2 of RAM*/ return -ERANGE; } @@ -411,10 +411,10 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file, return -ERANGE; pages_number = (long)val; - if (pages_number < 0 || pages_number > totalram_pages) { + if (pages_number < 0 || pages_number > totalram_pages()) { CERROR("%s: can't set max cache more than %lu MB\n", ll_get_fsname(sb, NULL, 0), - totalram_pages >> (20 - PAGE_SHIFT)); + totalram_pages() >> (20 - PAGE_SHIFT)); return -ERANGE; } diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c index 76e1ee83a723..ff3f738f01dc 100644 --- a/drivers/staging/lustre/lustre/obdclass/class_obd.c +++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c @@ -477,10 +477,10 @@ static int __init obdclass_init(void) * For clients with less memory, a larger fraction is needed * for other purposes (mostly for BGL). */ - if (totalram_pages <= 512 << (20 - PAGE_SHIFT)) - obd_max_dirty_pages = totalram_pages / 4; + if (totalram_pages() <= 512 << (20 - PAGE_SHIFT)) + obd_max_dirty_pages = totalram_pages() / 4; else - obd_max_dirty_pages = totalram_pages / 2; + obd_max_dirty_pages = totalram_pages() / 2; err = obd_init_caches(); if (err) diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c index e6c785afceba..d7d2ea8c3c40 100644 --- a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c +++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c @@ -112,7 +112,7 @@ static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr, val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */ - if (val > ((totalram_pages / 10) * 9)) { + if (val > ((totalram_pages() / 10) * 9)) { /* Somebody wants to assign too much memory to dirty pages */ return -EINVAL; } diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c index 054e567e6c8d..107f4362109c 100644 --- a/drivers/staging/lustre/lustre/obdclass/lu_object.c +++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c @@ -884,7 +884,7 @@ static unsigned long lu_htable_order(struct lu_device *top) * * Size of lu_object is (arbitrary) taken as 1K (together with inode). */ - cache_size = totalram_pages; + cache_size = totalram_pages(); #if BITS_PER_LONG == 32 /* limit hashtable size for lowmem systems to low RAM */ diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c index f0062d44ee03..fc3fab479e16 100644 --- a/drivers/staging/lustre/lustre/osc/lproc_osc.c +++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c @@ -163,7 +163,7 @@ static ssize_t max_dirty_mb_store(struct kobject *kobj, if (pages_number <= 0 || pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) || - pages_number > totalram_pages / 4) /* 1/4 of RAM */ + pages_number > totalram_pages() / 4) /* 1/4 of RAM */ return -ERANGE; spin_lock(&cli->cl_loi_list_lock); diff --git a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c index 9bad57d65db4..84845c274a2e 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c +++ b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c @@ -306,7 +306,7 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file, * far. */ bufpages = (svc->srv_buf_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (val > totalram_pages / (2 * bufpages)) + if (val > totalram_pages() / (2 * bufpages)) return -ERANGE; spin_lock(&svc->srv_lock); diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c index b2cc5ea6cb93..f9095a84fd15 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c @@ -140,7 +140,7 @@ int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v) "low free mark: %lu\n" "max waitqueue depth: %u\n" "max wait time: %ld/%lu\n", - totalram_pages, + totalram_pages(), PAGES_PER_POOL, page_pools.epp_max_pages, page_pools.epp_max_pools, @@ -378,7 +378,7 @@ int sptlrpc_enc_pool_init(void) * maximum capacity is 1/8 of total physical memory. * is the 1/8 a good number? */ - page_pools.epp_max_pages = totalram_pages / 8; + page_pools.epp_max_pages = totalram_pages() / 8; page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages); init_waitqueue_head(&page_pools.epp_waitq); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 1b76e8a99c40..67a6f62e3313 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -747,7 +747,7 @@ static void __init balloon_add_region(unsigned long start_pfn, for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) { page = pfn_to_page(pfn); - /* totalram_pages and totalhigh_pages do not + /* totalram_pages() and totalhigh_pages() do not include the boot-time balloon extension, so don't subtract from it. */ __balloon_append(page); diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 66620713242a..21e96adb00b8 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -188,7 +188,7 @@ static void selfballoon_process(struct work_struct *work) bool reset_timer = false; if (xen_selfballooning_enabled) { - cur_pages = totalram_pages; + cur_pages = totalram_pages(); tgt_pages = cur_pages; /* default is no change */ goal_pages = vm_memory_committed() + totalreserve_pages + @@ -226,7 +226,7 @@ static void selfballoon_process(struct work_struct *work) if (tgt_pages < floor_pages) tgt_pages = floor_pages; balloon_set_new_target(tgt_pages + - balloon_stats.current_pages - totalram_pages); + balloon_stats.current_pages - totalram_pages()); reset_timer = true; } #ifdef CONFIG_FRONTSWAP @@ -568,7 +568,7 @@ int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) * much more reliably and response faster in some cases. */ if (!selfballoon_reserved_mb) { - reserve_pages = totalram_pages / 10; + reserve_pages = totalram_pages() / 10; selfballoon_reserved_mb = PAGES2MB(reserve_pages); } schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 57401b474ec6..14476074957a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -167,8 +167,7 @@ static u8 *alloc_bitmap(u32 bitmap_size) if (mem) return mem; - return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL); } int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 89fbff1a9b2c..e32046f0848e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2387,7 +2387,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) return 0; size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); - new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); + new_groupinfo = kvzalloc(size, GFP_KERNEL); if (!new_groupinfo) { ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4384bbe61415..f5e8c1247af5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2130,8 +2130,8 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) if (size <= sbi->s_flex_groups_allocated) return 0; - new_groups = ext4_kvzalloc(roundup_pow_of_two(size * - sizeof(*sbi->s_flex_groups)), GFP_KERNEL); + new_groups = kvzalloc(roundup_pow_of_two(size * + sizeof(*sbi->s_flex_groups)), GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex group pointers", size); @@ -3993,7 +3993,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } rcu_assign_pointer(sbi->s_group_desc, - ext4_kvmalloc(db_count * + kvmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL)); if (sbi->s_group_desc == NULL) { diff --git a/fs/file.c b/fs/file.c index be0792c0a231..33e39e40fe1e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -42,7 +42,7 @@ static void *alloc_fdmem(size_t size) if (data != NULL) return data; } - return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); } static void __free_fdtable(struct fdtable *fdt) diff --git a/fs/file_table.c b/fs/file_table.c index 251d54ee7ef7..be36504b8a01 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -334,10 +334,10 @@ void __init files_init(void) void __init files_maxfiles_init(void) { unsigned long n; - unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2; + unsigned long memreserve = (totalram_pages() - nr_free_pages()) * 3/2; - memreserve = min(memreserve, totalram_pages - 1); - n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; + memreserve = min(memreserve, totalram_pages() - 1); + n = ((totalram_pages() - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 75e0d1297775..5c68fa5a4c08 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -829,7 +829,7 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { if (*limit == 0) - *limit = ((totalram_pages << PAGE_SHIFT) >> 13) / + *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / sizeof(struct fuse_req); if (*limit >= 1 << 16) diff --git a/fs/proc/base.c b/fs/proc/base.c index b6959f6dae5b..9f64c9d6a06c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -576,7 +576,7 @@ static const struct file_operations proc_lstats_operations = { static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - unsigned long totalpages = totalram_pages + total_swap_pages; + unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; points = oom_badness(task, NULL, NULL, totalpages) * diff --git a/fs/seq_file.c b/fs/seq_file.c index 3ade39e02bb7..023d92dfffa9 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -25,24 +25,7 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { - void *buf; - gfp_t gfp = GFP_KERNEL; - - if (unlikely(size > MAX_RW_COUNT)) - return NULL; - - /* - * For high order allocations, use __GFP_NORETRY to avoid oom-killing - - * it's better to fall back to vmalloc() than to kill things. For small - * allocations, just use GFP_KERNEL which will oom kill, thus no need - * for vmalloc fallback. - */ - if (size > PAGE_SIZE) - gfp |= __GFP_NORETRY | __GFP_NOWARN; - buf = kmalloc(size, gfp); - if (!buf && size > PAGE_SIZE) - buf = vmalloc(size); - return buf; + return kvmalloc(size, GFP_KERNEL); } /** diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index bb2beaef531a..0b770f861897 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -66,7 +66,7 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) noio_flag = memalloc_noio_save(); lflags = kmem_flags_convert(flags); - ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL); if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) memalloc_noio_restore(noio_flag); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index b43fa9d95a7a..1fc64a5e2d79 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -841,6 +841,10 @@ static inline bool arch_has_pfn_modify_check(void) } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range diff --git a/include/asm-generic/set_memory.h b/include/asm-generic/set_memory.h new file mode 100644 index 000000000000..83e81f8996b2 --- /dev/null +++ b/include/asm-generic/set_memory.h @@ -0,0 +1,12 @@ +#ifndef __ASM_SET_MEMORY_H +#define __ASM_SET_MEMORY_H + +/* + * Functions to change memory attributes. + */ +int set_memory_ro(unsigned long addr, int numpages); +int set_memory_rw(unsigned long addr, int numpages); +int set_memory_x(unsigned long addr, int numpages); +int set_memory_nx(unsigned long addr, int numpages); + +#endif diff --git a/include/drm/drm_mem_util.h b/include/drm/drm_mem_util.h index 70d4e221a3ad..d0f6cf2e5324 100644 --- a/include/drm/drm_mem_util.h +++ b/include/drm/drm_mem_util.h @@ -37,8 +37,7 @@ static __inline__ void *drm_calloc_large(size_t nmemb, size_t size) if (size * nmemb <= PAGE_SIZE) return kcalloc(nmemb, size, GFP_KERNEL); - return __vmalloc(size * nmemb, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + return vzalloc(size * nmemb); } /* Modeled after cairo's malloc_ab, it's like calloc but without the zeroing. */ @@ -50,8 +49,7 @@ static __inline__ void *drm_malloc_ab(size_t nmemb, size_t size) if (size * nmemb <= PAGE_SIZE) return kmalloc(nmemb * size, GFP_KERNEL); - return __vmalloc(size * nmemb, - GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return vmalloc(size * nmemb); } static __inline__ void *drm_malloc_gfp(size_t nmemb, size_t size, gfp_t gfp) @@ -69,8 +67,7 @@ static __inline__ void *drm_malloc_gfp(size_t nmemb, size_t size, gfp_t gfp) return ptr; } - return __vmalloc(size * nmemb, - gfp | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size * nmemb, gfp, PAGE_KERNEL); } static __inline void drm_free_large(void *ptr) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 657b56524a8a..dcc6ec1d473b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -35,7 +35,31 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); -extern unsigned long totalhigh_pages; +extern atomic_long_t _totalhigh_pages; +static inline unsigned long totalhigh_pages(void) +{ + return (unsigned long)atomic_long_read(&_totalhigh_pages); +} + +static inline void totalhigh_pages_inc(void) +{ + atomic_long_inc(&_totalhigh_pages); +} + +static inline void totalhigh_pages_dec(void) +{ + atomic_long_dec(&_totalhigh_pages); +} + +static inline void totalhigh_pages_add(long count) +{ + atomic_long_add(count, &_totalhigh_pages); +} + +static inline void totalhigh_pages_set(long val) +{ + atomic_long_set(&_totalhigh_pages, val); +} void kmap_flush_unused(void); @@ -57,7 +81,7 @@ static inline struct page *kmap_to_page(void *addr) return virt_to_page(addr); } -#define totalhigh_pages 0UL +static inline unsigned long totalhigh_pages(void) { return 0UL; } #ifndef ARCH_HAS_KMAP static inline void *kmap(struct page *page) diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 1c2a32829620..590343f6c1b1 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -22,6 +22,7 @@ #define __KMEMLEAK_H #include +#include #ifdef CONFIG_DEBUG_KMEMLEAK @@ -30,6 +31,8 @@ extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, gfp_t gfp) __ref; +extern void kmemleak_vmalloc(const struct vm_struct *area, size_t size, + gfp_t gfp) __ref; extern void kmemleak_free(const void *ptr) __ref; extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; @@ -81,6 +84,10 @@ static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, gfp_t gfp) { } +static inline void kmemleak_vmalloc(const struct vm_struct *area, size_t size, + gfp_t gfp) +{ +} static inline void kmemleak_free(const void *ptr) { } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 05aa860daa5c..dad7fb0c949c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -762,8 +762,6 @@ void kvm_arch_check_processor_compat(void *rtn); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); -void *kvm_kvzalloc(unsigned long size); - #ifndef __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { diff --git a/include/linux/list.h b/include/linux/list.h index 3ef3ade9930e..85baa403499d 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -556,6 +556,19 @@ static inline void list_splice_tail_init(struct list_head *list, for (; &pos->member != (head); \ pos = list_next_entry(pos, member)) +/** + * list_for_each_entry_from_reverse - iterate backwards over list of given type + * from the current point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type, continuing from current position. + */ +#define list_for_each_entry_from_reverse(pos, head, member) \ + for (; &pos->member != (head); \ + pos = list_prev_entry(pos, member)) + /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. diff --git a/include/linux/llist.h b/include/linux/llist.h index ac6796138ba0..9bbc3ed179c8 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -121,6 +121,25 @@ static inline void init_llist_head(struct llist_head *list) #define llist_for_each(pos, node) \ for ((pos) = (node); pos; (pos) = (pos)->next) +/** + * llist_for_each_safe - iterate over some deleted entries of a lock-less list + * safe against removal of list entry + * @pos: the &struct llist_node to use as a loop cursor + * @n: another &struct llist_node to use as temporary storage + * @node: the first entry of deleted list entries + * + * In general, some entries of the lock-less list can be traversed + * safely only after being deleted from list, so start with an entry + * instead of list head. + * + * If being used on entries deleted from lock-less list directly, the + * traverse order is from the newest to the oldest added entry. If + * you want to traverse from the oldest to the newest, you must + * reverse the order by yourself before traversing. + */ +#define llist_for_each_safe(pos, n, node) \ + for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n)) + /** * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type * @pos: the type * to use as a loop cursor. diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b722db541b0..10b6a3ee38db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -44,7 +44,32 @@ static inline void set_max_mapnr(unsigned long limit) static inline void set_max_mapnr(unsigned long limit) { } #endif -extern unsigned long totalram_pages; +extern atomic_long_t _totalram_pages; +static inline unsigned long totalram_pages(void) +{ + return (unsigned long)atomic_long_read(&_totalram_pages); +} + +static inline void totalram_pages_inc(void) +{ + atomic_long_inc(&_totalram_pages); +} + +static inline void totalram_pages_dec(void) +{ + atomic_long_dec(&_totalram_pages); +} + +static inline void totalram_pages_add(long count) +{ + atomic_long_add(count, &_totalram_pages); +} + +static inline void totalram_pages_set(long val) +{ + atomic_long_set(&_totalram_pages, val); +} + extern void * high_memory; extern int page_cluster; @@ -484,16 +509,16 @@ unsigned long vmalloc_to_pfn(const void *addr); * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ - -#ifdef CONFIG_MMU -extern int is_vmalloc_addr(const void *x); -#else -static inline int is_vmalloc_addr(const void *x) +static inline bool is_vmalloc_addr(const void *x) { - return 0; -} -#endif +#ifdef CONFIG_MMU + unsigned long addr = (unsigned long)x; + return addr >= VMALLOC_START && addr < VMALLOC_END; +#else + return false; +#endif +} #ifdef CONFIG_MMU extern int is_vmalloc_or_module_addr(const void *x); #else @@ -516,6 +541,7 @@ static inline void *kvzalloc(size_t size, gfp_t flags) { return kvmalloc(size, flags | __GFP_ZERO); } + extern void kvfree(const void *addr); /* diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h new file mode 100644 index 000000000000..b5071497b8cb --- /dev/null +++ b/include/linux/set_memory.h @@ -0,0 +1,57 @@ +/* + * Copyright 2017, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + */ +#ifndef _LINUX_SET_MEMORY_H_ +#define _LINUX_SET_MEMORY_H_ + +#ifdef CONFIG_ARCH_HAS_SET_MEMORY +#include +#else +static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } +#endif + +#ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP +static inline int set_direct_map_invalid_noflush(struct page *page) +{ + return 0; +} +static inline int set_direct_map_default_noflush(struct page *page) +{ + return 0; +} +#endif + +#ifndef set_mce_nospec +static inline int set_mce_nospec(unsigned long pfn) +{ + return 0; +} +#endif + +#ifndef clear_mce_nospec +static inline int clear_mce_nospec(unsigned long pfn) +{ + return 0; +} +#endif + +#ifndef CONFIG_ARCH_HAS_MEM_ENCRYPT +static inline int set_memory_encrypted(unsigned long addr, int numpages) +{ + return 0; +} + +static inline int set_memory_decrypted(unsigned long addr, int numpages) +{ + return 0; +} +#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ + +#endif /* _LINUX_SET_MEMORY_H_ */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 72d03e816293..ab50f9e5ee64 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -286,7 +286,6 @@ static inline void workingset_node_shadows_dec(struct radix_tree_node *node) } /* linux/mm/page_alloc.c */ -extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; extern unsigned long nr_free_buffer_pages(void); extern unsigned long nr_free_pagecache_pages(void); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index de477793ed60..d662eaebebeb 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -19,8 +19,11 @@ struct notifier_block; /* in notifier.h */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ -#define VM_LOWMEM 0x00000100 /* Tracking of direct mapped lowmem */ - +/* + * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with + * vfree_atomic(). + */ +#define VM_FLUSH_RESET_PERMS 0x00000100 /* Reset direct map and flush TLB on unmap */ /* bits [20..32] reserved for arch specific ioremap internals */ /* @@ -45,12 +48,16 @@ struct vm_struct { struct vmap_area { unsigned long va_start; unsigned long va_end; + + /* + * Largest available free size in subtree. + */ + unsigned long subtree_max_size; unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ struct llist_node purge_list; /* "lazy purge" list */ struct vm_struct *vm; - struct rcu_head rcu_head; }; /* @@ -85,6 +92,17 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); +#ifndef CONFIG_MMU +extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); +static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, + gfp_t flags, void *caller) +{ + return __vmalloc_node_flags(size, node, flags); +} +#else +extern void *__vmalloc_node_flags_caller(unsigned long size, + int node, gfp_t flags, void *caller); +#endif extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); @@ -145,6 +163,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); +static inline void set_vm_flush_reset_perms(void *addr) +{ + struct vm_struct *vm = find_vm_area(addr); + + if (vm) + vm->flags |= VM_FLUSH_RESET_PERMS; +} #else static inline int map_kernel_range_noflush(unsigned long start, unsigned long size, @@ -160,6 +185,9 @@ static inline void unmap_kernel_range(unsigned long addr, unsigned long size) { } +static inline void set_vm_flush_reset_perms(void *addr) +{ +} #endif /* Allocate/destroy a 'vmalloc' VM area. */ @@ -176,13 +204,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count); extern struct list_head vmap_area_list; extern __init void vm_area_add_early(struct vm_struct *vm); extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); -extern __init int vm_area_check_early(struct vm_struct *vm); -#ifdef CONFIG_ENABLE_VMALLOC_SAVING -extern void mark_vmalloc_reserved_area(void *addr, unsigned long size); -#else -static inline void mark_vmalloc_reserved_area(void *addr, unsigned long size) -{ }; -#endif #ifdef CONFIG_SMP # ifdef CONFIG_MMU @@ -208,12 +229,7 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #endif #ifdef CONFIG_MMU -#ifdef CONFIG_ENABLE_VMALLOC_SAVING -extern unsigned long total_vmalloc_size; -#define VMALLOC_TOTAL total_vmalloc_size -#else #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -#endif #else #define VMALLOC_TOTAL 0UL #endif diff --git a/ipc/util.c b/ipc/util.c index 76d4afcde7bb..721b96d8b9c3 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -403,12 +403,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) */ void *ipc_alloc(int size) { - void *out; - if (size > PAGE_SIZE) - out = vmalloc(size); - else - out = kmalloc(size, GFP_KERNEL); - return out; + return kvmalloc(size, GFP_KERNEL); } /** diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5a2b9db4b966..5ef7e997f853 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -81,8 +81,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; struct bpf_prog *fp; @@ -198,8 +197,7 @@ void bpf_prog_free_linfo(struct bpf_prog *prog) struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; u32 pages, delta; int ret; @@ -937,8 +935,7 @@ out: static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index df0092ae3313..ecc6ad6fb5a8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -152,6 +152,7 @@ static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) } return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, __builtin_return_address(0)); + /*return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);*/ } void *bpf_map_area_alloc(size_t size, int numa_node) diff --git a/kernel/fork.c b/kernel/fork.c index ae4d5fa88d03..6002350b1bfa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -199,7 +199,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END, - THREADINFO_GFP | __GFP_HIGHMEM, + THREADINFO_GFP, PAGE_KERNEL, 0, node, __builtin_return_address(0)); @@ -420,10 +420,10 @@ static void set_max_threads(unsigned int max_threads_suggested) * The number of threads shall be limited such that the thread * structures may only consume a small part of the available memory. */ - if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) + if (fls64(totalram_pages()) + fls64(PAGE_SIZE) > 64) threads = MAX_THREADS; else - threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + threads = div64_u64((u64) totalram_pages() * (u64) PAGE_SIZE, (u64) THREAD_SIZE * 8UL); if (threads > max_threads_suggested) diff --git a/kernel/groups.c b/kernel/groups.c index 94bde5210e3d..b5bb6e88376e 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -18,7 +18,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); if (!gi) return NULL; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index f5ab72ebda11..1a5ab1a64a14 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -223,13 +223,13 @@ int sanity_check_segment_list(struct kimage *image) * wasted allocating pages, which can cause a soft lockup. */ for (i = 0; i < nr_segments; i++) { - if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2) + if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages() / 2) return -EINVAL; total_pages += PAGE_COUNT(image->segment[i].memsz); } - if (total_pages > totalram_pages / 2) + if (total_pages > totalram_pages() / 2) return -EINVAL; /* diff --git a/kernel/module.c b/kernel/module.c index bdbd03fae41a..2d2199dc16ca 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2879,7 +2879,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, /* Suck in entire file: we'll want most of it. */ info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL); + GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); if (!info->hdr) return -ENOMEM; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b02850cfc8ee..e14e98c5de2d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -101,7 +101,7 @@ unsigned long image_size; void __init hibernate_image_size_init(void) { - image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; + image_size = ((totalram_pages() * 2) / 5) * PAGE_SIZE; } /* diff --git a/mm/bootmem.c b/mm/bootmem.c index d14efd6fda06..0b8c5b3ab621 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -162,7 +162,7 @@ void free_bootmem_late(unsigned long physaddr, unsigned long size) for (; cursor < end; cursor++) { __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; + totalram_pages_inc(); } } @@ -278,7 +278,7 @@ unsigned long __init free_all_bootmem(void) list_for_each_entry(bdata, &bdata_list, list) total_pages += free_all_bootmem_core(bdata); - totalram_pages += total_pages; + totalram_pages_add(total_pages); return total_pages; } diff --git a/mm/highmem.c b/mm/highmem.c index 50b4ca6787f0..a81bbfe69049 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -104,9 +104,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) } #endif -unsigned long totalhigh_pages __read_mostly; -EXPORT_SYMBOL(totalhigh_pages); - +atomic_long_t _totalhigh_pages __read_mostly; +EXPORT_SYMBOL(_totalhigh_pages); EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 34511b5b22b1..f9714777813a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -409,7 +409,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b9128eaafffe..69c1194db950 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2220,7 +2220,7 @@ static void __init gather_bootmem_prealloc(void) prep_new_huge_page(h, page, page_to_nid(page)); /* * If we had gigantic hugepages allocated at boot time, we need - * to restore the 'stolen' pages to totalram_pages in order to + * to restore the 'stolen' pages to totalram_pages() in order to * fix confusing memory reports from free(1) and another * side-effects, like CommitLimit going negative. */ diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 3c572104eb76..2d33a92a89fb 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -691,7 +691,7 @@ int kasan_module_alloc(void *addr, size_t size) ret = __vmalloc_node_range(shadow_size, 1, shadow_start, shadow_start + shadow_size, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 3a8ddf8baf7d..daffe1da66d6 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -236,7 +236,7 @@ void quarantine_reduce(void) * Update quarantine size in case of hotplug. Allocate a fraction of * the installed memory to quarantine minus per-cpu queue limits. */ - total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + total_size = (totalram_pages() << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); new_quarantine_size = (total_size < percpu_quarantines) ? diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 3d822d1491c4..83ecadc66586 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -148,7 +148,7 @@ struct kmemleak_scan_area { */ struct kmemleak_object { spinlock_t lock; - unsigned long flags; /* object status flags */ + unsigned int flags; /* object status flags */ struct list_head object_list; struct list_head gray_list; struct rb_node rb_node; @@ -157,6 +157,8 @@ struct kmemleak_object { atomic_t use_count; unsigned long pointer; size_t size; + /* pass surplus references to this pointer */ + unsigned long excess_ref; /* minimum number of a pointers found before it is considered leak */ int min_count; /* the total number of pointers found pointing to this object */ @@ -263,7 +265,8 @@ enum { KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, - KMEMLEAK_NO_SCAN + KMEMLEAK_NO_SCAN, + KMEMLEAK_SET_EXCESS_REF }; /* @@ -272,9 +275,12 @@ enum { */ struct early_log { int op_type; /* kmemleak operation type */ - const void *ptr; /* allocated/freed memory block */ - size_t size; /* memory block size */ int min_count; /* minimum reference count */ + const void *ptr; /* allocated/freed memory block */ + union { + size_t size; /* memory block size */ + unsigned long excess_ref; /* surplus reference passing */ + }; unsigned long trace[MAX_TRACE]; /* stack trace */ unsigned int trace_len; /* stack trace length */ }; @@ -403,7 +409,7 @@ static void dump_object_info(struct kmemleak_object *object) object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); - pr_notice(" flags = 0x%lx\n", object->flags); + pr_notice(" flags = 0x%x\n", object->flags); pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); @@ -572,6 +578,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object->flags = OBJECT_ALLOCATED; object->pointer = ptr; object->size = size; + object->excess_ref = 0; object->min_count = min_count; object->count = 0; /* white color initially */ object->jiffies = jiffies; @@ -804,6 +811,30 @@ out: put_object(object); } +/* + * Any surplus references (object already gray) to 'ptr' are passed to + * 'excess_ref'. This is used in the vmalloc() case where a pointer to + * vm_struct may be used as an alternative reference to the vmalloc'ed object + * (see free_thread_stack()). + */ +static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n", + ptr); + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->excess_ref = excess_ref; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + /* * Set the OBJECT_NO_SCAN flag for the object corresponding to the give * pointer. Such object will not be scanned by kmemleak but references to it @@ -918,7 +949,7 @@ static void early_alloc_percpu(struct early_log *log) * @gfp: kmalloc() flags used for kmemleak internal memory allocations * * This function is called from the kernel allocators when a new object - * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.). + * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.). */ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) @@ -961,6 +992,36 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); +/** + * kmemleak_vmalloc - register a newly vmalloc'ed object + * @area: pointer to vm_struct + * @size: size of the object + * @gfp: __vmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the vmalloc() kernel allocator when a new + * object (memory block) is allocated. + */ +void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp) +{ + pr_debug("%s(0x%p, %zu)\n", __func__, area, size); + + /* + * A min_count = 2 is needed because vm_struct contains a reference to + * the virtual address of the vmalloc'ed block. + */ + if (kmemleak_enabled) { + create_object((unsigned long)area->addr, size, 2, gfp); + object_set_excess_ref((unsigned long)area, + (unsigned long)area->addr); + } else if (kmemleak_early_log) { + log_early(KMEMLEAK_ALLOC, area->addr, size, 2); + /* reusing early_log.size for storing area->addr */ + log_early(KMEMLEAK_SET_EXCESS_REF, + area, (unsigned long)area->addr, 0); + } +} +EXPORT_SYMBOL_GPL(kmemleak_vmalloc); + /** * kmemleak_free - unregister a previously registered object * @ptr: pointer to beginning of the object @@ -1197,6 +1258,30 @@ static bool update_checksum(struct kmemleak_object *object) return object->checksum != old_csum; } +/* + * Update an object's references. object->lock must be held by the caller. + */ +static void update_refs(struct kmemleak_object *object) +{ + if (!color_white(object)) { + /* non-orphan, ignored or new */ + return; + } + + /* + * Increase the object's reference count (number of pointers to the + * memory block). If this count reaches the required minimum, the + * object's color will become gray and it will be added to the + * gray_list. + */ + object->count++; + if (color_gray(object)) { + /* put_object() called when removing from gray_list */ + WARN_ON(!get_object(object)); + list_add_tail(&object->gray_list, &gray_list); + } +} + /* * Memory scanning is a long process and it needs to be interruptable. This * function checks whether such interrupt condition occurred. @@ -1234,6 +1319,7 @@ static void scan_block(void *_start, void *_end, for (ptr = start; ptr < end; ptr++) { struct kmemleak_object *object; unsigned long pointer; + unsigned long excess_ref; if (scan_should_stop()) break; @@ -1269,25 +1355,27 @@ static void scan_block(void *_start, void *_end, * enclosed by scan_mutex. */ spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); - if (!color_white(object)) { - /* non-orphan, ignored or new */ - spin_unlock(&object->lock); - continue; - } - - /* - * Increase the object's reference count (number of pointers - * to the memory block). If this count reaches the required - * minimum, the object's color will become gray and it will be - * added to the gray_list. - */ - object->count++; + /* only pass surplus references (object already gray) */ if (color_gray(object)) { - /* put_object() called when removing from gray_list */ - WARN_ON(!get_object(object)); - list_add_tail(&object->gray_list, &gray_list); + excess_ref = object->excess_ref; + /* no need for update_refs() if object already gray */ + } else { + excess_ref = 0; + update_refs(object); } spin_unlock(&object->lock); + + if (excess_ref) { + object = lookup_object(excess_ref, 0); + if (!object) + continue; + if (object == scanned) + /* circular reference, ignore */ + continue; + spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + update_refs(object); + spin_unlock(&object->lock); + } } read_unlock_irqrestore(&kmemleak_lock, flags); } @@ -1994,6 +2082,10 @@ void __init kmemleak_init(void) case KMEMLEAK_NO_SCAN: kmemleak_no_scan(log->ptr); break; + case KMEMLEAK_SET_EXCESS_REF: + object_set_excess_ref((unsigned long)log->ptr, + log->excess_ref); + break; default: kmemleak_warn("Unknown early log operation: %d\n", log->op_type); diff --git a/mm/memblock.c b/mm/memblock.c index 9c96c537b68d..422cd907008f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1397,7 +1397,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) for (; cursor < end; cursor++) { __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; + totalram_pages_inc(); } } diff --git a/mm/mm_init.c b/mm/mm_init.c index 5b72266b4b03..db3a196826da 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -151,7 +151,7 @@ static void __meminit mm_compute_batch(void) s32 batch = max_t(s32, nr*2, 32); /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ - memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); + memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff); vm_committed_as_batch = max_t(s32, memsized_batch, batch); } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index aa59572cbac6..88817aab4393 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -91,7 +91,7 @@ void free_bootmem_late(unsigned long addr, unsigned long size) for (; cursor < end; cursor++) { __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); - totalram_pages++; + totalram_pages_inc(); } } @@ -184,7 +184,7 @@ unsigned long __init free_all_bootmem(void) reset_all_zones_managed_pages(); pages = free_low_memory_core_early(); - totalram_pages += pages; + totalram_pages_add(pages); return pages; } diff --git a/mm/nommu.c b/mm/nommu.c index b40ec74f364c..10be196aec37 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -236,12 +236,16 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); +void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc(size, flags, PAGE_KERNEL); +} + void *vmalloc_user(unsigned long size) { void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); if (ret) { struct vm_area_struct *vma; @@ -359,10 +363,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a72d15dd5304..dadd8465729f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -237,7 +237,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) } /* Default to all available memory */ - oc->totalpages = totalram_pages + total_swap_pages; + oc->totalpages = totalram_pages() + total_swap_pages; if (!IS_ENABLED(CONFIG_NUMA)) return CONSTRAINT_NONE; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dedc1fac8f35..f3bfef0b3394 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2066,7 +2066,7 @@ static int page_writeback_cpu_online(unsigned int cpu) * However, that was when we used "dirty_ratio" to scale with * all memory, and we don't do that any more. "dirty_ratio" * is now applied to total non-HIGHPAGE memory (by subtracting - * totalhigh_pages from vm_total_pages), and as such we can't + * totalhigh_pages() from vm_total_pages), and as such we can't * get into the old insane situation any more where we had * large amounts of dirty pages compared to a small amount of * non-HIGHMEM memory. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 532fa53c1316..fcb17aa08248 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -121,7 +122,8 @@ EXPORT_SYMBOL(node_states); /* Protect totalram_pages and zone->managed_pages */ static DEFINE_SPINLOCK(managed_page_count_lock); -unsigned long totalram_pages __read_mostly; +atomic_long_t _totalram_pages __read_mostly; +EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; @@ -4375,11 +4377,11 @@ EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { - val->totalram = totalram_pages; + val->totalram = totalram_pages(); val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); - val->totalhigh = totalhigh_pages; + val->totalhigh = totalhigh_pages(); val->freehigh = nr_free_highpages(); val->mem_unit = PAGE_SIZE; } @@ -6641,10 +6643,10 @@ void adjust_managed_page_count(struct page *page, long count) { spin_lock(&managed_page_count_lock); page_zone(page)->managed_pages += count; - totalram_pages += count; + totalram_pages_add(count); #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) - totalhigh_pages += count; + totalhigh_pages_add(count); #endif spin_unlock(&managed_page_count_lock); } @@ -6675,9 +6677,9 @@ EXPORT_SYMBOL(free_reserved_area); void free_highmem_page(struct page *page) { __free_reserved_page(page); - totalram_pages++; + totalram_pages_inc(); page_zone(page)->managed_pages++; - totalhigh_pages++; + totalhigh_pages_inc(); } #endif @@ -6726,10 +6728,10 @@ void __init mem_init_print_info(const char *str) physpages << (PAGE_SHIFT - 10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT - 10), + totalhigh_pages() << (PAGE_SHIFT - 10), #endif str ? ", " : "", str ? str : ""); } diff --git a/mm/shmem.c b/mm/shmem.c index a612d765dc59..a06511d6ceaa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -101,12 +101,12 @@ struct shmem_falloc { #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { - return totalram_pages / 2; + return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { - return min(totalram_pages - totalhigh_pages, totalram_pages / 2); + return min(totalram_pages() - totalhigh_pages(), totalram_pages() / 2); } #endif @@ -3380,7 +3380,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, size = memparse(value,&rest); if (*rest == '%') { size <<= PAGE_SHIFT; - size *= totalram_pages; + size *= totalram_pages(); do_div(size, 100); rest++; } diff --git a/mm/slab.c b/mm/slab.c index a671328e0610..89fe1cf2d241 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1256,7 +1256,7 @@ void __init kmem_cache_init(void) * page orders on machines with more than 32MB of memory if * not overridden on the command line. */ - if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) + if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT) slab_max_order = SLAB_MAX_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated diff --git a/mm/swap.c b/mm/swap.c index 0b01f9d7c1e5..36a213f15002 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -981,7 +981,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); */ void __init swap_setup(void) { - unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); + unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); #ifdef CONFIG_SWAP int i; diff --git a/mm/util.c b/mm/util.c index 734b2d0e4a49..571f4100be6a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -381,6 +381,52 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +/** + * kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * Reclaim modifiers - __GFP_NORETRY, __GFP_REPEAT and __GFP_NOFAIL are not supported + * + * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. + */ +void *kvmalloc_node(size_t size, gfp_t flags, int node) +{ + gfp_t kmalloc_flags = flags; + void *ret; + + /* + * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) + * so the given set of flags has to be compatible. + */ + WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); + + /* + * Make sure that larger requests are not too disruptive - no OOM + * killer and no allocation failure warnings as we have a fallback + */ + if (size > PAGE_SIZE) + kmalloc_flags |= __GFP_NORETRY | __GFP_NOWARN; + + ret = kmalloc_node(size, kmalloc_flags, node); + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + if (ret || size <= PAGE_SIZE) + return ret; + + return __vmalloc_node_flags_caller(size, node, flags, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(kvmalloc_node); + void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) @@ -527,7 +573,7 @@ unsigned long vm_commit_limit(void) if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else - allowed = ((totalram_pages - hugetlb_total_pages()) + allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 04529a37bed6..02787b96dd00 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include #include #include @@ -50,12 +52,10 @@ static void __vunmap(const void *, int); static void free_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); - struct llist_node *llnode = llist_del_all(&p->list); - while (llnode) { - void *p = llnode; - llnode = llist_next(llnode); - __vunmap(p, 1); - } + struct llist_node *t, *llnode; + + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + __vunmap((void *)llnode, 1); } /*** Page table manipulation functions ***/ @@ -286,6 +286,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ +#define DEBUG_AUGMENT_PROPAGATE_CHECK 0 +#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 + +#define VM_LAZY_FREE 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); @@ -293,65 +297,67 @@ static DEFINE_SPINLOCK(vmap_area_lock); LIST_HEAD(vmap_area_list); static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; +static bool vmap_initialized __read_mostly; -/* The vmap cache globals are protected by vmap_area_lock */ -static struct rb_node *free_vmap_cache; -static unsigned long cached_hole_size; -static unsigned long cached_vstart; -static unsigned long cached_align; +/* + * This kmem_cache is used for vmap_area objects. Instead of + * allocating from slab we reuse an object from this cache to + * make things faster. Especially in "no edge" splitting of + * free block. + */ +static struct kmem_cache *vmap_area_cachep; -static unsigned long vmap_area_pcpu_hole; +/* + * This linked list is used in pair with free_vmap_area_root. + * It gives O(1) access to prev/next to perform fast coalescing. + */ +static LIST_HEAD(free_vmap_area_list); -#ifdef CONFIG_ENABLE_VMALLOC_SAVING -#define POSSIBLE_VMALLOC_START PAGE_OFFSET +/* + * This augment red-black tree represents the free vmap space. + * All vmap_area objects in this tree are sorted by va->va_start + * address. It is used for allocation and merging when a vmap + * object is released. + * + * Each vmap_area node contains a maximum available free block + * of its sub-tree, right or left. Therefore it is possible to + * find a lowest match of free area. + */ +static struct rb_root free_vmap_area_root = RB_ROOT; -#define VMALLOC_BITMAP_SIZE ((VMALLOC_END - PAGE_OFFSET) >> \ - PAGE_SHIFT) -#define VMALLOC_TO_BIT(addr) ((addr - PAGE_OFFSET) >> PAGE_SHIFT) -#define BIT_TO_VMALLOC(i) (PAGE_OFFSET + i * PAGE_SIZE) - -unsigned long total_vmalloc_size; -unsigned long vmalloc_reserved; - -DECLARE_BITMAP(possible_areas, VMALLOC_BITMAP_SIZE); - -void mark_vmalloc_reserved_area(void *x, unsigned long size) +static __always_inline unsigned long +va_size(struct vmap_area *va) { - unsigned long addr = (unsigned long)x; - - bitmap_set(possible_areas, VMALLOC_TO_BIT(addr), size >> PAGE_SHIFT); - vmalloc_reserved += size; + return (va->va_end - va->va_start); } -int is_vmalloc_addr(const void *x) +static __always_inline unsigned long +get_subtree_max_size(struct rb_node *node) { - unsigned long addr = (unsigned long)x; + struct vmap_area *va; - if (addr < POSSIBLE_VMALLOC_START || addr >= VMALLOC_END) - return 0; - - if (test_bit(VMALLOC_TO_BIT(addr), possible_areas)) - return 0; - - return 1; + va = rb_entry_safe(node, struct vmap_area, rb_node); + return va ? va->subtree_max_size : 0; } -static void calc_total_vmalloc_size(void) +/* + * Gets called when remove the node and rotate. + */ +static __always_inline unsigned long +compute_subtree_max_size(struct vmap_area *va) { - total_vmalloc_size = VMALLOC_END - POSSIBLE_VMALLOC_START - - vmalloc_reserved; -} -#else -int is_vmalloc_addr(const void *x) -{ - unsigned long addr = (unsigned long)x; - - return addr >= VMALLOC_START && addr < VMALLOC_END; + return max3(va_size(va), + get_subtree_max_size(va->rb_node.rb_left), + get_subtree_max_size(va->rb_node.rb_right)); } -static void calc_total_vmalloc_size(void) { } -#endif -EXPORT_SYMBOL(is_vmalloc_addr); +RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb, + struct vmap_area, rb_node, unsigned long, subtree_max_size, + compute_subtree_max_size) + +static void purge_vmap_area_lazy(void); +static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); +static unsigned long lazy_max_pages(void); static atomic_long_t nr_vmalloc_pages; @@ -379,41 +385,656 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) return NULL; } -static void __insert_vmap_area(struct vmap_area *va) +/* + * This function returns back addresses of parent node + * and its left or right link for further processing. + */ +static __always_inline struct rb_node ** +find_va_links(struct vmap_area *va, + struct rb_root *root, struct rb_node *from, + struct rb_node **parent) { - struct rb_node **p = &vmap_area_root.rb_node; - struct rb_node *parent = NULL; - struct rb_node *tmp; + struct vmap_area *tmp_va; + struct rb_node **link; - while (*p) { - struct vmap_area *tmp_va; - - parent = *p; - tmp_va = rb_entry(parent, struct vmap_area, rb_node); - if (va->va_start < tmp_va->va_end) - p = &(*p)->rb_left; - else if (va->va_end > tmp_va->va_start) - p = &(*p)->rb_right; - else - BUG(); + if (root) { + link = &root->rb_node; + if (unlikely(!*link)) { + *parent = NULL; + return link; + } + } else { + link = &from; } - rb_link_node(&va->rb_node, parent, p); - rb_insert_color(&va->rb_node, &vmap_area_root); + /* + * Go to the bottom of the tree. When we hit the last point + * we end up with parent rb_node and correct direction, i name + * it link, where the new va->rb_node will be attached to. + */ + do { + tmp_va = rb_entry(*link, struct vmap_area, rb_node); - /* address-sort this list */ - tmp = rb_prev(&va->rb_node); - if (tmp) { - struct vmap_area *prev; - prev = rb_entry(tmp, struct vmap_area, rb_node); - list_add_rcu(&va->list, &prev->list); - } else - list_add_rcu(&va->list, &vmap_area_list); + /* + * During the traversal we also do some sanity check. + * Trigger the BUG() if there are sides(left/right) + * or full overlaps. + */ + if (va->va_start < tmp_va->va_end && + va->va_end <= tmp_va->va_start) + link = &(*link)->rb_left; + else if (va->va_end > tmp_va->va_start && + va->va_start >= tmp_va->va_end) + link = &(*link)->rb_right; + else + BUG(); + } while (*link); + + *parent = &tmp_va->rb_node; + return link; } -static void purge_vmap_area_lazy(void); +static __always_inline struct list_head * +get_va_next_sibling(struct rb_node *parent, struct rb_node **link) +{ + struct list_head *list; -static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); + if (unlikely(!parent)) + /* + * The red-black tree where we try to find VA neighbors + * before merging or inserting is empty, i.e. it means + * there is no free vmap space. Normally it does not + * happen but we handle this case anyway. + */ + return NULL; + + list = &rb_entry(parent, struct vmap_area, rb_node)->list; + return (&parent->rb_right == link ? list->next : list); +} + +static __always_inline void +link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, struct list_head *head) +{ + /* + * VA is still not in the list, but we can + * identify its future previous list_head node. + */ + if (likely(parent)) { + head = &rb_entry(parent, struct vmap_area, rb_node)->list; + if (&parent->rb_right != link) + head = head->prev; + } + + /* Insert to the rb-tree */ + rb_link_node(&va->rb_node, parent, link); + if (root == &free_vmap_area_root) { + /* + * Some explanation here. Just perform simple insertion + * to the tree. We do not set va->subtree_max_size to + * its current size before calling rb_insert_augmented(). + * It is because of we populate the tree from the bottom + * to parent levels when the node _is_ in the tree. + * + * Therefore we set subtree_max_size to zero after insertion, + * to let __augment_tree_propagate_from() puts everything to + * the correct order later on. + */ + rb_insert_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + va->subtree_max_size = 0; + } else { + rb_insert_color(&va->rb_node, root); + } + + /* Address-sort this list */ + list_add(&va->list, head); +} + +static __always_inline void +unlink_va(struct vmap_area *va, struct rb_root *root) +{ + /* + * During merging a VA node can be empty, therefore + * not linked with the tree nor list. Just check it. + */ + if (!RB_EMPTY_NODE(&va->rb_node)) { + if (root == &free_vmap_area_root) + rb_erase_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + else + rb_erase(&va->rb_node, root); + + list_del(&va->list); + RB_CLEAR_NODE(&va->rb_node); + } +} + +#if DEBUG_AUGMENT_PROPAGATE_CHECK +static void +augment_tree_propagate_check(struct rb_node *n) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long size; + bool found = false; + + if (n == NULL) + return; + + va = rb_entry(n, struct vmap_area, rb_node); + size = va->subtree_max_size; + node = n; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) == size) { + node = node->rb_left; + } else { + if (va_size(va) == size) { + found = true; + break; + } + + node = node->rb_right; + } + } + + if (!found) { + va = rb_entry(n, struct vmap_area, rb_node); + pr_emerg("tree is corrupted: %lu, %lu\n", + va_size(va), va->subtree_max_size); + } + + augment_tree_propagate_check(n->rb_left); + augment_tree_propagate_check(n->rb_right); +} +#endif + +#if DEBUG_AUGMENT_PROPAGATE_CHECK +static void +augment_tree_propagate_check(struct rb_node *n) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long size; + bool found = false; + + if (n == NULL) + return; + + va = rb_entry(n, struct vmap_area, rb_node); + size = va->subtree_max_size; + node = n; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) == size) { + node = node->rb_left; + } else { + if (va_size(va) == size) { + found = true; + break; + } + + node = node->rb_right; + } + } + + if (!found) { + va = rb_entry(n, struct vmap_area, rb_node); + pr_emerg("tree is corrupted: %lu, %lu\n", + va_size(va), va->subtree_max_size); + } + + augment_tree_propagate_check(n->rb_left); + augment_tree_propagate_check(n->rb_right); +} +#endif + +/* + * This function populates subtree_max_size from bottom to upper + * levels starting from VA point. The propagation must be done + * when VA size is modified by changing its va_start/va_end. Or + * in case of newly inserting of VA to the tree. + * + * It means that __augment_tree_propagate_from() must be called: + * - After VA has been inserted to the tree(free path); + * - After VA has been shrunk(allocation path); + * - After VA has been increased(merging path). + * + * Please note that, it does not mean that upper parent nodes + * and their subtree_max_size are recalculated all the time up + * to the root node. + * + * 4--8 + * /\ + * / \ + * / \ + * 2--2 8--8 + * + * For example if we modify the node 4, shrinking it to 2, then + * no any modification is required. If we shrink the node 2 to 1 + * its subtree_max_size is updated only, and set to 1. If we shrink + * the node 8 to 6, then its subtree_max_size is set to 6 and parent + * node becomes 4--6. + */ +static __always_inline void +augment_tree_propagate_from(struct vmap_area *va) +{ + struct rb_node *node = &va->rb_node; + unsigned long new_va_sub_max_size; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + new_va_sub_max_size = compute_subtree_max_size(va); + + /* + * If the newly calculated maximum available size of the + * subtree is equal to the current one, then it means that + * the tree is propagated correctly. So we have to stop at + * this point to save cycles. + */ + if (va->subtree_max_size == new_va_sub_max_size) + break; + + va->subtree_max_size = new_va_sub_max_size; + node = rb_parent(&va->rb_node); + } + +#if DEBUG_AUGMENT_PROPAGATE_CHECK + augment_tree_propagate_check(free_vmap_area_root.rb_node); +#endif +} + +static void +insert_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + link = find_va_links(va, root, NULL, &parent); + link_va(va, root, parent, link, head); +} + +static void +insert_vmap_area_augment(struct vmap_area *va, + struct rb_node *from, struct rb_root *root, + struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + if (from) + link = find_va_links(va, NULL, from, &parent); + else + link = find_va_links(va, root, NULL, &parent); + + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); +} + +/* + * Merge de-allocated chunk of VA memory with previous + * and next free blocks. If coalesce is not done a new + * free area is inserted. If VA has been merged, it is + * freed. + */ +static __always_inline void +merge_or_add_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct vmap_area *sibling; + struct list_head *next; + struct rb_node **link; + struct rb_node *parent; + bool merged = false; + + /* + * Find a place in the tree where VA potentially will be + * inserted, unless it is merged with its sibling/siblings. + */ + link = find_va_links(va, root, NULL, &parent); + + /* + * Get next node of VA to check if merging can be done. + */ + next = get_va_next_sibling(parent, link); + if (unlikely(next == NULL)) + goto insert; + + /* + * start end + * | | + * |<------VA------>|<-----Next----->| + * | | + * start end + */ + if (next != head) { + sibling = list_entry(next, struct vmap_area, list); + if (sibling->va_start == va->va_end) { + sibling->va_start = va->va_start; + + /* Check and update the tree if needed. */ + augment_tree_propagate_from(sibling); + + /* Remove this VA, it has been merged. */ + unlink_va(va, root); + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + /* Point to the new merged area. */ + va = sibling; + merged = true; + } + } + + /* + * start end + * | | + * |<-----Prev----->|<------VA------>| + * | | + * start end + */ + if (next->prev != head) { + sibling = list_entry(next->prev, struct vmap_area, list); + if (sibling->va_end == va->va_start) { + sibling->va_end = va->va_end; + + /* Check and update the tree if needed. */ + augment_tree_propagate_from(sibling); + + /* Remove this VA, it has been merged. */ + unlink_va(va, root); + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + return; + } + } + +insert: + if (!merged) { + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); + } + +#if DEBUG_AUGMENT_PROPAGATE_CHECK + augment_tree_propagate_check(free_vmap_area_root.rb_node); +#endif +} + +static __always_inline bool +is_within_this_va(struct vmap_area *va, unsigned long size, + unsigned long align, unsigned long vstart) +{ + unsigned long nva_start_addr; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Can be overflowed due to big size or alignment. */ + if (nva_start_addr + size < nva_start_addr || + nva_start_addr < vstart) + return false; + + return (nva_start_addr + size <= va->va_end); +} + +/* + * Find the first free block(lowest start address) in the tree, + * that will accomplish the request corresponding to passing + * parameters. + */ +static __always_inline struct vmap_area * +find_vmap_lowest_match(unsigned long size, + unsigned long align, unsigned long vstart) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long length; + + /* Start from the root. */ + node = free_vmap_area_root.rb_node; + + /* Adjust the search size for alignment overhead. */ + length = size + align - 1; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) >= length && + vstart < va->va_start) { + node = node->rb_left; + } else { + if (is_within_this_va(va, size, align, vstart)) + return va; + + /* + * Does not make sense to go deeper towards the right + * sub-tree if it does not have a free block that is + * equal or bigger to the requested search length. + */ + if (get_subtree_max_size(node->rb_right) >= length) { + node = node->rb_right; + continue; + } + + /* + * OK. We roll back and find the fist right sub-tree, + * that will satisfy the search criteria. It can happen + * only once due to "vstart" restriction. + */ + while ((node = rb_parent(node))) { + va = rb_entry(node, struct vmap_area, rb_node); + if (is_within_this_va(va, size, align, vstart)) + return va; + + if (get_subtree_max_size(node->rb_right) >= length && + vstart <= va->va_start) { + node = node->rb_right; + break; + } + } + } + } + + return NULL; +} + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK +#include + +static struct vmap_area * +find_vmap_lowest_linear_match(unsigned long size, + unsigned long align, unsigned long vstart) +{ + struct vmap_area *va; + + list_for_each_entry(va, &free_vmap_area_list, list) { + if (!is_within_this_va(va, size, align, vstart)) + continue; + + return va; + } + + return NULL; +} + +static void +find_vmap_lowest_match_check(unsigned long size) +{ + struct vmap_area *va_1, *va_2; + unsigned long vstart; + unsigned int rnd; + + get_random_bytes(&rnd, sizeof(rnd)); + vstart = VMALLOC_START + rnd; + + va_1 = find_vmap_lowest_match(size, 1, vstart); + va_2 = find_vmap_lowest_linear_match(size, 1, vstart); + + if (va_1 != va_2) + pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", + va_1, va_2, vstart); +} +#endif + +enum fit_type { + NOTHING_FIT = 0, + FL_FIT_TYPE = 1, /* full fit */ + LE_FIT_TYPE = 2, /* left edge fit */ + RE_FIT_TYPE = 3, /* right edge fit */ + NE_FIT_TYPE = 4 /* no edge fit */ +}; + +static __always_inline enum fit_type +classify_va_fit_type(struct vmap_area *va, + unsigned long nva_start_addr, unsigned long size) +{ + enum fit_type type; + + /* Check if it is within VA. */ + if (nva_start_addr < va->va_start || + nva_start_addr + size > va->va_end) + return NOTHING_FIT; + + /* Now classify. */ + if (va->va_start == nva_start_addr) { + if (va->va_end == nva_start_addr + size) + type = FL_FIT_TYPE; + else + type = LE_FIT_TYPE; + } else if (va->va_end == nva_start_addr + size) { + type = RE_FIT_TYPE; + } else { + type = NE_FIT_TYPE; + } + + return type; +} + +static __always_inline int +adjust_va_to_fit_type(struct vmap_area *va, + unsigned long nva_start_addr, unsigned long size, + enum fit_type type) +{ + struct vmap_area *lva; + + if (type == FL_FIT_TYPE) { + /* + * No need to split VA, it fully fits. + * + * | | + * V NVA V + * |---------------| + */ + unlink_va(va, &free_vmap_area_root); + kmem_cache_free(vmap_area_cachep, va); + } else if (type == LE_FIT_TYPE) { + /* + * Split left edge of fit VA. + * + * | | + * V NVA V R + * |-------|-------| + */ + va->va_start += size; + } else if (type == RE_FIT_TYPE) { + /* + * Split right edge of fit VA. + * + * | | + * L V NVA V + * |-------|-------| + */ + va->va_end = nva_start_addr; + } else if (type == NE_FIT_TYPE) { + /* + * Split no edge of fit VA. + * + * | | + * L V NVA V R + * |---|-------|---| + */ + lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); + if (unlikely(!lva)) + return -1; + + /* + * Build the remainder. + */ + lva->va_start = va->va_start; + lva->va_end = nva_start_addr; + + /* + * Shrink this VA to remaining size. + */ + va->va_start = nva_start_addr + size; + } else { + return -1; + } + + if (type != FL_FIT_TYPE) { + augment_tree_propagate_from(va); + + if (type == NE_FIT_TYPE) + insert_vmap_area_augment(lva, &va->rb_node, + &free_vmap_area_root, &free_vmap_area_list); + } + + return 0; +} + +/* + * Returns a start address of the newly allocated area, if success. + * Otherwise a vend is returned that indicates failure. + */ +static __always_inline unsigned long +__alloc_vmap_area(unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, int node) +{ + unsigned long nva_start_addr; + struct vmap_area *va; + enum fit_type type; + int ret; + + va = find_vmap_lowest_match(size, align, vstart); + if (unlikely(!va)) + return vend; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Check the "vend" restriction. */ + if (nva_start_addr + size > vend) + return vend; + + /* Classify what we have found. */ + type = classify_va_fit_type(va, nva_start_addr, size); + if (WARN_ON_ONCE(type == NOTHING_FIT)) + return vend; + + /* Update the free vmap_area. */ + ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); + if (ret) + return vend; + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK + find_vmap_lowest_match_check(size); +#endif + + return nva_start_addr; +} /* * Allocate a region of KVA of the specified size and alignment, within the @@ -425,18 +1046,19 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int node, gfp_t gfp_mask) { struct vmap_area *va; - struct rb_node *n; unsigned long addr; int purged = 0; - struct vmap_area *first; BUG_ON(!size); BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); + if (unlikely(!vmap_initialized)) + return ERR_PTR(-EBUSY); + might_sleep(); - va = kmalloc_node(sizeof(struct vmap_area), + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -449,87 +1071,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, retry: spin_lock(&vmap_area_lock); + /* - * Invalidate cache if we have more permissive parameters. - * cached_hole_size notes the largest hole noticed _below_ - * the vmap_area cached in free_vmap_cache: if size fits - * into that hole, we want to scan from vstart to reuse - * the hole instead of allocating above free_vmap_cache. - * Note that __free_vmap_area may update free_vmap_cache - * without updating cached_hole_size or cached_align. + * If an allocation fails, the "vend" address is + * returned. Therefore trigger the overflow path. */ - if (!free_vmap_cache || - size < cached_hole_size || - vstart < cached_vstart || - align < cached_align) { -nocache: - cached_hole_size = 0; - free_vmap_cache = NULL; - } - /* record if we encounter less permissive parameters */ - cached_vstart = vstart; - cached_align = align; - - /* find starting point for our search */ - if (free_vmap_cache) { - first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); - addr = ALIGN(first->va_end, align); - if (addr < vstart) - goto nocache; - if (addr + size < addr) - goto overflow; - - } else { - addr = ALIGN(vstart, align); - if (addr + size < addr) - goto overflow; - - n = vmap_area_root.rb_node; - first = NULL; - - while (n) { - struct vmap_area *tmp; - tmp = rb_entry(n, struct vmap_area, rb_node); - if (tmp->va_end >= addr) { - first = tmp; - if (tmp->va_start <= addr) - break; - n = n->rb_left; - } else - n = n->rb_right; - } - - if (!first) - goto found; - } - - /* from the starting point, walk areas until a suitable hole is found */ - while (addr + size > first->va_start && addr + size <= vend) { - if (addr + cached_hole_size < first->va_start) - cached_hole_size = first->va_start - addr; - addr = ALIGN(first->va_end, align); - if (addr + size < addr) - goto overflow; - - if (list_is_last(&first->list, &vmap_area_list)) - goto found; - - first = list_next_entry(first, list); - } - -found: - /* - * Check also calculated address against the vstart, - * because it can be 0 because of big align request. - */ - if (addr + size > vend || addr < vstart) + addr = __alloc_vmap_area(size, align, vstart, vend, node); + if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->flags = 0; - __insert_vmap_area(va); - free_vmap_cache = &va->rb_node; + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_unlock(&vmap_area_lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); @@ -558,7 +1113,8 @@ overflow: if (printk_ratelimit()) pr_warn("vmap allocation for size %lu failed: use vmalloc= to increase size\n", size); - kfree(va); + + kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } @@ -578,35 +1134,16 @@ static void __free_vmap_area(struct vmap_area *va) { BUG_ON(RB_EMPTY_NODE(&va->rb_node)); - if (free_vmap_cache) { - if (va->va_end < cached_vstart) { - free_vmap_cache = NULL; - } else { - struct vmap_area *cache; - cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node); - if (va->va_start <= cache->va_start) { - free_vmap_cache = rb_prev(&va->rb_node); - /* - * We don't try to update cached_hole_size or - * cached_align, but it won't go very wrong. - */ - } - } - } - rb_erase(&va->rb_node, &vmap_area_root); - RB_CLEAR_NODE(&va->rb_node); - list_del_rcu(&va->list); + /* + * Remove from the busy tree/list. + */ + unlink_va(va, &vmap_area_root); /* - * Track the highest possible candidate for pcpu area - * allocation. Areas outside of vmalloc area can be returned - * here too, consider only end addresses which fall inside - * vmalloc area proper. + * Merge VA with its neighbors, otherwise just add it. */ - if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) - vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); - - kfree_rcu(va, rcu_head); + merge_or_add_vmap_area(va, + &free_vmap_area_root, &free_vmap_area_list); } /* @@ -627,26 +1164,6 @@ static void unmap_vmap_area(struct vmap_area *va) vunmap_page_range(va->va_start, va->va_end); } -static void vmap_debug_free_range(unsigned long start, unsigned long end) -{ - /* - * Unmap page tables and force a TLB flush immediately if pagealloc - * debugging is enabled. This catches use after free bugs similarly to - * those in linear kernel virtual address space after a page has been - * freed. - * - * All the lazy freeing logic is still retained, in order to minimise - * intrusiveness of this debugging feature. - * - * This is going to be *slow* (linear kernel virtual address debugging - * doesn't do a broadcast TLB flush so it is a lot faster). - */ - if (debug_pagealloc_enabled()) { - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); - } -} - /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. @@ -672,7 +1189,7 @@ static unsigned long lazy_max_pages(void) return log * (32UL * 1024 * 1024 / PAGE_SIZE); } -static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* * Serialize vmap purging. There is no actual criticial section protected @@ -690,7 +1207,7 @@ static void purge_fragmented_blocks_allcpus(void); */ void set_iounmap_nonlazy(void) { - atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); + atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); } /* @@ -698,34 +1215,40 @@ void set_iounmap_nonlazy(void) */ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { + unsigned long resched_threshold; struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - bool do_free = false; lockdep_assert_held(&vmap_purge_lock); valist = llist_del_all(&vmap_purge_list); + if (unlikely(valist == NULL)) + return false; + + /* + * TODO: to calculate a flush range without looping. + * The list can be up to lazy_max_pages() elements. + */ llist_for_each_entry(va, valist, purge_list) { if (va->va_start < start) start = va->va_start; if (va->va_end > end) end = va->va_end; - do_free = true; } - if (!do_free) - return false; - flush_tlb_kernel_range(start, end); + resched_threshold = lazy_max_pages() << 1; spin_lock(&vmap_area_lock); llist_for_each_entry_safe(va, n_va, valist, purge_list) { - int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; __free_vmap_area(va); - atomic_sub(nr, &vmap_lazy_nr); - cond_resched_lock(&vmap_area_lock); + atomic_long_sub(nr, &vmap_lazy_nr); + + if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) + cond_resched_lock(&vmap_area_lock); } spin_unlock(&vmap_area_lock); return true; @@ -761,10 +1284,10 @@ static void purge_vmap_area_lazy(void) */ static void free_vmap_area_noflush(struct vmap_area *va) { - int nr_lazy; + unsigned long nr_lazy; - nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, - &vmap_lazy_nr); + nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> + PAGE_SHIFT, &vmap_lazy_nr); /* After this point, we may free va at any time */ llist_add(&va->purge_list, &vmap_purge_list); @@ -780,6 +1303,9 @@ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); unmap_vmap_area(va); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range(va->va_start, va->va_end); + free_vmap_area_noflush(va); } @@ -824,8 +1350,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) -static bool vmap_initialized __read_mostly = false; - struct vmap_block_queue { spinlock_t lock; struct list_head free; @@ -1077,6 +1601,10 @@ static void vb_free(const void *addr, unsigned long size) vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range((unsigned long)addr, + (unsigned long)addr + size); + spin_lock(&vb->lock); /* Expand dirty range */ @@ -1092,24 +1620,9 @@ static void vb_free(const void *addr, unsigned long size) spin_unlock(&vb->lock); } -/** - * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer - * - * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily - * to amortize TLB flushing overheads. What this means is that any page you - * have now, may, in a former life, have been mapped into kernel virtual - * address by the vmap layer and so there might be some CPUs with TLB entries - * still referencing that page (additional to the regular 1:1 kernel mapping). - * - * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can - * be sure that none of the pages we have control over will have any aliases - * from the vmap layer. - */ -void vm_unmap_aliases(void) +static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) { - unsigned long start = ULONG_MAX, end = 0; int cpu; - int flush = 0; if (unlikely(!vmap_initialized)) return; @@ -1146,6 +1659,27 @@ void vm_unmap_aliases(void) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int flush = 0; + + _vm_unmap_aliases(start, end, flush); +} EXPORT_SYMBOL_GPL(vm_unmap_aliases); /** @@ -1165,16 +1699,16 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); - debug_check_no_locks_freed(mem, size); - vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) { + debug_check_no_locks_freed(mem, size); vb_free(mem, size); return; } va = find_vmap_area(addr); BUG_ON(!va); + debug_check_no_locks_freed((void *)va->va_start, + (va->va_end - va->va_start)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1224,33 +1758,6 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; - -/** - * vm_area_check_early - check if vmap area is already mapped - * @vm: vm_struct to be checked - * - * This function is used to check if the vmap area has been - * mapped already. @vm->addr, @vm->size and @vm->flags should - * contain proper values. - * - */ -int __init vm_area_check_early(struct vm_struct *vm) -{ - struct vm_struct *tmp, **p; - - BUG_ON(vmap_initialized); - for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { - if (tmp->addr >= vm->addr) { - if (tmp->addr < vm->addr + vm->size) - return 1; - } else { - if (tmp->addr + tmp->size > vm->addr) - return 1; - } - } - return 0; -} - /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -1302,12 +1809,58 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) vm_area_add_early(vm); } +static void vmap_init_free_space(void) +{ + unsigned long vmap_start = 1; + const unsigned long vmap_end = ULONG_MAX; + struct vmap_area *busy, *free; + + /* + * B F B B B F + * -|-----|.....|-----|-----|-----|.....|- + * | The KVA space | + * |<--------------------------------->| + */ + list_for_each_entry(busy, &vmap_area_list, list) { + if (busy->va_start - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = busy->va_start; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } + + vmap_start = busy->va_end; + } + + if (vmap_end - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = vmap_end; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } +} + void __init vmalloc_init(void) { struct vmap_area *va; struct vm_struct *tmp; int i; + /* + * Create the cache for vmap_area objects. + */ + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + for_each_possible_cpu(i) { struct vmap_block_queue *vbq; struct vfree_deferred *p; @@ -1322,16 +1875,21 @@ void __init vmalloc_init(void) /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { - va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + continue; + va->flags = VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; - __insert_vmap_area(va); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } - vmap_area_pcpu_hole = VMALLOC_END; - calc_total_vmalloc_size(); + /* + * Now we can initialize a free vmap space. + */ + vmap_init_free_space(); vmap_initialized = true; } @@ -1558,9 +2116,9 @@ struct vm_struct *remove_vm_area(const void *addr) spin_lock(&vmap_area_lock); va->vm = NULL; va->flags &= ~VM_VM_AREA; + va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); - vmap_debug_free_range(va->va_start, va->va_end); kasan_free_shadow(vm); free_unmap_vmap_area(va); @@ -1569,6 +2127,72 @@ struct vm_struct *remove_vm_area(const void *addr) return NULL; } +static inline void set_area_direct_map(const struct vm_struct *area, + int (*set_direct_map)(struct page *page)) +{ + int i; + + for (i = 0; i < area->nr_pages; i++) + if (page_address(area->pages[i])) + set_direct_map(area->pages[i]); +} + +/* Handle removing and resetting vm mappings related to the vm_struct. */ +static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +{ + unsigned long addr = (unsigned long)area->addr; + unsigned long start = ULONG_MAX, end = 0; + int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; + int i; + + /* + * The below block can be removed when all architectures that have + * direct map permissions also have set_direct_map_() implementations. + * This is concerned with resetting the direct map any an vm alias with + * execute permissions, without leaving a RW+X window. + */ + if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + set_memory_nx(addr, area->nr_pages); + set_memory_rw(addr, area->nr_pages); + } + + remove_vm_area(area->addr); + + /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ + if (!flush_reset) + return; + + /* + * If not deallocating pages, just do the flush of the VM area and + * return. + */ + if (!deallocate_pages) { + vm_unmap_aliases(); + return; + } + + /* + * If execution gets here, flush the vm mapping and reset the direct + * map. Find the start and end range of the direct mappings to make sure + * the vm_unmap_aliases() flush includes the direct map. + */ + for (i = 0; i < area->nr_pages; i++) { + if (page_address(area->pages[i])) { + start = min(addr, start); + end = max(addr, end); + } + } + + /* + * Set direct map to something invalid so that it won't be cached if + * there are any accesses after the TLB flush, then flush the TLB and + * reset the direct map permissions to the default. + */ + set_area_direct_map(area, set_direct_map_invalid_noflush); + _vm_unmap_aliases(start, end, 1); + set_area_direct_map(area, set_direct_map_default_noflush); +} + static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; @@ -1587,10 +2211,11 @@ static void __vunmap(const void *addr, int deallocate_pages) return; } - debug_check_no_locks_freed(addr, get_vm_area_size(area)); - debug_check_no_obj_freed(addr, get_vm_area_size(area)); + debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); + debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); + + vm_remove_mappings(area, deallocate_pages); - remove_vm_area(addr); if (deallocate_pages) { int i; @@ -1641,6 +2266,14 @@ void vfree_atomic(const void *addr) __vfree_deferred(addr); } +static void __vfree(const void *addr) +{ + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else + __vunmap(addr, 1); +} + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1661,12 +2294,12 @@ void vfree(const void *addr) kmemleak_free(addr); + might_sleep_if(!in_interrupt()); + if (!addr) return; - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else - __vunmap(addr, 1); + + __vfree(addr); } EXPORT_SYMBOL(vfree); @@ -1706,7 +2339,7 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); - if (count > totalram_pages) + if (count > totalram_pages()) return NULL; size = (unsigned long)count << PAGE_SHIFT; @@ -1733,6 +2366,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; + const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? + 0 : + __GFP_HIGHMEM; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1740,7 +2376,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, + pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, PAGE_KERNEL, node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); @@ -1756,9 +2392,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask); + page = alloc_page(alloc_mask|highmem_mask); else - page = alloc_pages_node(node, alloc_mask, 0); + page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1767,7 +2403,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfpflags_allow_blocking(gfp_mask)) + if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); @@ -1780,7 +2416,7 @@ fail: warn_alloc(gfp_mask, "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); - vfree(area->addr); + __vfree(area->addr); return NULL; } @@ -1810,7 +2446,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long real_size = size; size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > totalram_pages) + if (!size || (size >> PAGE_SHIFT) > totalram_pages()) goto fail; area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | @@ -1835,12 +2471,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, */ clear_vm_uninitialized_flag(area); - /* - * A ref_count = 2 is needed because vm_struct allocated in - * __get_vm_area_node() contains a reference to the virtual address of - * the vmalloc'ed block. - */ - kmemleak_alloc(addr, real_size, 2, gfp_mask); + kmemleak_vmalloc(area, size, gfp_mask); return addr; @@ -1850,6 +2481,15 @@ fail: return NULL; } +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node_range); +#endif + /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size @@ -1862,6 +2502,13 @@ fail: * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. + * + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT + * and __GFP_NOFAIL are not supported + * + * Any use of gfp flags outside of GFP_KERNEL should be consulted + * with mm people. + * */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, @@ -1878,6 +2525,20 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); +static inline void *__vmalloc_node_flags(unsigned long size, + int node, gfp_t flags) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, + node, __builtin_return_address(0)); +} + + +void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, + void *caller) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); +} + /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -1890,7 +2551,7 @@ EXPORT_SYMBOL(__vmalloc); void *vmalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_HIGHMEM); + GFP_KERNEL); } EXPORT_SYMBOL(vmalloc); @@ -1907,7 +2568,7 @@ EXPORT_SYMBOL(vmalloc); void *vzalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -1920,18 +2581,10 @@ EXPORT_SYMBOL(vzalloc); */ void *vmalloc_user(unsigned long size) { - struct vm_struct *area; - void *ret; - - ret = __vmalloc_node(size, SHMLBA, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL, NUMA_NO_NODE, - __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_user); @@ -1948,7 +2601,7 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -1968,14 +2621,10 @@ EXPORT_SYMBOL(vmalloc_node); void *vzalloc_node(unsigned long size, int node) { return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc_node); -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - /** * vmalloc_user_node_flags - allocate memory for userspace on a specific node * @size: allocation size @@ -2010,16 +2659,21 @@ EXPORT_SYMBOL(vmalloc_user_node_flags); void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) -#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL +#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) -#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL +#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) #else -#define GFP_VMALLOC32 GFP_KERNEL +/* + * 64b systems should always have either DMA or DMA32 zones. For others + * GFP_DMA32 should do the right thing and use the normal zone. + */ +#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL #endif /** @@ -2045,16 +2699,10 @@ EXPORT_SYMBOL(vmalloc_32); */ void *vmalloc_32_user(unsigned long size) { - struct vm_struct *area; - void *ret; - - ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32_user); @@ -2462,81 +3110,64 @@ static struct vmap_area *node_to_va(struct rb_node *n) } /** - * pvm_find_next_prev - find the next and prev vmap_area surrounding @end - * @end: target address - * @pnext: out arg for the next vmap_area - * @pprev: out arg for the previous vmap_area + * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to + * @addr: target address * - * Returns: %true if either or both of next and prev are found, - * %false if no vmap_area exists - * - * Find vmap_areas end addresses of which enclose @end. ie. if not - * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. + * Returns: vmap_area if it is found. If there is no such area + * the first highest(reverse order) vmap_area is returned + * i.e. va->va_start < addr && va->va_end < addr or NULL + * if there are no any areas before @addr. */ -static bool pvm_find_next_prev(unsigned long end, - struct vmap_area **pnext, - struct vmap_area **pprev) +static struct vmap_area * +pvm_find_va_enclose_addr(unsigned long addr) { - struct rb_node *n = vmap_area_root.rb_node; - struct vmap_area *va = NULL; + struct vmap_area *va, *tmp; + struct rb_node *n; + + n = free_vmap_area_root.rb_node; + va = NULL; while (n) { - va = rb_entry(n, struct vmap_area, rb_node); - if (end < va->va_end) - n = n->rb_left; - else if (end > va->va_end) + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_start <= addr) { + va = tmp; + if (tmp->va_end >= addr) + break; + n = n->rb_right; - else - break; + } else { + n = n->rb_left; + } } - if (!va) - return false; - - if (va->va_end > end) { - *pnext = va; - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); - } else { - *pprev = va; - *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); - } - return true; + return va; } /** - * pvm_determine_end - find the highest aligned address between two vmap_areas - * @pnext: in/out arg for the next vmap_area - * @pprev: in/out arg for the previous vmap_area - * @align: alignment + * pvm_determine_end_from_reverse - find the highest aligned address + * of free block below VMALLOC_END + * @va: + * in - the VA we start the search(reverse order); + * out - the VA with the highest aligned end address. * - * Returns: determined end address - * - * Find the highest aligned address between *@pnext and *@pprev below - * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned - * down address is between the end addresses of the two vmap_areas. - * - * Please note that the address returned by this function may fall - * inside *@pnext vmap_area. The caller is responsible for checking - * that. + * Returns: determined end address within vmap_area */ -static unsigned long pvm_determine_end(struct vmap_area **pnext, - struct vmap_area **pprev, - unsigned long align) +static unsigned long +pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) { - const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); unsigned long addr; - if (*pnext) - addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); - else - addr = vmalloc_end; - - while (*pprev && (*pprev)->va_end > addr) { - *pnext = *pprev; - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + if (likely(*va)) { + list_for_each_entry_from_reverse((*va), + &free_vmap_area_list, list) { + addr = min((*va)->va_end & ~(align - 1), vmalloc_end); + if ((*va)->va_start < addr) + return addr; + } } - return addr; + return 0; } /** @@ -2556,12 +3187,12 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * to gigabytes. To avoid interacting with regular vmallocs, these * areas are allocated from top. * - * Despite its complicated look, this allocator is rather simple. It - * does everything top-down and scans areas from the end looking for - * matching slot. While scanning, if any of the areas overlaps with - * existing vmap_area, the base address is pulled down to fit the - * area. Scanning is repeated till all the areas fit and then all - * necessary data structres are inserted and the result is returned. + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans free blocks from the end looking + * for matching base. While scanning, if any of the areas do not fit the + * base address is pulled down to fit the area. Scanning is repeated till + * all the areas fit and then all necessary data structures are inserted + * and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, @@ -2569,11 +3200,12 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); - struct vmap_area **vas, *prev, *next; + struct vmap_area **vas, *va; struct vm_struct **vms; int area, area2, last_area, term_area; - unsigned long base, start, end, last_end; + unsigned long base, start, size, end, last_end; bool purged = false; + enum fit_type type; /* verify parameters and allocate data structures */ BUG_ON(offset_in_page(align) || !is_power_of_2(align)); @@ -2589,15 +3221,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, if (start > offsets[last_area]) last_area = area; - for (area2 = 0; area2 < nr_vms; area2++) { + for (area2 = area + 1; area2 < nr_vms; area2++) { unsigned long start2 = offsets[area2]; unsigned long end2 = start2 + sizes[area2]; - if (area2 == area) - continue; - - BUG_ON(start2 >= start && start2 < end); - BUG_ON(end2 <= end && end2 > start); + BUG_ON(start2 < end && start < end2); } } last_end = offsets[last_area] + sizes[last_area]; @@ -2613,7 +3241,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, goto err_free2; for (area = 0; area < nr_vms; area++) { - vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; @@ -2626,49 +3254,29 @@ retry: start = offsets[area]; end = start + sizes[area]; - if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { - base = vmalloc_end - last_end; - goto found; - } - base = pvm_determine_end(&next, &prev, align) - end; + va = pvm_find_va_enclose_addr(vmalloc_end); + base = pvm_determine_end_from_reverse(&va, align) - end; while (true) { - BUG_ON(next && next->va_end <= base + end); - BUG_ON(prev && prev->va_end > base + end); - /* * base might have underflowed, add last_end before * comparing. */ - if (base + last_end < vmalloc_start + last_end) { - spin_unlock(&vmap_area_lock); - if (!purged) { - purge_vmap_area_lazy(); - purged = true; - goto retry; - } - goto err_free; - } + if (base + last_end < vmalloc_start + last_end) + goto overflow; /* - * If next overlaps, move base downwards so that it's - * right below next and then recheck. + * Fitting base has not been found. */ - if (next && next->va_start < base + end) { - base = pvm_determine_end(&next, &prev, align) - end; - term_area = area; - continue; - } + if (va == NULL) + goto overflow; /* - * If prev overlaps, shift down next and prev and move - * base so that it's right below new next and then - * recheck. + * If this VA does not fit, move base downwards and recheck. */ - if (prev && prev->va_end > base + start) { - next = prev; - prev = node_to_va(rb_prev(&next->rb_node)); - base = pvm_determine_end(&next, &prev, align) - end; + if (base + start < va->va_start || base + end > va->va_end) { + va = node_to_va(rb_prev(&va->rb_node)); + base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } @@ -2680,22 +3288,41 @@ retry: area = (area + nr_vms - 1) % nr_vms; if (area == term_area) break; + start = offsets[area]; end = start + sizes[area]; - pvm_find_next_prev(base + end, &next, &prev); + va = pvm_find_va_enclose_addr(base + end); } -found: + /* we've found a fitting base, insert all va's */ for (area = 0; area < nr_vms; area++) { - struct vmap_area *va = vas[area]; + int ret; - va->va_start = base + offsets[area]; - va->va_end = va->va_start + sizes[area]; - __insert_vmap_area(va); + start = base + offsets[area]; + size = sizes[area]; + + va = pvm_find_va_enclose_addr(start); + if (WARN_ON_ONCE(va == NULL)) + /* It is a BUG(), but trigger recovery instead. */ + goto recovery; + + type = classify_va_fit_type(va, start, size); + if (WARN_ON_ONCE(type == NOTHING_FIT)) + /* It is a BUG(), but trigger recovery instead. */ + goto recovery; + + ret = adjust_va_to_fit_type(va, start, size, type); + if (unlikely(ret)) + goto recovery; + + /* Allocated area. */ + va = vas[area]; + va->va_start = start; + va->va_end = start + size; + + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } - vmap_area_pcpu_hole = base + offsets[last_area]; - spin_unlock(&vmap_area_lock); /* insert all vm's */ @@ -2706,9 +3333,38 @@ found: kfree(vas); return vms; +recovery: + /* Remove previously inserted areas. */ + while (area--) { + __free_vmap_area(vas[area]); + vas[area] = NULL; + } + +overflow: + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + + /* Before "retry", check if we recover. */ + for (area = 0; area < nr_vms; area++) { + if (vas[area]) + continue; + + vas[area] = kmem_cache_zalloc( + vmap_area_cachep, GFP_KERNEL); + if (!vas[area]) + goto err_free; + } + + goto retry; + } + err_free: for (area = 0; area < nr_vms; area++) { - kfree(vas[area]); + if (vas[area]) + kmem_cache_free(vmap_area_cachep, vas[area]); + kfree(vms[area]); } err_free2: @@ -2805,8 +3461,14 @@ static int s_show(struct seq_file *m, void *p) * s_show can encounter race with remove_vm_area, !VM_VM_AREA on * behalf of vmap area is being tear down or vm_map_ram allocation. */ - if (!(va->flags & VM_VM_AREA)) + if (!(va->flags & VM_VM_AREA)) { + seq_printf(m, "0x%pK-0x%pK %7ld %s\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start, + va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram"); + return 0; + } v = va->vm; diff --git a/mm/workingset.c b/mm/workingset.c index a6976119b4a4..522f8e66d957 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -511,10 +511,10 @@ static int __init workingset_init(void) * actionable refault distance, which is currently half of * memory (totalram_pages/2). However, memory hotplug may add * some more pages at runtime, so keep working with up to - * double the initial memory by using totalram_pages as-is. + * double the initial memory by using totalram_pages() as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; - max_order = fls_long(totalram_pages - 1); + max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 18c4b34bd6e0..696f13b58e65 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -174,7 +174,7 @@ void *ceph_kvmalloc(size_t size, gfp_t flags) return ptr; } - return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, flags, PAGE_KERNEL); } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 936dab12f99f..21db0a88b158 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1149,10 +1149,10 @@ static int __init dccp_init(void) * * The methodology is similar to that of the buffer cache. */ - if (totalram_pages >= (128 * 1024)) - goal = totalram_pages >> (21 - PAGE_SHIFT); + if (totalram_pages() >= (128 * 1024)) + goal = totalram_pages() >> (21 - PAGE_SHIFT); else - goal = totalram_pages >> (23 - PAGE_SHIFT); + goal = totalram_pages() >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 403593bd2b83..86b7eec8470c 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1877,7 +1877,7 @@ void __init dn_route_init(void) dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; add_timer(&dn_route_timer); - goal = totalram_pages >> (26 - PAGE_SHIFT); + goal = totalram_pages() >> (26 - PAGE_SHIFT); for(order = 0; (1UL << order) < goal; order++) /* NOTHING */; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index bf1f3b2b29d1..5e0d05f852c0 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1142,7 +1142,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) slots = tcpmhash_entries; if (!slots) { - if (totalram_pages >= 128 * 1024) + if (totalram_pages() >= 128 * 1024) slots = 16 * 1024; else slots = 8 * 1024; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ba46d5fa646b..ca54273657bd 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2002,11 +2002,11 @@ int nf_conntrack_init_start(void) * >= 4GB machines have 65536 buckets. */ nf_conntrack_htable_size - = (((totalram_pages << PAGE_SHIFT) / 16384) + = (((totalram_pages() << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) + if (totalram_pages() > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) nf_conntrack_htable_size = 65536; - else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) + else if (totalram_pages() > (1024 * 1024 * 1024 / PAGE_SIZE)) nf_conntrack_htable_size = 16384; if (nf_conntrack_htable_size < 32) nf_conntrack_htable_size = 32; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2820fa6f399c..5c021ffcb51f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1038,7 +1038,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) return NULL; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((size >> PAGE_SHIFT) + 2 > totalram_pages) + if ((size >> PAGE_SHIFT) + 2 > totalram_pages()) return NULL; if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 97df6f9dbdde..d8cfddf61b30 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -253,9 +253,9 @@ static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg, if (cfg->size) { size = cfg->size; } else { - size = (totalram_pages << PAGE_SHIFT) / 16384 / + size = (totalram_pages() << PAGE_SHIFT) / 16384 / sizeof(struct list_head); - if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) + if (totalram_pages() > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) size = 16; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 02afbe571008..24321a504dc5 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1433,10 +1433,10 @@ static __init int sctp_init(void) * The methodology is similar to that of the tcp hash tables. * Though not identical. Start by getting a goal size */ - if (totalram_pages >= (128 * 1024)) - goal = totalram_pages >> (22 - PAGE_SHIFT); + if (totalram_pages() >= (128 * 1024)) + goal = totalram_pages() >> (22 - PAGE_SHIFT); else - goal = totalram_pages >> (24 - PAGE_SHIFT); + goal = totalram_pages() >> (24 - PAGE_SHIFT); /* Then compute the page order for said goal */ order = get_order(goal); diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 5923d5665209..0ebdadd9c103 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -100,7 +100,7 @@ static char *aa_simple_write_to_buffer(int op, const char __user *userbuf, return ERR_PTR(-EACCES); /* freed by caller to simple_write_to_buffer */ - data = kvmalloc(alloc_size); + data = kvmalloc(alloc_size, GFP_KERNEL); if (data == NULL) return ERR_PTR(-ENOMEM); diff --git a/security/apparmor/include/apparmor.h b/security/apparmor/include/apparmor.h index 5d721e990876..6c119d29da6b 100644 --- a/security/apparmor/include/apparmor.h +++ b/security/apparmor/include/apparmor.h @@ -66,17 +66,6 @@ extern int apparmor_initialized __initdata; /* fn's in lib */ char *aa_split_fqname(char *args, char **ns_name); void aa_info_message(const char *str); -void *__aa_kvmalloc(size_t size, gfp_t flags); - -static inline void *kvmalloc(size_t size) -{ - return __aa_kvmalloc(size, 0); -} - -static inline void *kvzalloc(size_t size) -{ - return __aa_kvmalloc(size, __GFP_ZERO); -} /* returns 0 if kref not incremented */ static inline int kref_get_not0(struct kref *kref) diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index c1827e068454..a4975a7a395c 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -76,31 +76,3 @@ void aa_info_message(const char *str) printk(KERN_INFO "AppArmor: %s\n", str); } -/** - * __aa_kvmalloc - do allocation preferring kmalloc but falling back to vmalloc - * @size: how many bytes of memory are required - * @flags: the type of memory to allocate (see kmalloc). - * - * Return: allocated buffer or NULL if failed - * - * It is possible that policy being loaded from the user is larger than - * what can be allocated by kmalloc, in those cases fall back to vmalloc. - */ -void *__aa_kvmalloc(size_t size, gfp_t flags) -{ - void *buffer = NULL; - - if (size == 0) - return NULL; - - /* do not attempt kmalloc if we need more than 16 pages at once */ - if (size <= (16*PAGE_SIZE)) - buffer = kmalloc(size, flags | GFP_NOIO | __GFP_NOWARN); - if (!buffer) { - if (flags & __GFP_ZERO) - buffer = vzalloc(size); - else - buffer = vmalloc(size); - } - return buffer; -} diff --git a/security/apparmor/match.c b/security/apparmor/match.c index 3f900fcca8fb..0ffbafa48fa5 100644 --- a/security/apparmor/match.c +++ b/security/apparmor/match.c @@ -61,7 +61,7 @@ static struct table_header *unpack_table(char *blob, size_t bsize) if (bsize < tsize) goto out; - table = kvzalloc(tsize); + table = kvzalloc(tsize, GFP_KERNEL); if (table) { table->td_id = th.td_id; table->td_flags = th.td_flags; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d251b718bf53..5acfb995e966 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -530,7 +530,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void) int i; struct kvm_memslots *slots; - slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); + slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) return NULL; @@ -749,18 +749,6 @@ out_err_no_disable: return ERR_PTR(r); } -/* - * Avoid using vmalloc for a small buffer. - * Should not be used when the size is statically known. - */ -void *kvm_kvzalloc(unsigned long size) -{ - if (size > PAGE_SIZE) - return vzalloc(size); - else - return kzalloc(size, GFP_KERNEL); -} - static void kvm_destroy_devices(struct kvm *kvm) { struct kvm_device *dev, *tmp; @@ -845,7 +833,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); - memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); + memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL); if (!memslot->dirty_bitmap) return -ENOMEM; @@ -1064,7 +1052,7 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_free; } - slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); + slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) goto out_free; memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));