diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d2ee033e95c7..9a661604bd8b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2694,6 +2694,8 @@ steal time is computed, but won't influence scheduler behaviour + nopti [X86-64] Disable kernel page table isolation + nolapic [X86-32,APIC] Do not enable or use the local APIC. nolapic_timer [X86-32,APIC] Do not use the local APIC timer. @@ -3262,6 +3264,12 @@ pt. [PARIDE] See Documentation/blockdev/paride.txt. + pti= [X86_64] + Control user/kernel address space isolation: + on - enable + off - disable + auto - default setting + pty.legacy_count= [KNL] Number of legacy pty's. Overwrites compiled-in default number. diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 51101708a03a..ea91cb61a602 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,7 +12,9 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) ... unused hole ... -fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping + vaddr_end for KASLR +fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping +fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space @@ -29,20 +31,22 @@ Virtual memory map with 5 level page tables: hole caused by [56:63] sign extension ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory -ff90000000000000 - ff91ffffffffffff (=49 bits) hole -ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space +ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI +ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) ... unused hole ... ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) ... unused hole ... -fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping + vaddr_end for KASLR +fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping +... unused hole ... ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ... unused hole ... ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space +ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space [fixmap start] - ffffffffff5fffff kernel-internal fixmap range ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole @@ -66,9 +70,10 @@ memory window (this size is arbitrary, it can be raised later if needed). The mappings are not part of any other kernel PGD and are only available during EFI runtime calls. -The module mapping space size changes based on the CONFIG requirements for the -following fixmap section. - Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all physical memory, vmalloc/ioremap space and virtual memory map are randomized. Their order is preserved but their base will be offset early at boot time. + +Be very careful vs. KASLR when changing anything here. The KASLR address +range must not overlap with anything except the KASAN shadow area, which is +correct as KASAN disables KASLR. diff --git a/Makefile b/Makefile index 38f73762d96d..0e9ec5d746a2 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 PATCHLEVEL = 14 -SUBLEVEL = 10 +SUBLEVEL = 13 EXTRAVERSION = NAME = Petit Gorille @@ -817,6 +817,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign) # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow) +# Make sure -fstack-check isn't enabled (like gentoo apparently did) +KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,) + # conserve stack if available KBUILD_CFLAGS += $(call cc-option,-fconserve-stack) diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h index f35974ee7264..c9173c02081c 100644 --- a/arch/arc/include/asm/uaccess.h +++ b/arch/arc/include/asm/uaccess.h @@ -668,6 +668,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count) return 0; __asm__ __volatile__( + " mov lp_count, %5 \n" " lp 3f \n" "1: ldb.ab %3, [%2, 1] \n" " breq.d %3, 0, 3f \n" @@ -684,8 +685,8 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count) " .word 1b, 4b \n" " .previous \n" : "+r"(res), "+r"(dst), "+r"(src), "=r"(val) - : "g"(-EFAULT), "l"(count) - : "memory"); + : "g"(-EFAULT), "r"(count) + : "lp_count", "lp_start", "lp_end", "memory"); return res; } diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h index dd5a08aaa4da..3eb4bfc1fb36 100644 --- a/arch/parisc/include/asm/ldcw.h +++ b/arch/parisc/include/asm/ldcw.h @@ -12,6 +12,7 @@ for the semaphore. */ #define __PA_LDCW_ALIGNMENT 16 +#define __PA_LDCW_ALIGN_ORDER 4 #define __ldcw_align(a) ({ \ unsigned long __ret = (unsigned long) &(a)->lock[0]; \ __ret = (__ret + __PA_LDCW_ALIGNMENT - 1) \ @@ -29,6 +30,7 @@ ldcd). */ #define __PA_LDCW_ALIGNMENT 4 +#define __PA_LDCW_ALIGN_ORDER 2 #define __ldcw_align(a) (&(a)->slock) #define __LDCW "ldcw,co" diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index f3cecf5117cf..e95207c0565e 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,14 @@ #endif .import pa_tlb_lock,data + .macro load_pa_tlb_lock reg +#if __PA_LDCW_ALIGNMENT > 4 + load32 PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg + depi 0,31,__PA_LDCW_ALIGN_ORDER, \reg +#else + load32 PA(pa_tlb_lock), \reg +#endif + .endm /* space_to_prot macro creates a prot id from a space id */ @@ -457,7 +466,7 @@ .macro tlb_lock spc,ptp,pte,tmp,tmp1,fault #ifdef CONFIG_SMP cmpib,COND(=),n 0,\spc,2f - load32 PA(pa_tlb_lock),\tmp + load_pa_tlb_lock \tmp 1: LDCW 0(\tmp),\tmp1 cmpib,COND(=) 0,\tmp1,1b nop @@ -480,7 +489,7 @@ /* Release pa_tlb_lock lock. */ .macro tlb_unlock1 spc,tmp #ifdef CONFIG_SMP - load32 PA(pa_tlb_lock),\tmp + load_pa_tlb_lock \tmp tlb_unlock0 \spc,\tmp #endif .endm diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S index adf7187f8951..2d40c4ff3f69 100644 --- a/arch/parisc/kernel/pacache.S +++ b/arch/parisc/kernel/pacache.S @@ -36,6 +36,7 @@ #include #include #include +#include #include .text @@ -333,8 +334,12 @@ ENDPROC_CFI(flush_data_cache_local) .macro tlb_lock la,flags,tmp #ifdef CONFIG_SMP - ldil L%pa_tlb_lock,%r1 - ldo R%pa_tlb_lock(%r1),\la +#if __PA_LDCW_ALIGNMENT > 4 + load32 pa_tlb_lock + __PA_LDCW_ALIGNMENT-1, \la + depi 0,31,__PA_LDCW_ALIGN_ORDER, \la +#else + load32 pa_tlb_lock, \la +#endif rsm PSW_SM_I,\flags 1: LDCW 0(\la),\tmp cmpib,<>,n 0,\tmp,3f diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 30f92391a93e..cad3e8661cd6 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -183,6 +184,44 @@ int dump_task_fpu (struct task_struct *tsk, elf_fpregset_t *r) return 1; } +/* + * Idle thread support + * + * Detect when running on QEMU with SeaBIOS PDC Firmware and let + * QEMU idle the host too. + */ + +int running_on_qemu __read_mostly; + +void __cpuidle arch_cpu_idle_dead(void) +{ + /* nop on real hardware, qemu will offline CPU. */ + asm volatile("or %%r31,%%r31,%%r31\n":::); +} + +void __cpuidle arch_cpu_idle(void) +{ + local_irq_enable(); + + /* nop on real hardware, qemu will idle sleep. */ + asm volatile("or %%r10,%%r10,%%r10\n":::); +} + +static int __init parisc_idle_init(void) +{ + const char *marker; + + /* check QEMU/SeaBIOS marker in PAGE0 */ + marker = (char *) &PAGE0->pad0; + running_on_qemu = (memcmp(marker, "SeaBIOS", 8) == 0); + + if (!running_on_qemu) + cpu_idle_poll_ctrl(1); + + return 0; +} +arch_initcall(parisc_idle_init); + /* * Copy architecture-specific thread state */ diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 4797d08581ce..6e1e39035380 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -145,6 +145,11 @@ static noinline int bad_area(struct pt_regs *regs, unsigned long address) return __bad_area(regs, address, SEGV_MAPERR); } +static noinline int bad_access(struct pt_regs *regs, unsigned long address) +{ + return __bad_area(regs, address, SEGV_ACCERR); +} + static int do_sigbus(struct pt_regs *regs, unsigned long address, unsigned int fault) { @@ -490,7 +495,7 @@ retry: good_area: if (unlikely(access_error(is_write, is_exec, vma))) - return bad_area(regs, address); + return bad_access(regs, address); /* * If for any reason at all we couldn't handle the fault, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 40d0a1a97889..b87a930c2201 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -794,11 +794,12 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) if (kvm->arch.use_cmma) { /* - * Get the last slot. They should be sorted by base_gfn, so the - * last slot is also the one at the end of the address space. - * We have verified above that at least one slot is present. + * Get the first slot. They are reverse sorted by base_gfn, so + * the first slot is also the one at the end of the address + * space. We have verified above that at least one slot is + * present. */ - ms = slots->memslots + slots->used_slots - 1; + ms = slots->memslots; /* round up so we only use full longs */ ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG); /* allocate enough bytes to store all the bits */ diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 5b25287f449b..7bd3a59232f0 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1009,7 +1009,7 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc) cbrlo[entries] = gfn << PAGE_SHIFT; } - if (orc) { + if (orc && gfn < ms->bitmap_size) { /* increment only if we are really flipping the bit to 1 */ if (!test_and_set_bit(gfn, ms->pgste_bitmap)) atomic64_inc(&ms->dirty_pages); diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S index e5547b22cd18..0ddbbb031822 100644 --- a/arch/sparc/lib/hweight.S +++ b/arch/sparc/lib/hweight.S @@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32) .previous ENTRY(__arch_hweight64) - sethi %hi(__sw_hweight16), %g1 - jmpl %g1 + %lo(__sw_hweight16), %g0 + sethi %hi(__sw_hweight64), %g1 + jmpl %g1 + %lo(__sw_hweight64), %g0 nop ENDPROC(__arch_hweight64) EXPORT_SYMBOL(__arch_hweight64) diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 972319ff5b01..e691ff734cb5 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -23,6 +23,9 @@ */ #undef CONFIG_AMD_MEM_ENCRYPT +/* No PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_PAGE_TABLE_ISOLATION + #include "misc.h" /* These actually do the work of building the kernel identity maps. */ diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 3fd8bc560fae..45a63e00a6af 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,6 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include +#include +#include +#include +#include /* @@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +/* + * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two + * halves: + */ +#define PTI_SWITCH_PGTABLES_MASK (1< #include #include -#include "calling.h" #include #include #include @@ -40,6 +39,8 @@ #include #include +#include "calling.h" + .code64 .section .entry.text, "ax" @@ -164,6 +165,9 @@ ENTRY(entry_SYSCALL_64_trampoline) /* Stash the user RSP. */ movq %rsp, RSP_SCRATCH + /* Note: using %rsp as a scratch reg. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + /* Load the top of the task stack into RSP */ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp @@ -203,6 +207,10 @@ ENTRY(entry_SYSCALL_64) */ swapgs + /* + * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it + * is not required to switch CR3. + */ movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -399,6 +407,7 @@ syscall_return_via_sysret: * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi popq %rdi popq %rsp @@ -736,6 +745,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) * We can do future final exit work right here. */ + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + /* Restore RDI. */ popq %rdi SWAPGS @@ -818,7 +829,9 @@ native_irq_return_ldt: */ pushq %rdi /* Stash user RDI */ - SWAPGS + SWAPGS /* to kernel GS */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ + movq PER_CPU_VAR(espfix_waddr), %rdi movq %rax, (0*8)(%rdi) /* user RAX */ movq (1*8)(%rsp), %rax /* user RIP */ @@ -834,7 +847,6 @@ native_irq_return_ldt: /* Now RAX == RSP. */ andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ - popq %rdi /* Restore user RDI */ /* * espfix_stack[31:16] == 0. The page tables are set up such that @@ -845,7 +857,11 @@ native_irq_return_ldt: * still points to an RO alias of the ESPFIX stack. */ orq PER_CPU_VAR(espfix_stack), %rax - SWAPGS + + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + SWAPGS /* to user GS */ + popq %rdi /* Restore user RDI */ + movq %rax, %rsp UNWIND_HINT_IRET_REGS offset=8 @@ -945,6 +961,8 @@ ENTRY(switch_to_thread_stack) UNWIND_HINT_FUNC pushq %rdi + /* Need to switch before accessing the thread stack. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi movq %rsp, %rdi movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI @@ -1244,7 +1262,11 @@ ENTRY(paranoid_entry) js 1f /* negative -> in kernel */ SWAPGS xorl %ebx, %ebx -1: ret + +1: + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + + ret END(paranoid_entry) /* @@ -1266,6 +1288,7 @@ ENTRY(paranoid_exit) testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK jmp .Lparanoid_exit_restore .Lparanoid_exit_no_swapgs: @@ -1293,6 +1316,8 @@ ENTRY(error_entry) * from user mode due to an IRET fault. */ SWAPGS + /* We have user CR3. Change to kernel CR3. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax .Lerror_entry_from_usermode_after_swapgs: /* Put us onto the real thread stack. */ @@ -1339,6 +1364,7 @@ ENTRY(error_entry) * .Lgs_change's error handler with kernel gsbase. */ SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax jmp .Lerror_entry_done .Lbstep_iret: @@ -1348,10 +1374,11 @@ ENTRY(error_entry) .Lerror_bad_iret: /* - * We came from an IRET to user mode, so we have user gsbase. - * Switch to kernel gsbase: + * We came from an IRET to user mode, so we have user + * gsbase and CR3. Switch to kernel gsbase and CR3: */ SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax /* * Pretend that the exception came from user mode: set up pt_regs @@ -1383,6 +1410,10 @@ END(error_exit) /* * Runs on exception stack. Xen PV does not go through this path at all, * so we can use real assembly here. + * + * Registers: + * %r14: Used to save/restore the CR3 of the interrupted context + * when PAGE_TABLE_ISOLATION is in use. Do not clobber. */ ENTRY(nmi) UNWIND_HINT_IRET_REGS @@ -1446,6 +1477,7 @@ ENTRY(nmi) swapgs cld + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 @@ -1698,6 +1730,8 @@ end_repeat_nmi: movq $-1, %rsi call do_nmi + RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 95ad40eb7eff..98d5358e4041 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -49,6 +49,10 @@ ENTRY(entry_SYSENTER_compat) /* Interrupts are off on entry. */ SWAPGS + + /* We are about to clobber %rsp anyway, clobbering here is OK */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* @@ -186,8 +190,13 @@ ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ swapgs - /* Stash user ESP and switch to the kernel stack. */ + /* Stash user ESP */ movl %esp, %r8d + + /* Use %rsp as scratch reg. User ESP is stashed in r8 */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + + /* Switch to the kernel stack */ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ @@ -256,10 +265,22 @@ sysret32_from_system_call: * when the system call started, which is already known to user * code. We zero R8-R10 to avoid info leaks. */ + movq RSP-ORIG_RAX(%rsp), %rsp + + /* + * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored + * on the process stack which is not mapped to userspace and + * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 + * switch until after after the last reference to the process + * stack. + * + * %r8/%r9 are zeroed before the sysret, thus safe to clobber. + */ + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 + xorq %r8, %r8 xorq %r9, %r9 xorq %r10, %r10 - movq RSP-ORIG_RAX(%rsp), %rsp swapgs sysretl END(entry_SYSCALL_compat) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 1faf40f2dda9..577fa8adb785 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr) * vsyscalls but leave the page not present. If so, we skip calling * this. */ -static void __init set_vsyscall_pgtable_user_bits(void) +void __init set_vsyscall_pgtable_user_bits(pgd_t *root) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; - pgd = pgd_offset_k(VSYSCALL_ADDR); + pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); p4d = p4d_offset(pgd, VSYSCALL_ADDR); #if CONFIG_PGTABLE_LEVELS >= 5 @@ -373,7 +373,7 @@ void __init map_vsyscall(void) vsyscall_mode == NATIVE ? PAGE_KERNEL_VSYSCALL : PAGE_KERNEL_VVAR); - set_vsyscall_pgtable_user_bits(); + set_vsyscall_pgtable_user_bits(swapper_pg_dir); } BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 3674a4b6f8bd..8156e47da7ba 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -3,16 +3,19 @@ #include #include +#include #include +#include #include #include "../perf_event.h" +/* Waste a full page so it can be mapped into the cpu_entry_area */ +DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); + /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 -#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) #define PEBS_FIXUP_SIZE PAGE_SIZE /* @@ -279,17 +282,67 @@ void fini_debug_store_on_cpu(int cpu) static DEFINE_PER_CPU(void *, insn_buffer); +static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) +{ + unsigned long start = (unsigned long)cea; + phys_addr_t pa; + size_t msz = 0; + + pa = virt_to_phys(addr); + + preempt_disable(); + for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, pa, prot); + + /* + * This is a cross-CPU update of the cpu_entry_area, we must shoot down + * all TLB entries for it. + */ + flush_tlb_kernel_range(start, start + size); + preempt_enable(); +} + +static void ds_clear_cea(void *cea, size_t size) +{ + unsigned long start = (unsigned long)cea; + size_t msz = 0; + + preempt_disable(); + for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); + + flush_tlb_kernel_range(start, start + size); + preempt_enable(); +} + +static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) +{ + unsigned int order = get_order(size); + int node = cpu_to_node(cpu); + struct page *page; + + page = __alloc_pages_node(node, flags | __GFP_ZERO, order); + return page ? page_address(page) : NULL; +} + +static void dsfree_pages(const void *buffer, size_t size) +{ + if (buffer) + free_pages((unsigned long)buffer, get_order(size)); +} + static int alloc_pebs_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - int node = cpu_to_node(cpu); - int max; - void *buffer, *ibuffer; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + size_t bsiz = x86_pmu.pebs_buffer_size; + int max, node = cpu_to_node(cpu); + void *buffer, *ibuffer, *cea; if (!x86_pmu.pebs) return 0; - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); + buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); if (unlikely(!buffer)) return -ENOMEM; @@ -300,25 +353,27 @@ static int alloc_pebs_buffer(int cpu) if (x86_pmu.intel_cap.pebs_format < 2) { ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); if (!ibuffer) { - kfree(buffer); + dsfree_pages(buffer, bsiz); return -ENOMEM; } per_cpu(insn_buffer, cpu) = ibuffer; } - - max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; - - ds->pebs_buffer_base = (u64)(unsigned long)buffer; + hwev->ds_pebs_vaddr = buffer; + /* Update the cpu entry area mapping */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds->pebs_buffer_base = (unsigned long) cea; + ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL); ds->pebs_index = ds->pebs_buffer_base; - ds->pebs_absolute_maximum = ds->pebs_buffer_base + - max * x86_pmu.pebs_record_size; - + max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size); + ds->pebs_absolute_maximum = ds->pebs_buffer_base + max; return 0; } static void release_pebs_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *cea; if (!ds || !x86_pmu.pebs) return; @@ -326,73 +381,70 @@ static void release_pebs_buffer(int cpu) kfree(per_cpu(insn_buffer, cpu)); per_cpu(insn_buffer, cpu) = NULL; - kfree((void *)(unsigned long)ds->pebs_buffer_base); + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds_clear_cea(cea, x86_pmu.pebs_buffer_size); ds->pebs_buffer_base = 0; + dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); + hwev->ds_pebs_vaddr = NULL; } static int alloc_bts_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - int node = cpu_to_node(cpu); - int max, thresh; - void *buffer; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *buffer, *cea; + int max; if (!x86_pmu.bts) return 0; - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); + buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu); if (unlikely(!buffer)) { WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); return -ENOMEM; } - - max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; - thresh = max / 16; - - ds->bts_buffer_base = (u64)(unsigned long)buffer; + hwev->ds_bts_vaddr = buffer; + /* Update the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; + ds->bts_buffer_base = (unsigned long) cea; + ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL); ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = ds->bts_buffer_base + - max * BTS_RECORD_SIZE; - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - - thresh * BTS_RECORD_SIZE; - + max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); + ds->bts_absolute_maximum = ds->bts_buffer_base + max; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16); return 0; } static void release_bts_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *cea; if (!ds || !x86_pmu.bts) return; - kfree((void *)(unsigned long)ds->bts_buffer_base); + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; + ds_clear_cea(cea, BTS_BUFFER_SIZE); ds->bts_buffer_base = 0; + dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); + hwev->ds_bts_vaddr = NULL; } static int alloc_ds_buffer(int cpu) { - int node = cpu_to_node(cpu); - struct debug_store *ds; - - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); - if (unlikely(!ds)) - return -ENOMEM; + struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store; + memset(ds, 0, sizeof(*ds)); per_cpu(cpu_hw_events, cpu).ds = ds; - return 0; } static void release_ds_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - return; - per_cpu(cpu_hw_events, cpu).ds = NULL; - kfree(ds); } void release_ds_buffers(void) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f7aaadf9331f..8e4ea143ed96 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -14,6 +14,8 @@ #include +#include + /* To enable MSR tracing please use the generic trace points. */ /* @@ -77,8 +79,6 @@ struct amd_nb { struct event_constraint event_constraints[X86_PMC_IDX_MAX]; }; -/* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 8 #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) /* @@ -95,23 +95,6 @@ struct amd_nb { PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; -}; - #define PEBS_REGS \ (PERF_REG_X86_AX | \ PERF_REG_X86_BX | \ @@ -216,6 +199,8 @@ struct cpu_hw_events { * Intel DebugStore bits */ struct debug_store *ds; + void *ds_pebs_vaddr; + void *ds_bts_vaddr; u64 pebs_enabled; int n_pebs; int n_large_pebs; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index dbfd0854651f..cf5961ca8677 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -140,7 +140,7 @@ static inline int alternatives_text_reserved(void *start, void *end) ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ - ".popsection" + ".popsection\n" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ OLDINSTR_2(oldinstr, 1, 2) \ @@ -151,7 +151,7 @@ static inline int alternatives_text_reserved(void *start, void *end) ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ - ".popsection" + ".popsection\n" /* * Alternative instructions for different CPU types or capabilities. diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 2fbc69a0916e..4a7884b8dca5 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -5,6 +5,7 @@ #include #include +#include /* * cpu_entry_area is a percpu region that contains things needed by the CPU @@ -40,6 +41,18 @@ struct cpu_entry_area { */ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; #endif +#ifdef CONFIG_CPU_SUP_INTEL + /* + * Per CPU debug store for Intel performance monitoring. Wastes a + * full page at the moment. + */ + struct debug_store cpu_debug_store; + /* + * The actual PEBS/BTS buffers must be mapped to user space + * Reserve enough fixmap PTEs. + */ + struct debug_store_buffers cpu_debug_buffers; +#endif }; #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 800104c8a3ed..21ac898df2d8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -197,11 +197,12 @@ #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ - +#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ @@ -340,5 +341,6 @@ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index bc359dd2f7f6..85e23bb7b34e 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->type = (info->read_exec_only ^ 1) << 1; desc->type |= info->contents << 2; + /* Set the ACCESS bit so it can be mapped RO */ + desc->type |= 1; desc->s = 1; desc->dpl = 0x3; diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index c10c9128f54e..e428e16dd822 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -44,6 +44,12 @@ # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) #endif +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define DISABLE_PTI 0 +#else +# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -54,7 +60,7 @@ #define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 -#define DISABLED_MASK7 0 +#define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 #define DISABLED_MASK9 (DISABLE_MPX) #define DISABLED_MASK10 0 diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h new file mode 100644 index 000000000000..62a9f4966b42 --- /dev/null +++ b/arch/x86/include/asm/intel_ds.h @@ -0,0 +1,36 @@ +#ifndef _ASM_INTEL_DS_H +#define _ASM_INTEL_DS_H + +#include + +#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) +#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 8 + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +} __aligned(PAGE_SIZE); + +DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); + +struct debug_store_buffers { + char bts_buffer[BTS_BUFFER_SIZE]; + char pebs_buffer[PEBS_BUFFER_SIZE]; +}; + +#endif diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 5ede7cae1d67..c931b88982a0 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -50,10 +50,33 @@ struct ldt_struct { * call gates. On native, we could merge the ldt_struct and LDT * allocations, but it's not worth trying to optimize. */ - struct desc_struct *entries; - unsigned int nr_entries; + struct desc_struct *entries; + unsigned int nr_entries; + + /* + * If PTI is in use, then the entries array is not mapped while we're + * in user mode. The whole array will be aliased at the addressed + * given by ldt_slot_va(slot). We use two slots so that we can allocate + * and map, and enable a new LDT without invalidating the mapping + * of an older, still-in-use LDT. + * + * slot will be -1 if this LDT doesn't have an alias mapping. + */ + int slot; }; +/* This is a multiple of PAGE_SIZE. */ +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) + +static inline void *ldt_slot_va(int slot) +{ +#ifdef CONFIG_X86_64 + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); +#else + BUG(); +#endif +} + /* * Used for LDT copy/destruction. */ @@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm) } int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm); +void ldt_arch_exit_mmap(struct mm_struct *mm); #else /* CONFIG_MODIFY_LDT_SYSCALL */ static inline void init_new_context_ldt(struct mm_struct *mm) { } static inline int ldt_dup_context(struct mm_struct *oldmm, @@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm, { return 0; } -static inline void destroy_context_ldt(struct mm_struct *mm) {} +static inline void destroy_context_ldt(struct mm_struct *mm) { } +static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { } #endif static inline void load_mm_ldt(struct mm_struct *mm) @@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm) * that we can see. */ - if (unlikely(ldt)) - set_ldt(ldt->entries, ldt->nr_entries); - else + if (unlikely(ldt)) { + if (static_cpu_has(X86_FEATURE_PTI)) { + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { + /* + * Whoops -- either the new LDT isn't mapped + * (if slot == -1) or is mapped into a bogus + * slot (if slot > 1). + */ + clear_LDT(); + return; + } + + /* + * If page table isolation is enabled, ldt->entries + * will not be mapped in the userspace pagetables. + * Tell the CPU to access the LDT through the alias + * at ldt_slot_va(ldt->slot). + */ + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); + } else { + set_ldt(ldt->entries, ldt->nr_entries); + } + } else { clear_LDT(); + } #else clear_LDT(); #endif @@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) static inline void arch_exit_mmap(struct mm_struct *mm) { paravirt_arch_exit_mmap(mm); + ldt_arch_exit_mmap(mm); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 4b5e1eafada7..aff42e1da6ee 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {} */ extern gfp_t __userpte_alloc_gfp; +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Instead of one PGD, we acquire two PGDs. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +#define PGD_ALLOCATION_ORDER 1 +#else +#define PGD_ALLOCATION_ORDER 0 +#endif + /* * Allocate and free page tables. */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f02de8bc1f72..211368922cad 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD]; int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); void ptdump_walk_pgd_level_checkwx(void); #ifdef CONFIG_DEBUG_WX @@ -846,7 +847,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) static inline int p4d_bad(p4d_t p4d) { - return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; + unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; + + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + ignore_flags |= _PAGE_NX; + + return (p4d_flags(p4d) & ~ignore_flags) != 0; } #endif /* CONFIG_PGTABLE_LEVELS > 3 */ @@ -880,7 +886,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) static inline int pgd_bad(pgd_t pgd) { - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; + unsigned long ignore_flags = _PAGE_USER; + + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + ignore_flags |= _PAGE_NX; + + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; } static inline int pgd_none(pgd_t pgd) @@ -909,7 +920,11 @@ static inline int pgd_none(pgd_t pgd) * pgd_offset() returns a (pgd_t *) * pgd_index() is used get the offset into the pgd page's array of pgd_t's; */ -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) +#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address))) +/* + * a shortcut to get a pgd_t in a given mm + */ +#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's @@ -1111,7 +1126,14 @@ static inline int pud_write(pud_t pud) */ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { - memcpy(dst, src, count * sizeof(pgd_t)); + memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + /* Clone the user space pgd as well */ + memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src), + count * sizeof(pgd_t)); +#endif } #define PTE_SHIFT ilog2(PTRS_PER_PTE) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e9f05331e732..81462e9a34f6 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) #endif } +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and + * the user one is in the last 4k. To switch between them, you + * just need to flip the 12th bit in their addresses. + */ +#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT + +/* + * This generates better code than the inline assembly in + * __set_bit(). + */ +static inline void *ptr_set_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr |= BIT(bit); + return (void *)__ptr; +} +static inline void *ptr_clear_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr &= ~BIT(bit); + return (void *)__ptr; +} + +static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) +{ + return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) +{ + return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) +{ + return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) +{ + return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * + * Returns true for parts of the PGD that map userspace and + * false for the parts that map the kernel. + */ +static inline bool pgdp_maps_userspace(void *__ptr) +{ + unsigned long ptr = (unsigned long)__ptr; + + return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); +} + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); + +/* + * Take a PGD location (pgdp) and a pgd value that needs to be set there. + * Populates the user and returns the resulting PGD that must be set in + * the kernel copy of the page tables. + */ +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return pgd; + return __pti_set_user_pgd(pgdp, pgd); +} +#else +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +#endif + static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) + p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); +#else *p4dp = p4d; +#endif } static inline void native_p4d_clear(p4d_t *p4d) @@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + *pgdp = pti_set_user_pgd(pgdp, pgd); +#else *pgdp = pgd; +#endif } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 3d27831bc58d..6b8f73dcbc2c 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -75,17 +75,27 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) -/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +/* + * See Documentation/x86/x86_64/mm.txt for a description of the memory map. + * + * Be very careful vs. KASLR when changing anything here. The KASLR address + * range must not overlap with anything except the KASAN shadow area, which + * is correct as KASAN disables KASLR. + */ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #ifdef CONFIG_X86_5LEVEL -# define VMALLOC_SIZE_TB _AC(16384, UL) -# define __VMALLOC_BASE _AC(0xff92000000000000, UL) +# define VMALLOC_SIZE_TB _AC(12800, UL) +# define __VMALLOC_BASE _AC(0xffa0000000000000, UL) # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) +# define LDT_PGD_ENTRY _AC(-112, UL) +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #else # define VMALLOC_SIZE_TB _AC(32, UL) # define __VMALLOC_BASE _AC(0xffffc90000000000, UL) # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) +# define LDT_PGD_ENTRY _AC(-3, UL) +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #endif #ifdef CONFIG_RANDOMIZE_MEMORY @@ -100,13 +110,13 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ -#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) +#define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) -#define CPU_ENTRY_AREA_PGD _AC(-3, UL) +#define CPU_ENTRY_AREA_PGD _AC(-4, UL) #define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 43212a43ee69..6a60fea90b9d 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -38,6 +38,11 @@ #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) #define CR3_PCID_MASK 0xFFFull #define CR3_NOFLUSH BIT_ULL(63) + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define X86_CR3_PTI_SWITCH_BIT 11 +#endif + #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9e482d8b0b97..9c18da64daa9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x) #else /* - * User space process size. 47bits minus one guard page. The guard - * page is necessary on Intel CPUs: if a SYSCALL instruction is at - * the highest possible canonical userspace address, then that - * syscall will enter the kernel with a non-canonical return - * address, and SYSRET will explode dangerously. We avoid this - * particular problem by preventing anything from being mapped - * at the maximum canonical address. + * User space process size. This is the first address outside the user range. + * There are a few constraints that determine this: + * + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical + * address, then that syscall will enter the kernel with a + * non-canonical return address, and SYSRET will explode dangerously. + * We avoid this particular problem by preventing anything executable + * from being mapped at the maximum canonical address. + * + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the + * CPUs malfunction if they execute code from the highest canonical page. + * They'll speculate right off the end of the canonical space, and + * bad things happen. This is worked around in the same way as the + * Intel problem. + * + * With page table isolation enabled, we map the LDT in ... [stay tuned] */ #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h new file mode 100644 index 000000000000..0b5ef05b2d2d --- /dev/null +++ b/arch/x86/include/asm/pti.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _ASM_X86_PTI_H +#define _ASM_X86_PTI_H +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +extern void pti_init(void); +extern void pti_check_boottime_disable(void); +#else +static inline void pti_check_boottime_disable(void) { } +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_PTI_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 171b429f43a2..f9b48ce152eb 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -10,38 +10,90 @@ #include #include #include +#include +#include -static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) -{ - /* - * Bump the generation count. This also serves as a full barrier - * that synchronizes with switch_mm(): callers are required to order - * their read of mm_cpumask after their writes to the paging - * structures. - */ - return atomic64_inc_return(&mm->context.tlb_gen); -} +/* + * The x86 feature is called PCID (Process Context IDentifier). It is similar + * to what is traditionally called ASID on the RISC processors. + * + * We don't use the traditional ASID implementation, where each process/mm gets + * its own ASID and flush/restart when we run out of ASID space. + * + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's + * that came by on this CPU, allowing cheaper switch_mm between processes on + * this CPU. + * + * We end up with different spaces for different things. To avoid confusion we + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] + * the canonical identifier for an mm + * + * kPCID - [1, TLB_NR_DYN_ASIDS] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. + * + */ /* There are 12 bits of space for ASIDS in CR3 */ #define CR3_HW_ASID_BITS 12 + /* * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches */ -#define PTI_CONSUMED_ASID_BITS 0 +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS 1 +#else +# define PTI_CONSUMED_PCID_BITS 0 +#endif + +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) -#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) /* * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account - * for them being zero-based. Another -1 is because ASID 0 is reserved for + * for them being zero-based. Another -1 is because PCID 0 is reserved for * use by non-PCID-aware users. */ -#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) +/* + * 6 because 6 should be plenty and struct tlb_state will fit in two cache + * lines. + */ +#define TLB_NR_DYN_ASIDS 6 + +/* + * Given @asid, compute kPCID + */ static inline u16 kern_pcid(u16 asid) { VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION /* + * Make sure that the dynamic ASID space does not confict with the + * bit we are using to switch between user and kernel ASIDs. + */ + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT)); + + /* + * The ASID being passed in here should have respected the + * MAX_ASID_AVAILABLE and thus never have the switch bit set. + */ + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); +#endif + /* + * The dynamically-assigned ASIDs that get passed in are small + * (mm == NULL then we borrow a mm which may change during a - * task switch and therefore we must not be preempted while we write CR3 - * back: + * Preemption or interrupts must be disabled to protect the access + * to the per CPU variable and to prevent being preempted between + * read_cr3() and write_cr3(). */ - preempt_disable(); + WARN_ON_ONCE(preemptible()); + + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* If current->mm == NULL then the read_cr3() "borrows" an mm */ native_write_cr3(__native_read_cr3()); - preempt_enable(); } /* @@ -256,6 +369,8 @@ static inline void __native_flush_tlb_global(void) /* * Using INVPCID is considerably faster than a pair of writes * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. */ invpcid_flush_all(); return; @@ -282,7 +397,21 @@ static inline void __native_flush_tlb_global(void) */ static inline void __native_flush_tlb_single(unsigned long addr) { + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. + * Just use invalidate_user_asid() in case we are called early. + */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) + invalidate_user_asid(loaded_mm_asid); + else + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); } /* @@ -298,14 +427,6 @@ static inline void __flush_tlb_all(void) */ __flush_tlb(); } - - /* - * Note: if we somehow had PCID but not PGE, then this wouldn't work -- - * we'd end up flushing kernel translations for the current ASID but - * we might fail to flush kernel translations for other cached ASIDs. - * - * To avoid this issue, we force PCID off if PGE is off. - */ } /* @@ -315,6 +436,16 @@ static inline void __flush_tlb_one(unsigned long addr) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * __flush_tlb_single() will have cleared the TLB entry for this ASID, + * but since kernel space is replicated across all, we must also + * invalidate all others. + */ + invalidate_other_asid(); } #define TLB_FLUSH_ALL -1UL @@ -375,6 +506,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info); +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ + return atomic64_inc_return(&mm->context.tlb_gen); +} + static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index c1688c2d0a12..1f86e1b0a5cd 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -56,18 +56,27 @@ void unwind_start(struct unwind_state *state, struct task_struct *task, #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) /* - * WARNING: The entire pt_regs may not be safe to dereference. In some cases, - * only the iret frame registers are accessible. Use with caution! + * If 'partial' returns true, only the iret frame registers are valid. */ -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, + bool *partial) { if (unwind_done(state)) return NULL; + if (partial) { +#ifdef CONFIG_UNWINDER_ORC + *partial = !state->full_regs; +#else + *partial = false; +#endif + } + return state->regs; } #else -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, + bool *partial) { return NULL; } diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d9a7c659009c..b986b2ca688a 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -7,6 +7,7 @@ #ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); +extern void set_vsyscall_pgtable_user_bits(pgd_t *root); /* * Called on instruction fetch fault in vsyscall page. diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 53b4ca55ebb6..97abdaab9535 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -78,7 +78,12 @@ #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ + +#define X86_CR3_PCID_BITS 12 +#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) + +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) /* * Intel CPU features in CR4 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 676b7cf4b62b..76417a9aab73 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef CONFIG_XEN #include @@ -94,6 +95,9 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + /* TLB state for the entry code */ + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); + /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 90cb82dbba57..570e8bb1f386 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -22,7 +22,7 @@ obj-y += common.o obj-y += rdrand.o obj-y += match.o obj-y += bugs.o -obj-$(CONFIG_CPU_FREQ) += aperfmperf.o +obj-y += aperfmperf.o obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 0ee83321a313..7eba34df54c3 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -14,6 +14,8 @@ #include #include +#include "cpu.h" + struct aperfmperf_sample { unsigned int khz; ktime_t time; @@ -24,7 +26,7 @@ struct aperfmperf_sample { static DEFINE_PER_CPU(struct aperfmperf_sample, samples); #define APERFMPERF_CACHE_THRESHOLD_MS 10 -#define APERFMPERF_REFRESH_DELAY_MS 20 +#define APERFMPERF_REFRESH_DELAY_MS 10 #define APERFMPERF_STALE_THRESHOLD_MS 1000 /* @@ -38,14 +40,8 @@ static void aperfmperf_snapshot_khz(void *dummy) u64 aperf, aperf_delta; u64 mperf, mperf_delta; struct aperfmperf_sample *s = this_cpu_ptr(&samples); - ktime_t now = ktime_get(); - s64 time_delta = ktime_ms_delta(now, s->time); unsigned long flags; - /* Don't bother re-computing within the cache threshold time. */ - if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) - return; - local_irq_save(flags); rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); @@ -61,31 +57,68 @@ static void aperfmperf_snapshot_khz(void *dummy) if (mperf_delta == 0) return; - s->time = now; + s->time = ktime_get(); s->aperf = aperf; s->mperf = mperf; - - /* If the previous iteration was too long ago, discard it. */ - if (time_delta > APERFMPERF_STALE_THRESHOLD_MS) - s->khz = 0; - else - s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); } -unsigned int arch_freq_get_on_cpu(int cpu) +static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) { - unsigned int khz; + s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); + /* Don't bother re-computing within the cache threshold time. */ + if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) + return true; + + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); + + /* Return false if the previous iteration was too long ago. */ + return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; +} + +unsigned int aperfmperf_get_khz(int cpu) +{ if (!cpu_khz) return 0; if (!static_cpu_has(X86_FEATURE_APERFMPERF)) return 0; - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); - khz = per_cpu(samples.khz, cpu); - if (khz) - return khz; + aperfmperf_snapshot_cpu(cpu, ktime_get(), true); + return per_cpu(samples.khz, cpu); +} + +void arch_freq_prepare_all(void) +{ + ktime_t now = ktime_get(); + bool wait = false; + int cpu; + + if (!cpu_khz) + return; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return; + + for_each_online_cpu(cpu) + if (!aperfmperf_snapshot_cpu(cpu, now, false)) + wait = true; + + if (wait) + msleep(APERFMPERF_REFRESH_DELAY_MS); +} + +unsigned int arch_freq_get_on_cpu(int cpu) +{ + if (!cpu_khz) + return 0; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return 0; + + if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) + return per_cpu(samples.khz, cpu); msleep(APERFMPERF_REFRESH_DELAY_MS); smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8ddcfa4d4165..2d3bd2215e5b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -898,6 +898,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) } setup_force_cpu_cap(X86_FEATURE_ALWAYS); + + if (c->x86_vendor != X86_VENDOR_AMD) + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + fpu__init_system(c); #ifdef CONFIG_X86_32 @@ -1335,7 +1339,10 @@ void syscall_init(void) (entry_SYSCALL_64_trampoline - _entry_trampoline); wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); + if (static_cpu_has(X86_FEATURE_PTI)) + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); + else + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index f52a370b6c00..e806b11a99af 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -47,4 +47,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); + +unsigned int aperfmperf_get_khz(int cpu); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index c6daec4bdba5..330b8462d426 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -470,6 +470,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, #define F14H_MPB_MAX_SIZE 1824 #define F15H_MPB_MAX_SIZE 4096 #define F16H_MPB_MAX_SIZE 3458 +#define F17H_MPB_MAX_SIZE 3200 switch (family) { case 0x14: @@ -481,6 +482,9 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, case 0x16: max_size = F16H_MPB_MAX_SIZE; break; + case 0x17: + max_size = F17H_MPB_MAX_SIZE; + break; default: max_size = F1XH_MPB_MAX_SIZE; break; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 6b7e17bf0b71..e7ecedafa1c8 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -5,6 +5,8 @@ #include #include +#include "cpu.h" + /* * Get CPU information for use by the procfs. */ @@ -78,8 +80,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = cpufreq_quick_get(cpu); + unsigned int freq = aperfmperf_get_khz(cpu); + if (!freq) + freq = cpufreq_quick_get(cpu); if (!freq) freq = cpu_khz; seq_printf(m, "cpu MHz\t\t: %u.%03u\n", diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 36b17e0febe8..afbecff161d1 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -76,12 +76,23 @@ void show_iret_regs(struct pt_regs *regs) regs->sp, regs->flags); } -static void show_regs_safe(struct stack_info *info, struct pt_regs *regs) +static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, + bool partial) { - if (on_stack(info, regs, sizeof(*regs))) + /* + * These on_stack() checks aren't strictly necessary: the unwind code + * has already validated the 'regs' pointer. The checks are done for + * ordering reasons: if the registers are on the next stack, we don't + * want to print them out yet. Otherwise they'll be shown as part of + * the wrong stack. Later, when show_trace_log_lvl() switches to the + * next stack, this function will be called again with the same regs so + * they can be printed in the right context. + */ + if (!partial && on_stack(info, regs, sizeof(*regs))) { __show_regs(regs, 0); - else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET, - IRET_FRAME_SIZE)) { + + } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, + IRET_FRAME_SIZE)) { /* * When an interrupt or exception occurs in entry code, the * full pt_regs might not have been saved yet. In that case @@ -98,11 +109,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, struct stack_info stack_info = {0}; unsigned long visit_mask = 0; int graph_idx = 0; + bool partial; printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); stack = stack ? : get_stack_pointer(task, regs); + regs = unwind_get_entry_regs(&state, &partial); /* * Iterate through the stacks, starting with the current stack pointer. @@ -120,7 +133,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - hardirq stack * - entry stack */ - for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; if (get_stack_info(stack, task, &stack_info, &visit_mask)) { @@ -140,7 +153,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%s <%s>\n", log_lvl, stack_name); if (regs) - show_regs_safe(&stack_info, regs); + show_regs_if_on_stack(&stack_info, regs, partial); /* * Scan the stack, printing any text addresses we find. At the @@ -164,7 +177,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* * Don't print regs->ip again if it was already printed - * by show_regs_safe() below. + * by show_regs_if_on_stack(). */ if (regs && stack == ®s->ip) goto next; @@ -199,9 +212,9 @@ next: unwind_next_frame(&state); /* if the frame has entry regs, print them */ - regs = unwind_get_entry_regs(&state); + regs = unwind_get_entry_regs(&state, &partial); if (regs) - show_regs_safe(&stack_info, regs); + show_regs_if_on_stack(&stack_info, regs, partial); } if (stack_name) @@ -297,11 +310,13 @@ int __die(const char *str, struct pt_regs *regs, long err) unsigned long sp; #endif printk(KERN_DEFAULT - "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, + "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", - IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", + IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? + (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7dca675fe78d..04a625f0fcda 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag) .balign PAGE_SIZE; \ GLOBAL(name) +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define PTI_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define PTI_USER_PGD_FILL 0 +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -350,13 +371,14 @@ GLOBAL(name) .endr __INITDATA -NEXT_PAGE(early_top_pgt) +NEXT_PGD_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #else .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 @@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts) .data #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) -NEXT_PAGE(init_top_pgt) +NEXT_PGD_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC @@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt) */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) #else -NEXT_PAGE(init_top_pgt) +NEXT_PGD_PAGE(init_top_pgt) .fill 512,8,0 + .fill PTI_USER_PGD_FILL,8,0 #endif #ifdef CONFIG_X86_5LEVEL diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a6b5d62f45a7..26d713ecad34 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -51,13 +52,11 @@ static void refresh_ldt_segments(void) static void flush_ldt(void *__mm) { struct mm_struct *mm = __mm; - mm_context_t *pc; if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; - pc = &mm->context; - set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + load_mm_ldt(mm); refresh_ldt_segments(); } @@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return NULL; } + /* The new LDT isn't aliased for PTI yet. */ + new_ldt->slot = -1; + new_ldt->nr_entries = num_entries; return new_ldt; } +/* + * If PTI is enabled, this maps the LDT into the kernelmode and + * usermode tables for the given mm. + * + * There is no corresponding unmap function. Even if the LDT is freed, we + * leave the PTEs around until the slot is reused or the mm is destroyed. + * This is harmless: the LDT is always in ordinary memory, and no one will + * access the freed slot. + * + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make + * it useful, and the flush would slow down modify_ldt(). + */ +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + bool is_vmalloc, had_top_level_entry; + unsigned long va; + spinlock_t *ptl; + pgd_t *pgd; + int i; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return 0; + + /* + * Any given ldt_struct should have map_ldt_struct() called at most + * once. + */ + WARN_ON(ldt->slot != -1); + + /* + * Did we already have the top level entry allocated? We can't + * use pgd_none() for this because it doens't do anything on + * 4-level page table kernels. + */ + pgd = pgd_offset(mm, LDT_BASE_ADDR); + had_top_level_entry = (pgd->pgd != 0); + + is_vmalloc = is_vmalloc_addr(ldt->entries); + + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { + unsigned long offset = i << PAGE_SHIFT; + const void *src = (char *)ldt->entries + offset; + unsigned long pfn; + pte_t pte, *ptep; + + va = (unsigned long)ldt_slot_va(slot) + offset; + pfn = is_vmalloc ? vmalloc_to_pfn(src) : + page_to_pfn(virt_to_page(src)); + /* + * Treat the PTI LDT range as a *userspace* range. + * get_locked_pte() will allocate all needed pagetables + * and account for them in this mm. + */ + ptep = get_locked_pte(mm, va, &ptl); + if (!ptep) + return -ENOMEM; + /* + * Map it RO so the easy to find address is not a primary + * target via some kernel interface which misses a + * permission check. + */ + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); + set_pte_at(mm, va, ptep, pte); + pte_unmap_unlock(ptep, ptl); + } + + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON(kernel_to_user_pgdp(pgd)->pgd); + set_pgd(kernel_to_user_pgdp(pgd), *pgd); + } + } + + va = (unsigned long)ldt_slot_va(slot); + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); + + ldt->slot = slot; +#endif + return 0; +} + +static void free_ldt_pgtables(struct mm_struct *mm) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + struct mmu_gather tlb; + unsigned long start = LDT_BASE_ADDR; + unsigned long end = start + (1UL << PGDIR_SHIFT); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + tlb_gather_mmu(&tlb, mm, start, end); + free_pgd_range(&tlb, start, end, start, end); + tlb_finish_mmu(&tlb, start, end); +#endif +} + /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { @@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); + retval = map_ldt_struct(mm, new_ldt, 0); + if (retval) { + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } mm->context.ldt = new_ldt; out_unlock: @@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm) mm->context.ldt = NULL; } +void ldt_arch_exit_mmap(struct mm_struct *mm) +{ + free_ldt_pgtables(mm); +} + static int read_ldt(void __user *ptr, unsigned long bytecount) { struct mm_struct *mm = current->mm; @@ -287,6 +413,25 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); + /* + * If we are using PTI, map the new LDT into the userspace pagetables. + * If there is already an LDT, use the other slot so that other CPUs + * will continue to use the old LDT until install_ldt() switches + * them over to the new LDT. + */ + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); + if (error) { + /* + * This only can fail for the first LDT setup. If an LDT is + * already installed then the PTE page is already + * populated. Mop up a half populated page table. + */ + if (!WARN_ON_ONCE(old_ldt)) + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } + install_ldt(mm, new_ldt); free_ldt_struct(old_ldt); error = 0; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 00bc751c861c..edfede768688 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -48,8 +48,6 @@ static void load_segments(void) "\tmovl $"STR(__KERNEL_DS)",%%eax\n" "\tmovl %%eax,%%ds\n" "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" "\tmovl %%eax,%%ss\n" : : : "eax", "memory"); #undef STR @@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0), 0); idt_invalidate(phys_to_virt(0)); + set_gdt(phys_to_virt(0), 0); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 517415978409..3cb2486c47e4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -47,7 +47,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = { +__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { /* * .sp0 is only used when entering ring 0 from a lower diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 12bf07d44dfe..2651ca2112c4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -128,25 +128,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(0xa, 0xf); spin_unlock_irqrestore(&rtc_lock, flags); - local_flush_tlb(); - pr_debug("1.\n"); *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4; - pr_debug("2.\n"); *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf; - pr_debug("3.\n"); } static inline void smpboot_restore_warm_reset_vector(void) { unsigned long flags; - /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - /* * Paranoid: Set warm reset code and vector here back * to default values. diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 8dabd7bf1673..60244bfaf88f 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -98,7 +98,7 @@ static int __save_stack_trace_reliable(struct stack_trace *trace, for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); unwind_next_frame(&state)) { - regs = unwind_get_entry_regs(&state); + regs = unwind_get_entry_regs(&state, NULL); if (regs) { /* * Kernel mode registers on the stack indicate an diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 9a9c9b076955..a5b802a12212 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info) || LDT_zero(info)) { + if (LDT_empty(info) || LDT_zero(info)) memset(desc, 0, sizeof(*desc)); - } else { + else fill_ldt(desc, info); - - /* - * Always set the accessed bit so that the CPU - * doesn't try to write to the (read-only) GDT. - */ - desc->type |= 1; - } ++info; ++desc; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7c16fe0b60c2..b33e860d32fe 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * * No need for ist_enter here because we don't use RCU. */ - if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d2a8b5a24a44..1e413a9326aa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -61,11 +61,17 @@ jiffies_64 = jiffies; . = ALIGN(HPAGE_SIZE); \ __end_rodata_hpage_align = .; +#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); +#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); + #else #define X64_ALIGN_RODATA_BEGIN #define X64_ALIGN_RODATA_END +#define ALIGN_ENTRY_TEXT_BEGIN +#define ALIGN_ENTRY_TEXT_END + #endif PHDRS { @@ -102,8 +108,10 @@ SECTIONS CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT + ALIGN_ENTRY_TEXT_BEGIN ENTRY_TEXT IRQENTRY_TEXT + ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 2e0017af8f9b..52906808e277 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -43,9 +43,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_X86_INTEL_MPX) += mpx.o -obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index fe814fd5e014..b9283cc27622 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); } +static void percpu_setup_debug_store(int cpu) +{ +#ifdef CONFIG_CPU_SUP_INTEL + int npages; + void *cea; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + cea = &get_cpu_entry_area(cpu)->cpu_debug_store; + npages = sizeof(struct debug_store) / PAGE_SIZE; + BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); + cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, + PAGE_KERNEL); + + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; + /* + * Force the population of PMDs for not yet allocated per cpu + * memory like debug store buffers. + */ + npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; + for (; npages; npages--, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); +#endif +} + /* Setup the fixmap mappings only once per-processor */ static void __init setup_cpu_entry_area(int cpu) { @@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu) cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); #endif + percpu_setup_debug_store(cpu); } static __init void setup_cpu_entry_area_ptes(void) diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index bfcffdf6c577..421f2664ffa0 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -5,7 +5,7 @@ static int ptdump_show(struct seq_file *m, void *v) { - ptdump_walk_pgd_level(m, NULL); + ptdump_walk_pgd_level_debugfs(m, NULL, false); return 0; } @@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = { .release = single_release, }; -static struct dentry *pe; +static int ptdump_show_curknl(struct seq_file *m, void *v) +{ + if (current->mm->pgd) { + down_read(¤t->mm->mmap_sem); + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); + up_read(¤t->mm->mmap_sem); + } + return 0; +} + +static int ptdump_open_curknl(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_curknl, NULL); +} + +static const struct file_operations ptdump_curknl_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_curknl, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static struct dentry *pe_curusr; + +static int ptdump_show_curusr(struct seq_file *m, void *v) +{ + if (current->mm->pgd) { + down_read(¤t->mm->mmap_sem); + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); + up_read(¤t->mm->mmap_sem); + } + return 0; +} + +static int ptdump_open_curusr(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_curusr, NULL); +} + +static const struct file_operations ptdump_curusr_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_curusr, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +static struct dentry *dir, *pe_knl, *pe_curknl; static int __init pt_dump_debug_init(void) { - pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, - &ptdump_fops); - if (!pe) + dir = debugfs_create_dir("page_tables", NULL); + if (!dir) return -ENOMEM; + pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, + &ptdump_fops); + if (!pe_knl) + goto err; + + pe_curknl = debugfs_create_file("current_kernel", 0400, + dir, NULL, &ptdump_curknl_fops); + if (!pe_curknl) + goto err; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pe_curusr = debugfs_create_file("current_user", 0400, + dir, NULL, &ptdump_curusr_fops); + if (!pe_curusr) + goto err; +#endif return 0; +err: + debugfs_remove_recursive(dir); + return -ENOMEM; } static void __exit pt_dump_debug_exit(void) { - debugfs_remove_recursive(pe); + debugfs_remove_recursive(dir); } module_init(pt_dump_debug_init); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 43dedbfb7257..2a4849e92831 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -52,6 +52,9 @@ enum address_markers_idx { USER_SPACE_NR = 0, KERNEL_SPACE_NR, LOW_KERNEL_NR, +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL) + LDT_NR, +#endif VMALLOC_START_NR, VMEMMAP_START_NR, #ifdef CONFIG_KASAN @@ -59,6 +62,9 @@ enum address_markers_idx { KASAN_SHADOW_END_NR, #endif CPU_ENTRY_AREA_NR, +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) + LDT_NR, +#endif #ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, #endif @@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = { #ifdef CONFIG_KASAN [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" }, #endif [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, #ifdef CONFIG_X86_ESPFIX64 @@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx) } static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, - bool checkwx) + bool checkwx, bool dmesg) { #ifdef CONFIG_X86_64 pgd_t *start = (pgd_t *) &init_top_pgt; @@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, if (pgd) { start = pgd; - st.to_dmesg = true; + st.to_dmesg = dmesg; } st.check_wx = checkwx; @@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) { - ptdump_walk_pgd_level_core(m, pgd, false); + ptdump_walk_pgd_level_core(m, pgd, false, true); +} + +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (user && static_cpu_has(X86_FEATURE_PTI)) + pgd = kernel_to_user_pgdp(pgd); +#endif + ptdump_walk_pgd_level_core(m, pgd, false, false); +} +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); + +static void ptdump_walk_user_pgd_level_checkwx(void) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pgd_t *pgd = (pgd_t *) &init_top_pgt; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("x86/mm: Checking user space page tables\n"); + pgd = kernel_to_user_pgdp(pgd); + ptdump_walk_pgd_level_core(NULL, pgd, true, false); +#endif } -EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level); void ptdump_walk_pgd_level_checkwx(void) { - ptdump_walk_pgd_level_core(NULL, NULL, true); + ptdump_walk_pgd_level_core(NULL, NULL, true, false); + ptdump_walk_user_pgd_level_checkwx(); } static int __init pt_dump_init(void) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a22c2b95e513..6b462a472a7b 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -20,6 +20,7 @@ #include #include #include +#include /* * We need to define the tracepoints somewhere, and tlb.c @@ -161,6 +162,12 @@ struct map_range { static int page_size_mask; +static void enable_global_pages(void) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + __supported_pte_mask |= _PAGE_GLOBAL; +} + static void __init probe_page_size_mask(void) { /* @@ -179,11 +186,11 @@ static void __init probe_page_size_mask(void) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ + __supported_pte_mask &= ~_PAGE_GLOBAL; if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); - __supported_pte_mask |= _PAGE_GLOBAL; - } else - __supported_pte_mask &= ~_PAGE_GLOBAL; + enable_global_pages(); + } /* Enable 1 GB linear kernel mappings if available: */ if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { @@ -196,34 +203,44 @@ static void __init probe_page_size_mask(void) static void setup_pcid(void) { -#ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_PCID)) { - if (boot_cpu_has(X86_FEATURE_PGE)) { - /* - * This can't be cr4_set_bits_and_update_boot() -- - * the trampoline code can't handle CR4.PCIDE and - * it wouldn't do any good anyway. Despite the name, - * cr4_set_bits_and_update_boot() doesn't actually - * cause the bits in question to remain set all the - * way through the secondary boot asm. - * - * Instead, we brute-force it and set CR4.PCIDE - * manually in start_secondary(). - */ - cr4_set_bits(X86_CR4_PCIDE); - } else { - /* - * flush_tlb_all(), as currently implemented, won't - * work if PCID is on but PGE is not. Since that - * combination doesn't exist on real hardware, there's - * no reason to try to fully support it, but it's - * polite to avoid corrupting data if we're on - * an improperly configured VM. - */ - setup_clear_cpu_cap(X86_FEATURE_PCID); - } + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (!boot_cpu_has(X86_FEATURE_PCID)) + return; + + if (boot_cpu_has(X86_FEATURE_PGE)) { + /* + * This can't be cr4_set_bits_and_update_boot() -- the + * trampoline code can't handle CR4.PCIDE and it wouldn't + * do any good anyway. Despite the name, + * cr4_set_bits_and_update_boot() doesn't actually cause + * the bits in question to remain set all the way through + * the secondary boot asm. + * + * Instead, we brute-force it and set CR4.PCIDE manually in + * start_secondary(). + */ + cr4_set_bits(X86_CR4_PCIDE); + + /* + * INVPCID's single-context modes (2/3) only work if we set + * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable + * on systems that have X86_CR4_PCIDE clear, or that have + * no INVPCID support at all. + */ + if (boot_cpu_has(X86_FEATURE_INVPCID)) + setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't work if + * PCID is on but PGE is not. Since that combination + * doesn't exist on real hardware, there's no reason to try + * to fully support it, but it's polite to avoid corrupting + * data if we're on an improperly configured VM. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); } -#endif } #ifdef CONFIG_X86_32 @@ -624,6 +641,7 @@ void __init init_mem_mapping(void) { unsigned long end; + pti_check_boottime_disable(); probe_page_size_mask(); setup_pcid(); @@ -847,12 +865,12 @@ void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; -EXPORT_SYMBOL_GPL(cpu_tlbstate); +EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 879ef930e2c2..aedebd2ebf1e 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -34,25 +34,14 @@ #define TB_SHIFT 40 /* - * Virtual address start and end range for randomization. The end changes base - * on configuration to have the highest amount of space for randomization. - * It increases the possible random position for each randomized region. + * Virtual address start and end range for randomization. * - * You need to add an if/def entry if you introduce a new memory region - * compatible with KASLR. Your entry must be in logical order with memory - * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to - * ensure that this order is correct and won't be changed. + * The end address could depend on more configuration options to make the + * highest amount of space for randomization available, but that's too hard + * to keep straight and caused issues already. */ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; - -#if defined(CONFIG_X86_ESPFIX64) -static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; -#elif defined(CONFIG_EFI) -static const unsigned long vaddr_end = EFI_VA_END; -#else -static const unsigned long vaddr_end = __START_KERNEL_map; -#endif +static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; /* Default values */ unsigned long page_offset_base = __PAGE_OFFSET_BASE; @@ -101,15 +90,12 @@ void __init kernel_randomize_memory(void) unsigned long remain_entropy; /* - * All these BUILD_BUG_ON checks ensures the memory layout is - * consistent with the vaddr_start/vaddr_end variables. + * These BUILD_BUG_ON checks ensure the memory layout is consistent + * with the vaddr_start/vaddr_end variables. These checks are very + * limited.... */ BUILD_BUG_ON(vaddr_start >= vaddr_end); - BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) && - vaddr_end >= EFI_VA_END); - BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) || - IS_ENABLED(CONFIG_EFI)) && - vaddr_end >= __START_KERNEL_map); + BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE); BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); if (!kaslr_memory_enabled()) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 17ebc5a978cc..9b7bcbd33cc2 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } #else + static inline pgd_t *_pgd_alloc(void) { - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); } #endif /* CONFIG_X86_PAE */ diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c new file mode 100644 index 000000000000..43d4a4a29037 --- /dev/null +++ b/arch/x86/mm/pti.c @@ -0,0 +1,388 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * This code is based in part on work published here: + * + * https://github.com/IAIK/KAISER + * + * The original work was written by and and signed off by for the Linux + * kernel by: + * + * Signed-off-by: Richard Fellner + * Signed-off-by: Moritz Lipp + * Signed-off-by: Daniel Gruss + * Signed-off-by: Michael Schwarz + * + * Major changes to the original code by: Dave Hansen + * Mostly rewritten by Thomas Gleixner and + * Andy Lutomirsky + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt + +/* Backporting helper */ +#ifndef __GFP_NOTRACK +#define __GFP_NOTRACK 0 +#endif + +static void __init pti_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + pr_info("%s\n", reason); +} + +static void __init pti_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + pr_info("%s\n", reason); +} + +void __init pti_check_boottime_disable(void) +{ + char arg[5]; + int ret; + + if (hypervisor_is_type(X86_HYPER_XEN_PV)) { + pti_print_if_insecure("disabled on XEN PV."); + return; + } + + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); + if (ret > 0) { + if (ret == 3 && !strncmp(arg, "off", 3)) { + pti_print_if_insecure("disabled on command line."); + return; + } + if (ret == 2 && !strncmp(arg, "on", 2)) { + pti_print_if_secure("force enabled on command line."); + goto enable; + } + if (ret == 4 && !strncmp(arg, "auto", 4)) + goto autosel; + } + + if (cmdline_find_option_bool(boot_command_line, "nopti")) { + pti_print_if_insecure("disabled on command line."); + return; + } + +autosel: + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + return; +enable: + setup_force_cpu_cap(X86_FEATURE_PTI); +} + +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + /* + * Changes to the high (kernel) portion of the kernelmode page + * tables are not automatically propagated to the usermode tables. + * + * Users should keep in mind that, unlike the kernelmode tables, + * there is no vmalloc_fault equivalent for the usermode tables. + * Top-level entries added to init_mm's usermode pgd after boot + * will not be automatically propagated to other mms. + */ + if (!pgdp_maps_userspace(pgdp)) + return pgd; + + /* + * The user page tables get the full PGD, accessible from + * userspace: + */ + kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; + + /* + * If this is normal user memory, make it NX in the kernel + * pagetables so that, if we somehow screw up and return to + * usermode with the kernel CR3 loaded, we'll get a page fault + * instead of allowing user code to execute with the wrong CR3. + * + * As exceptions, we don't set NX if: + * - _PAGE_USER is not set. This could be an executable + * EFI runtime mapping or something similar, and the kernel + * may execute from it + * - we don't have NX support + * - we're clearing the PGD (i.e. the new pgd is not present). + */ + if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && + (__supported_pte_mask & _PAGE_NX)) + pgd.pgd |= _PAGE_NX; + + /* return the copy of the PGD we want the kernel to use: */ + return pgd; +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a P4D on success, or NULL on failure. + */ +static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +{ + pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + + if (address < PAGE_OFFSET) { + WARN_ONCE(1, "attempt to walk user address\n"); + return NULL; + } + + if (pgd_none(*pgd)) { + unsigned long new_p4d_page = __get_free_page(gfp); + if (!new_p4d_page) + return NULL; + + if (pgd_none(*pgd)) { + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); + new_p4d_page = 0; + } + if (new_p4d_page) + free_page(new_p4d_page); + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + + return p4d_offset(pgd, address); +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a PMD on success, or NULL on failure. + */ +static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + p4d_t *p4d = pti_user_pagetable_walk_p4d(address); + pud_t *pud; + + BUILD_BUG_ON(p4d_large(*p4d) != 0); + if (p4d_none(*p4d)) { + unsigned long new_pud_page = __get_free_page(gfp); + if (!new_pud_page) + return NULL; + + if (p4d_none(*p4d)) { + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); + new_pud_page = 0; + } + if (new_pud_page) + free_page(new_pud_page); + } + + pud = pud_offset(p4d, address); + /* The user page tables do not use large mappings: */ + if (pud_large(*pud)) { + WARN_ON(1); + return NULL; + } + if (pud_none(*pud)) { + unsigned long new_pmd_page = __get_free_page(gfp); + if (!new_pmd_page) + return NULL; + + if (pud_none(*pud)) { + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); + new_pmd_page = 0; + } + if (new_pmd_page) + free_page(new_pmd_page); + } + + return pmd_offset(pud, address); +} + +#ifdef CONFIG_X86_VSYSCALL_EMULATION +/* + * Walk the shadow copy of the page tables (optionally) trying to allocate + * page table pages on the way down. Does not support large pages. + * + * Note: this is only used when mapping *new* kernel data into the + * user/shadow page tables. It is never used for userspace data. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + pmd_t *pmd = pti_user_pagetable_walk_pmd(address); + pte_t *pte; + + /* We can't do anything sensible if we hit a large mapping. */ + if (pmd_large(*pmd)) { + WARN_ON(1); + return NULL; + } + + if (pmd_none(*pmd)) { + unsigned long new_pte_page = __get_free_page(gfp); + if (!new_pte_page) + return NULL; + + if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); + new_pte_page = 0; + } + if (new_pte_page) + free_page(new_pte_page); + } + + pte = pte_offset_kernel(pmd, address); + if (pte_flags(*pte) & _PAGE_USER) { + WARN_ONCE(1, "attempt to walk to user pte\n"); + return NULL; + } + return pte; +} + +static void __init pti_setup_vsyscall(void) +{ + pte_t *pte, *target_pte; + unsigned int level; + + pte = lookup_address(VSYSCALL_ADDR, &level); + if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) + return; + + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); + if (WARN_ON(!target_pte)) + return; + + *target_pte = *pte; + set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); +} +#else +static void __init pti_setup_vsyscall(void) { } +#endif + +static void __init +pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear) +{ + unsigned long addr; + + /* + * Clone the populated PMDs which cover start to end. These PMD areas + * can have holes. + */ + for (addr = start; addr < end; addr += PMD_SIZE) { + pmd_t *pmd, *target_pmd; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset_k(addr); + if (WARN_ON(pgd_none(*pgd))) + return; + p4d = p4d_offset(pgd, addr); + if (WARN_ON(p4d_none(*p4d))) + return; + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + target_pmd = pti_user_pagetable_walk_pmd(addr); + if (WARN_ON(!target_pmd)) + return; + + /* + * Copy the PMD. That is, the kernelmode and usermode + * tables will share the last-level page tables of this + * address range + */ + *target_pmd = pmd_clear_flags(*pmd, clear); + } +} + +/* + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a + * next-level entry on 5-level systems. + */ +static void __init pti_clone_p4d(unsigned long addr) +{ + p4d_t *kernel_p4d, *user_p4d; + pgd_t *kernel_pgd; + + user_p4d = pti_user_pagetable_walk_p4d(addr); + kernel_pgd = pgd_offset_k(addr); + kernel_p4d = p4d_offset(kernel_pgd, addr); + *user_p4d = *kernel_p4d; +} + +/* + * Clone the CPU_ENTRY_AREA into the user space visible page table. + */ +static void __init pti_clone_user_shared(void) +{ + pti_clone_p4d(CPU_ENTRY_AREA_BASE); +} + +/* + * Clone the ESPFIX P4D into the user space visinble page table + */ +static void __init pti_setup_espfix64(void) +{ +#ifdef CONFIG_X86_ESPFIX64 + pti_clone_p4d(ESPFIX_BASE_ADDR); +#endif +} + +/* + * Clone the populated PMDs of the entry and irqentry text and force it RO. + */ +static void __init pti_clone_entry_text(void) +{ + pti_clone_pmds((unsigned long) __entry_text_start, + (unsigned long) __irqentry_text_end, + _PAGE_RW | _PAGE_GLOBAL); +} + +/* + * Initialize kernel page table isolation + */ +void __init pti_init(void) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("enabled\n"); + + pti_clone_user_shared(); + pti_clone_entry_text(); + pti_setup_espfix64(); + pti_setup_vsyscall(); +} diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0a1be3adc97e..a1561957dccb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,38 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +/* + * We get here when we do something requiring a TLB invalidation + * but could not go invalidate all of the contexts. We do the + * necessary invalidation by clearing out the 'ctx_id' which + * forces a TLB flush when the context is loaded. + */ +void clear_asid_other(void) +{ + u16 asid; + + /* + * This is only expected to be set if we have disabled + * kernel _PAGE_GLOBAL pages. + */ + if (!static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON_ONCE(1); + return; + } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + /* Do not need to flush the current asid */ + if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) + continue; + /* + * Make sure the next time we go to switch to + * this asid, we do a flush: + */ + this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); + } + this_cpu_write(cpu_tlbstate.invalidate_other, false); +} + atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); @@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, return; } + if (this_cpu_read(cpu_tlbstate.invalidate_other)) + clear_asid_other(); + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != next->context.ctx_id) @@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, *need_flush = true; } +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) +{ + unsigned long new_mm_cr3; + + if (need_flush) { + invalidate_user_asid(new_asid); + new_mm_cr3 = build_cr3(pgdir, new_asid); + } else { + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); + } + + /* + * Caution: many callers of this function expect + * that load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask writes. + */ + write_cr3(new_mm_cr3); +} + void leave_mm(int cpu) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - write_cr3(build_cr3(next->pgd, new_asid)); + load_new_mm_cr3(next->pgd, new_asid, true); /* * NB: This gets called via leave_mm() in the idle path @@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ - write_cr3(build_cr3_noflush(next->pgd, new_asid)); + load_new_mm_cr3(next->pgd, new_asid, false); /* See above wrt _rcuidle. */ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 20fb31579b69..39c4b35ac7a4 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -195,6 +195,9 @@ static pgd_t *efi_pgd; * because we want to avoid inserting EFI region mappings (EFI_VA_END * to EFI_VA_START) into the standard kernel page tables. Everything * else can be shared, see efi_sync_low_kernel_mappings(). + * + * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the + * allocation. */ int __init efi_alloc_page_tables(void) { @@ -207,7 +210,7 @@ int __init efi_alloc_page_tables(void) return 0; gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; - efi_pgd = (pgd_t *)__get_free_page(gfp_mask); + efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); if (!efi_pgd) return -ENOMEM; diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 8a99a2e96537..5b513ccffde4 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -592,7 +592,18 @@ static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff, /* * Update the first page pointer to skip over the CSH header. */ - cap_info->pages[0] += csh->headersize; + cap_info->phys[0] += csh->headersize; + + /* + * cap_info->capsule should point at a virtual mapping of the entire + * capsule, starting at the capsule header. Our image has the Quark + * security header prepended, so we cannot rely on the default vmap() + * mapping created by the generic capsule code. + * Given that the Quark firmware does not appear to care about the + * virtual mapping, let's just point cap_info->capsule at our copy + * of the capsule header. + */ + cap_info->capsule = &cap_info->header; return 1; } diff --git a/block/blk-map.c b/block/blk-map.c index d5251edcc0dd..368daa02714e 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -12,22 +12,29 @@ #include "blk.h" /* - * Append a bio to a passthrough request. Only works can be merged into - * the request based on the driver constraints. + * Append a bio to a passthrough request. Only works if the bio can be merged + * into the request based on the driver constraints. */ -int blk_rq_append_bio(struct request *rq, struct bio *bio) +int blk_rq_append_bio(struct request *rq, struct bio **bio) { - blk_queue_bounce(rq->q, &bio); + struct bio *orig_bio = *bio; + + blk_queue_bounce(rq->q, bio); if (!rq->bio) { - blk_rq_bio_prep(rq->q, rq, bio); + blk_rq_bio_prep(rq->q, rq, *bio); } else { - if (!ll_back_merge_fn(rq->q, rq, bio)) + if (!ll_back_merge_fn(rq->q, rq, *bio)) { + if (orig_bio != *bio) { + bio_put(*bio); + *bio = orig_bio; + } return -EINVAL; + } - rq->biotail->bi_next = bio; - rq->biotail = bio; - rq->__data_len += bio->bi_iter.bi_size; + rq->biotail->bi_next = *bio; + rq->biotail = *bio; + rq->__data_len += (*bio)->bi_iter.bi_size; } return 0; @@ -80,14 +87,12 @@ static int __blk_rq_map_user_iov(struct request *rq, * We link the bounce buffer in and could have to traverse it * later so we have to get a ref to prevent it from being freed */ - ret = blk_rq_append_bio(rq, bio); - bio_get(bio); + ret = blk_rq_append_bio(rq, &bio); if (ret) { - bio_endio(bio); __blk_rq_unmap_user(orig_bio); - bio_put(bio); return ret; } + bio_get(bio); return 0; } @@ -220,7 +225,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; int do_copy = 0; - struct bio *bio; + struct bio *bio, *orig_bio; int ret; if (len > (queue_max_hw_sectors(q) << 9)) @@ -243,10 +248,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (do_copy) rq->rq_flags |= RQF_COPY_USER; - ret = blk_rq_append_bio(rq, bio); + orig_bio = bio; + ret = blk_rq_append_bio(rq, &bio); if (unlikely(ret)) { /* request is too big */ - bio_put(bio); + bio_put(orig_bio); return ret; } diff --git a/block/bounce.c b/block/bounce.c index fceb1a96480b..1d05c422c932 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -200,6 +200,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, unsigned i = 0; bool bounce = false; int sectors = 0; + bool passthrough = bio_is_passthrough(*bio_orig); bio_for_each_segment(from, *bio_orig, iter) { if (i++ < BIO_MAX_PAGES) @@ -210,13 +211,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, if (!bounce) return; - if (sectors < bio_sectors(*bio_orig)) { + if (!passthrough && sectors < bio_sectors(*bio_orig)) { bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split); bio_chain(bio, *bio_orig); generic_make_request(*bio_orig); *bio_orig = bio; } - bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set); + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL : + bounce_bio_set); bio_for_each_segment_all(to, bio, i) { struct page *page = to->bv_page; diff --git a/crypto/chacha20poly1305.c b/crypto/chacha20poly1305.c index db1bc3147bc4..600afa99941f 100644 --- a/crypto/chacha20poly1305.c +++ b/crypto/chacha20poly1305.c @@ -610,6 +610,11 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb, algt->mask)); if (IS_ERR(poly)) return PTR_ERR(poly); + poly_hash = __crypto_hash_alg_common(poly); + + err = -EINVAL; + if (poly_hash->digestsize != POLY1305_DIGEST_SIZE) + goto out_put_poly; err = -ENOMEM; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); @@ -618,7 +623,6 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb, ctx = aead_instance_ctx(inst); ctx->saltlen = CHACHAPOLY_IV_SIZE - ivsize; - poly_hash = __crypto_hash_alg_common(poly); err = crypto_init_ahash_spawn(&ctx->poly, poly_hash, aead_crypto_instance(inst)); if (err) diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index ee9cfb99fe25..f8ec3d4ba4a8 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -254,6 +254,14 @@ static void pcrypt_aead_exit_tfm(struct crypto_aead *tfm) crypto_free_aead(ctx->child); } +static void pcrypt_free(struct aead_instance *inst) +{ + struct pcrypt_instance_ctx *ctx = aead_instance_ctx(inst); + + crypto_drop_aead(&ctx->spawn); + kfree(inst); +} + static int pcrypt_init_instance(struct crypto_instance *inst, struct crypto_alg *alg) { @@ -319,6 +327,8 @@ static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb, inst->alg.encrypt = pcrypt_aead_encrypt; inst->alg.decrypt = pcrypt_aead_decrypt; + inst->free = pcrypt_free; + err = aead_register_instance(tmpl, inst); if (err) goto out_drop_aead; @@ -349,14 +359,6 @@ static int pcrypt_create(struct crypto_template *tmpl, struct rtattr **tb) return -EINVAL; } -static void pcrypt_free(struct crypto_instance *inst) -{ - struct pcrypt_instance_ctx *ctx = crypto_instance_ctx(inst); - - crypto_drop_aead(&ctx->spawn); - kfree(inst); -} - static int pcrypt_cpumask_change_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -469,7 +471,6 @@ static void pcrypt_fini_padata(struct padata_pcrypt *pcrypt) static struct crypto_template pcrypt_tmpl = { .name = "pcrypt", .create = pcrypt_create, - .free = pcrypt_free, .module = THIS_MODULE, }; diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index eb3af2739537..07532d83be0b 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -186,6 +186,11 @@ static void cache_associativity(struct cacheinfo *this_leaf) this_leaf->ways_of_associativity = (size / nr_sets) / line_size; } +static bool cache_node_is_unified(struct cacheinfo *this_leaf) +{ + return of_property_read_bool(this_leaf->of_node, "cache-unified"); +} + static void cache_of_override_properties(unsigned int cpu) { int index; @@ -194,6 +199,14 @@ static void cache_of_override_properties(unsigned int cpu) for (index = 0; index < cache_leaves(cpu); index++) { this_leaf = this_cpu_ci->info_list + index; + /* + * init_cache_level must setup the cache level correctly + * overriding the architecturally specified levels, so + * if type is NONE at this stage, it should be unified + */ + if (this_leaf->type == CACHE_TYPE_NOCACHE && + cache_node_is_unified(this_leaf)) + this_leaf->type = CACHE_TYPE_UNIFIED; cache_size(this_leaf); cache_get_line_size(this_leaf); cache_nr_sets(this_leaf); diff --git a/drivers/bus/sunxi-rsb.c b/drivers/bus/sunxi-rsb.c index 328ca93781cf..1b76d9585902 100644 --- a/drivers/bus/sunxi-rsb.c +++ b/drivers/bus/sunxi-rsb.c @@ -178,6 +178,7 @@ static struct bus_type sunxi_rsb_bus = { .match = sunxi_rsb_device_match, .probe = sunxi_rsb_device_probe, .remove = sunxi_rsb_device_remove, + .uevent = of_device_uevent_modalias, }; static void sunxi_rsb_dev_release(struct device *dev) diff --git a/drivers/crypto/chelsio/Kconfig b/drivers/crypto/chelsio/Kconfig index 3e104f5aa0c2..b56b3f711d94 100644 --- a/drivers/crypto/chelsio/Kconfig +++ b/drivers/crypto/chelsio/Kconfig @@ -5,6 +5,7 @@ config CRYPTO_DEV_CHELSIO select CRYPTO_SHA256 select CRYPTO_SHA512 select CRYPTO_AUTHENC + select CRYPTO_GF128MUL ---help--- The Chelsio Crypto Co-processor driver for T6 adapters. diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c index a9fd8b9e86cd..699ee5a9a8f9 100644 --- a/drivers/crypto/n2_core.c +++ b/drivers/crypto/n2_core.c @@ -1625,6 +1625,7 @@ static int queue_cache_init(void) CWQ_ENTRY_SIZE, 0, NULL); if (!queue_cache[HV_NCS_QTYPE_CWQ - 1]) { kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_MAU - 1]); + queue_cache[HV_NCS_QTYPE_MAU - 1] = NULL; return -ENOMEM; } return 0; @@ -1634,6 +1635,8 @@ static void queue_cache_destroy(void) { kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_MAU - 1]); kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]); + queue_cache[HV_NCS_QTYPE_MAU - 1] = NULL; + queue_cache[HV_NCS_QTYPE_CWQ - 1] = NULL; } static long spu_queue_register_workfn(void *arg) diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c index ec8ac5c4dd84..055e2e8f985a 100644 --- a/drivers/firmware/efi/capsule-loader.c +++ b/drivers/firmware/efi/capsule-loader.c @@ -20,10 +20,6 @@ #define NO_FURTHER_WRITE_ACTION -1 -#ifndef phys_to_page -#define phys_to_page(x) pfn_to_page((x) >> PAGE_SHIFT) -#endif - /** * efi_free_all_buff_pages - free all previous allocated buffer pages * @cap_info: pointer to current instance of capsule_info structure @@ -35,7 +31,7 @@ static void efi_free_all_buff_pages(struct capsule_info *cap_info) { while (cap_info->index > 0) - __free_page(phys_to_page(cap_info->pages[--cap_info->index])); + __free_page(cap_info->pages[--cap_info->index]); cap_info->index = NO_FURTHER_WRITE_ACTION; } @@ -71,6 +67,14 @@ int __efi_capsule_setup_info(struct capsule_info *cap_info) cap_info->pages = temp_page; + temp_page = krealloc(cap_info->phys, + pages_needed * sizeof(phys_addr_t *), + GFP_KERNEL | __GFP_ZERO); + if (!temp_page) + return -ENOMEM; + + cap_info->phys = temp_page; + return 0; } @@ -105,9 +109,24 @@ int __weak efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, **/ static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info) { + bool do_vunmap = false; int ret; - ret = efi_capsule_update(&cap_info->header, cap_info->pages); + /* + * cap_info->capsule may have been assigned already by a quirk + * handler, so only overwrite it if it is NULL + */ + if (!cap_info->capsule) { + cap_info->capsule = vmap(cap_info->pages, cap_info->index, + VM_MAP, PAGE_KERNEL); + if (!cap_info->capsule) + return -ENOMEM; + do_vunmap = true; + } + + ret = efi_capsule_update(cap_info->capsule, cap_info->phys); + if (do_vunmap) + vunmap(cap_info->capsule); if (ret) { pr_err("capsule update failed\n"); return ret; @@ -165,10 +184,12 @@ static ssize_t efi_capsule_write(struct file *file, const char __user *buff, goto failed; } - cap_info->pages[cap_info->index++] = page_to_phys(page); + cap_info->pages[cap_info->index] = page; + cap_info->phys[cap_info->index] = page_to_phys(page); cap_info->page_bytes_remain = PAGE_SIZE; + cap_info->index++; } else { - page = phys_to_page(cap_info->pages[cap_info->index - 1]); + page = cap_info->pages[cap_info->index - 1]; } kbuff = kmap(page); @@ -252,6 +273,7 @@ static int efi_capsule_release(struct inode *inode, struct file *file) struct capsule_info *cap_info = file->private_data; kfree(cap_info->pages); + kfree(cap_info->phys); kfree(file->private_data); file->private_data = NULL; return 0; @@ -281,6 +303,13 @@ static int efi_capsule_open(struct inode *inode, struct file *file) return -ENOMEM; } + cap_info->phys = kzalloc(sizeof(void *), GFP_KERNEL); + if (!cap_info->phys) { + kfree(cap_info->pages); + kfree(cap_info); + return -ENOMEM; + } + file->private_data = cap_info; return 0; diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index eb4528c87c0b..d6f3d9ee1350 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -1074,7 +1074,7 @@ void acpi_gpiochip_add(struct gpio_chip *chip) } if (!chip->names) - devprop_gpiochip_set_names(chip); + devprop_gpiochip_set_names(chip, dev_fwnode(chip->parent)); acpi_gpiochip_request_regions(acpi_gpio); acpi_gpiochip_scan_gpios(acpi_gpio); diff --git a/drivers/gpio/gpiolib-devprop.c b/drivers/gpio/gpiolib-devprop.c index 27f383bda7d9..f748aa3e77f7 100644 --- a/drivers/gpio/gpiolib-devprop.c +++ b/drivers/gpio/gpiolib-devprop.c @@ -19,30 +19,27 @@ /** * devprop_gpiochip_set_names - Set GPIO line names using device properties * @chip: GPIO chip whose lines should be named, if possible + * @fwnode: Property Node containing the gpio-line-names property * * Looks for device property "gpio-line-names" and if it exists assigns * GPIO line names for the chip. The memory allocated for the assigned * names belong to the underlying firmware node and should not be released * by the caller. */ -void devprop_gpiochip_set_names(struct gpio_chip *chip) +void devprop_gpiochip_set_names(struct gpio_chip *chip, + const struct fwnode_handle *fwnode) { struct gpio_device *gdev = chip->gpiodev; const char **names; int ret, i; - if (!chip->parent) { - dev_warn(&gdev->dev, "GPIO chip parent is NULL\n"); - return; - } - - ret = device_property_read_string_array(chip->parent, "gpio-line-names", + ret = fwnode_property_read_string_array(fwnode, "gpio-line-names", NULL, 0); if (ret < 0) return; if (ret != gdev->ngpio) { - dev_warn(chip->parent, + dev_warn(&gdev->dev, "names %d do not match number of GPIOs %d\n", ret, gdev->ngpio); return; @@ -52,10 +49,10 @@ void devprop_gpiochip_set_names(struct gpio_chip *chip) if (!names) return; - ret = device_property_read_string_array(chip->parent, "gpio-line-names", + ret = fwnode_property_read_string_array(fwnode, "gpio-line-names", names, gdev->ngpio); if (ret < 0) { - dev_warn(chip->parent, "failed to read GPIO line names\n"); + dev_warn(&gdev->dev, "failed to read GPIO line names\n"); kfree(names); return; } diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index bfcd20699ec8..ba38f530e403 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -493,7 +493,8 @@ int of_gpiochip_add(struct gpio_chip *chip) /* If the chip defines names itself, these take precedence */ if (!chip->names) - devprop_gpiochip_set_names(chip); + devprop_gpiochip_set_names(chip, + of_fwnode_handle(chip->of_node)); of_node_get(chip->of_node); diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index d003ccb12781..3d4d0634c9dd 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -224,7 +224,8 @@ static inline int gpio_chip_hwgpio(const struct gpio_desc *desc) return desc - &desc->gdev->descs[0]; } -void devprop_gpiochip_set_names(struct gpio_chip *chip); +void devprop_gpiochip_set_names(struct gpio_chip *chip, + const struct fwnode_handle *fwnode); /* With descriptor prefix */ diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index c9bcc6c45012..ce2ed16f2a30 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -6944,6 +6944,7 @@ enum { #define RESET_PCH_HANDSHAKE_ENABLE (1<<4) #define GEN8_CHICKEN_DCPR_1 _MMIO(0x46430) +#define SKL_SELECT_ALTERNATE_DC_EXIT (1<<30) #define MASK_WAKEMEM (1<<13) #define SKL_DFSM _MMIO(0x51000) @@ -8475,6 +8476,7 @@ enum skl_power_gate { #define BXT_CDCLK_CD2X_DIV_SEL_2 (2<<22) #define BXT_CDCLK_CD2X_DIV_SEL_4 (3<<22) #define BXT_CDCLK_CD2X_PIPE(pipe) ((pipe)<<20) +#define CDCLK_DIVMUX_CD_OVERRIDE (1<<19) #define BXT_CDCLK_CD2X_PIPE_NONE BXT_CDCLK_CD2X_PIPE(3) #define BXT_CDCLK_SSA_PRECHARGE_ENABLE (1<<16) #define CDCLK_FREQ_DECIMAL_MASK (0x7ff) diff --git a/drivers/gpu/drm/i915/intel_cdclk.c b/drivers/gpu/drm/i915/intel_cdclk.c index 1241e5891b29..26a8dcd2c549 100644 --- a/drivers/gpu/drm/i915/intel_cdclk.c +++ b/drivers/gpu/drm/i915/intel_cdclk.c @@ -859,16 +859,10 @@ static void skl_set_preferred_cdclk_vco(struct drm_i915_private *dev_priv, static void skl_dpll0_enable(struct drm_i915_private *dev_priv, int vco) { - int min_cdclk = skl_calc_cdclk(0, vco); u32 val; WARN_ON(vco != 8100000 && vco != 8640000); - /* select the minimum CDCLK before enabling DPLL 0 */ - val = CDCLK_FREQ_337_308 | skl_cdclk_decimal(min_cdclk); - I915_WRITE(CDCLK_CTL, val); - POSTING_READ(CDCLK_CTL); - /* * We always enable DPLL0 with the lowest link rate possible, but still * taking into account the VCO required to operate the eDP panel at the @@ -922,7 +916,7 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv, { int cdclk = cdclk_state->cdclk; int vco = cdclk_state->vco; - u32 freq_select, pcu_ack; + u32 freq_select, pcu_ack, cdclk_ctl; int ret; WARN_ON((cdclk == 24000) != (vco == 0)); @@ -939,7 +933,7 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv, return; } - /* set CDCLK_CTL */ + /* Choose frequency for this cdclk */ switch (cdclk) { case 450000: case 432000: @@ -967,10 +961,33 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv, dev_priv->cdclk.hw.vco != vco) skl_dpll0_disable(dev_priv); + cdclk_ctl = I915_READ(CDCLK_CTL); + + if (dev_priv->cdclk.hw.vco != vco) { + /* Wa Display #1183: skl,kbl,cfl */ + cdclk_ctl &= ~(CDCLK_FREQ_SEL_MASK | CDCLK_FREQ_DECIMAL_MASK); + cdclk_ctl |= freq_select | skl_cdclk_decimal(cdclk); + I915_WRITE(CDCLK_CTL, cdclk_ctl); + } + + /* Wa Display #1183: skl,kbl,cfl */ + cdclk_ctl |= CDCLK_DIVMUX_CD_OVERRIDE; + I915_WRITE(CDCLK_CTL, cdclk_ctl); + POSTING_READ(CDCLK_CTL); + if (dev_priv->cdclk.hw.vco != vco) skl_dpll0_enable(dev_priv, vco); - I915_WRITE(CDCLK_CTL, freq_select | skl_cdclk_decimal(cdclk)); + /* Wa Display #1183: skl,kbl,cfl */ + cdclk_ctl &= ~(CDCLK_FREQ_SEL_MASK | CDCLK_FREQ_DECIMAL_MASK); + I915_WRITE(CDCLK_CTL, cdclk_ctl); + + cdclk_ctl |= freq_select | skl_cdclk_decimal(cdclk); + I915_WRITE(CDCLK_CTL, cdclk_ctl); + + /* Wa Display #1183: skl,kbl,cfl */ + cdclk_ctl &= ~CDCLK_DIVMUX_CD_OVERRIDE; + I915_WRITE(CDCLK_CTL, cdclk_ctl); POSTING_READ(CDCLK_CTL); /* inform PCU of the change */ diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c index 49577eba8e7e..51cb5293bf43 100644 --- a/drivers/gpu/drm/i915/intel_runtime_pm.c +++ b/drivers/gpu/drm/i915/intel_runtime_pm.c @@ -598,6 +598,11 @@ void gen9_enable_dc5(struct drm_i915_private *dev_priv) DRM_DEBUG_KMS("Enabling DC5\n"); + /* Wa Display #1183: skl,kbl,cfl */ + if (IS_GEN9_BC(dev_priv)) + I915_WRITE(GEN8_CHICKEN_DCPR_1, I915_READ(GEN8_CHICKEN_DCPR_1) | + SKL_SELECT_ALTERNATE_DC_EXIT); + gen9_set_dc_state(dev_priv, DC_STATE_EN_UPTO_DC5); } @@ -625,6 +630,11 @@ void skl_disable_dc6(struct drm_i915_private *dev_priv) { DRM_DEBUG_KMS("Disabling DC6\n"); + /* Wa Display #1183: skl,kbl,cfl */ + if (IS_GEN9_BC(dev_priv)) + I915_WRITE(GEN8_CHICKEN_DCPR_1, I915_READ(GEN8_CHICKEN_DCPR_1) | + SKL_SELECT_ALTERNATE_DC_EXIT); + gen9_set_dc_state(dev_priv, DC_STATE_DISABLE); } @@ -1786,6 +1796,7 @@ void intel_display_power_put(struct drm_i915_private *dev_priv, GLK_DISPLAY_POWERWELL_2_POWER_DOMAINS | \ BIT_ULL(POWER_DOMAIN_MODESET) | \ BIT_ULL(POWER_DOMAIN_AUX_A) | \ + BIT_ULL(POWER_DOMAIN_GMBUS) | \ BIT_ULL(POWER_DOMAIN_INIT)) #define CNL_DISPLAY_POWERWELL_2_POWER_DOMAINS ( \ diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index feafdb961c48..59b2f96d986a 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -386,6 +386,9 @@ int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev) if (ret) return ret; + if (!qp->qp_sec) + return 0; + mutex_lock(&real_qp->qp_sec->mutex); ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys, qp->qp_sec); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index d8f540054392..93c1a57dbff1 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2085,8 +2085,8 @@ int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file, return -EOPNOTSUPP; if (ucore->inlen > sizeof(cmd)) { - if (ib_is_udata_cleared(ucore, sizeof(cmd), - ucore->inlen - sizeof(cmd))) + if (!ib_is_udata_cleared(ucore, sizeof(cmd), + ucore->inlen - sizeof(cmd))) return -EOPNOTSUPP; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index de57d6c11a25..9032f77cc38d 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1400,7 +1400,8 @@ int ib_close_qp(struct ib_qp *qp) spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); atomic_dec(&real_qp->usecnt); - ib_close_shared_qp_security(qp->qp_sec); + if (qp->qp_sec) + ib_close_shared_qp_security(qp->qp_sec); kfree(qp); return 0; diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index eae8ea81c6e2..514c1000ded1 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -586,10 +586,10 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, ret = -EAGAIN; goto skip_cqe; } - if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) { + if (unlikely(!CQE_STATUS(hw_cqe) && + CQE_WRID_MSN(hw_cqe) != wq->rq.msn)) { t4_set_wq_in_error(wq); - hw_cqe->header |= htonl(CQE_STATUS_V(T4_ERR_MSN)); - goto proc_cqe; + hw_cqe->header |= cpu_to_be32(CQE_STATUS_V(T4_ERR_MSN)); } goto proc_cqe; } diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 6ff44dc606eb..3409eee16092 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1129,7 +1129,6 @@ struct hfi1_devdata { u16 pcie_lnkctl; u16 pcie_devctl2; u32 pci_msix0; - u32 pci_lnkctl3; u32 pci_tph2; /* diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 09e50fd2a08f..8c7e7a60b715 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -411,15 +411,12 @@ int restore_pci_variables(struct hfi1_devdata *dd) if (ret) goto error; - ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, - dd->pci_lnkctl3); - if (ret) - goto error; - - ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2); - if (ret) - goto error; - + if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) { + ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, + dd->pci_tph2); + if (ret) + goto error; + } return 0; error: @@ -469,15 +466,12 @@ int save_pci_variables(struct hfi1_devdata *dd) if (ret) goto error; - ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, - &dd->pci_lnkctl3); - if (ret) - goto error; - - ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2); - if (ret) - goto error; - + if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) { + ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, + &dd->pci_tph2); + if (ret) + goto error; + } return 0; error: diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5aff1e33d984..30d479f87cb8 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1415,6 +1415,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, } INIT_LIST_HEAD(&context->vma_private_list); + mutex_init(&context->vma_private_list_mutex); INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); @@ -1576,7 +1577,9 @@ static void mlx5_ib_vma_close(struct vm_area_struct *area) * mlx5_ib_disassociate_ucontext(). */ mlx5_ib_vma_priv_data->vma = NULL; + mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex); list_del(&mlx5_ib_vma_priv_data->list); + mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex); kfree(mlx5_ib_vma_priv_data); } @@ -1596,10 +1599,13 @@ static int mlx5_ib_set_vma_data(struct vm_area_struct *vma, return -ENOMEM; vma_prv->vma = vma; + vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex; vma->vm_private_data = vma_prv; vma->vm_ops = &mlx5_ib_vm_ops; + mutex_lock(&ctx->vma_private_list_mutex); list_add(&vma_prv->list, vma_head); + mutex_unlock(&ctx->vma_private_list_mutex); return 0; } @@ -1642,6 +1648,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) * mlx5_ib_vma_close. */ down_write(&owning_mm->mmap_sem); + mutex_lock(&context->vma_private_list_mutex); list_for_each_entry_safe(vma_private, n, &context->vma_private_list, list) { vma = vma_private->vma; @@ -1656,6 +1663,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) list_del(&vma_private->list); kfree(vma_private); } + mutex_unlock(&context->vma_private_list_mutex); up_write(&owning_mm->mmap_sem); mmput(owning_mm); put_task_struct(owning_process); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 189e80cd6b2f..754103372faa 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -115,6 +115,8 @@ enum { struct mlx5_ib_vma_private_data { struct list_head list; struct vm_area_struct *vma; + /* protect vma_private_list add/del */ + struct mutex *vma_private_list_mutex; }; struct mlx5_ib_ucontext { @@ -129,6 +131,8 @@ struct mlx5_ib_ucontext { /* Transport Domain number */ u32 tdn; struct list_head vma_private_list; + /* protect vma_private_list add/del */ + struct mutex vma_private_list_mutex; unsigned long upd_xlt_page; /* protect ODP/KSM */ diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c index b84cd978fce2..a4aaa748e987 100644 --- a/drivers/input/mouse/elantech.c +++ b/drivers/input/mouse/elantech.c @@ -1613,7 +1613,7 @@ static int elantech_set_properties(struct elantech_data *etd) case 5: etd->hw_version = 3; break; - case 6 ... 14: + case 6 ... 15: etd->hw_version = 4; break; default: diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index e67ba6c40faf..8f7a3c00b6cf 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -1611,13 +1611,15 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; domain->geometry.aperture_end = (1UL << ias) - 1; domain->geometry.force_aperture = true; - smmu_domain->pgtbl_ops = pgtbl_ops; ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg); - if (ret < 0) + if (ret < 0) { free_io_pgtable_ops(pgtbl_ops); + return ret; + } - return ret; + smmu_domain->pgtbl_ops = pgtbl_ops; + return 0; } static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) @@ -1644,7 +1646,7 @@ static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) static void arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec) { - int i; + int i, j; struct arm_smmu_master_data *master = fwspec->iommu_priv; struct arm_smmu_device *smmu = master->smmu; @@ -1652,6 +1654,13 @@ static void arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec) u32 sid = fwspec->ids[i]; __le64 *step = arm_smmu_get_step_for_sid(smmu, sid); + /* Bridged PCI devices may end up with duplicated IDs */ + for (j = 0; j < i; j++) + if (fwspec->ids[j] == sid) + break; + if (j < i) + continue; + arm_smmu_write_strtab_ent(smmu, sid, step, &master->ste); } } diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c index 85cff68643e0..125b744c9c28 100644 --- a/drivers/mtd/nand/pxa3xx_nand.c +++ b/drivers/mtd/nand/pxa3xx_nand.c @@ -950,6 +950,7 @@ static void prepare_start_command(struct pxa3xx_nand_info *info, int command) switch (command) { case NAND_CMD_READ0: + case NAND_CMD_READOOB: case NAND_CMD_PAGEPROG: info->use_ecc = 1; break; diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index d7b53d53c116..72d6ffbfd638 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -167,7 +167,7 @@ static void bcm_sf2_gphy_enable_set(struct dsa_switch *ds, bool enable) reg = reg_readl(priv, REG_SPHY_CNTRL); if (enable) { reg |= PHY_RESET; - reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | CK25_DIS); + reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | IDDQ_GLOBAL_PWR | CK25_DIS); reg_writel(priv, reg, REG_SPHY_CNTRL); udelay(21); reg = reg_readl(priv, REG_SPHY_CNTRL); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index dc5de275352a..aa764c5e3c6b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1875,7 +1875,7 @@ static int bnxt_poll_work(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) * here forever if we consistently cannot allocate * buffers. */ - else if (rc == -ENOMEM) + else if (rc == -ENOMEM && budget) rx_pkts++; else if (rc == -EBUSY) /* partial completion */ break; @@ -1961,7 +1961,7 @@ static int bnxt_poll_nitroa0(struct napi_struct *napi, int budget) cpu_to_le32(RX_CMPL_ERRORS_CRC_ERROR); rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event); - if (likely(rc == -EIO)) + if (likely(rc == -EIO) && budget) rx_pkts++; else if (rc == -EBUSY) /* partial completion */ break; diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 656e6af70f0a..aef3fcf2f5b9 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -14227,7 +14227,9 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu) /* Reset PHY, otherwise the read DMA engine will be in a mode that * breaks all requests to 256 bytes. */ - if (tg3_asic_rev(tp) == ASIC_REV_57766) + if (tg3_asic_rev(tp) == ASIC_REV_57766 || + tg3_asic_rev(tp) == ASIC_REV_5717 || + tg3_asic_rev(tp) == ASIC_REV_5719) reset_phy = true; err = tg3_restart_hw(tp, reset_phy); diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 3dc2d771a222..faf7cdc97ebf 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -818,6 +818,12 @@ static void fec_enet_bd_init(struct net_device *dev) for (i = 0; i < txq->bd.ring_size; i++) { /* Initialize the BD for every fragment in the page. */ bdp->cbd_sc = cpu_to_fec16(0); + if (bdp->cbd_bufaddr && + !IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr))) + dma_unmap_single(&fep->pdev->dev, + fec32_to_cpu(bdp->cbd_bufaddr), + fec16_to_cpu(bdp->cbd_datlen), + DMA_TO_DEVICE); if (txq->tx_skbuff[i]) { dev_kfree_skb_any(txq->tx_skbuff[i]); txq->tx_skbuff[i] = NULL; diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c index c9798210fa0f..0495487f7b42 100644 --- a/drivers/net/ethernet/marvell/mvmdio.c +++ b/drivers/net/ethernet/marvell/mvmdio.c @@ -344,7 +344,8 @@ static int orion_mdio_probe(struct platform_device *pdev) dev->regs + MVMDIO_ERR_INT_MASK); } else if (dev->err_interrupt == -EPROBE_DEFER) { - return -EPROBE_DEFER; + ret = -EPROBE_DEFER; + goto out_mdio; } if (pdev->dev.of_node) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 1fffdebbc9e8..e9a1fbcc4adf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -362,7 +362,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_QUERY_VPORT_COUNTER: case MLX5_CMD_OP_ALLOC_Q_COUNTER: case MLX5_CMD_OP_QUERY_Q_COUNTER: - case MLX5_CMD_OP_SET_RATE_LIMIT: + case MLX5_CMD_OP_SET_PP_RATE_LIMIT: case MLX5_CMD_OP_QUERY_RATE_LIMIT: case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT: @@ -505,7 +505,7 @@ const char *mlx5_command_str(int command) MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER); MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER); MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER); - MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT); + MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT); MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT); MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT); MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 13b5ef9d8703..5fa071620104 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -590,6 +590,7 @@ struct mlx5e_channel { struct mlx5_core_dev *mdev; struct mlx5e_tstamp *tstamp; int ix; + int cpu; }; struct mlx5e_channels { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index cc11bbbd0309..3cdb932cae76 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -71,11 +71,6 @@ struct mlx5e_channel_param { struct mlx5e_cq_param icosq_cq; }; -static int mlx5e_get_node(struct mlx5e_priv *priv, int ix) -{ - return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix); -} - static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) { return MLX5_CAP_GEN(mdev, striding_rq) && @@ -452,17 +447,16 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int wq_sz = mlx5_wq_ll_get_size(&rq->wq); int mtt_sz = mlx5e_get_wqe_mtt_sz(); int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1; - int node = mlx5e_get_node(c->priv, c->ix); int i; rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info), - GFP_KERNEL, node); + GFP_KERNEL, cpu_to_node(c->cpu)); if (!rq->mpwqe.info) goto err_out; /* We allocate more than mtt_sz as we will align the pointer */ - rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, - GFP_KERNEL, node); + rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL, + cpu_to_node(c->cpu)); if (unlikely(!rq->mpwqe.mtt_no_align)) goto err_free_wqe_info; @@ -570,7 +564,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, int err; int i; - rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); + rqp->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq, &rq->wq_ctrl); @@ -636,8 +630,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, default: /* MLX5_WQ_TYPE_LINKED_LIST */ rq->wqe.frag_info = kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info), - GFP_KERNEL, - mlx5e_get_node(c->priv, c->ix)); + GFP_KERNEL, cpu_to_node(c->cpu)); if (!rq->wqe.frag_info) { err = -ENOMEM; goto err_rq_wq_destroy; @@ -1007,13 +1000,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->uar_map = mdev->mlx5e_res.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); + param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix)); + err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu)); if (err) goto err_sq_wq_destroy; @@ -1060,13 +1053,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c, sq->channel = c; sq->uar_map = mdev->mlx5e_res.bfreg.map; - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); + param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix)); + err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu)); if (err) goto err_sq_wq_destroy; @@ -1132,13 +1125,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, if (MLX5_IPSEC_DEV(c->priv->mdev)) set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state); - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); + param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix)); + err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu)); if (err) goto err_sq_wq_destroy; @@ -1510,8 +1503,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c, struct mlx5_core_dev *mdev = c->priv->mdev; int err; - param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix); - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); + param->wq.buf_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = cpu_to_node(c->cpu); param->eq_ix = c->ix; err = mlx5e_alloc_cq_common(mdev, param, cq); @@ -1610,6 +1603,11 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq) mlx5e_free_cq(cq); } +static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix) +{ + return cpumask_first(priv->mdev->priv.irq_info[ix].mask); +} + static int mlx5e_open_tx_cqs(struct mlx5e_channel *c, struct mlx5e_params *params, struct mlx5e_channel_param *cparam) @@ -1758,12 +1756,13 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, { struct mlx5e_cq_moder icocq_moder = {0, 0}; struct net_device *netdev = priv->netdev; + int cpu = mlx5e_get_cpu(priv, ix); struct mlx5e_channel *c; unsigned int irq; int err; int eqn; - c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix)); + c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu)); if (!c) return -ENOMEM; @@ -1771,6 +1770,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->mdev = priv->mdev; c->tstamp = &priv->tstamp; c->ix = ix; + c->cpu = cpu; c->pdev = &priv->mdev->pdev->dev; c->netdev = priv->netdev; c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); @@ -1859,8 +1859,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c) for (tc = 0; tc < c->num_tc; tc++) mlx5e_activate_txqsq(&c->sq[tc]); mlx5e_activate_rq(&c->rq); - netif_set_xps_queue(c->netdev, - mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix); + netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix); } static void mlx5e_deactivate_channel(struct mlx5e_channel *c) @@ -3554,6 +3553,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv, struct sk_buff *skb, netdev_features_t features) { + unsigned int offset = 0; struct udphdr *udph; u8 proto; u16 port; @@ -3563,7 +3563,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv, proto = ip_hdr(skb)->protocol; break; case htons(ETH_P_IPV6): - proto = ipv6_hdr(skb)->nexthdr; + proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL); break; default: goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c index 3c11d6e2160a..14962969c5ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c @@ -66,6 +66,9 @@ static int mlx5_fpga_mem_read_i2c(struct mlx5_fpga_device *fdev, size_t size, u8 actual_size; int err; + if (!size) + return -EINVAL; + if (!fdev->mdev) return -ENOTCONN; @@ -95,6 +98,9 @@ static int mlx5_fpga_mem_write_i2c(struct mlx5_fpga_device *fdev, size_t size, u8 actual_size; int err; + if (!size) + return -EINVAL; + if (!fdev->mdev) return -ENOTCONN; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 06562c9a6b9c..8bfc37e4ec87 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -316,9 +316,6 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; struct mlx5_eq_table *table = &priv->eq_table; - struct irq_affinity irqdesc = { - .pre_vectors = MLX5_EQ_VEC_COMP_BASE, - }; int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq); int nvec; @@ -332,10 +329,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) if (!priv->irq_info) goto err_free_msix; - nvec = pci_alloc_irq_vectors_affinity(dev->pdev, + nvec = pci_alloc_irq_vectors(dev->pdev, MLX5_EQ_VEC_COMP_BASE + 1, nvec, - PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, - &irqdesc); + PCI_IRQ_MSIX); if (nvec < 0) return nvec; @@ -621,6 +617,63 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev) return (u64)timer_l | (u64)timer_h1 << 32; } +static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) +{ + struct mlx5_priv *priv = &mdev->priv; + int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i); + + if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) { + mlx5_core_warn(mdev, "zalloc_cpumask_var failed"); + return -ENOMEM; + } + + cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node), + priv->irq_info[i].mask); + + if (IS_ENABLED(CONFIG_SMP) && + irq_set_affinity_hint(irq, priv->irq_info[i].mask)) + mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq); + + return 0; +} + +static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i) +{ + struct mlx5_priv *priv = &mdev->priv; + int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i); + + irq_set_affinity_hint(irq, NULL); + free_cpumask_var(priv->irq_info[i].mask); +} + +static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev) +{ + int err; + int i; + + for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) { + err = mlx5_irq_set_affinity_hint(mdev, i); + if (err) + goto err_out; + } + + return 0; + +err_out: + for (i--; i >= 0; i--) + mlx5_irq_clear_affinity_hint(mdev, i); + + return err; +} + +static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev) +{ + int i; + + for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) + mlx5_irq_clear_affinity_hint(mdev, i); +} + int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, unsigned int *irqn) { @@ -1093,6 +1146,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, goto err_stop_eqs; } + err = mlx5_irq_set_affinity_hints(dev); + if (err) { + dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n"); + goto err_affinity_hints; + } + err = mlx5_init_fs(dev); if (err) { dev_err(&pdev->dev, "Failed to init flow steering\n"); @@ -1150,6 +1209,9 @@ err_sriov: mlx5_cleanup_fs(dev); err_fs: + mlx5_irq_clear_affinity_hints(dev); + +err_affinity_hints: free_comp_eqs(dev); err_stop_eqs: @@ -1218,6 +1280,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, mlx5_sriov_detach(dev); mlx5_cleanup_fs(dev); + mlx5_irq_clear_affinity_hints(dev); free_comp_eqs(dev); mlx5_stop_eqs(dev); mlx5_put_uars_page(dev, priv->uar); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index db9e665ab104..889130edb715 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -213,8 +213,8 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, err_cmd: memset(din, 0, sizeof(din)); memset(dout, 0, sizeof(dout)); - MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); - MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, din, qpn, qp->qpn); mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout)); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c index e651e4c02867..d3c33e9eea72 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c @@ -125,16 +125,16 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table, return ret_entry; } -static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev, +static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev, u32 rate, u16 index) { - u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0}; - MLX5_SET(set_rate_limit_in, in, opcode, - MLX5_CMD_OP_SET_RATE_LIMIT); - MLX5_SET(set_rate_limit_in, in, rate_limit_index, index); - MLX5_SET(set_rate_limit_in, in, rate_limit, rate); + MLX5_SET(set_pp_rate_limit_in, in, opcode, + MLX5_CMD_OP_SET_PP_RATE_LIMIT); + MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index); + MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rate); return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } @@ -173,7 +173,7 @@ int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index) entry->refcount++; } else { /* new rate limit */ - err = mlx5_set_rate_limit_cmd(dev, rate, entry->index); + err = mlx5_set_pp_rate_limit_cmd(dev, rate, entry->index); if (err) { mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n", rate, err); @@ -209,7 +209,7 @@ void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate) entry->refcount--; if (!entry->refcount) { /* need to remove rate */ - mlx5_set_rate_limit_cmd(dev, 0, entry->index); + mlx5_set_pp_rate_limit_cmd(dev, 0, entry->index); entry->rate = 0; } @@ -262,8 +262,8 @@ void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev) /* Clear all configured rates */ for (i = 0; i < table->max_size; i++) if (table->rl_entry[i].rate) - mlx5_set_rate_limit_cmd(dev, 0, - table->rl_entry[i].index); + mlx5_set_pp_rate_limit_cmd(dev, 0, + table->rl_entry[i].index); kfree(dev->priv.rl_table.rl_entry); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c index 07a9ba6cfc70..2f74953e4561 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c @@ -71,9 +71,9 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port) struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan; struct mlx5e_vxlan *vxlan; - spin_lock(&vxlan_db->lock); + spin_lock_bh(&vxlan_db->lock); vxlan = radix_tree_lookup(&vxlan_db->tree, port); - spin_unlock(&vxlan_db->lock); + spin_unlock_bh(&vxlan_db->lock); return vxlan; } @@ -88,8 +88,12 @@ static void mlx5e_vxlan_add_port(struct work_struct *work) struct mlx5e_vxlan *vxlan; int err; - if (mlx5e_vxlan_lookup_port(priv, port)) + mutex_lock(&priv->state_lock); + vxlan = mlx5e_vxlan_lookup_port(priv, port); + if (vxlan) { + atomic_inc(&vxlan->refcount); goto free_work; + } if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port)) goto free_work; @@ -99,10 +103,11 @@ static void mlx5e_vxlan_add_port(struct work_struct *work) goto err_delete_port; vxlan->udp_port = port; + atomic_set(&vxlan->refcount, 1); - spin_lock_irq(&vxlan_db->lock); + spin_lock_bh(&vxlan_db->lock); err = radix_tree_insert(&vxlan_db->tree, vxlan->udp_port, vxlan); - spin_unlock_irq(&vxlan_db->lock); + spin_unlock_bh(&vxlan_db->lock); if (err) goto err_free; @@ -113,35 +118,39 @@ err_free: err_delete_port: mlx5e_vxlan_core_del_port_cmd(priv->mdev, port); free_work: + mutex_unlock(&priv->state_lock); kfree(vxlan_work); } -static void __mlx5e_vxlan_core_del_port(struct mlx5e_priv *priv, u16 port) -{ - struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan; - struct mlx5e_vxlan *vxlan; - - spin_lock_irq(&vxlan_db->lock); - vxlan = radix_tree_delete(&vxlan_db->tree, port); - spin_unlock_irq(&vxlan_db->lock); - - if (!vxlan) - return; - - mlx5e_vxlan_core_del_port_cmd(priv->mdev, vxlan->udp_port); - - kfree(vxlan); -} - static void mlx5e_vxlan_del_port(struct work_struct *work) { struct mlx5e_vxlan_work *vxlan_work = container_of(work, struct mlx5e_vxlan_work, work); - struct mlx5e_priv *priv = vxlan_work->priv; + struct mlx5e_priv *priv = vxlan_work->priv; + struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan; u16 port = vxlan_work->port; + struct mlx5e_vxlan *vxlan; + bool remove = false; - __mlx5e_vxlan_core_del_port(priv, port); + mutex_lock(&priv->state_lock); + spin_lock_bh(&vxlan_db->lock); + vxlan = radix_tree_lookup(&vxlan_db->tree, port); + if (!vxlan) + goto out_unlock; + if (atomic_dec_and_test(&vxlan->refcount)) { + radix_tree_delete(&vxlan_db->tree, port); + remove = true; + } + +out_unlock: + spin_unlock_bh(&vxlan_db->lock); + + if (remove) { + mlx5e_vxlan_core_del_port_cmd(priv->mdev, port); + kfree(vxlan); + } + mutex_unlock(&priv->state_lock); kfree(vxlan_work); } @@ -171,12 +180,11 @@ void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv) struct mlx5e_vxlan *vxlan; unsigned int port = 0; - spin_lock_irq(&vxlan_db->lock); + /* Lockless since we are the only radix-tree consumers, wq is disabled */ while (radix_tree_gang_lookup(&vxlan_db->tree, (void **)&vxlan, port, 1)) { port = vxlan->udp_port; - spin_unlock_irq(&vxlan_db->lock); - __mlx5e_vxlan_core_del_port(priv, (u16)port); - spin_lock_irq(&vxlan_db->lock); + radix_tree_delete(&vxlan_db->tree, port); + mlx5e_vxlan_core_del_port_cmd(priv->mdev, port); + kfree(vxlan); } - spin_unlock_irq(&vxlan_db->lock); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h index 5def12c048e3..5ef6ae7d568a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h @@ -36,6 +36,7 @@ #include "en.h" struct mlx5e_vxlan { + atomic_t refcount; u16 udp_port; }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index db38880f54b4..3ead7439821c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4164,6 +4164,7 @@ static int mlxsw_sp_port_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port) { + u16 vid = 1; int err; err = mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, true); @@ -4176,8 +4177,19 @@ static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port) true, false); if (err) goto err_port_vlan_set; + + for (; vid <= VLAN_N_VID - 1; vid++) { + err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, + vid, false); + if (err) + goto err_vid_learning_set; + } + return 0; +err_vid_learning_set: + for (vid--; vid >= 1; vid--) + mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true); err_port_vlan_set: mlxsw_sp_port_stp_set(mlxsw_sp_port, false); err_port_stp_set: @@ -4187,6 +4199,12 @@ err_port_stp_set: static void mlxsw_sp_port_ovs_leave(struct mlxsw_sp_port *mlxsw_sp_port) { + u16 vid; + + for (vid = VLAN_N_VID - 1; vid >= 1; vid--) + mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, + vid, true); + mlxsw_sp_port_vlan_set(mlxsw_sp_port, 2, VLAN_N_VID - 1, false, false); mlxsw_sp_port_stp_set(mlxsw_sp_port, false); diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c index 32bf1fecf864..9b85cbd5a231 100644 --- a/drivers/net/ethernet/sfc/tx.c +++ b/drivers/net/ethernet/sfc/tx.c @@ -77,6 +77,7 @@ static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue, } if (buffer->flags & EFX_TX_BUF_SKB) { + EFX_WARN_ON_PARANOID(!pkts_compl || !bytes_compl); (*pkts_compl)++; (*bytes_compl) += buffer->skb->len; dev_consume_skb_any((struct sk_buff *)buffer->skb); @@ -426,12 +427,14 @@ static int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb, static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue) { struct efx_tx_buffer *buffer; + unsigned int bytes_compl = 0; + unsigned int pkts_compl = 0; /* Work backwards until we hit the original insert pointer value */ while (tx_queue->insert_count != tx_queue->write_count) { --tx_queue->insert_count; buffer = __efx_tx_queue_get_insert_buffer(tx_queue); - efx_dequeue_buffer(tx_queue, buffer, NULL, NULL); + efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); } } diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 4d02b27df044..a3f456b91c99 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -2069,7 +2069,7 @@ static struct phy_driver marvell_drivers[] = { .flags = PHY_HAS_INTERRUPT, .probe = marvell_probe, .config_init = &m88e1145_config_init, - .config_aneg = &marvell_config_aneg, + .config_aneg = &m88e1101_config_aneg, .read_status = &genphy_read_status, .ack_interrupt = &marvell_ack_interrupt, .config_intr = &marvell_config_intr, diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index fdb43dd9b5cd..6c45ff650ec7 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -622,6 +622,7 @@ static int ksz9031_read_status(struct phy_device *phydev) phydev->link = 0; if (phydev->drv->config_intr && phy_interrupt_is_valid(phydev)) phydev->drv->config_intr(phydev); + return genphy_config_aneg(phydev); } return 0; diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index bcb4755bcd95..4b377b978a0b 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -525,6 +525,7 @@ struct phylink *phylink_create(struct net_device *ndev, struct device_node *np, pl->link_config.pause = MLO_PAUSE_AN; pl->link_config.speed = SPEED_UNKNOWN; pl->link_config.duplex = DUPLEX_UNKNOWN; + pl->link_config.an_enabled = true; pl->ops = ops; __set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state); @@ -948,6 +949,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, mutex_lock(&pl->state_mutex); /* Configure the MAC to match the new settings */ linkmode_copy(pl->link_config.advertising, our_kset.link_modes.advertising); + pl->link_config.interface = config.interface; pl->link_config.speed = our_kset.base.speed; pl->link_config.duplex = our_kset.base.duplex; pl->link_config.an_enabled = our_kset.base.autoneg != AUTONEG_DISABLE; diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 81394a4b2803..2092febfcb42 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1204,6 +1204,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x1199, 0x9079, 10)}, /* Sierra Wireless EM74xx */ {QMI_FIXED_INTF(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */ {QMI_FIXED_INTF(0x1199, 0x907b, 10)}, /* Sierra Wireless EM74xx */ + {QMI_FIXED_INTF(0x1199, 0x9091, 8)}, /* Sierra Wireless EM7565 */ {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)}, /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */ {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */ {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index a2f4e52fadb5..9e9202b50e73 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -3105,6 +3105,11 @@ static void vxlan_config_apply(struct net_device *dev, max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); + if (max_mtu < ETH_MIN_MTU) + max_mtu = ETH_MIN_MTU; + + if (!changelink && !conf->mtu) + dev->mtu = max_mtu; } if (dev->mtu > max_mtu) diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c index 4307bf0013e1..63e916d4d069 100644 --- a/drivers/phy/tegra/xusb.c +++ b/drivers/phy/tegra/xusb.c @@ -75,14 +75,14 @@ MODULE_DEVICE_TABLE(of, tegra_xusb_padctl_of_match); static struct device_node * tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name) { - /* - * of_find_node_by_name() drops a reference, so make sure to grab one. - */ - struct device_node *np = of_node_get(padctl->dev->of_node); + struct device_node *pads, *np; - np = of_find_node_by_name(np, "pads"); - if (np) - np = of_find_node_by_name(np, name); + pads = of_get_child_by_name(padctl->dev->of_node, "pads"); + if (!pads) + return NULL; + + np = of_get_child_by_name(pads, name); + of_node_put(pads); return np; } @@ -90,16 +90,16 @@ tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name) static struct device_node * tegra_xusb_pad_find_phy_node(struct tegra_xusb_pad *pad, unsigned int index) { - /* - * of_find_node_by_name() drops a reference, so make sure to grab one. - */ - struct device_node *np = of_node_get(pad->dev.of_node); + struct device_node *np, *lanes; - np = of_find_node_by_name(np, "lanes"); - if (!np) + lanes = of_get_child_by_name(pad->dev.of_node, "lanes"); + if (!lanes) return NULL; - return of_find_node_by_name(np, pad->soc->lanes[index].name); + np = of_get_child_by_name(lanes, pad->soc->lanes[index].name); + of_node_put(lanes); + + return np; } static int @@ -195,7 +195,7 @@ int tegra_xusb_pad_register(struct tegra_xusb_pad *pad, unsigned int i; int err; - children = of_find_node_by_name(pad->dev.of_node, "lanes"); + children = of_get_child_by_name(pad->dev.of_node, "lanes"); if (!children) return -ENODEV; @@ -444,21 +444,21 @@ static struct device_node * tegra_xusb_find_port_node(struct tegra_xusb_padctl *padctl, const char *type, unsigned int index) { - /* - * of_find_node_by_name() drops a reference, so make sure to grab one. - */ - struct device_node *np = of_node_get(padctl->dev->of_node); + struct device_node *ports, *np; + char *name; - np = of_find_node_by_name(np, "ports"); - if (np) { - char *name; + ports = of_get_child_by_name(padctl->dev->of_node, "ports"); + if (!ports) + return NULL; - name = kasprintf(GFP_KERNEL, "%s-%u", type, index); - if (!name) - return ERR_PTR(-ENOMEM); - np = of_find_node_by_name(np, name); - kfree(name); + name = kasprintf(GFP_KERNEL, "%s-%u", type, index); + if (!name) { + of_node_put(ports); + return ERR_PTR(-ENOMEM); } + np = of_get_child_by_name(ports, name); + kfree(name); + of_node_put(ports); return np; } @@ -847,7 +847,7 @@ static void tegra_xusb_remove_ports(struct tegra_xusb_padctl *padctl) static int tegra_xusb_padctl_probe(struct platform_device *pdev) { - struct device_node *np = of_node_get(pdev->dev.of_node); + struct device_node *np = pdev->dev.of_node; const struct tegra_xusb_padctl_soc *soc; struct tegra_xusb_padctl *padctl; const struct of_device_id *match; @@ -855,7 +855,7 @@ static int tegra_xusb_padctl_probe(struct platform_device *pdev) int err; /* for backwards compatibility with old device trees */ - np = of_find_node_by_name(np, "pads"); + np = of_get_child_by_name(np, "pads"); if (!np) { dev_warn(&pdev->dev, "deprecated DT, using legacy driver\n"); return tegra_xusb_padctl_legacy_probe(pdev); diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c index f4c070ea8384..c90fba3ed861 100644 --- a/drivers/rtc/rtc-m41t80.c +++ b/drivers/rtc/rtc-m41t80.c @@ -154,6 +154,8 @@ struct m41t80_data { struct rtc_device *rtc; #ifdef CONFIG_COMMON_CLK struct clk_hw sqw; + unsigned long freq; + unsigned int sqwe; #endif }; @@ -443,43 +445,40 @@ static SIMPLE_DEV_PM_OPS(m41t80_pm, m41t80_suspend, m41t80_resume); #ifdef CONFIG_COMMON_CLK #define sqw_to_m41t80_data(_hw) container_of(_hw, struct m41t80_data, sqw) -static unsigned long m41t80_sqw_recalc_rate(struct clk_hw *hw, - unsigned long parent_rate) +static unsigned long m41t80_decode_freq(int setting) +{ + return (setting == 0) ? 0 : (setting == 1) ? M41T80_SQW_MAX_FREQ : + M41T80_SQW_MAX_FREQ >> setting; +} + +static unsigned long m41t80_get_freq(struct m41t80_data *m41t80) { - struct m41t80_data *m41t80 = sqw_to_m41t80_data(hw); struct i2c_client *client = m41t80->client; int reg_sqw = (m41t80->features & M41T80_FEATURE_SQ_ALT) ? M41T80_REG_WDAY : M41T80_REG_SQW; int ret = i2c_smbus_read_byte_data(client, reg_sqw); - unsigned long val = M41T80_SQW_MAX_FREQ; if (ret < 0) return 0; + return m41t80_decode_freq(ret >> 4); +} - ret >>= 4; - if (ret == 0) - val = 0; - else if (ret > 1) - val = val / (1 << ret); - - return val; +static unsigned long m41t80_sqw_recalc_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + return sqw_to_m41t80_data(hw)->freq; } static long m41t80_sqw_round_rate(struct clk_hw *hw, unsigned long rate, unsigned long *prate) { - int i, freq = M41T80_SQW_MAX_FREQ; - - if (freq <= rate) - return freq; - - for (i = 2; i <= ilog2(M41T80_SQW_MAX_FREQ); i++) { - freq /= 1 << i; - if (freq <= rate) - return freq; - } - - return 0; + if (rate >= M41T80_SQW_MAX_FREQ) + return M41T80_SQW_MAX_FREQ; + if (rate >= M41T80_SQW_MAX_FREQ / 4) + return M41T80_SQW_MAX_FREQ / 4; + if (!rate) + return 0; + return 1 << ilog2(rate); } static int m41t80_sqw_set_rate(struct clk_hw *hw, unsigned long rate, @@ -491,17 +490,12 @@ static int m41t80_sqw_set_rate(struct clk_hw *hw, unsigned long rate, M41T80_REG_WDAY : M41T80_REG_SQW; int reg, ret, val = 0; - if (rate) { - if (!is_power_of_2(rate)) - return -EINVAL; - val = ilog2(rate); - if (val == ilog2(M41T80_SQW_MAX_FREQ)) - val = 1; - else if (val < (ilog2(M41T80_SQW_MAX_FREQ) - 1)) - val = ilog2(M41T80_SQW_MAX_FREQ) - val; - else - return -EINVAL; - } + if (rate >= M41T80_SQW_MAX_FREQ) + val = 1; + else if (rate >= M41T80_SQW_MAX_FREQ / 4) + val = 2; + else if (rate) + val = 15 - ilog2(rate); reg = i2c_smbus_read_byte_data(client, reg_sqw); if (reg < 0) @@ -510,10 +504,9 @@ static int m41t80_sqw_set_rate(struct clk_hw *hw, unsigned long rate, reg = (reg & 0x0f) | (val << 4); ret = i2c_smbus_write_byte_data(client, reg_sqw, reg); - if (ret < 0) - return ret; - - return -EINVAL; + if (!ret) + m41t80->freq = m41t80_decode_freq(val); + return ret; } static int m41t80_sqw_control(struct clk_hw *hw, bool enable) @@ -530,7 +523,10 @@ static int m41t80_sqw_control(struct clk_hw *hw, bool enable) else ret &= ~M41T80_ALMON_SQWE; - return i2c_smbus_write_byte_data(client, M41T80_REG_ALARM_MON, ret); + ret = i2c_smbus_write_byte_data(client, M41T80_REG_ALARM_MON, ret); + if (!ret) + m41t80->sqwe = enable; + return ret; } static int m41t80_sqw_prepare(struct clk_hw *hw) @@ -545,14 +541,7 @@ static void m41t80_sqw_unprepare(struct clk_hw *hw) static int m41t80_sqw_is_prepared(struct clk_hw *hw) { - struct m41t80_data *m41t80 = sqw_to_m41t80_data(hw); - struct i2c_client *client = m41t80->client; - int ret = i2c_smbus_read_byte_data(client, M41T80_REG_ALARM_MON); - - if (ret < 0) - return ret; - - return !!(ret & M41T80_ALMON_SQWE); + return sqw_to_m41t80_data(hw)->sqwe; } static const struct clk_ops m41t80_sqw_ops = { @@ -587,6 +576,7 @@ static struct clk *m41t80_sqw_register_clk(struct m41t80_data *m41t80) init.parent_names = NULL; init.num_parents = 0; m41t80->sqw.init = &init; + m41t80->freq = m41t80_get_freq(m41t80); /* optional override of the clockname */ of_property_read_string(node, "clock-output-names", &init.name); diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 5340efc673a9..92dd4aef21a3 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -564,9 +564,9 @@ enum qeth_cq { }; struct qeth_ipato { - int enabled; - int invert4; - int invert6; + bool enabled; + bool invert4; + bool invert6; struct list_head entries; }; diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 330e5d3dadf3..7c7a244b6684 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -1479,9 +1479,9 @@ static int qeth_setup_card(struct qeth_card *card) qeth_set_intial_options(card); /* IP address takeover */ INIT_LIST_HEAD(&card->ipato.entries); - card->ipato.enabled = 0; - card->ipato.invert4 = 0; - card->ipato.invert6 = 0; + card->ipato.enabled = false; + card->ipato.invert4 = false; + card->ipato.invert6 = false; /* init QDIO stuff */ qeth_init_qdio_info(card); INIT_DELAYED_WORK(&card->buffer_reclaim_work, qeth_buffer_reclaim_work); @@ -5445,6 +5445,13 @@ out: } EXPORT_SYMBOL_GPL(qeth_poll); +static int qeth_setassparms_inspect_rc(struct qeth_ipa_cmd *cmd) +{ + if (!cmd->hdr.return_code) + cmd->hdr.return_code = cmd->data.setassparms.hdr.return_code; + return cmd->hdr.return_code; +} + int qeth_setassparms_cb(struct qeth_card *card, struct qeth_reply *reply, unsigned long data) { @@ -6304,7 +6311,7 @@ static int qeth_ipa_checksum_run_cmd_cb(struct qeth_card *card, (struct qeth_checksum_cmd *)reply->param; QETH_CARD_TEXT(card, 4, "chkdoccb"); - if (cmd->hdr.return_code) + if (qeth_setassparms_inspect_rc(cmd)) return 0; memset(chksum_cb, 0, sizeof(*chksum_cb)); diff --git a/drivers/s390/net/qeth_l3.h b/drivers/s390/net/qeth_l3.h index 194ae9b577cc..e5833837b799 100644 --- a/drivers/s390/net/qeth_l3.h +++ b/drivers/s390/net/qeth_l3.h @@ -82,7 +82,7 @@ void qeth_l3_del_vipa(struct qeth_card *, enum qeth_prot_versions, const u8 *); int qeth_l3_add_rxip(struct qeth_card *, enum qeth_prot_versions, const u8 *); void qeth_l3_del_rxip(struct qeth_card *card, enum qeth_prot_versions, const u8 *); -int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *, struct qeth_ipaddr *); +void qeth_l3_update_ipato(struct qeth_card *card); struct qeth_ipaddr *qeth_l3_get_addr_buffer(enum qeth_prot_versions); int qeth_l3_add_ip(struct qeth_card *, struct qeth_ipaddr *); int qeth_l3_delete_ip(struct qeth_card *, struct qeth_ipaddr *); diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 27185ab38136..36dee176f8e2 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -163,8 +163,8 @@ static void qeth_l3_convert_addr_to_bits(u8 *addr, u8 *bits, int len) } } -int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card, - struct qeth_ipaddr *addr) +static bool qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card, + struct qeth_ipaddr *addr) { struct qeth_ipato_entry *ipatoe; u8 addr_bits[128] = {0, }; @@ -173,6 +173,8 @@ int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card, if (!card->ipato.enabled) return 0; + if (addr->type != QETH_IP_TYPE_NORMAL) + return 0; qeth_l3_convert_addr_to_bits((u8 *) &addr->u, addr_bits, (addr->proto == QETH_PROT_IPV4)? 4:16); @@ -289,8 +291,7 @@ int qeth_l3_add_ip(struct qeth_card *card, struct qeth_ipaddr *tmp_addr) memcpy(addr, tmp_addr, sizeof(struct qeth_ipaddr)); addr->ref_counter = 1; - if (addr->type == QETH_IP_TYPE_NORMAL && - qeth_l3_is_addr_covered_by_ipato(card, addr)) { + if (qeth_l3_is_addr_covered_by_ipato(card, addr)) { QETH_CARD_TEXT(card, 2, "tkovaddr"); addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG; } @@ -604,6 +605,27 @@ int qeth_l3_setrouting_v6(struct qeth_card *card) /* * IP address takeover related functions */ + +/** + * qeth_l3_update_ipato() - Update 'takeover' property, for all NORMAL IPs. + * + * Caller must hold ip_lock. + */ +void qeth_l3_update_ipato(struct qeth_card *card) +{ + struct qeth_ipaddr *addr; + unsigned int i; + + hash_for_each(card->ip_htable, i, addr, hnode) { + if (addr->type != QETH_IP_TYPE_NORMAL) + continue; + if (qeth_l3_is_addr_covered_by_ipato(card, addr)) + addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG; + else + addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG; + } +} + static void qeth_l3_clear_ipato_list(struct qeth_card *card) { struct qeth_ipato_entry *ipatoe, *tmp; @@ -615,6 +637,7 @@ static void qeth_l3_clear_ipato_list(struct qeth_card *card) kfree(ipatoe); } + qeth_l3_update_ipato(card); spin_unlock_bh(&card->ip_lock); } @@ -639,8 +662,10 @@ int qeth_l3_add_ipato_entry(struct qeth_card *card, } } - if (!rc) + if (!rc) { list_add_tail(&new->entry, &card->ipato.entries); + qeth_l3_update_ipato(card); + } spin_unlock_bh(&card->ip_lock); @@ -663,6 +688,7 @@ void qeth_l3_del_ipato_entry(struct qeth_card *card, (proto == QETH_PROT_IPV4)? 4:16) && (ipatoe->mask_bits == mask_bits)) { list_del(&ipatoe->entry); + qeth_l3_update_ipato(card); kfree(ipatoe); } } diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c index 7a829ad77783..1295dd8ec849 100644 --- a/drivers/s390/net/qeth_l3_sys.c +++ b/drivers/s390/net/qeth_l3_sys.c @@ -370,8 +370,8 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct qeth_card *card = dev_get_drvdata(dev); - struct qeth_ipaddr *addr; - int i, rc = 0; + bool enable; + int rc = 0; if (!card) return -EINVAL; @@ -384,25 +384,18 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev, } if (sysfs_streq(buf, "toggle")) { - card->ipato.enabled = (card->ipato.enabled)? 0 : 1; - } else if (sysfs_streq(buf, "1")) { - card->ipato.enabled = 1; - hash_for_each(card->ip_htable, i, addr, hnode) { - if ((addr->type == QETH_IP_TYPE_NORMAL) && - qeth_l3_is_addr_covered_by_ipato(card, addr)) - addr->set_flags |= - QETH_IPA_SETIP_TAKEOVER_FLAG; - } - } else if (sysfs_streq(buf, "0")) { - card->ipato.enabled = 0; - hash_for_each(card->ip_htable, i, addr, hnode) { - if (addr->set_flags & - QETH_IPA_SETIP_TAKEOVER_FLAG) - addr->set_flags &= - ~QETH_IPA_SETIP_TAKEOVER_FLAG; - } - } else + enable = !card->ipato.enabled; + } else if (kstrtobool(buf, &enable)) { rc = -EINVAL; + goto out; + } + + if (card->ipato.enabled != enable) { + card->ipato.enabled = enable; + spin_lock_bh(&card->ip_lock); + qeth_l3_update_ipato(card); + spin_unlock_bh(&card->ip_lock); + } out: mutex_unlock(&card->conf_mutex); return rc ? rc : count; @@ -428,20 +421,27 @@ static ssize_t qeth_l3_dev_ipato_invert4_store(struct device *dev, const char *buf, size_t count) { struct qeth_card *card = dev_get_drvdata(dev); + bool invert; int rc = 0; if (!card) return -EINVAL; mutex_lock(&card->conf_mutex); - if (sysfs_streq(buf, "toggle")) - card->ipato.invert4 = (card->ipato.invert4)? 0 : 1; - else if (sysfs_streq(buf, "1")) - card->ipato.invert4 = 1; - else if (sysfs_streq(buf, "0")) - card->ipato.invert4 = 0; - else + if (sysfs_streq(buf, "toggle")) { + invert = !card->ipato.invert4; + } else if (kstrtobool(buf, &invert)) { rc = -EINVAL; + goto out; + } + + if (card->ipato.invert4 != invert) { + card->ipato.invert4 = invert; + spin_lock_bh(&card->ip_lock); + qeth_l3_update_ipato(card); + spin_unlock_bh(&card->ip_lock); + } +out: mutex_unlock(&card->conf_mutex); return rc ? rc : count; } @@ -607,20 +607,27 @@ static ssize_t qeth_l3_dev_ipato_invert6_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct qeth_card *card = dev_get_drvdata(dev); + bool invert; int rc = 0; if (!card) return -EINVAL; mutex_lock(&card->conf_mutex); - if (sysfs_streq(buf, "toggle")) - card->ipato.invert6 = (card->ipato.invert6)? 0 : 1; - else if (sysfs_streq(buf, "1")) - card->ipato.invert6 = 1; - else if (sysfs_streq(buf, "0")) - card->ipato.invert6 = 0; - else + if (sysfs_streq(buf, "toggle")) { + invert = !card->ipato.invert6; + } else if (kstrtobool(buf, &invert)) { rc = -EINVAL; + goto out; + } + + if (card->ipato.invert6 != invert) { + card->ipato.invert6 = invert; + spin_lock_bh(&card->ip_lock); + qeth_l3_update_ipato(card); + spin_unlock_bh(&card->ip_lock); + } +out: mutex_unlock(&card->conf_mutex); return rc ? rc : count; } diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index a4f28b7e4c65..e18877177f1b 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -1576,7 +1576,9 @@ static struct request *_make_request(struct request_queue *q, bool has_write, return req; for_each_bio(bio) { - ret = blk_rq_append_bio(req, bio); + struct bio *bounce_bio = bio; + + ret = blk_rq_append_bio(req, &bounce_bio); if (ret) return ERR_PTR(ret); } diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index eaa326c99057..5e0dca720518 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -415,7 +415,7 @@ static int ion_dma_buf_begin_cpu_access(struct dma_buf *dmabuf, mutex_lock(&buffer->lock); list_for_each_entry(a, &buffer->attachments, list) { dma_sync_sg_for_cpu(a->dev, a->table->sgl, a->table->nents, - DMA_BIDIRECTIONAL); + direction); } mutex_unlock(&buffer->lock); @@ -437,7 +437,7 @@ static int ion_dma_buf_end_cpu_access(struct dma_buf *dmabuf, mutex_lock(&buffer->lock); list_for_each_entry(a, &buffer->attachments, list) { dma_sync_sg_for_device(a->dev, a->table->sgl, a->table->nents, - DMA_BIDIRECTIONAL); + direction); } mutex_unlock(&buffer->lock); diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 7c69b4a9694d..0d99b242e82e 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -920,7 +920,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, " %d i: %d bio: %p, allocating another" " bio\n", bio->bi_vcnt, i, bio); - rc = blk_rq_append_bio(req, bio); + rc = blk_rq_append_bio(req, &bio); if (rc) { pr_err("pSCSI: failed to append bio\n"); goto fail; @@ -938,7 +938,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, } if (bio) { - rc = blk_rq_append_bio(req, bio); + rc = blk_rq_append_bio(req, &bio); if (rc) { pr_err("pSCSI: failed to append bio\n"); goto fail; diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index bdf0e6e89991..faf50df81622 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -1764,7 +1764,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old) { struct n_tty_data *ldata = tty->disc_data; - if (!old || (old->c_lflag ^ tty->termios.c_lflag) & ICANON) { + if (!old || (old->c_lflag ^ tty->termios.c_lflag) & (ICANON | EXTPROC)) { bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE); ldata->line_start = ldata->read_tail; if (!L_ICANON(tty) || !read_cnt(ldata)) { @@ -2427,7 +2427,7 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file, return put_user(tty_chars_in_buffer(tty), (int __user *) arg); case TIOCINQ: down_write(&tty->termios_rwsem); - if (L_ICANON(tty)) + if (L_ICANON(tty) && !L_EXTPROC(tty)) retval = inq_canon(ldata); else retval = read_cnt(ldata); diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index f8eba1c5412f..677fa99b7747 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -446,7 +446,7 @@ EXPORT_SYMBOL_GPL(tty_prepare_flip_string); * Callers other than flush_to_ldisc() need to exclude the kworker * from concurrent use of the line discipline, see paste_selection(). * - * Returns the number of bytes not processed + * Returns the number of bytes processed */ int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p, char *f, int count) diff --git a/drivers/usb/chipidea/ci_hdrc_msm.c b/drivers/usb/chipidea/ci_hdrc_msm.c index bb626120296f..53f3bf459dd1 100644 --- a/drivers/usb/chipidea/ci_hdrc_msm.c +++ b/drivers/usb/chipidea/ci_hdrc_msm.c @@ -251,7 +251,7 @@ static int ci_hdrc_msm_probe(struct platform_device *pdev) if (ret) goto err_mux; - ulpi_node = of_find_node_by_name(of_node_get(pdev->dev.of_node), "ulpi"); + ulpi_node = of_get_child_by_name(pdev->dev.of_node, "ulpi"); if (ulpi_node) { phy_node = of_get_next_available_child(ulpi_node, NULL); ci->hsic = of_device_is_compatible(phy_node, "qcom,usb-hsic-phy"); diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 843ef46d2537..9e3355b97396 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -1007,7 +1007,7 @@ int usb_get_bos_descriptor(struct usb_device *dev) case USB_SSP_CAP_TYPE: ssp_cap = (struct usb_ssp_cap_descriptor *)buffer; ssac = (le32_to_cpu(ssp_cap->bmAttributes) & - USB_SSP_SUBLINK_SPEED_ATTRIBS) + 1; + USB_SSP_SUBLINK_SPEED_ATTRIBS); if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac)) dev->bos->ssp_cap = ssp_cap; break; diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 50010282c010..c05c4f877750 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -57,10 +57,11 @@ static const struct usb_device_id usb_quirk_list[] = { /* Microsoft LifeCam-VX700 v2.0 */ { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME }, - /* Logitech HD Pro Webcams C920, C920-C and C930e */ + /* Logitech HD Pro Webcams C920, C920-C, C925e and C930e */ { USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT }, + { USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT }, /* Logitech ConferenceCam CC3000e */ { USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT }, @@ -154,6 +155,9 @@ static const struct usb_device_id usb_quirk_list[] = { /* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */ { USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM }, + /* ELSA MicroLink 56K */ + { USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME }, + /* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */ { USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM }, diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 876cdbec1307..c0491dd73f53 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -3704,7 +3704,8 @@ static void ffs_closed(struct ffs_data *ffs) ci = opts->func_inst.group.cg_item.ci_parent->ci_parent; ffs_dev_unlock(); - unregister_gadget_item(ci); + if (test_bit(FFS_FL_BOUND, &ffs->flags)) + unregister_gadget_item(ci); return; done: ffs_dev_unlock(); diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 76f392954733..abb8f19ae40f 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -189,6 +189,9 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) xhci->quirks |= XHCI_TRUST_TX_LENGTH; xhci->quirks |= XHCI_BROKEN_STREAMS; } + if (pdev->vendor == PCI_VENDOR_ID_RENESAS && + pdev->device == 0x0014) + xhci->quirks |= XHCI_TRUST_TX_LENGTH; if (pdev->vendor == PCI_VENDOR_ID_RENESAS && pdev->device == 0x0015) xhci->quirks |= XHCI_RESET_ON_RESUME; diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 49d1b2d4606d..d038e543c246 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -1017,6 +1017,7 @@ static const struct usb_device_id id_table_combined[] = { .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_BT_USB_PID) }, { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_WL_USB_PID) }, + { USB_DEVICE(AIRBUS_DS_VID, AIRBUS_DS_P8GR) }, { } /* Terminating entry */ }; diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 4faa09fe308c..8b4ecd2bd297 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -914,6 +914,12 @@ #define ICPDAS_I7561U_PID 0x0104 #define ICPDAS_I7563U_PID 0x0105 +/* + * Airbus Defence and Space + */ +#define AIRBUS_DS_VID 0x1e8e /* Vendor ID */ +#define AIRBUS_DS_P8GR 0x6001 /* Tetra P8GR */ + /* * RT Systems programming cables for various ham radios */ diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 54e316b1892d..a9400458ccea 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -236,6 +236,8 @@ static void option_instat_callback(struct urb *urb); /* These Quectel products use Qualcomm's vendor ID */ #define QUECTEL_PRODUCT_UC20 0x9003 #define QUECTEL_PRODUCT_UC15 0x9090 +/* These Yuga products use Qualcomm's vendor ID */ +#define YUGA_PRODUCT_CLM920_NC5 0x9625 #define QUECTEL_VENDOR_ID 0x2c7c /* These Quectel products use Quectel's vendor ID */ @@ -283,6 +285,7 @@ static void option_instat_callback(struct urb *urb); #define TELIT_PRODUCT_LE922_USBCFG3 0x1043 #define TELIT_PRODUCT_LE922_USBCFG5 0x1045 #define TELIT_PRODUCT_ME910 0x1100 +#define TELIT_PRODUCT_ME910_DUAL_MODEM 0x1101 #define TELIT_PRODUCT_LE920 0x1200 #define TELIT_PRODUCT_LE910 0x1201 #define TELIT_PRODUCT_LE910_USBCFG4 0x1206 @@ -648,6 +651,11 @@ static const struct option_blacklist_info telit_me910_blacklist = { .reserved = BIT(1) | BIT(3), }; +static const struct option_blacklist_info telit_me910_dual_modem_blacklist = { + .sendsetup = BIT(0), + .reserved = BIT(3), +}; + static const struct option_blacklist_info telit_le910_blacklist = { .sendsetup = BIT(0), .reserved = BIT(1) | BIT(2), @@ -677,6 +685,10 @@ static const struct option_blacklist_info cinterion_rmnet2_blacklist = { .reserved = BIT(4) | BIT(5), }; +static const struct option_blacklist_info yuga_clm920_nc5_blacklist = { + .reserved = BIT(1) | BIT(4), +}; + static const struct usb_device_id option_ids[] = { { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) }, { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) }, @@ -1181,6 +1193,9 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC15)}, { USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC20), .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, + /* Yuga products use Qualcomm vendor ID */ + { USB_DEVICE(QUALCOMM_VENDOR_ID, YUGA_PRODUCT_CLM920_NC5), + .driver_info = (kernel_ulong_t)&yuga_clm920_nc5_blacklist }, /* Quectel products using Quectel vendor ID */ { USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC21), .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, @@ -1247,6 +1262,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = (kernel_ulong_t)&telit_me910_blacklist }, + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), + .driver_info = (kernel_ulong_t)&telit_me910_dual_modem_blacklist }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910), .driver_info = (kernel_ulong_t)&telit_le910_blacklist }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4), diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c index 9f9d3a904464..55a8fb25ce2b 100644 --- a/drivers/usb/serial/qcserial.c +++ b/drivers/usb/serial/qcserial.c @@ -166,6 +166,8 @@ static const struct usb_device_id id_table[] = { {DEVICE_SWI(0x1199, 0x9079)}, /* Sierra Wireless EM74xx */ {DEVICE_SWI(0x1199, 0x907a)}, /* Sierra Wireless EM74xx QDL */ {DEVICE_SWI(0x1199, 0x907b)}, /* Sierra Wireless EM74xx */ + {DEVICE_SWI(0x1199, 0x9090)}, /* Sierra Wireless EM7565 QDL */ + {DEVICE_SWI(0x1199, 0x9091)}, /* Sierra Wireless EM7565 */ {DEVICE_SWI(0x413c, 0x81a2)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */ {DEVICE_SWI(0x413c, 0x81a3)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */ {DEVICE_SWI(0x413c, 0x81a4)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */ @@ -346,6 +348,7 @@ static int qcprobe(struct usb_serial *serial, const struct usb_device_id *id) break; case 2: dev_dbg(dev, "NMEA GPS interface found\n"); + sendsetup = true; break; case 3: dev_dbg(dev, "Modem port found\n"); diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c index c653ce533430..720408d39f11 100644 --- a/drivers/usb/usbip/stub_dev.c +++ b/drivers/usb/usbip/stub_dev.c @@ -163,8 +163,7 @@ static void stub_shutdown_connection(struct usbip_device *ud) * step 1? */ if (ud->tcp_socket) { - dev_dbg(&sdev->udev->dev, "shutdown tcp_socket %p\n", - ud->tcp_socket); + dev_dbg(&sdev->udev->dev, "shutdown sockfd %d\n", ud->sockfd); kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR); } diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c index 7170404e8979..6968c906fa29 100644 --- a/drivers/usb/usbip/stub_main.c +++ b/drivers/usb/usbip/stub_main.c @@ -251,11 +251,12 @@ void stub_device_cleanup_urbs(struct stub_device *sdev) struct stub_priv *priv; struct urb *urb; - dev_dbg(&sdev->udev->dev, "free sdev %p\n", sdev); + dev_dbg(&sdev->udev->dev, "Stub device cleaning up urbs\n"); while ((priv = stub_priv_pop(sdev))) { urb = priv->urb; - dev_dbg(&sdev->udev->dev, "free urb %p\n", urb); + dev_dbg(&sdev->udev->dev, "free urb seqnum %lu\n", + priv->seqnum); usb_kill_urb(urb); kmem_cache_free(stub_priv_cache, priv); diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c index 283a9be77a22..5b807185f79e 100644 --- a/drivers/usb/usbip/stub_rx.c +++ b/drivers/usb/usbip/stub_rx.c @@ -225,9 +225,6 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev, if (priv->seqnum != pdu->u.cmd_unlink.seqnum) continue; - dev_info(&priv->urb->dev->dev, "unlink urb %p\n", - priv->urb); - /* * This matched urb is not completed yet (i.e., be in * flight in usb hcd hardware/driver). Now we are @@ -266,8 +263,8 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev, ret = usb_unlink_urb(priv->urb); if (ret != -EINPROGRESS) dev_err(&priv->urb->dev->dev, - "failed to unlink a urb %p, ret %d\n", - priv->urb, ret); + "failed to unlink a urb # %lu, ret %d\n", + priv->seqnum, ret); return 0; } diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c index 87ff94be4235..96aa375b80d9 100644 --- a/drivers/usb/usbip/stub_tx.c +++ b/drivers/usb/usbip/stub_tx.c @@ -102,7 +102,7 @@ void stub_complete(struct urb *urb) /* link a urb to the queue of tx. */ spin_lock_irqsave(&sdev->priv_lock, flags); if (sdev->ud.tcp_socket == NULL) { - usbip_dbg_stub_tx("ignore urb for closed connection %p", urb); + usbip_dbg_stub_tx("ignore urb for closed connection\n"); /* It will be freed in stub_device_cleanup_urbs(). */ } else if (priv->unlinking) { stub_enqueue_ret_unlink(sdev, priv->seqnum, urb->status); @@ -204,8 +204,8 @@ static int stub_send_ret_submit(struct stub_device *sdev) /* 1. setup usbip_header */ setup_ret_submit_pdu(&pdu_header, urb); - usbip_dbg_stub_tx("setup txdata seqnum: %d urb: %p\n", - pdu_header.base.seqnum, urb); + usbip_dbg_stub_tx("setup txdata seqnum: %d\n", + pdu_header.base.seqnum); usbip_header_correct_endian(&pdu_header, 1); iov[iovnum].iov_base = &pdu_header; diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c index 2281f3562870..17b599b923f3 100644 --- a/drivers/usb/usbip/usbip_common.c +++ b/drivers/usb/usbip/usbip_common.c @@ -331,26 +331,20 @@ int usbip_recv(struct socket *sock, void *buf, int size) struct msghdr msg = {.msg_flags = MSG_NOSIGNAL}; int total = 0; + if (!sock || !buf || !size) + return -EINVAL; + iov_iter_kvec(&msg.msg_iter, READ|ITER_KVEC, &iov, 1, size); usbip_dbg_xmit("enter\n"); - if (!sock || !buf || !size) { - pr_err("invalid arg, sock %p buff %p size %d\n", sock, buf, - size); - return -EINVAL; - } - do { - int sz = msg_data_left(&msg); + msg_data_left(&msg); sock->sk->sk_allocation = GFP_NOIO; result = sock_recvmsg(sock, &msg, MSG_WAITALL); - if (result <= 0) { - pr_debug("receive sock %p buf %p size %u ret %d total %d\n", - sock, buf + total, sz, result, total); + if (result <= 0) goto err; - } total += result; } while (msg_data_left(&msg)); diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index 1f0cf81cc145..692cfdef667e 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -670,9 +670,6 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag struct vhci_device *vdev; unsigned long flags; - usbip_dbg_vhci_hc("enter, usb_hcd %p urb %p mem_flags %d\n", - hcd, urb, mem_flags); - if (portnum > VHCI_HC_PORTS) { pr_err("invalid port number %d\n", portnum); return -ENODEV; @@ -836,8 +833,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) struct vhci_device *vdev; unsigned long flags; - pr_info("dequeue a urb %p\n", urb); - spin_lock_irqsave(&vhci->lock, flags); priv = urb->hcpriv; @@ -865,7 +860,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) /* tcp connection is closed */ spin_lock(&vdev->priv_lock); - pr_info("device %p seems to be disconnected\n", vdev); list_del(&priv->list); kfree(priv); urb->hcpriv = NULL; @@ -877,8 +871,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) * vhci_rx will receive RET_UNLINK and give back the URB. * Otherwise, we give back it here. */ - pr_info("gives back urb %p\n", urb); - usb_hcd_unlink_urb_from_ep(hcd, urb); spin_unlock_irqrestore(&vhci->lock, flags); @@ -906,8 +898,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) unlink->unlink_seqnum = priv->seqnum; - pr_info("device %p seems to be still connected\n", vdev); - /* send cmd_unlink and try to cancel the pending URB in the * peer */ list_add_tail(&unlink->list, &vdev->unlink_tx); @@ -989,7 +979,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud) /* need this? see stub_dev.c */ if (ud->tcp_socket) { - pr_debug("shutdown tcp_socket %p\n", ud->tcp_socket); + pr_debug("shutdown tcp_socket %d\n", ud->sockfd); kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR); } diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c index ef2f2d5ca6b2..1343037d00f9 100644 --- a/drivers/usb/usbip/vhci_rx.c +++ b/drivers/usb/usbip/vhci_rx.c @@ -37,24 +37,23 @@ struct urb *pickup_urb_and_free_priv(struct vhci_device *vdev, __u32 seqnum) urb = priv->urb; status = urb->status; - usbip_dbg_vhci_rx("find urb %p vurb %p seqnum %u\n", - urb, priv, seqnum); + usbip_dbg_vhci_rx("find urb seqnum %u\n", seqnum); switch (status) { case -ENOENT: /* fall through */ case -ECONNRESET: - dev_info(&urb->dev->dev, - "urb %p was unlinked %ssynchronuously.\n", urb, - status == -ENOENT ? "" : "a"); + dev_dbg(&urb->dev->dev, + "urb seq# %u was unlinked %ssynchronuously\n", + seqnum, status == -ENOENT ? "" : "a"); break; case -EINPROGRESS: /* no info output */ break; default: - dev_info(&urb->dev->dev, - "urb %p may be in a error, status %d\n", urb, - status); + dev_dbg(&urb->dev->dev, + "urb seq# %u may be in a error, status %d\n", + seqnum, status); } list_del(&priv->list); @@ -81,8 +80,8 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev, spin_unlock_irqrestore(&vdev->priv_lock, flags); if (!urb) { - pr_err("cannot find a urb of seqnum %u\n", pdu->base.seqnum); - pr_info("max seqnum %d\n", + pr_err("cannot find a urb of seqnum %u max seqnum %d\n", + pdu->base.seqnum, atomic_read(&vhci_hcd->seqnum)); usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); return; @@ -105,7 +104,7 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev, if (usbip_dbg_flag_vhci_rx) usbip_dump_urb(urb); - usbip_dbg_vhci_rx("now giveback urb %p\n", urb); + usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum); spin_lock_irqsave(&vhci->lock, flags); usb_hcd_unlink_urb_from_ep(vhci_hcd_to_hcd(vhci_hcd), urb); @@ -172,7 +171,7 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev, pr_info("the urb (seqnum %d) was already given back\n", pdu->base.seqnum); } else { - usbip_dbg_vhci_rx("now giveback urb %p\n", urb); + usbip_dbg_vhci_rx("now giveback urb %d\n", pdu->base.seqnum); /* If unlink is successful, status is -ECONNRESET */ urb->status = pdu->u.ret_unlink.status; diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c index 3e7878fe2fd4..a9a663a578b6 100644 --- a/drivers/usb/usbip/vhci_tx.c +++ b/drivers/usb/usbip/vhci_tx.c @@ -83,7 +83,8 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev) memset(&msg, 0, sizeof(msg)); memset(&iov, 0, sizeof(iov)); - usbip_dbg_vhci_tx("setup txdata urb %p\n", urb); + usbip_dbg_vhci_tx("setup txdata urb seqnum %lu\n", + priv->seqnum); /* 1. setup usbip_header */ setup_cmd_submit_pdu(&pdu_header, urb); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 19e4ad2f3f2e..0c4b690cf761 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -87,6 +87,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( spin_lock(&root->inode_lock); node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + if (node) { if (btrfs_inode->delayed_node) { refcount_inc(&node->refs); /* can be accessed */ @@ -94,9 +95,30 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( spin_unlock(&root->inode_lock); return node; } - btrfs_inode->delayed_node = node; - /* can be accessed and cached in the inode */ - refcount_add(2, &node->refs); + + /* + * It's possible that we're racing into the middle of removing + * this node from the radix tree. In this case, the refcount + * was zero and it should never go back to one. Just return + * NULL like it was never in the radix at all; our release + * function is in the process of removing it. + * + * Some implementations of refcount_inc refuse to bump the + * refcount once it has hit zero. If we don't do this dance + * here, refcount_inc() may decide to just WARN_ONCE() instead + * of actually bumping the refcount. + * + * If this node is properly in the radix, we want to bump the + * refcount twice, once for the inode and once for this get + * operation. + */ + if (refcount_inc_not_zero(&node->refs)) { + refcount_inc(&node->refs); + btrfs_inode->delayed_node = node; + } else { + node = NULL; + } + spin_unlock(&root->inode_lock); return node; } @@ -254,17 +276,18 @@ static void __btrfs_release_delayed_node( mutex_unlock(&delayed_node->mutex); if (refcount_dec_and_test(&delayed_node->refs)) { - bool free = false; struct btrfs_root *root = delayed_node->root; + spin_lock(&root->inode_lock); - if (refcount_read(&delayed_node->refs) == 0) { - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); - free = true; - } + /* + * Once our refcount goes to zero, nobody is allowed to bump it + * back up. We can delete it now. + */ + ASSERT(refcount_read(&delayed_node->refs) == 0); + radix_tree_delete(&root->delayed_nodes_tree, + delayed_node->inode_id); spin_unlock(&root->inode_lock); - if (free) - kmem_cache_free(delayed_node_cache, delayed_node); + kmem_cache_free(delayed_node_cache, delayed_node); } } diff --git a/fs/exec.c b/fs/exec.c index 0f1ae67a19ec..ea3c4924dd6b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1350,9 +1350,14 @@ void setup_new_exec(struct linux_binprm * bprm) current->sas_ss_sp = current->sas_ss_size = 0; - /* Figure out dumpability. */ + /* + * Figure out dumpability. Note that this checking only of current + * is wrong, but userspace depends on it. This should be testing + * bprm->secureexec instead. + */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || - bprm->secureexec) + !(uid_eq(current_euid(), current_uid()) && + gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index e0f867cd8553..96f1087e372c 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -1,12 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include #include +__weak void arch_freq_prepare_all(void) +{ +} + extern const struct seq_operations cpuinfo_op; static int cpuinfo_open(struct inode *inode, struct file *file) { + arch_freq_prepare_all(); return seq_open(file, &cpuinfo_op); } diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2751476e6b6e..37658b6e83ee 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -39,45 +41,382 @@ #include "decompressor.h" #include "page_actor.h" -/* - * Read the metadata block length, this is stored in the first two - * bytes of the metadata block. - */ -static struct buffer_head *get_block_length(struct super_block *sb, - u64 *cur_index, int *offset, int *length) +static struct workqueue_struct *squashfs_read_wq; + +struct squashfs_read_request { + struct super_block *sb; + u64 index; + int length; + int compressed; + int offset; + u64 read_end; + struct squashfs_page_actor *output; + enum { + SQUASHFS_COPY, + SQUASHFS_DECOMPRESS, + SQUASHFS_METADATA, + } data_processing; + bool synchronous; + + /* + * If the read is synchronous, it is possible to retrieve information + * about the request by setting these pointers. + */ + int *res; + int *bytes_read; + int *bytes_uncompressed; + + int nr_buffers; + struct buffer_head **bh; + struct work_struct offload; +}; + +struct squashfs_bio_request { + struct buffer_head **bh; + int nr_buffers; +}; + +static int squashfs_bio_submit(struct squashfs_read_request *req); + +int squashfs_init_read_wq(void) { - struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head *bh; + squashfs_read_wq = create_workqueue("SquashFS read wq"); + return !!squashfs_read_wq; +} - bh = sb_bread(sb, *cur_index); - if (bh == NULL) - return NULL; +void squashfs_destroy_read_wq(void) +{ + flush_workqueue(squashfs_read_wq); + destroy_workqueue(squashfs_read_wq); +} - if (msblk->devblksize - *offset == 1) { - *length = (unsigned char) bh->b_data[*offset]; - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *length |= (unsigned char) bh->b_data[0] << 8; - *offset = 1; - } else { - *length = (unsigned char) bh->b_data[*offset] | - (unsigned char) bh->b_data[*offset + 1] << 8; - *offset += 2; +static void free_read_request(struct squashfs_read_request *req, int error) +{ + if (!req->synchronous) + squashfs_page_actor_free(req->output, error); + if (req->res) + *(req->res) = error; + kfree(req->bh); + kfree(req); +} - if (*offset == msblk->devblksize) { - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *offset = 0; +static void squashfs_process_blocks(struct squashfs_read_request *req) +{ + int error = 0; + int bytes, i, length; + struct squashfs_sb_info *msblk = req->sb->s_fs_info; + struct squashfs_page_actor *actor = req->output; + struct buffer_head **bh = req->bh; + int nr_buffers = req->nr_buffers; + + for (i = 0; i < nr_buffers; ++i) { + if (!bh[i]) + continue; + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + error = -EIO; + } + if (error) + goto cleanup; + + if (req->data_processing == SQUASHFS_METADATA) { + /* Extract the length of the metadata block */ + if (req->offset != msblk->devblksize - 1) { + length = le16_to_cpup((__le16 *) + (bh[0]->b_data + req->offset)); + } else { + length = (unsigned char)bh[0]->b_data[req->offset]; + length |= (unsigned char)bh[1]->b_data[0] << 8; + } + req->compressed = SQUASHFS_COMPRESSED(length); + req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS + : SQUASHFS_COPY; + length = SQUASHFS_COMPRESSED_SIZE(length); + if (req->index + length + 2 > req->read_end) { + for (i = 0; i < nr_buffers; ++i) + put_bh(bh[i]); + kfree(bh); + req->length = length; + req->index += 2; + squashfs_bio_submit(req); + return; + } + req->length = length; + req->offset = (req->offset + 2) % PAGE_SIZE; + if (req->offset < 2) { + put_bh(bh[0]); + ++bh; + --nr_buffers; + } + } + if (req->bytes_read) + *(req->bytes_read) = req->length; + + if (req->data_processing == SQUASHFS_COPY) { + squashfs_bh_to_actor(bh, nr_buffers, req->output, req->offset, + req->length, msblk->devblksize); + } else if (req->data_processing == SQUASHFS_DECOMPRESS) { + req->length = squashfs_decompress(msblk, bh, nr_buffers, + req->offset, req->length, actor); + if (req->length < 0) { + error = -EIO; + goto cleanup; } } - return bh; + /* Last page may have trailing bytes not filled */ + bytes = req->length % PAGE_SIZE; + if (bytes && actor->page[actor->pages - 1]) + zero_user_segment(actor->page[actor->pages - 1], bytes, + PAGE_SIZE); + +cleanup: + if (req->bytes_uncompressed) + *(req->bytes_uncompressed) = req->length; + if (error) { + for (i = 0; i < nr_buffers; ++i) + if (bh[i]) + put_bh(bh[i]); + } + free_read_request(req, error); } +static void read_wq_handler(struct work_struct *work) +{ + squashfs_process_blocks(container_of(work, + struct squashfs_read_request, offload)); +} + +static void squashfs_bio_end_io(struct bio *bio) +{ + int i; + int error = bio->bi_status; + struct squashfs_bio_request *bio_req = bio->bi_private; + + bio_put(bio); + + for (i = 0; i < bio_req->nr_buffers; ++i) { + if (!bio_req->bh[i]) + continue; + if (!error) + set_buffer_uptodate(bio_req->bh[i]); + else + clear_buffer_uptodate(bio_req->bh[i]); + unlock_buffer(bio_req->bh[i]); + } + kfree(bio_req); +} + +static int bh_is_optional(struct squashfs_read_request *req, int idx) +{ + int start_idx, end_idx; + struct squashfs_sb_info *msblk = req->sb->s_fs_info; + + start_idx = (idx * msblk->devblksize - req->offset) >> PAGE_SHIFT; + end_idx = ((idx + 1) * msblk->devblksize - req->offset + 1) >> PAGE_SHIFT; + if (start_idx >= req->output->pages) + return 1; + if (start_idx < 0) + start_idx = end_idx; + if (end_idx >= req->output->pages) + end_idx = start_idx; + return !req->output->page[start_idx] && !req->output->page[end_idx]; +} + +static int actor_getblks(struct squashfs_read_request *req, u64 block) +{ + int i; + + req->bh = kmalloc_array(req->nr_buffers, sizeof(*(req->bh)), GFP_NOIO); + if (!req->bh) + return -ENOMEM; + + for (i = 0; i < req->nr_buffers; ++i) { + /* + * When dealing with an uncompressed block, the actor may + * contains NULL pages. There's no need to read the buffers + * associated with these pages. + */ + if (!req->compressed && bh_is_optional(req, i)) { + req->bh[i] = NULL; + continue; + } + req->bh[i] = sb_getblk(req->sb, block + i); + if (!req->bh[i]) { + while (--i) { + if (req->bh[i]) + put_bh(req->bh[i]); + } + return -1; + } + } + return 0; +} + +static int squashfs_bio_submit(struct squashfs_read_request *req) +{ + struct bio *bio = NULL; + struct buffer_head *bh; + struct squashfs_bio_request *bio_req = NULL; + int b = 0, prev_block = 0; + struct squashfs_sb_info *msblk = req->sb->s_fs_info; + + u64 read_start = round_down(req->index, msblk->devblksize); + u64 read_end = round_up(req->index + req->length, msblk->devblksize); + sector_t block = read_start >> msblk->devblksize_log2; + sector_t block_end = read_end >> msblk->devblksize_log2; + int offset = read_start - round_down(req->index, PAGE_SIZE); + int nr_buffers = block_end - block; + int blksz = msblk->devblksize; + int bio_max_pages = nr_buffers > BIO_MAX_PAGES ? BIO_MAX_PAGES + : nr_buffers; + + /* Setup the request */ + req->read_end = read_end; + req->offset = req->index - read_start; + req->nr_buffers = nr_buffers; + if (actor_getblks(req, block) < 0) + goto getblk_failed; + + /* Create and submit the BIOs */ + for (b = 0; b < nr_buffers; ++b, offset += blksz) { + bh = req->bh[b]; + if (!bh || !trylock_buffer(bh)) + continue; + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + offset %= PAGE_SIZE; + + /* Append the buffer to the current BIO if it is contiguous */ + if (bio && bio_req && prev_block + 1 == b) { + if (bio_add_page(bio, bh->b_page, blksz, offset)) { + bio_req->nr_buffers += 1; + prev_block = b; + continue; + } + } + + /* Otherwise, submit the current BIO and create a new one */ + if (bio) + submit_bio(bio); + bio_req = kcalloc(1, sizeof(struct squashfs_bio_request), + GFP_NOIO); + if (!bio_req) + goto req_alloc_failed; + bio_req->bh = &req->bh[b]; + bio = bio_alloc(GFP_NOIO, bio_max_pages); + if (!bio) + goto bio_alloc_failed; + bio_set_dev(bio, req->sb->s_bdev); + bio->bi_iter.bi_sector = (block + b) + << (msblk->devblksize_log2 - 9); + bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_private = bio_req; + bio->bi_end_io = squashfs_bio_end_io; + + bio_add_page(bio, bh->b_page, blksz, offset); + bio_req->nr_buffers += 1; + prev_block = b; + } + if (bio) + submit_bio(bio); + + if (req->synchronous) + squashfs_process_blocks(req); + else { + INIT_WORK(&req->offload, read_wq_handler); + schedule_work(&req->offload); + } + return 0; + +bio_alloc_failed: + kfree(bio_req); +req_alloc_failed: + unlock_buffer(bh); + while (--nr_buffers >= b) + if (req->bh[nr_buffers]) + put_bh(req->bh[nr_buffers]); + while (--b >= 0) + if (req->bh[b]) + wait_on_buffer(req->bh[b]); +getblk_failed: + free_read_request(req, -ENOMEM); + return -ENOMEM; +} + +static int read_metadata_block(struct squashfs_read_request *req, + u64 *next_index) +{ + int ret, error, bytes_read = 0, bytes_uncompressed = 0; + struct squashfs_sb_info *msblk = req->sb->s_fs_info; + + if (req->index + 2 > msblk->bytes_used) { + free_read_request(req, -EINVAL); + return -EINVAL; + } + req->length = 2; + + /* Do not read beyond the end of the device */ + if (req->index + req->length > msblk->bytes_used) + req->length = msblk->bytes_used - req->index; + req->data_processing = SQUASHFS_METADATA; + + /* + * Reading metadata is always synchronous because we don't know the + * length in advance and the function is expected to update + * 'next_index' and return the length. + */ + req->synchronous = true; + req->res = &error; + req->bytes_read = &bytes_read; + req->bytes_uncompressed = &bytes_uncompressed; + + TRACE("Metadata block @ 0x%llx, %scompressed size %d, src size %d\n", + req->index, req->compressed ? "" : "un", bytes_read, + req->output->length); + + ret = squashfs_bio_submit(req); + if (ret) + return ret; + if (error) + return error; + if (next_index) + *next_index += 2 + bytes_read; + return bytes_uncompressed; +} + +static int read_data_block(struct squashfs_read_request *req, int length, + u64 *next_index, bool synchronous) +{ + int ret, error = 0, bytes_uncompressed = 0, bytes_read = 0; + + req->compressed = SQUASHFS_COMPRESSED_BLOCK(length); + req->length = length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); + req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS + : SQUASHFS_COPY; + + req->synchronous = synchronous; + if (synchronous) { + req->res = &error; + req->bytes_read = &bytes_read; + req->bytes_uncompressed = &bytes_uncompressed; + } + + TRACE("Data block @ 0x%llx, %scompressed size %d, src size %d\n", + req->index, req->compressed ? "" : "un", req->length, + req->output->length); + + ret = squashfs_bio_submit(req); + if (ret) + return ret; + if (synchronous) + ret = error ? error : bytes_uncompressed; + if (next_index) + *next_index += length; + return ret; +} /* * Read and decompress a metadata block or datablock. Length is non-zero @@ -88,128 +427,50 @@ static struct buffer_head *get_block_length(struct super_block *sb, * generated a larger block - this does occasionally happen with compression * algorithms). */ -int squashfs_read_data(struct super_block *sb, u64 index, int length, - u64 *next_index, struct squashfs_page_actor *output) +static int __squashfs_read_data(struct super_block *sb, u64 index, int length, + u64 *next_index, struct squashfs_page_actor *output, bool sync) { - struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head **bh; - int offset = index & ((1 << msblk->devblksize_log2) - 1); - u64 cur_index = index >> msblk->devblksize_log2; - int bytes, compressed, b = 0, k = 0, avail, i; + struct squashfs_read_request *req; - bh = kcalloc(((output->length + msblk->devblksize - 1) - >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); - if (bh == NULL) + req = kcalloc(1, sizeof(struct squashfs_read_request), GFP_KERNEL); + if (!req) { + if (!sync) + squashfs_page_actor_free(output, -ENOMEM); return -ENOMEM; - - if (length) { - /* - * Datablock. - */ - bytes = -offset; - compressed = SQUASHFS_COMPRESSED_BLOCK(length); - length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); - if (next_index) - *next_index = index + length; - - TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", - index, compressed ? "" : "un", length, output->length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto read_failure; - - for (b = 0; bytes < length; b++, cur_index++) { - bh[b] = sb_getblk(sb, cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b, bh); - } else { - /* - * Metadata block. - */ - if ((index + 2) > msblk->bytes_used) - goto read_failure; - - bh[0] = get_block_length(sb, &cur_index, &offset, &length); - if (bh[0] == NULL) - goto read_failure; - b = 1; - - bytes = msblk->devblksize - offset; - compressed = SQUASHFS_COMPRESSED(length); - length = SQUASHFS_COMPRESSED_SIZE(length); - if (next_index) - *next_index = index + length + 2; - - TRACE("Block @ 0x%llx, %scompressed size %d\n", index, - compressed ? "" : "un", length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto block_release; - - for (; bytes < length; b++) { - bh[b] = sb_getblk(sb, ++cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1); } - for (i = 0; i < b; i++) { - wait_on_buffer(bh[i]); - if (!buffer_uptodate(bh[i])) - goto block_release; + req->sb = sb; + req->index = index; + req->output = output; + + if (next_index) + *next_index = index; + + if (length) + length = read_data_block(req, length, next_index, sync); + else + length = read_metadata_block(req, next_index); + + if (length < 0) { + ERROR("squashfs_read_data failed to read block 0x%llx\n", + (unsigned long long)index); + return -EIO; } - if (compressed) { - length = squashfs_decompress(msblk, bh, b, offset, length, - output); - if (length < 0) - goto read_failure; - } else { - /* - * Block is uncompressed. - */ - int in, pg_offset = 0; - void *data = squashfs_first_page(output); - - for (bytes = length; k < b; k++) { - in = min(bytes, msblk->devblksize - offset); - bytes -= in; - while (in) { - if (pg_offset == PAGE_SIZE) { - data = squashfs_next_page(output); - pg_offset = 0; - } - avail = min_t(int, in, PAGE_SIZE - - pg_offset); - memcpy(data + pg_offset, bh[k]->b_data + offset, - avail); - in -= avail; - pg_offset += avail; - offset += avail; - } - offset = 0; - put_bh(bh[k]); - } - squashfs_finish_page(output); - } - - kfree(bh); return length; - -block_release: - for (; k < b; k++) - put_bh(bh[k]); - -read_failure: - ERROR("squashfs_read_data failed to read block 0x%llx\n", - (unsigned long long) index); - kfree(bh); - return -EIO; +} + +int squashfs_read_data(struct super_block *sb, u64 index, int length, + u64 *next_index, struct squashfs_page_actor *output) +{ + return __squashfs_read_data(sb, index, length, next_index, output, + true); +} + +int squashfs_read_data_async(struct super_block *sb, u64 index, int length, + u64 *next_index, struct squashfs_page_actor *output) +{ + + return __squashfs_read_data(sb, index, length, next_index, output, + false); } diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 23813c078cc9..05e42441d106 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -209,17 +209,14 @@ void squashfs_cache_put(struct squashfs_cache_entry *entry) */ void squashfs_cache_delete(struct squashfs_cache *cache) { - int i, j; + int i; if (cache == NULL) return; for (i = 0; i < cache->entries; i++) { - if (cache->entry[i].data) { - for (j = 0; j < cache->pages; j++) - kfree(cache->entry[i].data[j]); - kfree(cache->entry[i].data); - } + if (cache->entry[i].page) + free_page_array(cache->entry[i].page, cache->pages); kfree(cache->entry[i].actor); } @@ -236,7 +233,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache) struct squashfs_cache *squashfs_cache_init(char *name, int entries, int block_size) { - int i, j; + int i; struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) { @@ -268,22 +265,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, init_waitqueue_head(&cache->entry[i].wait_queue); entry->cache = cache; entry->block = SQUASHFS_INVALID_BLK; - entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL); - if (entry->data == NULL) { + entry->page = alloc_page_array(cache->pages, GFP_KERNEL); + if (!entry->page) { ERROR("Failed to allocate %s cache entry\n", name); goto cleanup; } - - for (j = 0; j < cache->pages; j++) { - entry->data[j] = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (entry->data[j] == NULL) { - ERROR("Failed to allocate %s buffer\n", name); - goto cleanup; - } - } - - entry->actor = squashfs_page_actor_init(entry->data, - cache->pages, 0); + entry->actor = squashfs_page_actor_init(entry->page, + cache->pages, 0, NULL); if (entry->actor == NULL) { ERROR("Failed to allocate %s cache entry\n", name); goto cleanup; @@ -314,18 +302,20 @@ int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry, return min(length, entry->length - offset); while (offset < entry->length) { - void *buff = entry->data[offset / PAGE_SIZE] - + (offset % PAGE_SIZE); + void *buff = kmap_atomic(entry->page[offset / PAGE_SIZE]) + + (offset % PAGE_SIZE); int bytes = min_t(int, entry->length - offset, PAGE_SIZE - (offset % PAGE_SIZE)); if (bytes >= remaining) { memcpy(buffer, buff, remaining); + kunmap_atomic(buff); remaining = 0; break; } memcpy(buffer, buff, bytes); + kunmap_atomic(buff); buffer += bytes; remaining -= bytes; offset += bytes; @@ -416,43 +406,38 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb, void *squashfs_read_table(struct super_block *sb, u64 block, int length) { int pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; - int i, res; - void *table, *buffer, **data; + struct page **page; + void *buff; + int res; struct squashfs_page_actor *actor; - table = buffer = kmalloc(length, GFP_KERNEL); - if (table == NULL) + page = alloc_page_array(pages, GFP_KERNEL); + if (!page) return ERR_PTR(-ENOMEM); - data = kcalloc(pages, sizeof(void *), GFP_KERNEL); - if (data == NULL) { + actor = squashfs_page_actor_init(page, pages, length, NULL); + if (actor == NULL) { res = -ENOMEM; goto failed; } - actor = squashfs_page_actor_init(data, pages, length); - if (actor == NULL) { - res = -ENOMEM; - goto failed2; - } - - for (i = 0; i < pages; i++, buffer += PAGE_SIZE) - data[i] = buffer; - res = squashfs_read_data(sb, block, length | SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor); - kfree(data); - kfree(actor); - if (res < 0) - goto failed; + goto failed2; - return table; + buff = kmalloc(length, GFP_KERNEL); + if (!buff) + goto failed2; + squashfs_actor_to_buf(actor, buff, length); + squashfs_page_actor_free(actor, 0); + free_page_array(page, pages); + return buff; failed2: - kfree(data); + squashfs_page_actor_free(actor, 0); failed: - kfree(table); + free_page_array(page, pages); return ERR_PTR(res); } diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 836639810ea0..86831aaedb9e 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -24,7 +24,8 @@ #include #include #include -#include +#include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -101,40 +102,44 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) static void *get_comp_opts(struct super_block *sb, unsigned short flags) { struct squashfs_sb_info *msblk = sb->s_fs_info; - void *buffer = NULL, *comp_opts; + void *comp_opts, *buffer = NULL; + struct page *page; struct squashfs_page_actor *actor = NULL; int length = 0; + if (!SQUASHFS_COMP_OPTS(flags)) + return squashfs_comp_opts(msblk, buffer, length); + /* * Read decompressor specific options from file system if present */ - if (SQUASHFS_COMP_OPTS(flags)) { - buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (buffer == NULL) { - comp_opts = ERR_PTR(-ENOMEM); - goto out; - } - actor = squashfs_page_actor_init(&buffer, 1, 0); - if (actor == NULL) { - comp_opts = ERR_PTR(-ENOMEM); - goto out; - } + page = alloc_page(GFP_KERNEL); + if (!page) + return ERR_PTR(-ENOMEM); - length = squashfs_read_data(sb, - sizeof(struct squashfs_super_block), 0, NULL, actor); - - if (length < 0) { - comp_opts = ERR_PTR(length); - goto out; - } + actor = squashfs_page_actor_init(&page, 1, 0, NULL); + if (actor == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto actor_error; } - comp_opts = squashfs_comp_opts(msblk, buffer, length); + length = squashfs_read_data(sb, + sizeof(struct squashfs_super_block), 0, NULL, actor); -out: - kfree(actor); - kfree(buffer); + if (length < 0) { + comp_opts = ERR_PTR(length); + goto read_error; + } + + buffer = kmap_atomic(page); + comp_opts = squashfs_comp_opts(msblk, buffer, length); + kunmap_atomic(buffer); + +read_error: + squashfs_page_actor_free(actor, 0); +actor_error: + __free_page(page); return comp_opts; } diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 13d80947bf9e..bb2e77ee4209 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -438,6 +439,21 @@ static int squashfs_readpage_fragment(struct page *page) return res; } +static int squashfs_readpages_fragment(struct page *page, + struct list_head *readahead_pages, struct address_space *mapping) +{ + if (!page) { + page = lru_to_page(readahead_pages); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, page->index, + mapping_gfp_constraint(mapping, GFP_KERNEL))) { + put_page(page); + return 0; + } + } + return squashfs_readpage_fragment(page); +} + static int squashfs_readpage_sparse(struct page *page, int index, int file_end) { struct inode *inode = page->mapping->host; @@ -450,54 +466,105 @@ static int squashfs_readpage_sparse(struct page *page, int index, int file_end) return 0; } -static int squashfs_readpage(struct file *file, struct page *page) +static int squashfs_readpages_sparse(struct page *page, + struct list_head *readahead_pages, int index, int file_end, + struct address_space *mapping) { - struct inode *inode = page->mapping->host; + if (!page) { + page = lru_to_page(readahead_pages); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, page->index, + mapping_gfp_constraint(mapping, GFP_KERNEL))) { + put_page(page); + return 0; + } + } + return squashfs_readpage_sparse(page, index, file_end); +} + +static int __squashfs_readpages(struct file *file, struct page *page, + struct list_head *readahead_pages, unsigned int nr_pages, + struct address_space *mapping) +{ + struct inode *inode = mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int index = page->index >> (msblk->block_log - PAGE_SHIFT); int file_end = i_size_read(inode) >> msblk->block_log; int res; - void *pageaddr; + + do { + struct page *cur_page = page ? page + : lru_to_page(readahead_pages); + int page_index = cur_page->index; + int index = page_index >> (msblk->block_log - PAGE_SHIFT); + + if (page_index >= ((i_size_read(inode) + PAGE_SIZE - 1) >> + PAGE_SHIFT)) + return 1; + + if (index < file_end || squashfs_i(inode)->fragment_block == + SQUASHFS_INVALID_BLK) { + u64 block = 0; + int bsize = read_blocklist(inode, index, &block); + + if (bsize < 0) + return -1; + + if (bsize == 0) { + res = squashfs_readpages_sparse(page, + readahead_pages, index, file_end, + mapping); + } else { + res = squashfs_readpages_block(page, + readahead_pages, &nr_pages, mapping, + page_index, block, bsize); + } + } else { + res = squashfs_readpages_fragment(page, + readahead_pages, mapping); + } + if (res) + return 0; + page = NULL; + } while (readahead_pages && !list_empty(readahead_pages)); + + return 0; +} + +static int squashfs_readpage(struct file *file, struct page *page) +{ + int ret; TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", - page->index, squashfs_i(inode)->start); + page->index, squashfs_i(page->mapping->host)->start); - if (page->index >= ((i_size_read(inode) + PAGE_SIZE - 1) >> - PAGE_SHIFT)) - goto out; + get_page(page); - if (index < file_end || squashfs_i(inode)->fragment_block == - SQUASHFS_INVALID_BLK) { - u64 block = 0; - int bsize = read_blocklist(inode, index, &block); - if (bsize < 0) - goto error_out; - - if (bsize == 0) - res = squashfs_readpage_sparse(page, index, file_end); + ret = __squashfs_readpages(file, page, NULL, 1, page->mapping); + if (ret) { + flush_dcache_page(page); + if (ret < 0) + SetPageError(page); else - res = squashfs_readpage_block(page, block, bsize); - } else - res = squashfs_readpage_fragment(page); + SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + unlock_page(page); + put_page(page); + } - if (!res) - return 0; - -error_out: - SetPageError(page); -out: - pageaddr = kmap_atomic(page); - memset(pageaddr, 0, PAGE_SIZE); - kunmap_atomic(pageaddr); - flush_dcache_page(page); - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); + return 0; +} +static int squashfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned int nr_pages) +{ + TRACE("Entered squashfs_readpages, %u pages, first page index %lx\n", + nr_pages, lru_to_page(pages)->index); + __squashfs_readpages(file, NULL, pages, nr_pages, mapping); return 0; } const struct address_space_operations squashfs_aops = { - .readpage = squashfs_readpage + .readpage = squashfs_readpage, + .readpages = squashfs_readpages, }; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index cb485d8e0e91..dc87f77ce11e 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -20,157 +21,136 @@ #include "squashfs.h" #include "page_actor.h" -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page); - -/* Read separately compressed datablock directly into page cache */ -int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) - +static void release_actor_pages(struct page **page, int pages, int error) { - struct inode *inode = target_page->mapping->host; - struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int i; - int file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; - int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - int start_index = target_page->index & ~mask; - int end_index = start_index | mask; - int i, n, pages, missing_pages, bytes, res = -ENOMEM; + for (i = 0; i < pages; i++) { + if (!page[i]) + continue; + flush_dcache_page(page[i]); + if (!error) + SetPageUptodate(page[i]); + else { + SetPageError(page[i]); + zero_user_segment(page[i], 0, PAGE_SIZE); + } + unlock_page(page[i]); + put_page(page[i]); + } + kfree(page); +} + +/* + * Create a "page actor" which will kmap and kunmap the + * page cache pages appropriately within the decompressor + */ +static struct squashfs_page_actor *actor_from_page_cache( + unsigned int actor_pages, struct page *target_page, + struct list_head *rpages, unsigned int *nr_pages, int start_index, + struct address_space *mapping) +{ struct page **page; struct squashfs_page_actor *actor; - void *pageaddr; + int i, n; + gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); - if (end_index > file_end) - end_index = file_end; + page = kmalloc_array(actor_pages, sizeof(void *), GFP_KERNEL); + if (!page) + return NULL; - pages = end_index - start_index + 1; + for (i = 0, n = start_index; i < actor_pages; i++, n++) { + if (target_page == NULL && rpages && !list_empty(rpages)) { + struct page *cur_page = lru_to_page(rpages); - page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); - if (page == NULL) - return res; + if (cur_page->index < start_index + actor_pages) { + list_del(&cur_page->lru); + --(*nr_pages); + if (add_to_page_cache_lru(cur_page, mapping, + cur_page->index, gfp)) + put_page(cur_page); + else + target_page = cur_page; + } else + rpages = NULL; + } - /* - * Create a "page actor" which will kmap and kunmap the - * page cache pages appropriately within the decompressor - */ - actor = squashfs_page_actor_init_special(page, pages, 0); - if (actor == NULL) - goto out; - - /* Try to grab all the pages covered by the Squashfs block */ - for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { - page[i] = (n == target_page->index) ? target_page : - grab_cache_page_nowait(target_page->mapping, n); - - if (page[i] == NULL) { - missing_pages++; - continue; + if (target_page && target_page->index == n) { + page[i] = target_page; + target_page = NULL; + } else { + page[i] = grab_cache_page_nowait(mapping, n); + if (page[i] == NULL) + continue; } if (PageUptodate(page[i])) { unlock_page(page[i]); put_page(page[i]); page[i] = NULL; - missing_pages++; } } - if (missing_pages) { - /* - * Couldn't get one or more pages, this page has either - * been VM reclaimed, but others are still in the page cache - * and uptodate, or we're racing with another thread in - * squashfs_readpage also trying to grab them. Fall back to - * using an intermediate buffer. - */ - res = squashfs_read_cache(target_page, block, bsize, pages, - page); - if (res < 0) - goto mark_errored; - - goto out; + actor = squashfs_page_actor_init(page, actor_pages, 0, + release_actor_pages); + if (!actor) { + release_actor_pages(page, actor_pages, -ENOMEM); + kfree(page); + return NULL; } - - /* Decompress directly into the page cache buffers */ - res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - if (res < 0) - goto mark_errored; - - /* Last page may have trailing bytes not filled */ - bytes = res % PAGE_SIZE; - if (bytes) { - pageaddr = kmap_atomic(page[pages - 1]); - memset(pageaddr + bytes, 0, PAGE_SIZE - bytes); - kunmap_atomic(pageaddr); - } - - /* Mark pages as uptodate, unlock and release */ - for (i = 0; i < pages; i++) { - flush_dcache_page(page[i]); - SetPageUptodate(page[i]); - unlock_page(page[i]); - if (page[i] != target_page) - put_page(page[i]); - } - - kfree(actor); - kfree(page); - - return 0; - -mark_errored: - /* Decompression failed, mark pages as errored. Target_page is - * dealt with by the caller - */ - for (i = 0; i < pages; i++) { - if (page[i] == NULL || page[i] == target_page) - continue; - flush_dcache_page(page[i]); - SetPageError(page[i]); - unlock_page(page[i]); - put_page(page[i]); - } - -out: - kfree(actor); - kfree(page); - return res; + return actor; } +int squashfs_readpages_block(struct page *target_page, + struct list_head *readahead_pages, + unsigned int *nr_pages, + struct address_space *mapping, + int page_index, u64 block, int bsize) -static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, - int pages, struct page **page) { - struct inode *i = target_page->mapping->host; - struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, - block, bsize); - int bytes = buffer->length, res = buffer->error, n, offset = 0; - void *pageaddr; + struct squashfs_page_actor *actor; + struct inode *inode = mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int start_index, end_index, file_end, actor_pages, res; + int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - if (res) { - ERROR("Unable to read page, block %llx, size %x\n", block, - bsize); - goto out; + /* + * If readpage() is called on an uncompressed datablock, we can just + * read the pages instead of fetching the whole block. + * This greatly improves the performance when a process keep doing + * random reads because we only fetch the necessary data. + * The readahead algorithm will take care of doing speculative reads + * if necessary. + * We can't read more than 1 block even if readahead provides use more + * pages because we don't know yet if the next block is compressed or + * not. + */ + if (bsize && !SQUASHFS_COMPRESSED_BLOCK(bsize)) { + u64 block_end = block + msblk->block_size; + + block += (page_index & mask) * PAGE_SIZE; + actor_pages = (block_end - block) / PAGE_SIZE; + if (*nr_pages < actor_pages) + actor_pages = *nr_pages; + start_index = page_index; + bsize = min_t(int, bsize, (PAGE_SIZE * actor_pages) + | SQUASHFS_COMPRESSED_BIT_BLOCK); + } else { + file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; + start_index = page_index & ~mask; + end_index = start_index | mask; + if (end_index > file_end) + end_index = file_end; + actor_pages = end_index - start_index + 1; } - for (n = 0; n < pages && bytes > 0; n++, - bytes -= PAGE_SIZE, offset += PAGE_SIZE) { - int avail = min_t(int, bytes, PAGE_SIZE); + actor = actor_from_page_cache(actor_pages, target_page, + readahead_pages, nr_pages, start_index, + mapping); + if (!actor) + return -ENOMEM; - if (page[n] == NULL) - continue; - - pageaddr = kmap_atomic(page[n]); - squashfs_copy_data(pageaddr, buffer, offset, avail); - memset(pageaddr + avail, 0, PAGE_SIZE - avail); - kunmap_atomic(pageaddr); - flush_dcache_page(page[n]); - SetPageUptodate(page[n]); - unlock_page(page[n]); - if (page[n] != target_page) - put_page(page[n]); - } - -out: - squashfs_cache_put(buffer); - return res; + res = squashfs_read_data_async(inode->i_sb, block, bsize, NULL, + actor); + return res < 0 ? res : 0; } diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index 95da65366548..854d5ece7d08 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -94,39 +94,18 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, struct buffer_head **bh, int b, int offset, int length, struct squashfs_page_actor *output) { + int res; struct squashfs_lz4 *stream = strm; - void *buff = stream->input, *data; - int avail, i, bytes = length, res; - - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); - buff += avail; - bytes -= avail; - offset = 0; - put_bh(bh[i]); - } + squashfs_bh_to_buf(bh, b, stream->input, offset, length, + msblk->devblksize); res = LZ4_decompress_safe(stream->input, stream->output, length, output->length); if (res < 0) return -EIO; - bytes = res; - data = squashfs_first_page(output); - buff = stream->output; - while (data) { - if (bytes <= PAGE_SIZE) { - memcpy(data, buff, bytes); - break; - } - memcpy(data, buff, PAGE_SIZE); - buff += PAGE_SIZE; - bytes -= PAGE_SIZE; - data = squashfs_next_page(output); - } - squashfs_finish_page(output); + squashfs_buf_to_actor(stream->output, output, res); return res; } diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 934c17e96590..2c844d53a59e 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -79,45 +79,19 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, struct buffer_head **bh, int b, int offset, int length, struct squashfs_page_actor *output) { - struct squashfs_lzo *stream = strm; - void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int res; size_t out_len = output->length; + struct squashfs_lzo *stream = strm; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); - buff += avail; - bytes -= avail; - offset = 0; - put_bh(bh[i]); - } - + squashfs_bh_to_buf(bh, b, stream->input, offset, length, + msblk->devblksize); res = lzo1x_decompress_safe(stream->input, (size_t)length, stream->output, &out_len); if (res != LZO_E_OK) - goto failed; + return -EIO; + squashfs_buf_to_actor(stream->output, output, out_len); - res = bytes = (int)out_len; - data = squashfs_first_page(output); - buff = stream->output; - while (data) { - if (bytes <= PAGE_SIZE) { - memcpy(data, buff, bytes); - break; - } else { - memcpy(data, buff, PAGE_SIZE); - buff += PAGE_SIZE; - bytes -= PAGE_SIZE; - data = squashfs_next_page(output); - } - } - squashfs_finish_page(output); - - return res; - -failed: - return -EIO; + return out_len; } const struct squashfs_decompressor squashfs_lzo_comp_ops = { diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 9b7b1b6a7892..e348f5647fbd 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -9,79 +9,11 @@ #include #include #include +#include #include "page_actor.h" -/* - * This file contains implementations of page_actor for decompressing into - * an intermediate buffer, and for decompressing directly into the - * page cache. - * - * Calling code should avoid sleeping between calls to squashfs_first_page() - * and squashfs_finish_page(). - */ - -/* Implementation of page_actor for decompressing into intermediate buffer */ -static void *cache_first_page(struct squashfs_page_actor *actor) -{ - actor->next_page = 1; - return actor->buffer[0]; -} - -static void *cache_next_page(struct squashfs_page_actor *actor) -{ - if (actor->next_page == actor->pages) - return NULL; - - return actor->buffer[actor->next_page++]; -} - -static void cache_finish_page(struct squashfs_page_actor *actor) -{ - /* empty */ -} - -struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, - int pages, int length) -{ - struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); - - if (actor == NULL) - return NULL; - - actor->length = length ? : pages * PAGE_SIZE; - actor->buffer = buffer; - actor->pages = pages; - actor->next_page = 0; - actor->squashfs_first_page = cache_first_page; - actor->squashfs_next_page = cache_next_page; - actor->squashfs_finish_page = cache_finish_page; - return actor; -} - -/* Implementation of page_actor for decompressing directly into page cache. */ -static void *direct_first_page(struct squashfs_page_actor *actor) -{ - actor->next_page = 1; - return actor->pageaddr = kmap_atomic(actor->page[0]); -} - -static void *direct_next_page(struct squashfs_page_actor *actor) -{ - if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); - - return actor->pageaddr = actor->next_page == actor->pages ? NULL : - kmap_atomic(actor->page[actor->next_page++]); -} - -static void direct_finish_page(struct squashfs_page_actor *actor) -{ - if (actor->pageaddr) - kunmap_atomic(actor->pageaddr); -} - -struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, - int pages, int length) +struct squashfs_page_actor *squashfs_page_actor_init(struct page **page, + int pages, int length, void (*release_pages)(struct page **, int, int)) { struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); @@ -93,8 +25,129 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, actor->pages = pages; actor->next_page = 0; actor->pageaddr = NULL; - actor->squashfs_first_page = direct_first_page; - actor->squashfs_next_page = direct_next_page; - actor->squashfs_finish_page = direct_finish_page; + actor->release_pages = release_pages; return actor; } + +void squashfs_page_actor_free(struct squashfs_page_actor *actor, int error) +{ + if (!actor) + return; + + if (actor->release_pages) + actor->release_pages(actor->page, actor->pages, error); + kfree(actor); +} + +void squashfs_actor_to_buf(struct squashfs_page_actor *actor, void *buf, + int length) +{ + void *pageaddr; + int pos = 0, avail, i; + + for (i = 0; i < actor->pages && pos < length; ++i) { + avail = min_t(int, length - pos, PAGE_SIZE); + if (actor->page[i]) { + pageaddr = kmap_atomic(actor->page[i]); + memcpy(buf + pos, pageaddr, avail); + kunmap_atomic(pageaddr); + } + pos += avail; + } +} + +void squashfs_buf_to_actor(void *buf, struct squashfs_page_actor *actor, + int length) +{ + void *pageaddr; + int pos = 0, avail, i; + + for (i = 0; i < actor->pages && pos < length; ++i) { + avail = min_t(int, length - pos, PAGE_SIZE); + if (actor->page[i]) { + pageaddr = kmap_atomic(actor->page[i]); + memcpy(pageaddr, buf + pos, avail); + kunmap_atomic(pageaddr); + } + pos += avail; + } +} + +void squashfs_bh_to_actor(struct buffer_head **bh, int nr_buffers, + struct squashfs_page_actor *actor, int offset, int length, int blksz) +{ + void *kaddr = NULL; + int bytes = 0, pgoff = 0, b = 0, p = 0, avail, i; + + while (bytes < length) { + if (actor->page[p]) { + kaddr = kmap_atomic(actor->page[p]); + while (pgoff < PAGE_SIZE && bytes < length) { + avail = min_t(int, blksz - offset, + PAGE_SIZE - pgoff); + memcpy(kaddr + pgoff, bh[b]->b_data + offset, + avail); + pgoff += avail; + bytes += avail; + offset = (offset + avail) % blksz; + if (!offset) { + put_bh(bh[b]); + ++b; + } + } + kunmap_atomic(kaddr); + pgoff = 0; + } else { + for (i = 0; i < PAGE_SIZE / blksz; ++i) { + if (bh[b]) + put_bh(bh[b]); + ++b; + } + bytes += PAGE_SIZE; + } + ++p; + } +} + +void squashfs_bh_to_buf(struct buffer_head **bh, int nr_buffers, void *buf, + int offset, int length, int blksz) +{ + int i, avail, bytes = 0; + + for (i = 0; i < nr_buffers && bytes < length; ++i) { + avail = min_t(int, length - bytes, blksz - offset); + if (bh[i]) { + memcpy(buf + bytes, bh[i]->b_data + offset, avail); + put_bh(bh[i]); + } + bytes += avail; + offset = 0; + } +} + +void free_page_array(struct page **page, int nr_pages) +{ + int i; + + for (i = 0; i < nr_pages; ++i) + __free_page(page[i]); + kfree(page); +} + +struct page **alloc_page_array(int nr_pages, int gfp_mask) +{ + int i; + struct page **page; + + page = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); + if (!page) + return NULL; + for (i = 0; i < nr_pages; ++i) { + page[i] = alloc_page(gfp_mask); + if (!page[i]) { + free_page_array(page, i); + return NULL; + } + } + return page; +} diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 98537eab27e2..3102987dccfd 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -49,33 +49,59 @@ static inline void squashfs_finish_page(struct squashfs_page_actor *actor) } #else struct squashfs_page_actor { - union { - void **buffer; - struct page **page; - }; + struct page **page; void *pageaddr; - void *(*squashfs_first_page)(struct squashfs_page_actor *); - void *(*squashfs_next_page)(struct squashfs_page_actor *); - void (*squashfs_finish_page)(struct squashfs_page_actor *); int pages; int length; int next_page; + void (*release_pages)(struct page **, int, int); }; -extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int); -extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page - **, int, int); +extern struct squashfs_page_actor *squashfs_page_actor_init(struct page **, + int, int, void (*)(struct page **, int, int)); +extern void squashfs_page_actor_free(struct squashfs_page_actor *, int); + +extern void squashfs_actor_to_buf(struct squashfs_page_actor *, void *, int); +extern void squashfs_buf_to_actor(void *, struct squashfs_page_actor *, int); +extern void squashfs_bh_to_actor(struct buffer_head **, int, + struct squashfs_page_actor *, int, int, int); +extern void squashfs_bh_to_buf(struct buffer_head **, int, void *, int, int, + int); + +/* + * Calling code should avoid sleeping between calls to squashfs_first_page() + * and squashfs_finish_page(). + */ static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { - return actor->squashfs_first_page(actor); + actor->next_page = 1; + return actor->pageaddr = actor->page[0] ? kmap_atomic(actor->page[0]) + : NULL; } + static inline void *squashfs_next_page(struct squashfs_page_actor *actor) { - return actor->squashfs_next_page(actor); + if (!IS_ERR_OR_NULL(actor->pageaddr)) + kunmap_atomic(actor->pageaddr); + + if (actor->next_page == actor->pages) + return actor->pageaddr = ERR_PTR(-ENODATA); + + actor->pageaddr = actor->page[actor->next_page] ? + kmap_atomic(actor->page[actor->next_page]) : NULL; + ++actor->next_page; + return actor->pageaddr; } + static inline void squashfs_finish_page(struct squashfs_page_actor *actor) { - actor->squashfs_finish_page(actor); + if (!IS_ERR_OR_NULL(actor->pageaddr)) + kunmap_atomic(actor->pageaddr); } + #endif + +extern struct page **alloc_page_array(int, int); +extern void free_page_array(struct page **, int); + #endif diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 887d6d270080..f4faab52a879 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -28,8 +28,12 @@ #define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args) /* block.c */ +extern int squashfs_init_read_wq(void); +extern void squashfs_destroy_read_wq(void); extern int squashfs_read_data(struct super_block *, u64, int, u64 *, struct squashfs_page_actor *); +extern int squashfs_read_data_async(struct super_block *, u64, int, u64 *, + struct squashfs_page_actor *); /* cache.c */ extern struct squashfs_cache *squashfs_cache_init(char *, int, int); @@ -70,8 +74,9 @@ extern __le64 *squashfs_read_fragment_index_table(struct super_block *, void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int, int); -/* file_xxx.c */ -extern int squashfs_readpage_block(struct page *, u64, int); +/* file_direct.c */ +extern int squashfs_readpages_block(struct page *, struct list_head *, + unsigned int *, struct address_space *, int, u64, int); /* id.c */ extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 1da565cb50c3..8a6995de0277 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -49,7 +49,7 @@ struct squashfs_cache_entry { int num_waiters; wait_queue_head_t wait_queue; struct squashfs_cache *cache; - void **data; + struct page **page; struct squashfs_page_actor *actor; }; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index cf01e15a7b16..e2a0a7342bf8 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -444,9 +444,15 @@ static int __init init_squashfs_fs(void) if (err) return err; + if (!squashfs_init_read_wq()) { + destroy_inodecache(); + return -ENOMEM; + } + err = register_filesystem(&squashfs_fs_type); if (err) { destroy_inodecache(); + squashfs_destroy_read_wq(); return err; } @@ -460,6 +466,7 @@ static void __exit exit_squashfs_fs(void) { unregister_filesystem(&squashfs_fs_type); destroy_inodecache(); + squashfs_destroy_read_wq(); } diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 6bfaef73d065..2f7be1fb167c 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -55,7 +55,7 @@ static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk, struct comp_opts *opts; int err = 0, n; - opts = kmalloc(sizeof(*opts), GFP_KERNEL); + opts = kmalloc(sizeof(*opts), GFP_ATOMIC); if (opts == NULL) { err = -ENOMEM; goto out2; @@ -136,6 +136,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, enum xz_ret xz_err; int avail, total = 0, k = 0; struct squashfs_xz *stream = strm; + void *buf = NULL; xz_dec_reset(stream->state); stream->buf.in_pos = 0; @@ -156,12 +157,20 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->buf.out_pos == stream->buf.out_size) { stream->buf.out = squashfs_next_page(output); - if (stream->buf.out != NULL) { + if (!IS_ERR(stream->buf.out)) { stream->buf.out_pos = 0; total += PAGE_SIZE; } } + if (!stream->buf.out) { + if (!buf) { + buf = kmalloc(PAGE_SIZE, GFP_ATOMIC); + if (!buf) + goto out; + } + stream->buf.out = buf; + } xz_err = xz_dec_run(stream->state, &stream->buf); if (stream->buf.in_pos == stream->buf.in_size && k < b) @@ -173,11 +182,13 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, if (xz_err != XZ_STREAM_END || k < b) goto out; + kfree(buf); return total + stream->buf.out_pos; out: for (; k < b; k++) put_bh(bh[k]); + kfree(buf); return -EIO; } diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 2ec24d128bce..d917c728422b 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -66,6 +66,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, struct buffer_head **bh, int b, int offset, int length, struct squashfs_page_actor *output) { + void *buf = NULL; int zlib_err, zlib_init = 0, k = 0; z_stream *stream = strm; @@ -84,10 +85,19 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (stream->avail_out == 0) { stream->next_out = squashfs_next_page(output); - if (stream->next_out != NULL) + if (!IS_ERR(stream->next_out)) stream->avail_out = PAGE_SIZE; } + if (!stream->next_out) { + if (!buf) { + buf = kmalloc(PAGE_SIZE, GFP_ATOMIC); + if (!buf) + goto out; + } + stream->next_out = buf; + } + if (!zlib_init) { zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { @@ -115,11 +125,13 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (k < b) goto out; + kfree(buf); return stream->total_out; out: for (; k < b; k++) put_bh(bh[k]); + kfree(buf); return -EIO; } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d81d85394e32..5811bb06a0cc 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -570,11 +570,14 @@ out: static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, struct userfaultfd_wait_queue *ewq) { + struct userfaultfd_ctx *release_new_ctx; + if (WARN_ON_ONCE(current->flags & PF_EXITING)) goto out; ewq->ctx = ctx; init_waitqueue_entry(&ewq->wq, current); + release_new_ctx = NULL; spin_lock(&ctx->event_wqh.lock); /* @@ -601,8 +604,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, new = (struct userfaultfd_ctx *) (unsigned long) ewq->msg.arg.reserved.reserved1; - - userfaultfd_ctx_put(new); + release_new_ctx = new; } break; } @@ -617,6 +619,20 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, __set_current_state(TASK_RUNNING); spin_unlock(&ctx->event_wqh.lock); + if (release_new_ctx) { + struct vm_area_struct *vma; + struct mm_struct *mm = release_new_ctx->mm; + + /* the various vma->vm_userfaultfd_ctx still points to it */ + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + up_write(&mm->mmap_sem); + + userfaultfd_ctx_put(release_new_ctx); + } + /* * ctx may go away after this if the userfault pseudo fd is * already released. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2522533cd365..7a138a482f94 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -244,14 +244,24 @@ struct request { int lat_hist_enabled; }; +static inline bool blk_op_is_scsi(unsigned int op) +{ + return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT; +} + +static inline bool blk_op_is_private(unsigned int op) +{ + return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; +} + static inline bool blk_rq_is_scsi(struct request *rq) { - return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT; + return blk_op_is_scsi(req_op(rq)); } static inline bool blk_rq_is_private(struct request *rq) { - return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT; + return blk_op_is_private(req_op(rq)); } static inline bool blk_rq_is_passthrough(struct request *rq) @@ -259,6 +269,13 @@ static inline bool blk_rq_is_passthrough(struct request *rq) return blk_rq_is_scsi(rq) || blk_rq_is_private(rq); } +static inline bool bio_is_passthrough(struct bio *bio) +{ + unsigned op = bio_op(bio); + + return blk_op_is_scsi(op) || blk_op_is_private(op); +} + static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; @@ -955,7 +972,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, extern void blk_rq_unprep_clone(struct request *rq); extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); -extern int blk_rq_append_bio(struct request *rq, struct bio *bio); +extern int blk_rq_append_bio(struct request *rq, struct bio **bio); extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_queue_split(struct request_queue *, struct bio **); extern void blk_recount_segments(struct request_queue *, struct bio *); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 28734ee185a7..065f3a8eb486 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -917,6 +917,7 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) } #endif +extern void arch_freq_prepare_all(void); extern unsigned int arch_freq_get_on_cpu(int cpu); extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0df74b03b7b1..fe1250652515 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -87,7 +87,7 @@ enum cpuhp_state { CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, - CPUHP_TIMERS_DEAD, + CPUHP_TIMERS_PREPARE, CPUHP_MIPS_SOC_PREPARE, CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, diff --git a/include/linux/efi.h b/include/linux/efi.h index d813f7b04da7..29fdf8029cf6 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -140,11 +140,13 @@ struct efi_boot_memmap { struct capsule_info { efi_capsule_header_t header; + efi_capsule_header_t *capsule; int reset_type; long index; size_t count; size_t total_size; - phys_addr_t *pages; + struct page **pages; + phys_addr_t *phys; size_t page_bytes_remain; }; diff --git a/include/linux/fscache.h b/include/linux/fscache.h index f4ff47d4a893..fe0c349684fa 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -755,7 +755,7 @@ bool fscache_maybe_release_page(struct fscache_cookie *cookie, { if (fscache_cookie_valid(cookie) && PageFsCache(page)) return __fscache_maybe_release_page(cookie, page, gfp); - return false; + return true; } /** diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index cfd0cc158326..3e3893f1b596 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -273,7 +273,8 @@ struct ipv6_pinfo { * 100: prefer care-of address */ dontfrag:1, - autoflowlabel:1; + autoflowlabel:1, + autoflowlabel_set:1; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 401c8972cc3a..8b3d0103c03a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -546,6 +546,7 @@ struct mlx5_core_sriov { }; struct mlx5_irq_info { + cpumask_var_t mask; char name[MLX5_MAX_IRQ_NAME]; }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 69772347f866..c8091f06eaa4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -147,7 +147,7 @@ enum { MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771, MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772, MLX5_CMD_OP_QUERY_Q_COUNTER = 0x773, - MLX5_CMD_OP_SET_RATE_LIMIT = 0x780, + MLX5_CMD_OP_SET_PP_RATE_LIMIT = 0x780, MLX5_CMD_OP_QUERY_RATE_LIMIT = 0x781, MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT = 0x782, MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT = 0x783, @@ -7233,7 +7233,7 @@ struct mlx5_ifc_add_vxlan_udp_dport_in_bits { u8 vxlan_udp_port[0x10]; }; -struct mlx5_ifc_set_rate_limit_out_bits { +struct mlx5_ifc_set_pp_rate_limit_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -7242,7 +7242,7 @@ struct mlx5_ifc_set_rate_limit_out_bits { u8 reserved_at_40[0x40]; }; -struct mlx5_ifc_set_rate_limit_in_bits { +struct mlx5_ifc_set_pp_rate_limit_in_bits { u8 opcode[0x10]; u8 reserved_at_10[0x10]; @@ -7255,6 +7255,8 @@ struct mlx5_ifc_set_rate_limit_in_bits { u8 reserved_at_60[0x20]; u8 rate_limit[0x20]; + + u8 reserved_at_a0[0x160]; }; struct mlx5_ifc_access_register_out_bits { diff --git a/include/linux/pti.h b/include/linux/pti.h new file mode 100644 index 000000000000..0174883a935a --- /dev/null +++ b/include/linux/pti.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _INCLUDE_PTI_H +#define _INCLUDE_PTI_H + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#include +#else +static inline void pti_init(void) { } +#endif + +#endif diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 37b4bb2545b3..6866df4f31b5 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r) /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). Callers must hold producer_lock. + * Callers are responsible for making sure pointer that is being queued + * points to a valid data. */ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) { if (unlikely(!r->size) || r->queue[r->producer]) return -ENOSPC; + /* Make sure the pointer we are storing points to a valid data. */ + /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */ + smp_wmb(); + r->queue[r->producer++] = ptr; if (unlikely(r->producer >= r->size)) r->producer = 0; @@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r) if (ptr) __ptr_ring_discard_one(r); + /* Make sure anyone accessing data through the pointer is up to date. */ + /* Pairs with smp_wmb in __ptr_ring_produce. */ + smp_read_barrier_depends(); return ptr; } diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4aa40ef02d32..e8418fc77a43 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -214,7 +214,8 @@ struct tcp_sock { u8 chrono_type:2, /* current chronograph type */ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ - unused:4; + is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ + unused:3; u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ unused1 : 1, diff --git a/include/linux/tick.h b/include/linux/tick.h index 8ea41f658911..816b153e699e 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -119,6 +119,7 @@ extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern unsigned long tick_nohz_get_idle_calls(void); +extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ diff --git a/include/linux/timer.h b/include/linux/timer.h index 2a8be87fc13a..ed788f76c3f3 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -248,9 +248,11 @@ unsigned long round_jiffies_up(unsigned long j); unsigned long round_jiffies_up_relative(unsigned long j); #ifdef CONFIG_HOTPLUG_CPU +int timers_prepare_cpu(unsigned int cpu); int timers_dead_cpu(unsigned int cpu); #else -#define timers_dead_cpu NULL +#define timers_prepare_cpu NULL +#define timers_dead_cpu NULL #endif #endif diff --git a/include/net/ip.h b/include/net/ip.h index 9896f46cbbf1..af8addbaa3c1 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -34,6 +34,7 @@ #include #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ +#define IPV4_MIN_MTU 68 /* RFC 791 */ struct sock; diff --git a/include/net/tcp.h b/include/net/tcp.h index 4ff89c179cb9..44d5d2e8204c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1086,7 +1086,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct rate_sample *rs); + bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); /* These functions determine how the current flow behaves in respect of SACK diff --git a/init/main.c b/init/main.c index 8a390f60ec81..b32ec72cdf3d 100644 --- a/init/main.c +++ b/init/main.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include #include @@ -506,6 +507,8 @@ static void __init mm_init(void) ioremap_huge_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); + /* Should be run after espfix64 is set up. */ + pti_init(); } asmlinkage __visible void __init start_kernel(void) diff --git a/kernel/acct.c b/kernel/acct.c index 6670fbd3e466..354578d253d5 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - if (time_is_before_jiffies(acct->needcheck)) + if (time_is_after_jiffies(acct->needcheck)) goto out; /* May block */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 17d7fcfcb22b..70b351843f99 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1335,9 +1335,9 @@ static struct cpuhp_step cpuhp_bp_states[] = { * before blk_mq_queue_reinit_notify() from notify_dead(), * otherwise a RCU stall occurs. */ - [CPUHP_TIMERS_DEAD] = { + [CPUHP_TIMERS_PREPARE] = { .name = "timers:dead", - .startup.single = NULL, + .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, /* Kicks the plugged cpu into life */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 25846a8db6b8..6e33b6b92b43 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -259,7 +259,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, #ifdef CONFIG_NO_HZ_COMMON static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { - unsigned long idle_calls = tick_nohz_get_idle_calls(); + unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); bool ret = idle_calls == sg_cpu->saved_idle_calls; sg_cpu->saved_idle_calls = idle_calls; diff --git a/kernel/signal.c b/kernel/signal.c index 8dcd8825b2de..1facff1dbbae 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -78,7 +78,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force) handler = sig_handler(t, sig); if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && - handler == SIG_DFL && !force) + handler == SIG_DFL && !(force && sig_kernel_only(sig))) return 1; return sig_handler_ignored(handler, sig); @@ -94,13 +94,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - if (!sig_task_ignored(t, sig, force)) + /* + * Tracers may want to know about even ignored signal unless it + * is SIGKILL which can't be reported anyway but can be ignored + * by SIGNAL_UNKILLABLE task. + */ + if (t->ptrace && sig != SIGKILL) return 0; - /* - * Tracers may want to know about even ignored signals. - */ - return !t->ptrace; + return sig_task_ignored(t, sig, force); } /* @@ -929,9 +931,9 @@ static void complete_signal(int sig, struct task_struct *p, int group) * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && - !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && + !(signal->flags & SIGNAL_GROUP_EXIT) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || !t->ptrace)) { + (sig == SIGKILL || !p->ptrace)) { /* * This signal will be fatal to the whole group. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c08f36459654..4c50006e4d62 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -679,6 +679,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) ts->next_tick = 0; } +static inline bool local_timer_softirq_pending(void) +{ + return local_softirq_pending() & TIMER_SOFTIRQ; +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { @@ -695,8 +700,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; - if (rcu_needs_cpu(basemono, &next_rcu) || - arch_needs_cpu() || irq_work_needs_cpu()) { + /* + * Keep the periodic tick, when RCU, architecture or irq_work + * requests it. + * Aside of that check whether the local timer softirq is + * pending. If so its a bad idea to call get_next_timer_interrupt() + * because there is an already expired timer, so it will request + * immeditate expiry, which rearms the hardware timer with a + * minimal delta which brings us back to this place + * immediately. Lather, rinse and repeat... + */ + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || + irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* @@ -1014,6 +1029,19 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +/** + * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value + * for a particular CPU. + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls_cpu(int cpu) +{ + struct tick_sched *ts = tick_get_tick_sched(cpu); + + return ts->idle_calls; +} + /** * tick_nohz_get_idle_calls - return the current idle calls counter value * diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 00f943c222bc..9e809f8e4511 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -827,8 +827,8 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && (tflags & TIMER_DEFERRABLE)) { @@ -844,8 +844,8 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && (tflags & TIMER_DEFERRABLE)) { @@ -1003,8 +1003,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) if (!ret && pending_only) goto out_unlock; - debug_activate(timer, expires); - new_base = get_target_base(base, timer->flags); if (base != new_base) { @@ -1028,6 +1026,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } + debug_activate(timer, expires); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance @@ -1671,7 +1671,6 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) __run_timers(base); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) { __run_timers(&timer_base_deferrable); - __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); } } @@ -1830,6 +1829,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h } } +int timers_prepare_cpu(unsigned int cpu) +{ + struct timer_base *base; + int b; + + for (b = 0; b < NR_BASES; b++) { + base = per_cpu_ptr(&timer_bases[b], cpu); + base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->is_idle = false; + base->must_forward_clk = true; + } + return 0; +} + int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 81279c6602ff..0476a9372014 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -281,6 +281,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) +#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -332,7 +334,9 @@ static void rb_init_page(struct buffer_data_page *bpage) */ size_t ring_buffer_page_len(void *page) { - return local_read(&((struct buffer_data_page *)page)->commit) + struct buffer_data_page *bpage = page; + + return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) + BUF_PAGE_HDR_SIZE; } @@ -4439,8 +4443,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct buffer_data_page *bpage = data; + struct page *page = virt_to_page(bpage); unsigned long flags; + /* If the page is still in use someplace else, we can't reuse it */ + if (page_ref_count(page) > 1) + goto out; + local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); @@ -4452,6 +4461,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); + out: free_page((unsigned long)bpage); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a4a5828ae414..25990b511009 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6775,7 +6775,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int entries, size, i; + int entries, i; ssize_t ret = 0; #ifdef CONFIG_TRACER_MAX_TRACE @@ -6829,14 +6829,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - /* - * zero out any left over data, this is going to - * user land. - */ - size = ring_buffer_page_len(ref->page); - if (size < PAGE_SIZE) - memset(ref->page + size, 0, PAGE_SIZE - size); - page = virt_to_page(ref->page); spd.pages[i] = page; @@ -7594,6 +7586,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size buf->data = alloc_percpu(struct trace_array_cpu); if (!buf->data) { ring_buffer_free(buf->buffer); + buf->buffer = NULL; return -ENOMEM; } @@ -7617,7 +7610,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot ? size : 1); if (WARN_ON(ret)) { ring_buffer_free(tr->trace_buffer.buffer); + tr->trace_buffer.buffer = NULL; free_percpu(tr->trace_buffer.data); + tr->trace_buffer.data = NULL; return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; diff --git a/mm/mprotect.c b/mm/mprotect.c index b20675ad6b70..fc969367ef60 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -166,7 +166,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, next = pmd_addr_end(addr, end); if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) && pmd_none_or_clear_bad(pmd)) - continue; + goto next; /* invoke the mmu notifier if the pmd is populated */ if (!mni_start) { @@ -188,7 +188,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, } /* huge pmd was handled */ - continue; + goto next; } } /* fall through, the trans huge pmd just split */ @@ -196,6 +196,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, this_pages = change_pte_range(vma, pmd, addr, next, newprot, dirty_accountable, prot_numa); pages += this_pages; +next: + cond_resched(); } while (pmd++, addr = next, addr != end); if (mni_start) diff --git a/mm/sparse.c b/mm/sparse.c index 60805abf98af..30e56a100ee8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -211,7 +211,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) if (unlikely(!mem_section)) { unsigned long size, align; - size = sizeof(struct mem_section) * NR_SECTION_ROOTS; + size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); mem_section = memblock_virt_alloc(size, align); } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index de2152730809..08190db0a2dc 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1223,19 +1223,20 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev, struct net_bridge *br = netdev_priv(dev); int err; + err = register_netdevice(dev); + if (err) + return err; + if (tb[IFLA_ADDRESS]) { spin_lock_bh(&br->lock); br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS])); spin_unlock_bh(&br->lock); } - err = register_netdevice(dev); - if (err) - return err; - err = br_changelink(dev, tb, data, extack); if (err) - unregister_netdevice(dev); + br_dev_delete(dev, NULL); + return err; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6cfdc7c84c48..0dd6359e5924 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -266,7 +266,7 @@ struct net *get_net_ns_by_id(struct net *net, int id) spin_lock_bh(&net->nsid_lock); peer = idr_find(&net->netns_ids, id); if (peer) - get_net(peer); + peer = maybe_get_net(peer); spin_unlock_bh(&net->nsid_lock); rcu_read_unlock(); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e140ba49b30a..15fa5baa8fae 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1181,12 +1181,12 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) int i, new_frags; u32 d_off; - if (!num_frags) - return 0; - if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL; + if (!num_frags) + goto release; + new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = 0; i < new_frags; i++) { page = alloc_page(gfp_mask); @@ -1242,6 +1242,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); skb_shinfo(skb)->nr_frags = new_frags; +release: skb_zcopy_clear(skb, false); return 0; } @@ -3657,8 +3658,6 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; - if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC)) - goto err; while (pos < offset + len) { if (i >= nfrags) { @@ -3684,6 +3683,8 @@ normal: if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) goto err; + if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + goto err; *nskb_frag = *frag; __skb_frag_ref(nskb_frag); @@ -4296,7 +4297,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk = skb->sk; if (!skb_may_tx_timestamp(sk, false)) - return; + goto err; /* Take a reference to prevent skb_orphan() from freeing the socket, * but only if the socket refcount is not zero. @@ -4305,7 +4306,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, *skb_hwtstamps(skb) = *hwtstamps; __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); sock_put(sk); + return; } + +err: + kfree_skb(skb); } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d7adc0616599..bffa88ecc534 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1420,7 +1420,7 @@ skip: static bool inetdev_valid_mtu(unsigned int mtu) { - return mtu >= 68; + return mtu >= IPV4_MIN_MTU; } static void inetdev_send_gratuitous_arp(struct net_device *dev, diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 37819ab4cc74..d72874150905 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1274,14 +1274,19 @@ err_table_hash_alloc: static void ip_fib_net_exit(struct net *net) { - unsigned int i; + int i; rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif - for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + /* Destroy the tables in reverse order to guarantee that the + * local table, ID 255, is destroyed before the main table, ID + * 254. This is necessary as the local table may contain + * references to data contained in the main table. + */ + for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) { struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; struct fib_table *tb; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 01ed22139ac2..aff3751df950 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -706,7 +706,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); - u32 val; + u32 fi_val, val; if (!type) continue; @@ -723,7 +723,11 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) val = nla_get_u32(nla); } - if (fi->fib_metrics->metrics[type - 1] != val) + fi_val = fi->fib_metrics->metrics[type - 1]; + if (type == RTAX_FEATURES) + fi_val &= ~DST_FEATURE_ECN_CA; + + if (fi_val != val) return false; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index ab183af0b5b6..c621266e0306 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include @@ -321,6 +322,23 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) return scount; } +/* source address selection per RFC 3376 section 4.2.13 */ +static __be32 igmpv3_get_srcaddr(struct net_device *dev, + const struct flowi4 *fl4) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (!in_dev) + return htonl(INADDR_ANY); + + for_ifa(in_dev) { + if (inet_ifa_match(fl4->saddr, ifa)) + return fl4->saddr; + } endfor_ifa(in_dev); + + return htonl(INADDR_ANY); +} + static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; @@ -368,7 +386,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) pip->frag_off = htons(IP_DF); pip->ttl = 1; pip->daddr = fl4.daddr; - pip->saddr = fl4.saddr; + pip->saddr = igmpv3_get_srcaddr(dev, &fl4); pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(net, skb, NULL); @@ -404,16 +422,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, - int type, struct igmpv3_grec **ppgr) + int type, struct igmpv3_grec **ppgr, unsigned int mtu) { struct net_device *dev = pmc->interface->dev; struct igmpv3_report *pih; struct igmpv3_grec *pgr; - if (!skb) - skb = igmpv3_newpack(dev, dev->mtu); - if (!skb) - return NULL; + if (!skb) { + skb = igmpv3_newpack(dev, mtu); + if (!skb) + return NULL; + } pgr = skb_put(skb, sizeof(struct igmpv3_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; @@ -436,12 +455,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; + unsigned int mtu; if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return skb; + mtu = READ_ONCE(dev->mtu); + if (mtu < IPV4_MIN_MTU) + return skb; + isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; truncate = type == IGMPV3_MODE_IS_EXCLUDE || @@ -462,7 +486,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); } } first = 1; @@ -498,12 +522,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, pgr->grec_nsrcs = htons(scount); if (skb) igmpv3_sendpack(skb); - skb = igmpv3_newpack(dev, dev->mtu); + skb = igmpv3_newpack(dev, mtu); first = 1; scount = 0; } if (first) { - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) @@ -538,7 +562,7 @@ empty_source: igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e9805ad664ac..4e90082b23a6 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) dev->needed_headroom = t_hlen + hlen; mtu -= (dev->hard_header_len + t_hlen); - if (mtu < 68) - mtu = 68; + if (mtu < IPV4_MIN_MTU) + mtu = IPV4_MIN_MTU; return mtu; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 33b70bfd1122..125c1eab3eaa 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -513,11 +513,16 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; struct ip_options_data opt_copy; struct raw_frag_vec rfv; + int hdrincl; err = -EMSGSIZE; if (len > 0xFFFF) goto out; + /* hdrincl should be READ_ONCE(inet->hdrincl) + * but READ_ONCE() doesn't work with bit fields + */ + hdrincl = inet->hdrincl; /* * Check the flags. */ @@ -593,7 +598,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* Linux does not mangle headers on raw sockets, * so that IP options + IP_HDRINCL is non-sense. */ - if (inet->hdrincl) + if (hdrincl) goto done; if (ipc.opt->opt.srr) { if (!daddr) @@ -615,12 +620,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | - (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), + (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); - if (!inet->hdrincl) { + if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; @@ -645,7 +650,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto do_confirm; back_from_confirm: - if (inet->hdrincl) + if (hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, &rt, msg->msg_flags, &ipc.sockc); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5091402720ab..a0c72b09cefc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2356,6 +2356,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; tcp_clear_retrans(tp); inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 69ee877574d0..8322f26e770e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -110,7 +110,8 @@ struct bbr { u32 lt_last_lost; /* LT intvl start: tp->lost */ u32 pacing_gain:10, /* current gain for setting pacing rate */ cwnd_gain:10, /* current gain for setting cwnd */ - full_bw_cnt:3, /* number of rounds without large bw gains */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ cycle_idx:3, /* current index in pacing_gain cycle array */ has_seen_rtt:1, /* have we seen an RTT sample yet? */ unused_b:5; @@ -180,7 +181,7 @@ static bool bbr_full_bw_reached(const struct sock *sk) { const struct bbr *bbr = inet_csk_ca(sk); - return bbr->full_bw_cnt >= bbr_full_bw_cnt; + return bbr->full_bw_reached; } /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ @@ -717,6 +718,7 @@ static void bbr_check_full_bw_reached(struct sock *sk, return; } ++bbr->full_bw_cnt; + bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; } /* If pipe is probably full, drain the queue and then enter steady-state. */ @@ -850,6 +852,7 @@ static void bbr_init(struct sock *sk) bbr->restore_cwnd = 0; bbr->round_start = 0; bbr->idle_restart = 0; + bbr->full_bw_reached = 0; bbr->full_bw = 0; bbr->full_bw_cnt = 0; bbr->cycle_mstamp = 0; @@ -871,6 +874,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk) */ static u32 bbr_undo_cwnd(struct sock *sk) { + struct bbr *bbr = inet_csk_ca(sk); + + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ + bbr->full_bw_cnt = 0; + bbr_reset_lt_bw_sampling(sk); return tcp_sk(sk)->snd_cwnd; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d0b61ddfd912..c54d7a2c7fef 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -522,9 +522,6 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) u32 new_sample = tp->rcv_rtt_est.rtt_us; long m = sample; - if (m == 0) - m = 1; - if (new_sample != 0) { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially @@ -561,6 +558,8 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); + if (!delta_us) + delta_us = 1; tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: @@ -577,8 +576,11 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + u32 delta_us; + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); tcp_rcv_rtt_update(tp, delta_us, 0); } } @@ -1976,6 +1978,8 @@ void tcp_enter_loss(struct sock *sk) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; tp->fackets_out = 0; + /* Mark SACK reneging until we recover from this loss event. */ + tp->is_sack_reneg = 1; } tcp_clear_all_retrans_hints(tp); @@ -2429,6 +2433,7 @@ static bool tcp_try_undo_recovery(struct sock *sk) return true; } tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; return false; } @@ -2460,8 +2465,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; - if (frto_undo || tcp_is_sack(tp)) + if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); + tp->is_sack_reneg = 0; + } return true; } return false; @@ -3552,6 +3559,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) struct tcp_sacktag_state sack_state; struct rate_sample rs = { .prior_delivered = 0 }; u32 prior_snd_una = tp->snd_una; + bool is_sack_reneg = tp->is_sack_reneg; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; @@ -3667,7 +3675,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, sack_state.rate); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5a5ed4f14678..cab4b935e474 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -844,7 +844,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 3330a370d306..c61240e43923 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, /* Update the connection delivery information and generate a rate sample. */ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct rate_sample *rs) + bool is_sack_reneg, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; @@ -124,8 +124,12 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ - /* Return an invalid sample if no timing information is available. */ - if (!rs->prior_mstamp) { + /* Return an invalid sample if no timing information is available or + * in recovery from loss with SACK reneging. Rate samples taken during + * a SACK reneging event may overestimate bw by including packets that + * were SACKed before the reneg. + */ + if (!rs->prior_mstamp || is_sack_reneg) { rs->delivered = -1; rs->interval_us = -1; return; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 655dd8d7f064..e9af1879cd53 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -264,6 +264,7 @@ void tcp_delack_timer_handler(struct sock *sk) icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN; } + tcp_mstamp_refresh(tcp_sk(sk)); tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } @@ -627,6 +628,7 @@ static void tcp_keepalive_timer (unsigned long data) goto out; } + tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 068e5e69120e..e0af5560061b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -232,7 +232,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .proxy_ndp = 0, .accept_source_route = 0, /* we do not accept RH0 by default. */ .disable_ipv6 = 0, - .accept_dad = 1, + .accept_dad = 0, .suppress_frag_ndisc = 1, .accept_ra_mtu = 1, .stable_secret = { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b3c91857646e..3ddb89acf44b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -226,7 +226,6 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->autoflowlabel = ip6_default_np_autolabel(net); np->repflow = net->ipv6.sysctl.flowlabel_reflect; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 5d6bee070871..7a2df6646486 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1020,6 +1020,36 @@ static void ip6gre_tunnel_setup(struct net_device *dev) eth_random_addr(dev->perm_addr); } +#define GRE6_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_HW_CSUM) + +static void ip6gre_tnl_init_features(struct net_device *dev) +{ + struct ip6_tnl *nt = netdev_priv(dev); + + dev->features |= GRE6_FEATURES; + dev->hw_features |= GRE6_FEATURES; + + if (!(nt->parms.o_flags & TUNNEL_SEQ)) { + /* TCP offload with GRE SEQ is not supported, nor + * can we support 2 levels of outer headers requiring + * an update. + */ + if (!(nt->parms.o_flags & TUNNEL_CSUM) || + nt->encap.type == TUNNEL_ENCAP_NONE) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + + /* Can use a lockless transmit, unless we generate + * output sequences + */ + dev->features |= NETIF_F_LLTX; + } +} + static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; @@ -1054,6 +1084,8 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; + ip6gre_tnl_init_features(dev); + return 0; } @@ -1302,11 +1334,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_get_iflink = ip6_tnl_get_iflink, }; -#define GRE6_FEATURES (NETIF_F_SG | \ - NETIF_F_FRAGLIST | \ - NETIF_F_HIGHDMA | \ - NETIF_F_HW_CSUM) - static void ip6gre_tap_setup(struct net_device *dev) { @@ -1386,26 +1413,6 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, nt->net = dev_net(dev); ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); - dev->features |= GRE6_FEATURES; - dev->hw_features |= GRE6_FEATURES; - - if (!(nt->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported, nor - * can we support 2 levels of outer headers requiring - * an update. - */ - if (!(nt->parms.o_flags & TUNNEL_CSUM) || - (nt->encap.type == TUNNEL_ENCAP_NONE)) { - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - } - - /* Can use a lockless transmit, unless we generate - * output sequences - */ - dev->features |= NETIF_F_LLTX; - } - err = register_netdevice(dev); if (err) goto out; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5110a418cc4d..f7dd51c42314 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -166,6 +166,14 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } +static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +{ + if (!np->autoflowlabel_set) + return ip6_default_np_autolabel(net); + else + return np->autoflowlabel; +} + /* * xmit an sk_buff (used by TCP, SCTP and DCCP) * Note : socket lock is not held for SYNACK packets, but might be modified @@ -230,7 +238,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel, fl6)); + ip6_autoflowlabel(net, np), fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -1626,7 +1634,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel, fl6)); + ip6_autoflowlabel(net, np), fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a1c24443cd9e..ef958d50746b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -912,7 +912,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, if (t->parms.collect_md) { tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0); if (!tun_dst) - return 0; + goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a5e466d4e093..90dbfa78a390 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -878,6 +878,7 @@ pref_skip_coa: break; case IPV6_AUTOFLOWLABEL: np->autoflowlabel = valbool; + np->autoflowlabel_set = 1; retv = 0; break; case IPV6_RECVFRAGSIZE: diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 12b7c27ce5ce..9a38a2c641fa 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1682,16 +1682,16 @@ static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, - int type, struct mld2_grec **ppgr) + int type, struct mld2_grec **ppgr, unsigned int mtu) { - struct net_device *dev = pmc->idev->dev; struct mld2_report *pmr; struct mld2_grec *pgr; - if (!skb) - skb = mld_newpack(pmc->idev, dev->mtu); - if (!skb) - return NULL; + if (!skb) { + skb = mld_newpack(pmc->idev, mtu); + if (!skb) + return NULL; + } pgr = skb_put(skb, sizeof(struct mld2_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; @@ -1714,10 +1714,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_grec *pgr = NULL; struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; + unsigned int mtu; if (pmc->mca_flags & MAF_NOREPORT) return skb; + mtu = READ_ONCE(dev->mtu); + if (mtu < IPV6_MIN_MTU) + return skb; + isquery = type == MLD2_MODE_IS_INCLUDE || type == MLD2_MODE_IS_EXCLUDE; truncate = type == MLD2_MODE_IS_EXCLUDE || @@ -1738,7 +1743,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) mld_sendpack(skb); - skb = mld_newpack(idev, dev->mtu); + skb = mld_newpack(idev, mtu); } } first = 1; @@ -1774,12 +1779,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, pgr->grec_nsrcs = htons(scount); if (skb) mld_sendpack(skb); - skb = mld_newpack(idev, dev->mtu); + skb = mld_newpack(idev, mtu); first = 1; scount = 0; } if (first) { - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) @@ -1814,7 +1819,7 @@ empty_source: mld_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } - skb = add_grhead(skb, pmc, type, &pgr); + skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 56ed6594970b..696ed09a08b1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3672,19 +3672,13 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - if (!fibmatch) - dst = ip6_route_input_lookup(net, dev, &fl6, flags); - else - dst = ip6_route_lookup(net, &fl6, 0); + dst = ip6_route_input_lookup(net, dev, &fl6, flags); rcu_read_unlock(); } else { fl6.flowi6_oif = oif; - if (!fibmatch) - dst = ip6_route_output(net, NULL, &fl6); - else - dst = ip6_route_lookup(net, &fl6, 0); + dst = ip6_route_output(net, NULL, &fl6); } @@ -3701,6 +3695,15 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } + if (fibmatch && rt->dst.from) { + struct rt6_info *ort = container_of(rt->dst.from, + struct rt6_info, dst); + + dst_hold(&ort->dst); + ip6_rt_put(rt); + rt = ort; + } + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 32ded300633d..237cc6187c5a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -988,7 +988,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, sk->sk_bound_dev_if, - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr), 0, 0); } diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index 661a496f2c04..c6a51d91f26f 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -28,6 +28,7 @@ #include #include #include +#include #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) #include @@ -1085,7 +1086,7 @@ static int ipx_proto(const struct sk_buff *skb, { int thoff = 0, tproto; - switch (par->family) { + switch (par->state->pf) { case NFPROTO_IPV6: tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); if (tproto < 0) @@ -1180,27 +1181,29 @@ static void get_dev_and_dir(const struct sk_buff *skb, enum ifs_tx_rx *direction, const struct net_device **el_dev) { + const struct nf_hook_state *parst = par->state; + BUG_ON(!direction || !el_dev); - if (par->in) { - *el_dev = par->in; + if (parst->in) { + *el_dev = parst->in; *direction = IFS_RX; - } else if (par->out) { - *el_dev = par->out; + } else if (parst->out) { + *el_dev = parst->out; *direction = IFS_TX; } else { - pr_err("qtaguid[%d]: %s(): no par->in/out?!!\n", - par->hooknum, __func__); + pr_err("qtaguid[%d]: %s(): no par->state->in/out?!!\n", + parst->hook, __func__); BUG(); } if (unlikely(!(*el_dev)->name)) { pr_err("qtaguid[%d]: %s(): no dev->name?!!\n", - par->hooknum, __func__); + parst->hook, __func__); BUG(); } if (skb->dev && *el_dev != skb->dev) { MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs par->%s=%p %s\n", - par->hooknum, skb->dev, skb->dev->name, + parst->hook, skb->dev, skb->dev->name, *direction == IFS_RX ? "in" : "out", *el_dev, (*el_dev)->name); } @@ -1215,6 +1218,7 @@ static void get_dev_and_dir(const struct sk_buff *skb, static void iface_stat_update_from_skb(const struct sk_buff *skb, struct xt_action_param *par) { + const struct nf_hook_state *parst = par->state; struct iface_stat *entry; const struct net_device *el_dev; enum ifs_tx_rx direction; @@ -1225,19 +1229,19 @@ static void iface_stat_update_from_skb(const struct sk_buff *skb, proto = ipx_proto(skb, par); MT_DEBUG("qtaguid[%d]: iface_stat: %s(%s): " "type=%d fam=%d proto=%d dir=%d\n", - par->hooknum, __func__, el_dev->name, el_dev->type, - par->family, proto, direction); + parst->hook, __func__, el_dev->name, el_dev->type, + parst->pf, proto, direction); spin_lock_bh(&iface_stat_list_lock); entry = get_iface_entry(el_dev->name); if (entry == NULL) { IF_DEBUG("qtaguid[%d]: iface_stat: %s(%s): not tracked\n", - par->hooknum, __func__, el_dev->name); + parst->hook, __func__, el_dev->name); spin_unlock_bh(&iface_stat_list_lock); return; } - IF_DEBUG("qtaguid[%d]: %s(%s): entry=%p\n", par->hooknum, __func__, + IF_DEBUG("qtaguid[%d]: %s(%s): entry=%p\n", parst->hook, __func__, el_dev->name, entry); data_counters_update(&entry->totals_via_skb, 0, direction, proto, @@ -1583,11 +1587,12 @@ err: static struct sock *qtaguid_find_sk(const struct sk_buff *skb, struct xt_action_param *par) { + const struct nf_hook_state *parst = par->state; struct sock *sk; - unsigned int hook_mask = (1 << par->hooknum); + unsigned int hook_mask = (1 << parst->hook); MT_DEBUG("qtaguid[%d]: find_sk(skb=%p) family=%d\n", - par->hooknum, skb, par->family); + parst->hook, skb, parst->pf); /* * Let's not abuse the the xt_socket_get*_sk(), or else it will @@ -1596,12 +1601,12 @@ static struct sock *qtaguid_find_sk(const struct sk_buff *skb, if (!(hook_mask & XT_SOCKET_SUPPORTED_HOOKS)) return NULL; - switch (par->family) { + switch (parst->pf) { case NFPROTO_IPV6: - sk = nf_sk_lookup_slow_v6(dev_net(skb->dev), skb, par->in); + sk = nf_sk_lookup_slow_v6(dev_net(skb->dev), skb, parst->in); break; case NFPROTO_IPV4: - sk = nf_sk_lookup_slow_v4(dev_net(skb->dev), skb, par->in); + sk = nf_sk_lookup_slow_v4(dev_net(skb->dev), skb, parst->in); break; default: return NULL; @@ -1609,7 +1614,7 @@ static struct sock *qtaguid_find_sk(const struct sk_buff *skb, if (sk) { MT_DEBUG("qtaguid[%d]: %p->sk_proto=%u->sk_state=%d\n", - par->hooknum, sk, sk->sk_protocol, sk->sk_state); + parst->hook, sk, sk->sk_protocol, sk->sk_state); } return sk; } @@ -1625,8 +1630,8 @@ static void account_for_uid(const struct sk_buff *skb, get_dev_and_dir(skb, par, &direction, &el_dev); proto = ipx_proto(skb, par); MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d dir=%d\n", - par->hooknum, el_dev->name, el_dev->type, - par->family, proto, direction); + par->state->hook, el_dev->name, el_dev->type, + par->state->pf, proto, direction); if_tag_stat_update(el_dev->name, uid, skb->sk ? skb->sk : alternate_sk, @@ -1637,6 +1642,7 @@ static void account_for_uid(const struct sk_buff *skb, static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_qtaguid_match_info *info = par->matchinfo; + const struct nf_hook_state *parst = par->state; const struct file *filp; bool got_sock = false; struct sock *sk; @@ -1653,7 +1659,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) return (info->match ^ info->invert) == 0; MT_DEBUG("qtaguid[%d]: entered skb=%p par->in=%p/out=%p fam=%d\n", - par->hooknum, skb, par->in, par->out, par->family); + parst->hook, skb, parst->in, parst->out, parst->pf); atomic64_inc(&qtu_events.match_calls); if (skb == NULL) { @@ -1661,7 +1667,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) goto ret_res; } - switch (par->hooknum) { + switch (parst->hook) { case NF_INET_PRE_ROUTING: case NF_INET_POST_ROUTING: atomic64_inc(&qtu_events.match_calls_prepost); @@ -1718,7 +1724,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) atomic64_inc(&qtu_events.match_found_sk); } MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n", - par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par)); + parst->hook, sk, got_sock, parst->pf, ipx_proto(skb, par)); if (!sk) { /* @@ -1728,7 +1734,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) */ if (do_tag_stat) account_for_uid(skb, sk, 0, par); - MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum); + MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", parst->hook); res = (info->match ^ info->invert) == 0; atomic64_inc(&qtu_events.match_no_sk); goto put_sock_ret_res; @@ -1755,7 +1761,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) uid_lte(sock_uid, uid_max)) ^ !(info->invert & XT_QTAGUID_UID)) { MT_DEBUG("qtaguid[%d]: leaving uid not matching\n", - par->hooknum); + parst->hook); res = false; goto put_sock_ret_res; } @@ -1766,7 +1772,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) set_sk_callback_lock = true; read_lock_bh(&sk->sk_callback_lock); MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", - par->hooknum, sk, sk->sk_socket, + parst->hook, sk, sk->sk_socket, sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); filp = sk->sk_socket ? sk->sk_socket->file : NULL; if (!filp) { @@ -1776,18 +1782,18 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) goto put_sock_ret_res; } MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", - par->hooknum, filp ? + parst->hook, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); if ((gid_gte(filp->f_cred->fsgid, gid_min) && gid_lte(filp->f_cred->fsgid, gid_max)) ^ !(info->invert & XT_QTAGUID_GID)) { MT_DEBUG("qtaguid[%d]: leaving gid not matching\n", - par->hooknum); + parst->hook); res = false; goto put_sock_ret_res; } } - MT_DEBUG("qtaguid[%d]: leaving matched\n", par->hooknum); + MT_DEBUG("qtaguid[%d]: leaving matched\n", parst->hook); res = true; put_sock_ret_res: @@ -1796,7 +1802,7 @@ put_sock_ret_res: if (set_sk_callback_lock) read_unlock_bh(&sk->sk_callback_lock); ret_res: - MT_DEBUG("qtaguid[%d]: left %d\n", par->hooknum, res); + MT_DEBUG("qtaguid[%d]: left %d\n", parst->hook, res); return res; } @@ -1930,7 +1936,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) uid, sock_tag_entry->pid ); - sk_ref_count = atomic_read( + sk_ref_count = refcount_read( &sock_tag_entry->sk->sk_refcnt); seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u " "f_count=%d\n", @@ -2233,7 +2239,7 @@ static int ctrl_cmd_tag(const char *input) goto err; } CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->sk_refcnt=%d ->sk=%p\n", - input, atomic_read(&el_socket->sk->sk_refcnt), + input, refcount_read(&el_socket->sk->sk_refcnt), el_socket->sk); if (argc < 3) { acct_tag = make_atag_from_value(0); @@ -2279,7 +2285,7 @@ static int ctrl_cmd_tag(const char *input) CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p " "st@%p ...->sk_refcnt=%d\n", input, el_socket->sk, sock_tag_entry, - atomic_read(&el_socket->sk->sk_refcnt)); + refcount_read(&el_socket->sk->sk_refcnt)); prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &uid_tag_data_entry); BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry)); @@ -2334,7 +2340,7 @@ static int ctrl_cmd_tag(const char *input) /* We keep the ref to the sk until it is untagged */ CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->sk_refcnt=%d\n", input, sock_tag_entry, - atomic_read(&el_socket->sk->sk_refcnt)); + refcount_read(&el_socket->sk->sk_refcnt)); sockfd_put(el_socket); return 0; @@ -2344,7 +2350,7 @@ err_tag_unref_put: free_tag_ref_from_utd_entry(tag_ref_entry, uid_tag_data_entry); err_put: CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->sk_refcnt=%d\n", - input, atomic_read(&el_socket->sk->sk_refcnt) - 1); + input, refcount_read(&el_socket->sk->sk_refcnt) - 1); /* Release the sock_fd that was grabbed by sockfd_lookup(). */ sockfd_put(el_socket); return res; @@ -2443,7 +2449,7 @@ int qtaguid_untag(struct socket *el_socket, bool kernel) sock_put(sock_tag_entry->sk); CT_DEBUG("qtaguid: done. st@%p ...->sk_refcnt=%d\n", sock_tag_entry, - atomic_read(&el_socket->sk->sk_refcnt)); + refcount_read(&el_socket->sk->sk_refcnt)); kfree(sock_tag_entry); atomic64_inc(&qtu_events.sockets_untagged); diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c index 2a7190d285e6..cab478eba9c8 100644 --- a/net/netfilter/xt_qtaguid_print.c +++ b/net/netfilter/xt_qtaguid_print.c @@ -239,8 +239,7 @@ char *pp_sock_tag(struct sock_tag *st) "sock_node=rb_node{...}, " "sk=%p (f_count=%d), list=list_head{...}, " "pid=%u, tag=%s}", - st, st->sk, atomic_read( - &st->sk->sk_refcnt), + st, st->sk, refcount_read(&st->sk->sk_refcnt), st->pid, tag_str); _bug_on_err_or_null(res); kfree(tag_str); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 15c99dfa3d72..aac9d68b4636 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -254,6 +254,9 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct sock *sk = skb->sk; int ret = -ENOMEM; + if (!net_eq(dev_net(dev), sock_net(sk))) + return 0; + dev_hold(dev); if (is_vmalloc_addr(skb->head)) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index cfb652a4e007..dbe1079a1651 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -532,6 +532,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return -EINVAL; skb_reset_network_header(skb); + key->eth.type = skb->protocol; } else { eth = eth_hdr(skb); ether_addr_copy(key->eth.src, eth->h_source); @@ -545,15 +546,23 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) if (unlikely(parse_vlan(skb, key))) return -ENOMEM; - skb->protocol = parse_ethertype(skb); - if (unlikely(skb->protocol == htons(0))) + key->eth.type = parse_ethertype(skb); + if (unlikely(key->eth.type == htons(0))) return -ENOMEM; + /* Multiple tagged packets need to retain TPID to satisfy + * skb_vlan_pop(), which will later shift the ethertype into + * skb->protocol. + */ + if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT)) + skb->protocol = key->eth.cvlan.tpid; + else + skb->protocol = key->eth.type; + skb_reset_network_header(skb); __skb_push(skb, skb->data - skb_mac_header(skb)); } skb_reset_mac_len(skb); - key->eth.type = skb->protocol; /* Network layer. */ if (key->eth.type == htons(ETH_P_IP)) { diff --git a/net/rds/send.c b/net/rds/send.c index b52cdc8ae428..f72466c63f0c 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1009,6 +1009,9 @@ static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) continue; if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { + if (cmsg->cmsg_len < + CMSG_LEN(sizeof(struct rds_rdma_args))) + return -EINVAL; args = CMSG_DATA(cmsg); *rdma_bytes += args->remote_vec.bytes; } diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 44de4ee51ce9..a08a32fa0949 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -59,11 +59,12 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; + net_inc_ingress_queue(); + err = tcf_block_get(&q->block, &dev->ingress_cl_list); if (err) return err; - net_inc_ingress_queue(); sch->flags |= TCQ_F_CPUSTATS; return 0; @@ -153,6 +154,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; + net_inc_ingress_queue(); + net_inc_egress_queue(); + err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list); if (err) return err; @@ -161,9 +165,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) if (err) return err; - net_inc_ingress_queue(); - net_inc_egress_queue(); - sch->flags |= TCQ_F_CPUSTATS; return 0; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d6163f7aefb1..df806b8819aa 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3874,13 +3874,17 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, struct sctp_association *asoc; int retval = -EINVAL; - if (optlen < sizeof(struct sctp_reset_streams)) + if (optlen < sizeof(*params)) return -EINVAL; params = memdup_user(optval, optlen); if (IS_ERR(params)) return PTR_ERR(params); + if (params->srs_number_streams * sizeof(__u16) > + optlen - sizeof(*params)) + goto out; + asoc = sctp_id2assoc(sk, params->srs_assoc_id); if (!asoc) goto out; @@ -4413,7 +4417,7 @@ static int sctp_init_sock(struct sock *sk) SCTP_DBG_OBJCNT_INC(sock); local_bh_disable(); - percpu_counter_inc(&sctp_sockets_allocated); + sk_sockets_allocated_inc(sk); sock_prot_inuse_add(net, sk->sk_prot, 1); /* Nothing can fail after this block, otherwise @@ -4457,7 +4461,7 @@ static void sctp_destroy_sock(struct sock *sk) } sctp_endpoint_free(sp->ep); local_bh_disable(); - percpu_counter_dec(&sctp_sockets_allocated); + sk_sockets_allocated_dec(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index d50edd6e0019..98a44ecb11e7 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -709,11 +709,11 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, switch (sk->sk_state) { case TIPC_ESTABLISHED: + case TIPC_CONNECTING: if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) mask |= POLLOUT; /* fall thru' */ case TIPC_LISTEN: - case TIPC_CONNECTING: if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= (POLLIN | POLLRDNORM); break; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ca2af3757bc3..33f58160a6cf 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1362,29 +1362,36 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct net *net = xp_net(policy); int nx; int i, error; + xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); + xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); xfrm_address_t tmp; for (nx = 0, i = 0; i < policy->xfrm_nr; i++) { struct xfrm_state *x; - xfrm_address_t *local; - xfrm_address_t *remote; + xfrm_address_t *remote = daddr; + xfrm_address_t *local = saddr; struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; - remote = &tmpl->id.daddr; - local = &tmpl->saddr; - if (xfrm_addr_any(local, tmpl->encap_family)) { - error = xfrm_get_saddr(net, fl->flowi_oif, - &tmp, remote, - tmpl->encap_family, 0); - if (error) - goto fail; - local = &tmp; + if (tmpl->mode == XFRM_MODE_TUNNEL || + tmpl->mode == XFRM_MODE_BEET) { + remote = &tmpl->id.daddr; + local = &tmpl->saddr; + if (xfrm_addr_any(local, tmpl->encap_family)) { + error = xfrm_get_saddr(net, fl->flowi_oif, + &tmp, remote, + tmpl->encap_family, 0); + if (error) + goto fail; + local = &tmp; + } } x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family); if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; + daddr = remote; + saddr = local; continue; } if (x) { diff --git a/security/Kconfig b/security/Kconfig index c501710cc69b..5f4946505cfa 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -63,6 +63,17 @@ config SECURITY_NETWORK implement socket and networking access controls. If you are unsure how to answer this question, answer N. +config PAGE_TABLE_ISOLATION + bool "Remove the kernel mapping in user mode" + depends on X86_64 && !UML + default y + help + This feature reduces the number of hardware side channels by + ensuring that the majority of kernel addresses are not mapped + into userspace. + + See Documentation/x86/pagetable-isolation.txt for more details. + config SECURITY_INFINIBAND bool "Infiniband Security Hooks" depends on SECURITY && INFINIBAND diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index 82a64b58041d..e395137ecff1 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -330,6 +330,9 @@ static int match_mnt_path_str(struct aa_profile *profile, AA_BUG(!mntpath); AA_BUG(!buffer); + if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT)) + return 0; + error = aa_path_name(mntpath, path_flags(profile, mntpath), buffer, &mntpnt, &info, profile->disconnected); if (error) @@ -381,6 +384,9 @@ static int match_mnt(struct aa_profile *profile, const struct path *path, AA_BUG(!profile); AA_BUG(devpath && !devbuffer); + if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT)) + return 0; + if (devpath) { error = aa_path_name(devpath, path_flags(profile, devpath), devbuffer, &devname, &info, @@ -559,6 +565,9 @@ static int profile_umount(struct aa_profile *profile, struct path *path, AA_BUG(!profile); AA_BUG(!path); + if (!PROFILE_MEDIATES(profile, AA_CLASS_MOUNT)) + return 0; + error = aa_path_name(path, path_flags(profile, path), buffer, &name, &info, profile->disconnected); if (error) @@ -614,7 +623,8 @@ static struct aa_label *build_pivotroot(struct aa_profile *profile, AA_BUG(!new_path); AA_BUG(!old_path); - if (profile_unconfined(profile)) + if (profile_unconfined(profile) || + !PROFILE_MEDIATES(profile, AA_CLASS_MOUNT)) return aa_get_newest_label(&profile->label); error = aa_path_name(old_path, path_flags(profile, old_path), diff --git a/security/commoncap.c b/security/commoncap.c index 8a824e8a0b01..901d4cb27293 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -380,21 +380,18 @@ static __u32 sansflags(__u32 m) return m & ~VFS_CAP_FLAGS_EFFECTIVE; } -static bool is_v2header(size_t size, __le32 magic) +static bool is_v2header(size_t size, const struct vfs_cap_data *cap) { - __u32 m = le32_to_cpu(magic); if (size != XATTR_CAPS_SZ_2) return false; - return sansflags(m) == VFS_CAP_REVISION_2; + return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2; } -static bool is_v3header(size_t size, __le32 magic) +static bool is_v3header(size_t size, const struct vfs_cap_data *cap) { - __u32 m = le32_to_cpu(magic); - if (size != XATTR_CAPS_SZ_3) return false; - return sansflags(m) == VFS_CAP_REVISION_3; + return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3; } /* @@ -437,7 +434,7 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, fs_ns = inode->i_sb->s_user_ns; cap = (struct vfs_cap_data *) tmpbuf; - if (is_v2header((size_t) ret, cap->magic_etc)) { + if (is_v2header((size_t) ret, cap)) { /* If this is sizeof(vfs_cap_data) then we're ok with the * on-disk value, so return that. */ if (alloc) @@ -445,7 +442,7 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, else kfree(tmpbuf); return ret; - } else if (!is_v3header((size_t) ret, cap->magic_etc)) { + } else if (!is_v3header((size_t) ret, cap)) { kfree(tmpbuf); return -EINVAL; } @@ -502,9 +499,9 @@ static kuid_t rootid_from_xattr(const void *value, size_t size, return make_kuid(task_ns, rootid); } -static bool validheader(size_t size, __le32 magic) +static bool validheader(size_t size, const struct vfs_cap_data *cap) { - return is_v2header(size, magic) || is_v3header(size, magic); + return is_v2header(size, cap) || is_v3header(size, cap); } /* @@ -527,7 +524,7 @@ int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size) if (!*ivalue) return -EINVAL; - if (!validheader(size, cap->magic_etc)) + if (!validheader(size, cap)) return -EINVAL; if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP)) return -EPERM; diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c index 038a180d3f81..cbe818eda336 100644 --- a/sound/hda/hdac_i915.c +++ b/sound/hda/hdac_i915.c @@ -325,7 +325,7 @@ static int hdac_component_master_match(struct device *dev, void *data) */ int snd_hdac_i915_register_notifier(const struct i915_audio_component_audio_ops *aops) { - if (WARN_ON(!hdac_acomp)) + if (!hdac_acomp) return -ENODEV; hdac_acomp->audio_ops = aops; diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index a81aacf684b2..37e1cf8218ff 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -271,6 +271,8 @@ enum { CXT_FIXUP_HP_SPECTRE, CXT_FIXUP_HP_GATE_MIC, CXT_FIXUP_MUTE_LED_GPIO, + CXT_FIXUP_HEADSET_MIC, + CXT_FIXUP_HP_MIC_NO_PRESENCE, }; /* for hda_fixup_thinkpad_acpi() */ @@ -350,6 +352,18 @@ static void cxt_fixup_headphone_mic(struct hda_codec *codec, } } +static void cxt_fixup_headset_mic(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + struct conexant_spec *spec = codec->spec; + + switch (action) { + case HDA_FIXUP_ACT_PRE_PROBE: + spec->parse_flags |= HDA_PINCFG_HEADSET_MIC; + break; + } +} + /* OPLC XO 1.5 fixup */ /* OLPC XO-1.5 supports DC input mode (e.g. for use with analog sensors) @@ -880,6 +894,19 @@ static const struct hda_fixup cxt_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = cxt_fixup_mute_led_gpio, }, + [CXT_FIXUP_HEADSET_MIC] = { + .type = HDA_FIXUP_FUNC, + .v.func = cxt_fixup_headset_mic, + }, + [CXT_FIXUP_HP_MIC_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x1a, 0x02a1113c }, + { } + }, + .chained = true, + .chain_id = CXT_FIXUP_HEADSET_MIC, + }, }; static const struct snd_pci_quirk cxt5045_fixups[] = { @@ -934,6 +961,8 @@ static const struct snd_pci_quirk cxt5066_fixups[] = { SND_PCI_QUIRK(0x103c, 0x8115, "HP Z1 Gen3", CXT_FIXUP_HP_GATE_MIC), SND_PCI_QUIRK(0x103c, 0x814f, "HP ZBook 15u G3", CXT_FIXUP_MUTE_LED_GPIO), SND_PCI_QUIRK(0x103c, 0x822e, "HP ProBook 440 G4", CXT_FIXUP_MUTE_LED_GPIO), + SND_PCI_QUIRK(0x103c, 0x8299, "HP 800 G3 SFF", CXT_FIXUP_HP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x103c, 0x829a, "HP 800 G3 DM", CXT_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x138d, "Asus", CXT_FIXUP_HEADPHONE_MIC_PIN), SND_PCI_QUIRK(0x152d, 0x0833, "OLPC XO-1.5", CXT_FIXUP_OLPC_XO), SND_PCI_QUIRK(0x17aa, 0x20f2, "Lenovo T400", CXT_PINCFG_LENOVO_TP410), diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 9ac4b9076ee2..acdb196ddb44 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -324,8 +324,12 @@ static void alc_fill_eapd_coef(struct hda_codec *codec) case 0x10ec0292: alc_update_coef_idx(codec, 0x4, 1<<15, 0); break; - case 0x10ec0215: case 0x10ec0225: + case 0x10ec0295: + case 0x10ec0299: + alc_update_coef_idx(codec, 0x67, 0xf000, 0x3000); + /* fallthrough */ + case 0x10ec0215: case 0x10ec0233: case 0x10ec0236: case 0x10ec0255: @@ -336,10 +340,8 @@ static void alc_fill_eapd_coef(struct hda_codec *codec) case 0x10ec0286: case 0x10ec0288: case 0x10ec0285: - case 0x10ec0295: case 0x10ec0298: case 0x10ec0289: - case 0x10ec0299: alc_update_coef_idx(codec, 0x10, 1<<9, 0); break; case 0x10ec0275: @@ -6305,6 +6307,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY), SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY), SND_PCI_QUIRK(0x17aa, 0x310c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION), + SND_PCI_QUIRK(0x17aa, 0x313c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION), SND_PCI_QUIRK(0x17aa, 0x3112, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY), SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI), SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC), @@ -6557,6 +6560,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, {0x1b, 0x01011020}, {0x21, 0x02211010}), + SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, + {0x12, 0x90a60130}, + {0x14, 0x90170110}, + {0x1b, 0x01011020}, + {0x21, 0x0221101f}), SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, {0x12, 0x90a60160}, {0x14, 0x90170120}, diff --git a/sound/soc/codecs/da7218.c b/sound/soc/codecs/da7218.c index b2d42ec1dcd9..56564ce90cb6 100644 --- a/sound/soc/codecs/da7218.c +++ b/sound/soc/codecs/da7218.c @@ -2520,7 +2520,7 @@ static struct da7218_pdata *da7218_of_to_pdata(struct snd_soc_codec *codec) } if (da7218->dev_id == DA7218_DEV_ID) { - hpldet_np = of_find_node_by_name(np, "da7218_hpldet"); + hpldet_np = of_get_child_by_name(np, "da7218_hpldet"); if (!hpldet_np) return pdata; diff --git a/sound/soc/codecs/msm8916-wcd-analog.c b/sound/soc/codecs/msm8916-wcd-analog.c index 18933bf6473f..8c7063e1aa46 100644 --- a/sound/soc/codecs/msm8916-wcd-analog.c +++ b/sound/soc/codecs/msm8916-wcd-analog.c @@ -267,7 +267,7 @@ #define MSM8916_WCD_ANALOG_RATES (SNDRV_PCM_RATE_8000 | SNDRV_PCM_RATE_16000 |\ SNDRV_PCM_RATE_32000 | SNDRV_PCM_RATE_48000) #define MSM8916_WCD_ANALOG_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\ - SNDRV_PCM_FMTBIT_S24_LE) + SNDRV_PCM_FMTBIT_S32_LE) static int btn_mask = SND_JACK_BTN_0 | SND_JACK_BTN_1 | SND_JACK_BTN_2 | SND_JACK_BTN_3 | SND_JACK_BTN_4; diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c index 66df8f810f0d..694db27b11fa 100644 --- a/sound/soc/codecs/msm8916-wcd-digital.c +++ b/sound/soc/codecs/msm8916-wcd-digital.c @@ -194,7 +194,7 @@ SNDRV_PCM_RATE_32000 | \ SNDRV_PCM_RATE_48000) #define MSM8916_WCD_DIGITAL_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\ - SNDRV_PCM_FMTBIT_S24_LE) + SNDRV_PCM_FMTBIT_S32_LE) struct msm8916_wcd_digital_priv { struct clk *ahbclk, *mclk; @@ -645,7 +645,7 @@ static int msm8916_wcd_digital_hw_params(struct snd_pcm_substream *substream, RX_I2S_CTL_RX_I2S_MODE_MASK, RX_I2S_CTL_RX_I2S_MODE_16); break; - case SNDRV_PCM_FORMAT_S24_LE: + case SNDRV_PCM_FORMAT_S32_LE: snd_soc_update_bits(dai->codec, LPASS_CDC_CLK_TX_I2S_CTL, TX_I2S_CTL_TX_I2S_MODE_MASK, TX_I2S_CTL_TX_I2S_MODE_32); diff --git a/sound/soc/codecs/tlv320aic31xx.h b/sound/soc/codecs/tlv320aic31xx.h index 730fb2058869..1ff3edb7bbb6 100644 --- a/sound/soc/codecs/tlv320aic31xx.h +++ b/sound/soc/codecs/tlv320aic31xx.h @@ -116,7 +116,7 @@ struct aic31xx_pdata { /* INT2 interrupt control */ #define AIC31XX_INT2CTRL AIC31XX_REG(0, 49) /* GPIO1 control */ -#define AIC31XX_GPIO1 AIC31XX_REG(0, 50) +#define AIC31XX_GPIO1 AIC31XX_REG(0, 51) #define AIC31XX_DACPRB AIC31XX_REG(0, 60) /* ADC Instruction Set Register */ diff --git a/sound/soc/codecs/twl4030.c b/sound/soc/codecs/twl4030.c index c482b2e7a7d2..cfe72b9d4356 100644 --- a/sound/soc/codecs/twl4030.c +++ b/sound/soc/codecs/twl4030.c @@ -232,7 +232,7 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec) struct twl4030_codec_data *pdata = dev_get_platdata(codec->dev); struct device_node *twl4030_codec_node = NULL; - twl4030_codec_node = of_find_node_by_name(codec->dev->parent->of_node, + twl4030_codec_node = of_get_child_by_name(codec->dev->parent->of_node, "codec"); if (!pdata && twl4030_codec_node) { @@ -241,9 +241,11 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec) GFP_KERNEL); if (!pdata) { dev_err(codec->dev, "Can not allocate memory\n"); + of_node_put(twl4030_codec_node); return NULL; } twl4030_setup_pdata_of(pdata, twl4030_codec_node); + of_node_put(twl4030_codec_node); } return pdata; diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index 65c059b5ffd7..66e32f5d2917 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -1733,7 +1733,7 @@ static int wm_adsp_load(struct wm_adsp *dsp) le64_to_cpu(footer->timestamp)); while (pos < firmware->size && - pos - firmware->size > sizeof(*region)) { + sizeof(*region) < firmware->size - pos) { region = (void *)&(firmware->data[pos]); region_name = "Unknown"; reg = 0; @@ -1782,8 +1782,8 @@ static int wm_adsp_load(struct wm_adsp *dsp) regions, le32_to_cpu(region->len), offset, region_name); - if ((pos + le32_to_cpu(region->len) + sizeof(*region)) > - firmware->size) { + if (le32_to_cpu(region->len) > + firmware->size - pos - sizeof(*region)) { adsp_err(dsp, "%s.%d: %s region len %d bytes exceeds file length %zu\n", file, regions, region_name, @@ -2253,7 +2253,7 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp) blocks = 0; while (pos < firmware->size && - pos - firmware->size > sizeof(*blk)) { + sizeof(*blk) < firmware->size - pos) { blk = (void *)(&firmware->data[pos]); type = le16_to_cpu(blk->type); @@ -2327,8 +2327,8 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp) } if (reg) { - if ((pos + le32_to_cpu(blk->len) + sizeof(*blk)) > - firmware->size) { + if (le32_to_cpu(blk->len) > + firmware->size - pos - sizeof(*blk)) { adsp_err(dsp, "%s.%d: %s region len %d bytes exceeds file length %zu\n", file, blocks, region_name, diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c index 64598d1183f8..3ffbb498cc70 100644 --- a/sound/soc/fsl/fsl_ssi.c +++ b/sound/soc/fsl/fsl_ssi.c @@ -1452,12 +1452,6 @@ static int fsl_ssi_probe(struct platform_device *pdev) sizeof(fsl_ssi_ac97_dai)); fsl_ac97_data = ssi_private; - - ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev); - if (ret) { - dev_err(&pdev->dev, "could not set AC'97 ops\n"); - return ret; - } } else { /* Initialize this copy of the CPU DAI driver structure */ memcpy(&ssi_private->cpu_dai_drv, &fsl_ssi_dai_template, @@ -1568,6 +1562,14 @@ static int fsl_ssi_probe(struct platform_device *pdev) return ret; } + if (fsl_ssi_is_ac97(ssi_private)) { + ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev); + if (ret) { + dev_err(&pdev->dev, "could not set AC'97 ops\n"); + goto error_ac97_ops; + } + } + ret = devm_snd_soc_register_component(&pdev->dev, &fsl_ssi_component, &ssi_private->cpu_dai_drv, 1); if (ret) { @@ -1651,6 +1653,10 @@ error_sound_card: fsl_ssi_debugfs_remove(&ssi_private->dbg_stats); error_asoc_register: + if (fsl_ssi_is_ac97(ssi_private)) + snd_soc_set_ac97_ops(NULL); + +error_ac97_ops: if (ssi_private->soc->imx) fsl_ssi_imx_clean(pdev, ssi_private); diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index 0304ffb714f2..1aef72df20a1 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt, * NB: Different Linux versions do different things with the * accessed bit in set_thread_area(). */ - if (ar != expected_ar && - (ldt || ar != (expected_ar | AR_ACCESSED))) { + if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) { printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", (ldt ? "LDT" : "GDT"), index, ar, expected_ar); nerrs++; diff --git a/tools/usb/usbip/src/utils.c b/tools/usb/usbip/src/utils.c index 2b3d6d235015..3d7b42e77299 100644 --- a/tools/usb/usbip/src/utils.c +++ b/tools/usb/usbip/src/utils.c @@ -30,6 +30,7 @@ int modify_match_busid(char *busid, int add) char command[SYSFS_BUS_ID_SIZE + 4]; char match_busid_attr_path[SYSFS_PATH_MAX]; int rc; + int cmd_size; snprintf(match_busid_attr_path, sizeof(match_busid_attr_path), "%s/%s/%s/%s/%s/%s", SYSFS_MNT_PATH, SYSFS_BUS_NAME, @@ -37,12 +38,14 @@ int modify_match_busid(char *busid, int add) attr_name); if (add) - snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s", busid); + cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s", + busid); else - snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s", busid); + cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s", + busid); rc = write_sysfs_attribute(match_busid_attr_path, command, - sizeof(command)); + cmd_size); if (rc < 0) { dbg("failed to write match_busid: %s", strerror(errno)); return -1;