Merge 4.9.176 into android-4.9

Changes in 4.9.176
	x86/MCE: Save microcode revision in machine check records
	x86/cpufeatures: Hide AMD-specific speculation flags
	x86/bugs: Add AMD's variant of SSB_NO
	x86/bugs: Add AMD's SPEC_CTRL MSR usage
	x86/bugs: Switch the selection of mitigation from CPU vendor to CPU features
	x86/bugs: Fix the AMD SSBD usage of the SPEC_CTRL MSR
	x86/microcode/intel: Add a helper which gives the microcode revision
	x86/microcode/intel: Check microcode revision before updating sibling threads
	x86/microcode: Make sure boot_cpu_data.microcode is up-to-date
	x86/microcode: Update the new microcode revision unconditionally
	x86/mm: Use WRITE_ONCE() when setting PTEs
	bitops: avoid integer overflow in GENMASK(_ULL)
	x86/speculation: Simplify the CPU bug detection logic
	locking/atomics, asm-generic: Move some macros from <linux/bitops.h> to a new <linux/bits.h> file
	x86/speculation: Remove SPECTRE_V2_IBRS in enum spectre_v2_mitigation
	x86/cpu: Sanitize FAM6_ATOM naming
	Documentation/l1tf: Fix small spelling typo
	x86/speculation: Apply IBPB more strictly to avoid cross-process data leak
	x86/speculation: Enable cross-hyperthread spectre v2 STIBP mitigation
	x86/speculation: Propagate information about RSB filling mitigation to sysfs
	x86/speculation/l1tf: Drop the swap storage limit restriction when l1tf=off
	x86/speculation: Update the TIF_SSBD comment
	x86/speculation: Clean up spectre_v2_parse_cmdline()
	x86/speculation: Remove unnecessary ret variable in cpu_show_common()
	x86/speculation: Move STIPB/IBPB string conditionals out of cpu_show_common()
	x86/speculation: Disable STIBP when enhanced IBRS is in use
	x86/speculation: Rename SSBD update functions
	x86/speculation: Reorganize speculation control MSRs update
	x86/Kconfig: Select SCHED_SMT if SMP enabled
	sched: Add sched_smt_active()
	x86/speculation: Rework SMT state change
	x86/l1tf: Show actual SMT state
	x86/speculation: Reorder the spec_v2 code
	x86/speculation: Mark string arrays const correctly
	x86/speculataion: Mark command line parser data __initdata
	x86/speculation: Unify conditional spectre v2 print functions
	x86/speculation: Add command line control for indirect branch speculation
	x86/speculation: Prepare for per task indirect branch speculation control
	x86/process: Consolidate and simplify switch_to_xtra() code
	x86/speculation: Avoid __switch_to_xtra() calls
	x86/speculation: Prepare for conditional IBPB in switch_mm()
	x86/speculation: Split out TIF update
	x86/speculation: Prepare arch_smt_update() for PRCTL mode
	x86/speculation: Prevent stale SPEC_CTRL msr content
	x86/speculation: Add prctl() control for indirect branch speculation
	x86/speculation: Enable prctl mode for spectre_v2_user
	x86/speculation: Add seccomp Spectre v2 user space protection mode
	x86/speculation: Provide IBPB always command line options
	kvm: x86: Report STIBP on GET_SUPPORTED_CPUID
	x86/msr-index: Cleanup bit defines
	x86/speculation: Consolidate CPU whitelists
	x86/speculation/mds: Add basic bug infrastructure for MDS
	x86/speculation/mds: Add BUG_MSBDS_ONLY
	x86/kvm: Expose X86_FEATURE_MD_CLEAR to guests
	x86/speculation/mds: Add mds_clear_cpu_buffers()
	x86/speculation/mds: Clear CPU buffers on exit to user
	x86/kvm/vmx: Add MDS protection when L1D Flush is not active
	x86/speculation/mds: Conditionally clear CPU buffers on idle entry
	x86/speculation/mds: Add mitigation control for MDS
	x86/speculation/mds: Add sysfs reporting for MDS
	x86/speculation/mds: Add mitigation mode VMWERV
	Documentation: Move L1TF to separate directory
	Documentation: Add MDS vulnerability documentation
	x86/speculation/mds: Add mds=full,nosmt cmdline option
	x86/speculation: Move arch_smt_update() call to after mitigation decisions
	x86/speculation/mds: Add SMT warning message
	x86/speculation/mds: Fix comment
	x86/speculation/mds: Print SMT vulnerable on MSBDS with mitigations off
	cpu/speculation: Add 'mitigations=' cmdline option
	x86/speculation: Support 'mitigations=' cmdline option
	x86/speculation/mds: Add 'mitigations=' support for MDS
	x86/mds: Add MDSUM variant to the MDS documentation
	Documentation: Correct the possible MDS sysfs values
	x86/speculation/mds: Fix documentation typo
	x86: stop exporting msr-index.h to userland
	x86/cpu/bugs: Use __initconst for 'const' init data
	Linux 4.9.176

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
This commit is contained in:
Greg Kroah-Hartman
2019-05-14 20:43:04 +02:00
73 changed files with 2063 additions and 422 deletions

View File

@@ -357,6 +357,7 @@ What: /sys/devices/system/cpu/vulnerabilities
/sys/devices/system/cpu/vulnerabilities/spectre_v2
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
/sys/devices/system/cpu/vulnerabilities/l1tf
/sys/devices/system/cpu/vulnerabilities/mds
Date: January 2018
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Information about CPU vulnerabilities
@@ -369,8 +370,7 @@ Description: Information about CPU vulnerabilities
"Vulnerable" CPU is affected and no mitigation in effect
"Mitigation: $M" CPU is affected and mitigation $M is in effect
Details about the l1tf file can be found in
Documentation/admin-guide/l1tf.rst
See also: Documentation/hw-vuln/index.rst
What: /sys/devices/system/cpu/smt
/sys/devices/system/cpu/smt/active

View File

@@ -0,0 +1,13 @@
========================
Hardware vulnerabilities
========================
This section describes CPU vulnerabilities and provides an overview of the
possible mitigations along with guidance for selecting mitigations if they
are configurable at compile, boot or run time.
.. toctree::
:maxdepth: 1
l1tf
mds

View File

@@ -405,6 +405,9 @@ time with the option "l1tf=". The valid arguments for this option are:
off Disables hypervisor mitigations and doesn't emit any
warnings.
It also drops the swap size and available RAM limit restrictions
on both hypervisor and bare metal.
============ =============================================================
The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
@@ -442,6 +445,7 @@ The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
module parameter is ignored and writes to the sysfs file are rejected.
.. _mitigation_selection:
Mitigation selection guide
--------------------------
@@ -553,7 +557,7 @@ When nested virtualization is in use, three operating systems are involved:
the bare metal hypervisor, the nested hypervisor and the nested virtual
machine. VMENTER operations from the nested hypervisor into the nested
guest will always be processed by the bare metal hypervisor. If KVM is the
bare metal hypervisor it wiil:
bare metal hypervisor it will:
- Flush the L1D cache on every switch from the nested hypervisor to the
nested virtual machine, so that the nested hypervisor's secrets are not
@@ -576,7 +580,8 @@ Default mitigations
The kernel default mitigations for vulnerable processors are:
- PTE inversion to protect against malicious user space. This is done
unconditionally and cannot be controlled.
unconditionally and cannot be controlled. The swap storage is limited
to ~16TB.
- L1D conditional flushing on VMENTER when EPT is enabled for
a guest.

View File

@@ -0,0 +1,308 @@
MDS - Microarchitectural Data Sampling
======================================
Microarchitectural Data Sampling is a hardware vulnerability which allows
unprivileged speculative access to data which is available in various CPU
internal buffers.
Affected processors
-------------------
This vulnerability affects a wide range of Intel processors. The
vulnerability is not present on:
- Processors from AMD, Centaur and other non Intel vendors
- Older processor models, where the CPU family is < 6
- Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus)
- Intel processors which have the ARCH_CAP_MDS_NO bit set in the
IA32_ARCH_CAPABILITIES MSR.
Whether a processor is affected or not can be read out from the MDS
vulnerability file in sysfs. See :ref:`mds_sys_info`.
Not all processors are affected by all variants of MDS, but the mitigation
is identical for all of them so the kernel treats them as a single
vulnerability.
Related CVEs
------------
The following CVE entries are related to the MDS vulnerability:
============== ===== ===================================================
CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling
CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling
CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling
CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory
============== ===== ===================================================
Problem
-------
When performing store, load, L1 refill operations, processors write data
into temporary microarchitectural structures (buffers). The data in the
buffer can be forwarded to load operations as an optimization.
Under certain conditions, usually a fault/assist caused by a load
operation, data unrelated to the load memory address can be speculatively
forwarded from the buffers. Because the load operation causes a fault or
assist and its result will be discarded, the forwarded data will not cause
incorrect program execution or state changes. But a malicious operation
may be able to forward this speculative data to a disclosure gadget which
allows in turn to infer the value via a cache side channel attack.
Because the buffers are potentially shared between Hyper-Threads cross
Hyper-Thread attacks are possible.
Deeper technical information is available in the MDS specific x86
architecture section: :ref:`Documentation/x86/mds.rst <mds>`.
Attack scenarios
----------------
Attacks against the MDS vulnerabilities can be mounted from malicious non
priviledged user space applications running on hosts or guest. Malicious
guest OSes can obviously mount attacks as well.
Contrary to other speculation based vulnerabilities the MDS vulnerability
does not allow the attacker to control the memory target address. As a
consequence the attacks are purely sampling based, but as demonstrated with
the TLBleed attack samples can be postprocessed successfully.
Web-Browsers
^^^^^^^^^^^^
It's unclear whether attacks through Web-Browsers are possible at
all. The exploitation through Java-Script is considered very unlikely,
but other widely used web technologies like Webassembly could possibly be
abused.
.. _mds_sys_info:
MDS system information
-----------------------
The Linux kernel provides a sysfs interface to enumerate the current MDS
status of the system: whether the system is vulnerable, and which
mitigations are active. The relevant sysfs file is:
/sys/devices/system/cpu/vulnerabilities/mds
The possible values in this file are:
.. list-table::
* - 'Not affected'
- The processor is not vulnerable
* - 'Vulnerable'
- The processor is vulnerable, but no mitigation enabled
* - 'Vulnerable: Clear CPU buffers attempted, no microcode'
- The processor is vulnerable but microcode is not updated.
The mitigation is enabled on a best effort basis. See :ref:`vmwerv`
* - 'Mitigation: Clear CPU buffers'
- The processor is vulnerable and the CPU buffer clearing mitigation is
enabled.
If the processor is vulnerable then the following information is appended
to the above information:
======================== ============================================
'SMT vulnerable' SMT is enabled
'SMT mitigated' SMT is enabled and mitigated
'SMT disabled' SMT is disabled
'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
======================== ============================================
.. _vmwerv:
Best effort mitigation mode
^^^^^^^^^^^^^^^^^^^^^^^^^^^
If the processor is vulnerable, but the availability of the microcode based
mitigation mechanism is not advertised via CPUID the kernel selects a best
effort mitigation mode. This mode invokes the mitigation instructions
without a guarantee that they clear the CPU buffers.
This is done to address virtualization scenarios where the host has the
microcode update applied, but the hypervisor is not yet updated to expose
the CPUID to the guest. If the host has updated microcode the protection
takes effect otherwise a few cpu cycles are wasted pointlessly.
The state in the mds sysfs file reflects this situation accordingly.
Mitigation mechanism
-------------------------
The kernel detects the affected CPUs and the presence of the microcode
which is required.
If a CPU is affected and the microcode is available, then the kernel
enables the mitigation by default. The mitigation can be controlled at boot
time via a kernel command line option. See
:ref:`mds_mitigation_control_command_line`.
.. _cpu_buffer_clear:
CPU buffer clearing
^^^^^^^^^^^^^^^^^^^
The mitigation for MDS clears the affected CPU buffers on return to user
space and when entering a guest.
If SMT is enabled it also clears the buffers on idle entry when the CPU
is only affected by MSBDS and not any other MDS variant, because the
other variants cannot be protected against cross Hyper-Thread attacks.
For CPUs which are only affected by MSBDS the user space, guest and idle
transition mitigations are sufficient and SMT is not affected.
.. _virt_mechanism:
Virtualization mitigation
^^^^^^^^^^^^^^^^^^^^^^^^^
The protection for host to guest transition depends on the L1TF
vulnerability of the CPU:
- CPU is affected by L1TF:
If the L1D flush mitigation is enabled and up to date microcode is
available, the L1D flush mitigation is automatically protecting the
guest transition.
If the L1D flush mitigation is disabled then the MDS mitigation is
invoked explicit when the host MDS mitigation is enabled.
For details on L1TF and virtualization see:
:ref:`Documentation/hw-vuln//l1tf.rst <mitigation_control_kvm>`.
- CPU is not affected by L1TF:
CPU buffers are flushed before entering the guest when the host MDS
mitigation is enabled.
The resulting MDS protection matrix for the host to guest transition:
============ ===== ============= ============ =================
L1TF MDS VMX-L1FLUSH Host MDS MDS-State
Don't care No Don't care N/A Not affected
Yes Yes Disabled Off Vulnerable
Yes Yes Disabled Full Mitigated
Yes Yes Enabled Don't care Mitigated
No Yes N/A Off Vulnerable
No Yes N/A Full Mitigated
============ ===== ============= ============ =================
This only covers the host to guest transition, i.e. prevents leakage from
host to guest, but does not protect the guest internally. Guests need to
have their own protections.
.. _xeon_phi:
XEON PHI specific considerations
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The XEON PHI processor family is affected by MSBDS which can be exploited
cross Hyper-Threads when entering idle states. Some XEON PHI variants allow
to use MWAIT in user space (Ring 3) which opens an potential attack vector
for malicious user space. The exposure can be disabled on the kernel
command line with the 'ring3mwait=disable' command line option.
XEON PHI is not affected by the other MDS variants and MSBDS is mitigated
before the CPU enters a idle state. As XEON PHI is not affected by L1TF
either disabling SMT is not required for full protection.
.. _mds_smt_control:
SMT control
^^^^^^^^^^^
All MDS variants except MSBDS can be attacked cross Hyper-Threads. That
means on CPUs which are affected by MFBDS or MLPDS it is necessary to
disable SMT for full protection. These are most of the affected CPUs; the
exception is XEON PHI, see :ref:`xeon_phi`.
Disabling SMT can have a significant performance impact, but the impact
depends on the type of workloads.
See the relevant chapter in the L1TF mitigation documentation for details:
:ref:`Documentation/hw-vuln/l1tf.rst <smt_control>`.
.. _mds_mitigation_control_command_line:
Mitigation control on the kernel command line
---------------------------------------------
The kernel command line allows to control the MDS mitigations at boot
time with the option "mds=". The valid arguments for this option are:
============ =============================================================
full If the CPU is vulnerable, enable all available mitigations
for the MDS vulnerability, CPU buffer clearing on exit to
userspace and when entering a VM. Idle transitions are
protected as well if SMT is enabled.
It does not automatically disable SMT.
full,nosmt The same as mds=full, with SMT disabled on vulnerable
CPUs. This is the complete mitigation.
off Disables MDS mitigations completely.
============ =============================================================
Not specifying this option is equivalent to "mds=full".
Mitigation selection guide
--------------------------
1. Trusted userspace
^^^^^^^^^^^^^^^^^^^^
If all userspace applications are from a trusted source and do not
execute untrusted code which is supplied externally, then the mitigation
can be disabled.
2. Virtualization with trusted guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The same considerations as above versus trusted user space apply.
3. Virtualization with untrusted guests
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The protection depends on the state of the L1TF mitigations.
See :ref:`virt_mechanism`.
If the MDS mitigation is enabled and SMT is disabled, guest to host and
guest to guest attacks are prevented.
.. _mds_default_mitigations:
Default mitigations
-------------------
The kernel default mitigations for vulnerable processors are:
- Enable CPU buffer clearing
The kernel does not by default enforce the disabling of SMT, which leaves
SMT systems vulnerable when running untrusted code. The same rationale as
for L1TF applies.
See :ref:`Documentation/hw-vuln//l1tf.rst <default_mitigations>`.

View File

@@ -12,7 +12,6 @@ Contents:
:maxdepth: 2
kernel-documentation
l1tf
development-process/index
dev-tools/tools
driver-api/index
@@ -20,6 +19,24 @@ Contents:
gpu/index
80211/index
This section describes CPU vulnerabilities and their mitigations.
.. toctree::
:maxdepth: 1
hw-vuln/index
Architecture-specific documentation
-----------------------------------
These books provide programming details about architecture-specific
implementation.
.. toctree::
:maxdepth: 2
x86/index
Indices and tables
==================

View File

@@ -2088,10 +2088,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
off
Disables hypervisor mitigations and doesn't
emit any warnings.
It also drops the swap size and available
RAM limit restriction on both hypervisor and
bare metal.
Default is 'flush'.
For details see: Documentation/admin-guide/l1tf.rst
For details see: Documentation/hw-vuln/l1tf.rst
l2cr= [PPC]
@@ -2334,6 +2337,32 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Format: <first>,<last>
Specifies range of consoles to be captured by the MDA.
mds= [X86,INTEL]
Control mitigation for the Micro-architectural Data
Sampling (MDS) vulnerability.
Certain CPUs are vulnerable to an exploit against CPU
internal buffers which can forward information to a
disclosure gadget under certain conditions.
In vulnerable processors, the speculatively
forwarded data can be used in a cache side channel
attack, to access data to which the attacker does
not have direct access.
This parameter controls the MDS mitigation. The
options are:
full - Enable MDS mitigation on vulnerable CPUs
full,nosmt - Enable MDS mitigation and disable
SMT on vulnerable CPUs
off - Unconditionally disable MDS mitigation
Not specifying this option is equivalent to
mds=full.
For details see: Documentation/hw-vuln/mds.rst
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
Amount of memory to be used when the kernel is not able
to see the whole system memory or for test.
@@ -2456,6 +2485,38 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
in the "bleeding edge" mini2440 support kernel at
http://repo.or.cz/w/linux-2.6/mini2440.git
mitigations=
[X86] Control optional mitigations for CPU
vulnerabilities. This is a set of curated,
arch-independent options, each of which is an
aggregation of existing arch-specific options.
off
Disable all optional CPU mitigations. This
improves system performance, but it may also
expose users to several CPU vulnerabilities.
Equivalent to: nopti [X86]
nospectre_v2 [X86]
spectre_v2_user=off [X86]
spec_store_bypass_disable=off [X86]
l1tf=off [X86]
mds=off [X86]
auto (default)
Mitigate all CPU vulnerabilities, but leave SMT
enabled, even if it's vulnerable. This is for
users who don't want to be surprised by SMT
getting disabled across kernel upgrades, or who
have other ways of avoiding SMT-based attacks.
Equivalent to: (default behavior)
auto,nosmt
Mitigate all CPU vulnerabilities, disabling SMT
if needed. This is for users who always want to
be fully mitigated, even if it means losing SMT.
Equivalent to: l1tf=flush,nosmt [X86]
mds=full,nosmt [X86]
mminit_loglevel=
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
parameter allows control of the logging verbosity for
@@ -4054,9 +4115,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
spectre_v2= [X86] Control mitigation of Spectre variant 2
(indirect branch speculation) vulnerability.
The default operation protects the kernel from
user space attacks.
on - unconditionally enable
off - unconditionally disable
on - unconditionally enable, implies
spectre_v2_user=on
off - unconditionally disable, implies
spectre_v2_user=off
auto - kernel detects whether your CPU model is
vulnerable
@@ -4066,6 +4131,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
CONFIG_RETPOLINE configuration option, and the
compiler with which the kernel was built.
Selecting 'on' will also enable the mitigation
against user space to user space task attacks.
Selecting 'off' will disable both the kernel and
the user space protections.
Specific mitigations can also be selected manually:
retpoline - replace indirect branches
@@ -4075,6 +4146,48 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Not specifying this option is equivalent to
spectre_v2=auto.
spectre_v2_user=
[X86] Control mitigation of Spectre variant 2
(indirect branch speculation) vulnerability between
user space tasks
on - Unconditionally enable mitigations. Is
enforced by spectre_v2=on
off - Unconditionally disable mitigations. Is
enforced by spectre_v2=off
prctl - Indirect branch speculation is enabled,
but mitigation can be enabled via prctl
per thread. The mitigation control state
is inherited on fork.
prctl,ibpb
- Like "prctl" above, but only STIBP is
controlled per thread. IBPB is issued
always when switching between different user
space processes.
seccomp
- Same as "prctl" above, but all seccomp
threads will enable the mitigation unless
they explicitly opt out.
seccomp,ibpb
- Like "seccomp" above, but only STIBP is
controlled per thread. IBPB is issued
always when switching between different
user space processes.
auto - Kernel selects the mitigation depending on
the available CPU features and vulnerability.
Default mitigation:
If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
Not specifying this option is equivalent to
spectre_v2_user=auto.
spec_store_bypass_disable=
[HW] Control Speculative Store Bypass (SSB) Disable mitigation
(Speculative Store Bypass vulnerability)

View File

@@ -92,3 +92,12 @@ Speculation misfeature controls
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0);
- PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes
(Mitigate Spectre V2 style attacks against user processes)
Invocations:
* prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0);
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0);

10
Documentation/x86/conf.py Normal file
View File

@@ -0,0 +1,10 @@
# -*- coding: utf-8; mode: python -*-
project = "X86 architecture specific documentation"
tags.add("subproject")
latex_documents = [
('index', 'x86.tex', project,
'The kernel development community', 'manual'),
]

View File

@@ -0,0 +1,8 @@
==========================
x86 architecture specifics
==========================
.. toctree::
:maxdepth: 1
mds

225
Documentation/x86/mds.rst Normal file
View File

@@ -0,0 +1,225 @@
Microarchitectural Data Sampling (MDS) mitigation
=================================================
.. _mds:
Overview
--------
Microarchitectural Data Sampling (MDS) is a family of side channel attacks
on internal buffers in Intel CPUs. The variants are:
- Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126)
- Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130)
- Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127)
- Microarchitectural Data Sampling Uncacheable Memory (MDSUM) (CVE-2019-11091)
MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a
dependent load (store-to-load forwarding) as an optimization. The forward
can also happen to a faulting or assisting load operation for a different
memory address, which can be exploited under certain conditions. Store
buffers are partitioned between Hyper-Threads so cross thread forwarding is
not possible. But if a thread enters or exits a sleep state the store
buffer is repartitioned which can expose data from one thread to the other.
MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage
L1 miss situations and to hold data which is returned or sent in response
to a memory or I/O operation. Fill buffers can forward data to a load
operation and also write data to the cache. When the fill buffer is
deallocated it can retain the stale data of the preceding operations which
can then be forwarded to a faulting or assisting load operation, which can
be exploited under certain conditions. Fill buffers are shared between
Hyper-Threads so cross thread leakage is possible.
MLPDS leaks Load Port Data. Load ports are used to perform load operations
from memory or I/O. The received data is then forwarded to the register
file or a subsequent operation. In some implementations the Load Port can
contain stale data from a previous operation which can be forwarded to
faulting or assisting loads under certain conditions, which again can be
exploited eventually. Load ports are shared between Hyper-Threads so cross
thread leakage is possible.
MDSUM is a special case of MSBDS, MFBDS and MLPDS. An uncacheable load from
memory that takes a fault or assist can leave data in a microarchitectural
structure that may later be observed using one of the same methods used by
MSBDS, MFBDS or MLPDS.
Exposure assumptions
--------------------
It is assumed that attack code resides in user space or in a guest with one
exception. The rationale behind this assumption is that the code construct
needed for exploiting MDS requires:
- to control the load to trigger a fault or assist
- to have a disclosure gadget which exposes the speculatively accessed
data for consumption through a side channel.
- to control the pointer through which the disclosure gadget exposes the
data
The existence of such a construct in the kernel cannot be excluded with
100% certainty, but the complexity involved makes it extremly unlikely.
There is one exception, which is untrusted BPF. The functionality of
untrusted BPF is limited, but it needs to be thoroughly investigated
whether it can be used to create such a construct.
Mitigation strategy
-------------------
All variants have the same mitigation strategy at least for the single CPU
thread case (SMT off): Force the CPU to clear the affected buffers.
This is achieved by using the otherwise unused and obsolete VERW
instruction in combination with a microcode update. The microcode clears
the affected CPU buffers when the VERW instruction is executed.
For virtualization there are two ways to achieve CPU buffer
clearing. Either the modified VERW instruction or via the L1D Flush
command. The latter is issued when L1TF mitigation is enabled so the extra
VERW can be avoided. If the CPU is not affected by L1TF then VERW needs to
be issued.
If the VERW instruction with the supplied segment selector argument is
executed on a CPU without the microcode update there is no side effect
other than a small number of pointlessly wasted CPU cycles.
This does not protect against cross Hyper-Thread attacks except for MSBDS
which is only exploitable cross Hyper-thread when one of the Hyper-Threads
enters a C-state.
The kernel provides a function to invoke the buffer clearing:
mds_clear_cpu_buffers()
The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state
(idle) transitions.
As a special quirk to address virtualization scenarios where the host has
the microcode updated, but the hypervisor does not (yet) expose the
MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the
hope that it might actually clear the buffers. The state is reflected
accordingly.
According to current knowledge additional mitigations inside the kernel
itself are not required because the necessary gadgets to expose the leaked
data cannot be controlled in a way which allows exploitation from malicious
user space or VM guests.
Kernel internal mitigation modes
--------------------------------
======= ============================================================
off Mitigation is disabled. Either the CPU is not affected or
mds=off is supplied on the kernel command line
full Mitigation is enabled. CPU is affected and MD_CLEAR is
advertised in CPUID.
vmwerv Mitigation is enabled. CPU is affected and MD_CLEAR is not
advertised in CPUID. That is mainly for virtualization
scenarios where the host has the updated microcode but the
hypervisor does not expose MD_CLEAR in CPUID. It's a best
effort approach without guarantee.
======= ============================================================
If the CPU is affected and mds=off is not supplied on the kernel command
line then the kernel selects the appropriate mitigation mode depending on
the availability of the MD_CLEAR CPUID bit.
Mitigation points
-----------------
1. Return to user space
^^^^^^^^^^^^^^^^^^^^^^^
When transitioning from kernel to user space the CPU buffers are flushed
on affected CPUs when the mitigation is not disabled on the kernel
command line. The migitation is enabled through the static key
mds_user_clear.
The mitigation is invoked in prepare_exit_to_usermode() which covers
most of the kernel to user space transitions. There are a few exceptions
which are not invoking prepare_exit_to_usermode() on return to user
space. These exceptions use the paranoid exit code.
- Non Maskable Interrupt (NMI):
Access to sensible data like keys, credentials in the NMI context is
mostly theoretical: The CPU can do prefetching or execute a
misspeculated code path and thereby fetching data which might end up
leaking through a buffer.
But for mounting other attacks the kernel stack address of the task is
already valuable information. So in full mitigation mode, the NMI is
mitigated on the return from do_nmi() to provide almost complete
coverage.
- Double fault (#DF):
A double fault is usually fatal, but the ESPFIX workaround, which can
be triggered from user space through modify_ldt(2) is a recoverable
double fault. #DF uses the paranoid exit path, so explicit mitigation
in the double fault handler is required.
- Machine Check Exception (#MC):
Another corner case is a #MC which hits between the CPU buffer clear
invocation and the actual return to user. As this still is in kernel
space it takes the paranoid exit path which does not clear the CPU
buffers. So the #MC handler repopulates the buffers to some
extent. Machine checks are not reliably controllable and the window is
extremly small so mitigation would just tick a checkbox that this
theoretical corner case is covered. To keep the amount of special
cases small, ignore #MC.
- Debug Exception (#DB):
This takes the paranoid exit path only when the INT1 breakpoint is in
kernel space. #DB on a user space address takes the regular exit path,
so no extra mitigation required.
2. C-State transition
^^^^^^^^^^^^^^^^^^^^^
When a CPU goes idle and enters a C-State the CPU buffers need to be
cleared on affected CPUs when SMT is active. This addresses the
repartitioning of the store buffer when one of the Hyper-Threads enters
a C-State.
When SMT is inactive, i.e. either the CPU does not support it or all
sibling threads are offline CPU buffer clearing is not required.
The idle clearing is enabled on CPUs which are only affected by MSBDS
and not by any other MDS variant. The other MDS variants cannot be
protected against cross Hyper-Thread attacks because the Fill Buffer and
the Load Ports are shared. So on CPUs affected by other variants, the
idle clearing would be a window dressing exercise and is therefore not
activated.
The invocation is controlled by the static key mds_idle_clear which is
switched depending on the chosen mitigation mode and the SMT state of
the system.
The buffer clear is only invoked before entering the C-State to prevent
that stale data from the idling CPU from spilling to the Hyper-Thread
sibling after the store buffer got repartitioned and all entries are
available to the non idle sibling.
When coming out of idle the store buffer is partitioned again so each
sibling has half of it available. The back from idle CPU could be then
speculatively exposed to contents of the sibling. The buffers are
flushed either on exit to user space or on VMENTER so malicious code
in user space or the guest cannot speculatively access them.
The mitigation is hooked into all variants of halt()/mwait(), but does
not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver
has been superseded by the intel_idle driver around 2010 and is
preferred on all affected CPUs which are expected to gain the MD_CLEAR
functionality in microcode. Aside of that the IO-Port mechanism is a
legacy interface which is only used on older systems which are either
not affected or do not receive microcode updates anymore.

View File

@@ -1,6 +1,6 @@
VERSION = 4
PATCHLEVEL = 9
SUBLEVEL = 175
SUBLEVEL = 176
EXTRAVERSION =
NAME = Roaring Lionus

View File

@@ -937,13 +937,7 @@ config NR_CPUS
approximately eight kilobytes to the kernel image.
config SCHED_SMT
bool "SMT (Hyperthreading) scheduler support"
depends on SMP
---help---
SMT scheduler support improves the CPU scheduler's decision making
when dealing with Intel Pentium 4 chips with HyperThreading at a
cost of slightly increased overhead in some places. If unsure say
N here.
def_bool y if SMP
config SCHED_MC
def_bool y

View File

@@ -29,6 +29,7 @@
#include <asm/vdso.h>
#include <asm/uaccess.h>
#include <asm/cpufeature.h>
#include <asm/nospec-branch.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -209,6 +210,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
#endif
user_enter_irqoff();
mds_user_clear_cpu_buffers();
}
#define SYSCALL_EXIT_WORK_FLAGS \

View File

@@ -3750,11 +3750,11 @@ __init int intel_pmu_init(void)
pr_cont("Nehalem events, ");
break;
case INTEL_FAM6_ATOM_PINEVIEW:
case INTEL_FAM6_ATOM_LINCROFT:
case INTEL_FAM6_ATOM_PENWELL:
case INTEL_FAM6_ATOM_CLOVERVIEW:
case INTEL_FAM6_ATOM_CEDARVIEW:
case INTEL_FAM6_ATOM_BONNELL:
case INTEL_FAM6_ATOM_BONNELL_MID:
case INTEL_FAM6_ATOM_SALTWELL:
case INTEL_FAM6_ATOM_SALTWELL_MID:
case INTEL_FAM6_ATOM_SALTWELL_TABLET:
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -3766,9 +3766,11 @@ __init int intel_pmu_init(void)
pr_cont("Atom events, ");
break;
case INTEL_FAM6_ATOM_SILVERMONT1:
case INTEL_FAM6_ATOM_SILVERMONT2:
case INTEL_FAM6_ATOM_SILVERMONT:
case INTEL_FAM6_ATOM_SILVERMONT_X:
case INTEL_FAM6_ATOM_SILVERMONT_MID:
case INTEL_FAM6_ATOM_AIRMONT:
case INTEL_FAM6_ATOM_AIRMONT_MID:
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
@@ -3785,7 +3787,7 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_ATOM_GOLDMONT:
case INTEL_FAM6_ATOM_DENVERTON:
case INTEL_FAM6_ATOM_GOLDMONT_X:
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,

View File

@@ -531,8 +531,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates),

View File

@@ -61,8 +61,8 @@ static bool test_intel(int idx)
case INTEL_FAM6_BROADWELL_GT3E:
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_ATOM_SILVERMONT1:
case INTEL_FAM6_ATOM_SILVERMONT2:
case INTEL_FAM6_ATOM_SILVERMONT:
case INTEL_FAM6_ATOM_SILVERMONT_X:
case INTEL_FAM6_ATOM_AIRMONT:
if (idx == PERF_MSR_SMI)
return true;

View File

@@ -271,10 +271,12 @@
/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
@@ -315,6 +317,7 @@
#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
@@ -352,5 +355,7 @@
#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */
#endif /* _ASM_X86_CPUFEATURES_H */

View File

@@ -50,19 +50,23 @@
/* "Small Core" Processors (Atom) */
#define INTEL_FAM6_ATOM_PINEVIEW 0x1C
#define INTEL_FAM6_ATOM_LINCROFT 0x26
#define INTEL_FAM6_ATOM_PENWELL 0x27
#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35
#define INTEL_FAM6_ATOM_CEDARVIEW 0x36
#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */
#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */
#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */
#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */
#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C
#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */
#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */
#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */
#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */
#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */
#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */
#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */
#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */
#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */
#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
/* Xeon Phi */

View File

@@ -5,6 +5,8 @@
#ifndef __ASSEMBLY__
#include <asm/nospec-branch.h>
/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
#define __cpuidle __attribute__((__section__(".cpuidle.text")))
@@ -53,11 +55,13 @@ static inline void native_irq_enable(void)
static inline __cpuidle void native_safe_halt(void)
{
mds_idle_clear_cpu_buffers();
asm volatile("sti; hlt": : :"memory");
}
static inline __cpuidle void native_halt(void)
{
mds_idle_clear_cpu_buffers();
asm volatile("hlt": : :"memory");
}

View File

@@ -52,6 +52,21 @@ struct extended_sigtable {
#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
static inline u32 intel_get_microcode_revision(void)
{
u32 rev, dummy;
native_wrmsrl(MSR_IA32_UCODE_REV, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev);
return rev;
}
extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev);
extern int microcode_sanity_check(void *mc, int print_err);
extern int find_matching_signature(void *mc, unsigned int csig, int cpf);

View File

@@ -1,6 +1,8 @@
#ifndef _ASM_X86_MSR_INDEX_H
#define _ASM_X86_MSR_INDEX_H
#include <linux/bits.h>
/*
* CPU model specific register (MSR) numbers.
*
@@ -38,13 +40,14 @@
/* Intel MSRs. Some also available on other CPUs */
#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */
#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */
#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */
#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */
#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
#define MSR_IA32_PERFCTR0 0x000000c1
#define MSR_IA32_PERFCTR1 0x000000c2
@@ -61,20 +64,25 @@
#define MSR_MTRRcap 0x000000fe
#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */
#define ARCH_CAP_SSB_NO (1 << 4) /*
* Not susceptible to Speculative Store Bypass
* attack, so no Speculative Store Bypass
* control required.
*/
#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */
#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */
#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */
#define ARCH_CAP_SSB_NO BIT(4) /*
* Not susceptible to Speculative Store Bypass
* attack, so no Speculative Store Bypass
* control required.
*/
#define ARCH_CAP_MDS_NO BIT(5) /*
* Not susceptible to
* Microarchitectural Data
* Sampling (MDS) vulnerabilities.
*/
#define MSR_IA32_FLUSH_CMD 0x0000010b
#define L1D_FLUSH (1 << 0) /*
* Writeback and invalidate the
* L1 data cache.
*/
#define L1D_FLUSH BIT(0) /*
* Writeback and invalidate the
* L1 data cache.
*/
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e

View File

@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <asm/cpufeature.h>
#include <asm/nospec-branch.h>
#define MWAIT_SUBSTATE_MASK 0xf
#define MWAIT_CSTATE_MASK 0xf
@@ -38,6 +39,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx,
static inline void __mwait(unsigned long eax, unsigned long ecx)
{
mds_idle_clear_cpu_buffers();
/* "mwait %eax, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
@@ -72,6 +75,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
static inline void __mwaitx(unsigned long eax, unsigned long ebx,
unsigned long ecx)
{
/* No MDS buffer clear as this is AMD/HYGON only */
/* "mwaitx %eax, %ebx, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xfb;"
:: "a" (eax), "b" (ebx), "c" (ecx));
@@ -79,6 +84,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
mds_idle_clear_cpu_buffers();
trace_hardirqs_on();
/* "mwait %eax, %ecx;" */
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"

View File

@@ -3,6 +3,8 @@
#ifndef _ASM_X86_NOSPEC_BRANCH_H_
#define _ASM_X86_NOSPEC_BRANCH_H_
#include <linux/static_key.h>
#include <asm/alternative.h>
#include <asm/alternative-asm.h>
#include <asm/cpufeatures.h>
@@ -214,10 +216,17 @@ enum spectre_v2_mitigation {
SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
SPECTRE_V2_RETPOLINE_GENERIC,
SPECTRE_V2_RETPOLINE_AMD,
SPECTRE_V2_IBRS,
SPECTRE_V2_IBRS_ENHANCED,
};
/* The indirect branch speculation control variants */
enum spectre_v2_user_mitigation {
SPECTRE_V2_USER_NONE,
SPECTRE_V2_USER_STRICT,
SPECTRE_V2_USER_PRCTL,
SPECTRE_V2_USER_SECCOMP,
};
/* The Speculative Store Bypass disable variants */
enum ssb_mitigation {
SPEC_STORE_BYPASS_NONE,
@@ -295,6 +304,60 @@ do { \
preempt_enable(); \
} while (0)
DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
DECLARE_STATIC_KEY_FALSE(mds_user_clear);
DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
#include <asm/segment.h>
/**
* mds_clear_cpu_buffers - Mitigation for MDS vulnerability
*
* This uses the otherwise unused and obsolete VERW instruction in
* combination with microcode which triggers a CPU buffer flush when the
* instruction is executed.
*/
static inline void mds_clear_cpu_buffers(void)
{
static const u16 ds = __KERNEL_DS;
/*
* Has to be the memory-operand variant because only that
* guarantees the CPU buffer flush functionality according to
* documentation. The register-operand variant does not.
* Works with any segment selector, but a valid writable
* data segment is the fastest variant.
*
* "cc" clobber is required because VERW modifies ZF.
*/
asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
}
/**
* mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability
*
* Clear CPU buffers if the corresponding static key is enabled
*/
static inline void mds_user_clear_cpu_buffers(void)
{
if (static_branch_likely(&mds_user_clear))
mds_clear_cpu_buffers();
}
/**
* mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
*
* Clear CPU buffers if the corresponding static key is enabled
*/
static inline void mds_idle_clear_cpu_buffers(void)
{
if (static_branch_likely(&mds_idle_clear))
mds_clear_cpu_buffers();
}
#endif /* __ASSEMBLY__ */
/*

View File

@@ -44,15 +44,15 @@ struct mm_struct;
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
WRITE_ONCE(*ptep, pte);
}
static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
*ptep = native_make_pte(0);
}
static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
*ptep = pte;
native_set_pte(ptep, native_make_pte(0));
}
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
@@ -62,7 +62,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
*pmdp = pmd;
WRITE_ONCE(*pmdp, pmd);
}
static inline void native_pmd_clear(pmd_t *pmd)
@@ -98,7 +98,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
*pudp = pud;
WRITE_ONCE(*pudp, pud);
}
static inline void native_pud_clear(pud_t *pud)
@@ -131,7 +131,7 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
WRITE_ONCE(*pgdp, kaiser_set_shadow_pgd(pgdp, pgd));
}
static inline void native_pgd_clear(pgd_t *pgd)

View File

@@ -874,4 +874,10 @@ enum l1tf_mitigations {
extern enum l1tf_mitigations l1tf_mitigation;
enum mds_mitigations {
MDS_MITIGATION_OFF,
MDS_MITIGATION_FULL,
MDS_MITIGATION_VMWERV,
};
#endif /* _ASM_X86_PROCESSOR_H */

View File

@@ -53,12 +53,24 @@ static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
}
static inline u64 stibp_tif_to_spec_ctrl(u64 tifn)
{
BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
}
static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl)
{
BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
}
static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl)
{
BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
}
static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
{
return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
@@ -70,11 +82,7 @@ extern void speculative_store_bypass_ht_init(void);
static inline void speculative_store_bypass_ht_init(void) { }
#endif
extern void speculative_store_bypass_update(unsigned long tif);
static inline void speculative_store_bypass_update_current(void)
{
speculative_store_bypass_update(current_thread_info()->flags);
}
extern void speculation_ctrl_update(unsigned long tif);
extern void speculation_ctrl_update_current(void);
#endif

View File

@@ -8,9 +8,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
struct tss_struct;
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss);
/* This runs runs on the previous thread's stack. */
static inline void prepare_switch_to(struct task_struct *prev,

View File

@@ -83,10 +83,12 @@ struct thread_info {
#define TIF_SIGPENDING 2 /* signal pending */
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
#define TIF_SSBD 5 /* Reduced data speculation */
#define TIF_SSBD 5 /* Speculative store bypass disable */
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
@@ -112,6 +114,8 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_NOTSC (1 << TIF_NOTSC)
@@ -142,8 +146,18 @@ struct thread_info {
_TIF_NOHZ | _TIF_FSCHECK)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD)
#define _TIF_WORK_CTXSW_BASE \
(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP| \
_TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
/*
* Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
*/
#ifdef CONFIG_SMP
# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB)
#else
# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE)
#endif
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)

View File

@@ -68,8 +68,12 @@ static inline void invpcid_flush_all_nonglobals(void)
struct tlb_state {
struct mm_struct *active_mm;
int state;
/* last user mm's ctx id */
u64 last_ctx_id;
/* Last user mm for optimizing IBPB */
union {
struct mm_struct *last_user_mm;
unsigned long last_user_mm_ibpb;
};
/*
* Access to this CR4 shadow and to H/W CR4 is protected by

View File

@@ -27,7 +27,6 @@ header-y += ldt.h
header-y += mce.h
header-y += mman.h
header-y += msgbuf.h
header-y += msr-index.h
header-y += msr.h
header-y += mtrr.h
header-y += param.h

View File

@@ -28,6 +28,8 @@ struct mce {
__u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
__u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
__u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
__u64 ppin; /* Protected Processor Inventory Number */
__u32 microcode;/* Microcode revision */
};
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)

View File

@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/nospec.h>
#include <linux/prctl.h>
#include <linux/sched/smt.h>
#include <asm/spec-ctrl.h>
#include <asm/cmdline.h>
@@ -24,6 +25,7 @@
#include <asm/vmx.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
#include <asm/hypervisor.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/intel-family.h>
@@ -32,13 +34,12 @@
static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
/*
* Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
* writes to SPEC_CTRL contain whatever reserved bits have been set.
*/
u64 __ro_after_init x86_spec_ctrl_base;
/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
u64 x86_spec_ctrl_base;
EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
static DEFINE_MUTEX(spec_ctrl_mutex);
/*
* The vendor and possibly platform specific bits which can be modified in
@@ -53,6 +54,20 @@ static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
u64 __ro_after_init x86_amd_ls_cfg_base;
u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
/* Control conditional STIPB in switch_to() */
DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
/* Control conditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
/* Control unconditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
/* Control MDS CPU buffer clear before returning to user space */
DEFINE_STATIC_KEY_FALSE(mds_user_clear);
EXPORT_SYMBOL_GPL(mds_user_clear);
/* Control MDS CPU buffer clear before idling (halt, mwait) */
DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
EXPORT_SYMBOL_GPL(mds_idle_clear);
void __init check_bugs(void)
{
identify_boot_cpu();
@@ -91,6 +106,10 @@ void __init check_bugs(void)
l1tf_select_mitigation();
mds_select_mitigation();
arch_smt_update();
#ifdef CONFIG_X86_32
/*
* Check whether we are able to run this kernel safely on SMP.
@@ -123,31 +142,6 @@ void __init check_bugs(void)
#endif
}
/* The kernel command line selection */
enum spectre_v2_mitigation_cmd {
SPECTRE_V2_CMD_NONE,
SPECTRE_V2_CMD_AUTO,
SPECTRE_V2_CMD_FORCE,
SPECTRE_V2_CMD_RETPOLINE,
SPECTRE_V2_CMD_RETPOLINE_GENERIC,
SPECTRE_V2_CMD_RETPOLINE_AMD,
};
static const char *spectre_v2_strings[] = {
[SPECTRE_V2_NONE] = "Vulnerable",
[SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
[SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
[SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
[SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
[SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
};
#undef pr_fmt
#define pr_fmt(fmt) "Spectre V2 : " fmt
static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
SPECTRE_V2_NONE;
void
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
{
@@ -165,9 +159,14 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
/* SSBD controlled in MSR_SPEC_CTRL */
if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
static_cpu_has(X86_FEATURE_AMD_SSBD))
hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
/* Conditional STIBP enabled? */
if (static_branch_unlikely(&switch_to_cond_stibp))
hostval |= stibp_tif_to_spec_ctrl(ti->flags);
if (hostval != guestval) {
msrval = setguest ? guestval : hostval;
wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
@@ -201,7 +200,7 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
ssbd_spec_ctrl_to_tif(hostval);
speculative_store_bypass_update(tif);
speculation_ctrl_update(tif);
}
}
EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
@@ -216,6 +215,70 @@ static void x86_amd_ssb_disable(void)
wrmsrl(MSR_AMD64_LS_CFG, msrval);
}
#undef pr_fmt
#define pr_fmt(fmt) "MDS: " fmt
/* Default mitigation for MDS-affected CPUs */
static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
static bool mds_nosmt __ro_after_init = false;
static const char * const mds_strings[] = {
[MDS_MITIGATION_OFF] = "Vulnerable",
[MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
[MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
};
static void __init mds_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
mds_mitigation = MDS_MITIGATION_OFF;
return;
}
if (mds_mitigation == MDS_MITIGATION_FULL) {
if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
mds_mitigation = MDS_MITIGATION_VMWERV;
static_branch_enable(&mds_user_clear);
if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
(mds_nosmt || cpu_mitigations_auto_nosmt()))
cpu_smt_disable(false);
}
pr_info("%s\n", mds_strings[mds_mitigation]);
}
static int __init mds_cmdline(char *str)
{
if (!boot_cpu_has_bug(X86_BUG_MDS))
return 0;
if (!str)
return -EINVAL;
if (!strcmp(str, "off"))
mds_mitigation = MDS_MITIGATION_OFF;
else if (!strcmp(str, "full"))
mds_mitigation = MDS_MITIGATION_FULL;
else if (!strcmp(str, "full,nosmt")) {
mds_mitigation = MDS_MITIGATION_FULL;
mds_nosmt = true;
}
return 0;
}
early_param("mds", mds_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "Spectre V2 : " fmt
static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
SPECTRE_V2_NONE;
static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
SPECTRE_V2_USER_NONE;
#ifdef RETPOLINE
static bool spectre_v2_bad_module;
@@ -237,15 +300,193 @@ static inline const char *spectre_v2_module_string(void)
static inline const char *spectre_v2_module_string(void) { return ""; }
#endif
static void __init spec2_print_if_insecure(const char *reason)
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
pr_info("%s selected on command line.\n", reason);
int len = strlen(opt);
return len == arglen && !strncmp(arg, opt, len);
}
static void __init spec2_print_if_secure(const char *reason)
/* The kernel command line selection for spectre v2 */
enum spectre_v2_mitigation_cmd {
SPECTRE_V2_CMD_NONE,
SPECTRE_V2_CMD_AUTO,
SPECTRE_V2_CMD_FORCE,
SPECTRE_V2_CMD_RETPOLINE,
SPECTRE_V2_CMD_RETPOLINE_GENERIC,
SPECTRE_V2_CMD_RETPOLINE_AMD,
};
enum spectre_v2_user_cmd {
SPECTRE_V2_USER_CMD_NONE,
SPECTRE_V2_USER_CMD_AUTO,
SPECTRE_V2_USER_CMD_FORCE,
SPECTRE_V2_USER_CMD_PRCTL,
SPECTRE_V2_USER_CMD_PRCTL_IBPB,
SPECTRE_V2_USER_CMD_SECCOMP,
SPECTRE_V2_USER_CMD_SECCOMP_IBPB,
};
static const char * const spectre_v2_user_strings[] = {
[SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
[SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
[SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
[SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
};
static const struct {
const char *option;
enum spectre_v2_user_cmd cmd;
bool secure;
} v2_user_options[] __initconst = {
{ "auto", SPECTRE_V2_USER_CMD_AUTO, false },
{ "off", SPECTRE_V2_USER_CMD_NONE, false },
{ "on", SPECTRE_V2_USER_CMD_FORCE, true },
{ "prctl", SPECTRE_V2_USER_CMD_PRCTL, false },
{ "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false },
{ "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false },
{ "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false },
};
static void __init spec_v2_user_print_cond(const char *reason, bool secure)
{
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
pr_info("spectre_v2_user=%s forced on command line.\n", reason);
}
static enum spectre_v2_user_cmd __init
spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
{
char arg[20];
int ret, i;
switch (v2_cmd) {
case SPECTRE_V2_CMD_NONE:
return SPECTRE_V2_USER_CMD_NONE;
case SPECTRE_V2_CMD_FORCE:
return SPECTRE_V2_USER_CMD_FORCE;
default:
break;
}
ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
arg, sizeof(arg));
if (ret < 0)
return SPECTRE_V2_USER_CMD_AUTO;
for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
if (match_option(arg, ret, v2_user_options[i].option)) {
spec_v2_user_print_cond(v2_user_options[i].option,
v2_user_options[i].secure);
return v2_user_options[i].cmd;
}
}
pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
return SPECTRE_V2_USER_CMD_AUTO;
}
static void __init
spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
{
enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
bool smt_possible = IS_ENABLED(CONFIG_SMP);
enum spectre_v2_user_cmd cmd;
if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
return;
if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
smt_possible = false;
cmd = spectre_v2_parse_user_cmdline(v2_cmd);
switch (cmd) {
case SPECTRE_V2_USER_CMD_NONE:
goto set_mode;
case SPECTRE_V2_USER_CMD_FORCE:
mode = SPECTRE_V2_USER_STRICT;
break;
case SPECTRE_V2_USER_CMD_PRCTL:
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
mode = SPECTRE_V2_USER_PRCTL;
break;
case SPECTRE_V2_USER_CMD_AUTO:
case SPECTRE_V2_USER_CMD_SECCOMP:
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
if (IS_ENABLED(CONFIG_SECCOMP))
mode = SPECTRE_V2_USER_SECCOMP;
else
mode = SPECTRE_V2_USER_PRCTL;
break;
}
/* Initialize Indirect Branch Prediction Barrier */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
switch (cmd) {
case SPECTRE_V2_USER_CMD_FORCE:
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
static_branch_enable(&switch_mm_always_ibpb);
break;
case SPECTRE_V2_USER_CMD_PRCTL:
case SPECTRE_V2_USER_CMD_AUTO:
case SPECTRE_V2_USER_CMD_SECCOMP:
static_branch_enable(&switch_mm_cond_ibpb);
break;
default:
break;
}
pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
static_key_enabled(&switch_mm_always_ibpb) ?
"always-on" : "conditional");
}
/* If enhanced IBRS is enabled no STIPB required */
if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
return;
/*
* If SMT is not possible or STIBP is not available clear the STIPB
* mode.
*/
if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP))
mode = SPECTRE_V2_USER_NONE;
set_mode:
spectre_v2_user = mode;
/* Only print the STIBP mode when SMT possible */
if (smt_possible)
pr_info("%s\n", spectre_v2_user_strings[mode]);
}
static const char * const spectre_v2_strings[] = {
[SPECTRE_V2_NONE] = "Vulnerable",
[SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
[SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
[SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
[SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
[SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
};
static const struct {
const char *option;
enum spectre_v2_mitigation_cmd cmd;
bool secure;
} mitigation_options[] __initconst = {
{ "off", SPECTRE_V2_CMD_NONE, false },
{ "on", SPECTRE_V2_CMD_FORCE, true },
{ "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
{ "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
{ "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
{ "auto", SPECTRE_V2_CMD_AUTO, false },
};
static void __init spec_v2_print_cond(const char *reason, bool secure)
{
if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
pr_info("%s selected on command line.\n", reason);
}
@@ -254,50 +495,30 @@ static inline bool retp_compiler(void)
return __is_defined(RETPOLINE);
}
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
int len = strlen(opt);
return len == arglen && !strncmp(arg, opt, len);
}
static const struct {
const char *option;
enum spectre_v2_mitigation_cmd cmd;
bool secure;
} mitigation_options[] = {
{ "off", SPECTRE_V2_CMD_NONE, false },
{ "on", SPECTRE_V2_CMD_FORCE, true },
{ "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
{ "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
{ "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
{ "auto", SPECTRE_V2_CMD_AUTO, false },
};
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
{
enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
char arg[20];
int ret, i;
enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
cpu_mitigations_off())
return SPECTRE_V2_CMD_NONE;
else {
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
if (ret < 0)
return SPECTRE_V2_CMD_AUTO;
for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
if (!match_option(arg, ret, mitigation_options[i].option))
continue;
cmd = mitigation_options[i].cmd;
break;
}
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
if (ret < 0)
return SPECTRE_V2_CMD_AUTO;
if (i >= ARRAY_SIZE(mitigation_options)) {
pr_err("unknown option (%s). Switching to AUTO select\n", arg);
return SPECTRE_V2_CMD_AUTO;
}
for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
if (!match_option(arg, ret, mitigation_options[i].option))
continue;
cmd = mitigation_options[i].cmd;
break;
}
if (i >= ARRAY_SIZE(mitigation_options)) {
pr_err("unknown option (%s). Switching to AUTO select\n", arg);
return SPECTRE_V2_CMD_AUTO;
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
@@ -314,11 +535,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
if (mitigation_options[i].secure)
spec2_print_if_secure(mitigation_options[i].option);
else
spec2_print_if_insecure(mitigation_options[i].option);
spec_v2_print_cond(mitigation_options[i].option,
mitigation_options[i].secure);
return cmd;
}
@@ -400,12 +618,6 @@ specv2_set_mode:
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
/* Initialize Indirect Branch Prediction Barrier if supported */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
}
/*
* Retpoline means the kernel is safe because it has no indirect
* branches. Enhanced IBRS protects firmware too, so, enable restricted
@@ -421,6 +633,99 @@ specv2_set_mode:
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
/* Set up IBPB and STIBP depending on the general spectre V2 command */
spectre_v2_user_select_mitigation(cmd);
}
static void update_stibp_msr(void * __unused)
{
wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
}
/* Update x86_spec_ctrl_base in case SMT state changed. */
static void update_stibp_strict(void)
{
u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP;
if (sched_smt_active())
mask |= SPEC_CTRL_STIBP;
if (mask == x86_spec_ctrl_base)
return;
pr_info("Update user space SMT mitigation: STIBP %s\n",
mask & SPEC_CTRL_STIBP ? "always-on" : "off");
x86_spec_ctrl_base = mask;
on_each_cpu(update_stibp_msr, NULL, 1);
}
/* Update the static key controlling the evaluation of TIF_SPEC_IB */
static void update_indir_branch_cond(void)
{
if (sched_smt_active())
static_branch_enable(&switch_to_cond_stibp);
else
static_branch_disable(&switch_to_cond_stibp);
}
#undef pr_fmt
#define pr_fmt(fmt) fmt
/* Update the static key controlling the MDS CPU buffer clear in idle */
static void update_mds_branch_idle(void)
{
/*
* Enable the idle clearing if SMT is active on CPUs which are
* affected only by MSBDS and not any other MDS variant.
*
* The other variants cannot be mitigated when SMT is enabled, so
* clearing the buffers on idle just to prevent the Store Buffer
* repartitioning leak would be a window dressing exercise.
*/
if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
return;
if (sched_smt_active())
static_branch_enable(&mds_idle_clear);
else
static_branch_disable(&mds_idle_clear);
}
#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
void arch_smt_update(void)
{
/* Enhanced IBRS implies STIBP. No update required. */
if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
return;
mutex_lock(&spec_ctrl_mutex);
switch (spectre_v2_user) {
case SPECTRE_V2_USER_NONE:
break;
case SPECTRE_V2_USER_STRICT:
update_stibp_strict();
break;
case SPECTRE_V2_USER_PRCTL:
case SPECTRE_V2_USER_SECCOMP:
update_indir_branch_cond();
break;
}
switch (mds_mitigation) {
case MDS_MITIGATION_FULL:
case MDS_MITIGATION_VMWERV:
if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
pr_warn_once(MDS_MSG_SMT);
update_mds_branch_idle();
break;
case MDS_MITIGATION_OFF:
break;
}
mutex_unlock(&spec_ctrl_mutex);
}
#undef pr_fmt
@@ -437,7 +742,7 @@ enum ssb_mitigation_cmd {
SPEC_STORE_BYPASS_CMD_SECCOMP,
};
static const char *ssb_strings[] = {
static const char * const ssb_strings[] = {
[SPEC_STORE_BYPASS_NONE] = "Vulnerable",
[SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled",
[SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl",
@@ -447,7 +752,7 @@ static const char *ssb_strings[] = {
static const struct {
const char *option;
enum ssb_mitigation_cmd cmd;
} ssb_mitigation_options[] = {
} ssb_mitigation_options[] __initconst = {
{ "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
{ "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */
{ "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
@@ -461,7 +766,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
char arg[20];
int ret, i;
if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) {
if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
cpu_mitigations_off()) {
return SPEC_STORE_BYPASS_CMD_NONE;
} else {
ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
@@ -531,18 +837,16 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
if (mode == SPEC_STORE_BYPASS_DISABLE) {
setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
/*
* Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses
* a completely different MSR and bit dependent on family.
* Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
* use a completely different MSR and bit dependent on family.
*/
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
!static_cpu_has(X86_FEATURE_AMD_SSBD)) {
x86_amd_ssb_disable();
} else {
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
break;
case X86_VENDOR_AMD:
x86_amd_ssb_disable();
break;
}
}
@@ -560,10 +864,25 @@ static void ssb_select_mitigation(void)
#undef pr_fmt
#define pr_fmt(fmt) "Speculation prctl: " fmt
static void task_update_spec_tif(struct task_struct *tsk)
{
/* Force the update of the real TIF bits */
set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE);
/*
* Immediately update the speculation control MSRs for the current
* task, but for a non-current task delay setting the CPU
* mitigation until it is scheduled next.
*
* This can only happen for SECCOMP mitigation. For PRCTL it's
* always the current task.
*/
if (tsk == current)
speculation_ctrl_update_current();
}
static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
{
bool update;
if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
return -ENXIO;
@@ -574,28 +893,56 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
if (task_spec_ssb_force_disable(task))
return -EPERM;
task_clear_spec_ssb_disable(task);
update = test_and_clear_tsk_thread_flag(task, TIF_SSBD);
task_update_spec_tif(task);
break;
case PR_SPEC_DISABLE:
task_set_spec_ssb_disable(task);
update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
task_update_spec_tif(task);
break;
case PR_SPEC_FORCE_DISABLE:
task_set_spec_ssb_disable(task);
task_set_spec_ssb_force_disable(task);
update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
task_update_spec_tif(task);
break;
default:
return -ERANGE;
}
return 0;
}
/*
* If being set on non-current task, delay setting the CPU
* mitigation until it is next scheduled.
*/
if (task == current && update)
speculative_store_bypass_update_current();
static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
{
switch (ctrl) {
case PR_SPEC_ENABLE:
if (spectre_v2_user == SPECTRE_V2_USER_NONE)
return 0;
/*
* Indirect branch speculation is always disabled in strict
* mode.
*/
if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
return -EPERM;
task_clear_spec_ib_disable(task);
task_update_spec_tif(task);
break;
case PR_SPEC_DISABLE:
case PR_SPEC_FORCE_DISABLE:
/*
* Indirect branch speculation is always allowed when
* mitigation is force disabled.
*/
if (spectre_v2_user == SPECTRE_V2_USER_NONE)
return -EPERM;
if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
return 0;
task_set_spec_ib_disable(task);
if (ctrl == PR_SPEC_FORCE_DISABLE)
task_set_spec_ib_force_disable(task);
task_update_spec_tif(task);
break;
default:
return -ERANGE;
}
return 0;
}
@@ -605,6 +952,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssb_prctl_set(task, ctrl);
case PR_SPEC_INDIRECT_BRANCH:
return ib_prctl_set(task, ctrl);
default:
return -ENODEV;
}
@@ -615,6 +964,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
{
if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP)
ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
}
#endif
@@ -637,11 +988,35 @@ static int ssb_prctl_get(struct task_struct *task)
}
}
static int ib_prctl_get(struct task_struct *task)
{
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
return PR_SPEC_NOT_AFFECTED;
switch (spectre_v2_user) {
case SPECTRE_V2_USER_NONE:
return PR_SPEC_ENABLE;
case SPECTRE_V2_USER_PRCTL:
case SPECTRE_V2_USER_SECCOMP:
if (task_spec_ib_force_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
if (task_spec_ib_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
case SPECTRE_V2_USER_STRICT:
return PR_SPEC_DISABLE;
default:
return PR_SPEC_NOT_AFFECTED;
}
}
int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
{
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssb_prctl_get(task);
case PR_SPEC_INDIRECT_BRANCH:
return ib_prctl_get(task);
default:
return -ENODEV;
}
@@ -713,6 +1088,11 @@ static void __init l1tf_select_mitigation(void)
if (!boot_cpu_has_bug(X86_BUG_L1TF))
return;
if (cpu_mitigations_off())
l1tf_mitigation = L1TF_MITIGATION_OFF;
else if (cpu_mitigations_auto_nosmt())
l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
override_cache_bits(&boot_cpu_data);
switch (l1tf_mitigation) {
@@ -735,12 +1115,13 @@ static void __init l1tf_select_mitigation(void)
#endif
half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
if (l1tf_mitigation != L1TF_MITIGATION_OFF &&
e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
half_pa);
pr_info("However, doing so will make a part of your RAM unusable.\n");
pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n");
pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n");
return;
}
@@ -773,13 +1154,14 @@ static int __init l1tf_cmdline(char *str)
early_param("l1tf", l1tf_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) fmt
#ifdef CONFIG_SYSFS
#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
#if IS_ENABLED(CONFIG_KVM_INTEL)
static const char *l1tf_vmx_states[] = {
static const char * const l1tf_vmx_states[] = {
[VMENTER_L1D_FLUSH_AUTO] = "auto",
[VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
[VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
@@ -795,13 +1177,14 @@ static ssize_t l1tf_show_state(char *buf)
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
(l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
cpu_smt_control == CPU_SMT_ENABLED))
sched_smt_active())) {
return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
l1tf_vmx_states[l1tf_vmx_mitigation]);
}
return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
l1tf_vmx_states[l1tf_vmx_mitigation],
cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled");
sched_smt_active() ? "vulnerable" : "disabled");
}
#else
static ssize_t l1tf_show_state(char *buf)
@@ -810,6 +1193,55 @@ static ssize_t l1tf_show_state(char *buf)
}
#endif
static ssize_t mds_show_state(char *buf)
{
#ifdef CONFIG_HYPERVISOR_GUEST
if (x86_hyper) {
return sprintf(buf, "%s; SMT Host state unknown\n",
mds_strings[mds_mitigation]);
}
#endif
if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
(mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
sched_smt_active() ? "mitigated" : "disabled"));
}
return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
sched_smt_active() ? "vulnerable" : "disabled");
}
static char *stibp_state(void)
{
if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
return "";
switch (spectre_v2_user) {
case SPECTRE_V2_USER_NONE:
return ", STIBP: disabled";
case SPECTRE_V2_USER_STRICT:
return ", STIBP: forced";
case SPECTRE_V2_USER_PRCTL:
case SPECTRE_V2_USER_SECCOMP:
if (static_key_enabled(&switch_to_cond_stibp))
return ", STIBP: conditional";
}
return "";
}
static char *ibpb_state(void)
{
if (boot_cpu_has(X86_FEATURE_IBPB)) {
if (static_key_enabled(&switch_mm_always_ibpb))
return ", IBPB: always-on";
if (static_key_enabled(&switch_mm_cond_ibpb))
return ", IBPB: conditional";
return ", IBPB: disabled";
}
return "";
}
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -827,9 +1259,11 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
return sprintf(buf, "Mitigation: __user pointer sanitization\n");
case X86_BUG_SPECTRE_V2:
return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
ibpb_state(),
boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
stibp_state(),
boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
spectre_v2_module_string());
case X86_BUG_SPEC_STORE_BYPASS:
@@ -839,6 +1273,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
return l1tf_show_state(buf);
break;
case X86_BUG_MDS:
return mds_show_state(buf);
default:
break;
}
@@ -870,4 +1308,9 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b
{
return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
}
ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf)
{
return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
}
#endif

View File

@@ -752,6 +752,12 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_STIBP);
set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
}
if (cpu_has(c, X86_FEATURE_AMD_SSBD)) {
set_cpu_cap(c, X86_FEATURE_SSBD);
set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD);
}
}
void get_cpu_cap(struct cpuinfo_x86 *c)
@@ -885,84 +891,95 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
c->x86_cache_bits = c->x86_phys_bits;
}
static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
{ X86_VENDOR_CENTAUR, 5 },
{ X86_VENDOR_INTEL, 5 },
{ X86_VENDOR_NSC, 5 },
{ X86_VENDOR_ANY, 4 },
#define NO_SPECULATION BIT(0)
#define NO_MELTDOWN BIT(1)
#define NO_SSB BIT(2)
#define NO_L1TF BIT(3)
#define NO_MDS BIT(4)
#define MSBDS_ONLY BIT(5)
#define VULNWL(_vendor, _family, _model, _whitelist) \
{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
#define VULNWL_INTEL(model, whitelist) \
VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
#define VULNWL_AMD(family, whitelist) \
VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION),
VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION),
VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION),
VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION),
VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION),
VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION),
VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY),
VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY),
VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY),
VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY),
VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY),
VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY),
VULNWL_INTEL(CORE_YONAH, NO_SSB),
VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY),
VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF),
VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF),
VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF),
/* AMD Family 0xf - 0x12 */
VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS),
{}
};
static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
{ X86_VENDOR_AMD },
{}
};
static bool __init cpu_matches(unsigned long which)
{
const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist);
static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
{ X86_VENDOR_CENTAUR, 5, },
{ X86_VENDOR_INTEL, 5, },
{ X86_VENDOR_NSC, 5, },
{ X86_VENDOR_AMD, 0x12, },
{ X86_VENDOR_AMD, 0x11, },
{ X86_VENDOR_AMD, 0x10, },
{ X86_VENDOR_AMD, 0xf, },
{ X86_VENDOR_ANY, 4, },
{}
};
static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
/* in addition to cpu_no_speculation */
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
{}
};
return m && !!(m->driver_data & which);
}
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 ia32_cap = 0;
if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
!(ia32_cap & ARCH_CAP_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
if (x86_match_cpu(cpu_no_speculation))
if (cpu_matches(NO_SPECULATION))
return;
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
if (ia32_cap & ARCH_CAP_IBRS_ALL)
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
if (x86_match_cpu(cpu_no_meltdown))
if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) {
setup_force_cpu_bug(X86_BUG_MDS);
if (cpu_matches(MSBDS_ONLY))
setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
}
if (cpu_matches(NO_MELTDOWN))
return;
/* Rogue Data Cache Load? No! */
@@ -971,7 +988,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
if (x86_match_cpu(cpu_no_l1tf))
if (cpu_matches(NO_L1TF))
return;
setup_force_cpu_bug(X86_BUG_L1TF);

View File

@@ -14,6 +14,7 @@
#include <asm/bugs.h>
#include <asm/cpu.h>
#include <asm/intel-family.h>
#include <asm/microcode_intel.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -137,14 +138,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
(c->x86 == 0x6 && c->x86_model >= 0x0e))
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
unsigned lower_word;
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* Required by the SDM */
sync_core();
rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode);
}
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
c->microcode = intel_get_microcode_revision();
/* Now if any of them are set, check the blacklist and clear the lot */
if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||

View File

@@ -139,6 +139,8 @@ void mce_setup(struct mce *m)
m->socketid = cpu_data(m->extcpu).phys_proc_id;
m->apicid = cpu_data(m->extcpu).initial_apicid;
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
m->microcode = boot_cpu_data.microcode;
}
DEFINE_PER_CPU(struct mce, injectm);
@@ -309,7 +311,7 @@ static void print_mce(struct mce *m)
*/
pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
cpu_data(m->extcpu).microcode);
m->microcode);
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}

View File

@@ -707,22 +707,26 @@ int apply_microcode_amd(int cpu)
return -1;
/* need to apply patch? */
if (rev >= mc_amd->hdr.patch_id) {
c->microcode = rev;
uci->cpu_sig.rev = rev;
return 0;
}
if (rev >= mc_amd->hdr.patch_id)
goto out;
if (__apply_microcode_amd(mc_amd)) {
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
cpu, mc_amd->hdr.patch_id);
return -1;
}
pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
mc_amd->hdr.patch_id);
uci->cpu_sig.rev = mc_amd->hdr.patch_id;
c->microcode = mc_amd->hdr.patch_id;
rev = mc_amd->hdr.patch_id;
pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
out:
uci->cpu_sig.rev = rev;
c->microcode = rev;
/* Update boot_cpu_data's revision too, if we're on the BSP: */
if (c->cpu_index == boot_cpu_data.cpu_index)
boot_cpu_data.microcode = rev;
return 0;
}

View File

@@ -386,15 +386,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig.pf = 1 << ((val[1] >> 18) & 7);
}
native_wrmsrl(MSR_IA32_UCODE_REV, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
csig.rev = val[1];
csig.rev = intel_get_microcode_revision();
uci->cpu_sig = csig;
uci->valid = 1;
@@ -618,29 +611,35 @@ static inline void print_ucode(struct ucode_cpu_info *uci)
static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
{
struct microcode_intel *mc;
unsigned int val[2];
u32 rev;
mc = uci->mc;
if (!mc)
return 0;
/*
* Save us the MSR write below - which is a particular expensive
* operation - when the other hyperthread has updated the microcode
* already.
*/
rev = intel_get_microcode_revision();
if (rev >= mc->hdr.rev) {
uci->cpu_sig.rev = rev;
return 0;
}
/* write microcode via MSR 0x79 */
native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
native_wrmsrl(MSR_IA32_UCODE_REV, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
if (val[1] != mc->hdr.rev)
rev = intel_get_microcode_revision();
if (rev != mc->hdr.rev)
return -1;
#ifdef CONFIG_X86_64
/* Flush global tlb. This is precaution. */
flush_tlb_early();
#endif
uci->cpu_sig.rev = val[1];
uci->cpu_sig.rev = rev;
if (early)
print_ucode(uci);
@@ -903,9 +902,9 @@ static int apply_microcode_intel(int cpu)
{
struct microcode_intel *mc;
struct ucode_cpu_info *uci;
struct cpuinfo_x86 *c;
unsigned int val[2];
struct cpuinfo_x86 *c = &cpu_data(cpu);
static int prev_rev;
u32 rev;
/* We should bind the task to the CPU */
if (WARN_ON(raw_smp_processor_id() != cpu))
@@ -924,35 +923,42 @@ static int apply_microcode_intel(int cpu)
if (!get_matching_mc(mc, cpu))
return 0;
/*
* Save us the MSR write below - which is a particular expensive
* operation - when the other hyperthread has updated the microcode
* already.
*/
rev = intel_get_microcode_revision();
if (rev >= mc->hdr.rev)
goto out;
/* write microcode via MSR 0x79 */
wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
wrmsrl(MSR_IA32_UCODE_REV, 0);
/* As documented in the SDM: Do a CPUID 1 here */
sync_core();
rev = intel_get_microcode_revision();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
if (val[1] != mc->hdr.rev) {
if (rev != mc->hdr.rev) {
pr_err("CPU%d update to revision 0x%x failed\n",
cpu, mc->hdr.rev);
return -1;
}
if (val[1] != prev_rev) {
if (rev != prev_rev) {
pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
val[1],
rev,
mc->hdr.date & 0xffff,
mc->hdr.date >> 24,
(mc->hdr.date >> 16) & 0xff);
prev_rev = val[1];
prev_rev = rev;
}
c = &cpu_data(cpu);
out:
uci->cpu_sig.rev = rev;
c->microcode = rev;
uci->cpu_sig.rev = val[1];
c->microcode = val[1];
/* Update boot_cpu_data's revision too, if we're on the BSP: */
if (c->cpu_index == boot_cpu_data.cpu_index)
boot_cpu_data.microcode = rev;
return 0;
}

View File

@@ -32,6 +32,7 @@
#include <asm/x86_init.h>
#include <asm/reboot.h>
#include <asm/cache.h>
#include <asm/nospec-branch.h>
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
@@ -544,6 +545,9 @@ nmi_restart:
write_cr2(this_cpu_read(nmi_cr2));
if (this_cpu_dec_return(nmi_state))
goto nmi_restart;
if (user_mode(regs))
mds_user_clear_cpu_buffers();
}
NOKPROBE_SYMBOL(do_nmi);

View File

@@ -35,6 +35,8 @@
#include <asm/switch_to.h>
#include <asm/spec-ctrl.h>
#include "process.h"
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
* no more per-task TSS's. The TSS size is kept cacheline-aligned
@@ -170,11 +172,12 @@ int set_tsc_mode(unsigned int val)
return 0;
}
static inline void switch_to_bitmap(struct tss_struct *tss,
struct thread_struct *prev,
static inline void switch_to_bitmap(struct thread_struct *prev,
struct thread_struct *next,
unsigned long tifp, unsigned long tifn)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss);
if (tifn & _TIF_IO_BITMAP) {
/*
* Copy the relevant range of the IO bitmap.
@@ -308,32 +311,85 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
}
static __always_inline void intel_set_ssb_state(unsigned long tifn)
/*
* Update the MSRs managing speculation control, during context switch.
*
* tifp: Previous task's thread flags
* tifn: Next task's thread flags
*/
static __always_inline void __speculation_ctrl_update(unsigned long tifp,
unsigned long tifn)
{
u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
unsigned long tif_diff = tifp ^ tifn;
u64 msr = x86_spec_ctrl_base;
bool updmsr = false;
wrmsrl(MSR_IA32_SPEC_CTRL, msr);
/*
* If TIF_SSBD is different, select the proper mitigation
* method. Note that if SSBD mitigation is disabled or permanentely
* enabled this branch can't be taken because nothing can set
* TIF_SSBD.
*/
if (tif_diff & _TIF_SSBD) {
if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
amd_set_ssb_virt_state(tifn);
} else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
amd_set_core_ssb_state(tifn);
} else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
static_cpu_has(X86_FEATURE_AMD_SSBD)) {
msr |= ssbd_tif_to_spec_ctrl(tifn);
updmsr = true;
}
}
/*
* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
* otherwise avoid the MSR write.
*/
if (IS_ENABLED(CONFIG_SMP) &&
static_branch_unlikely(&switch_to_cond_stibp)) {
updmsr |= !!(tif_diff & _TIF_SPEC_IB);
msr |= stibp_tif_to_spec_ctrl(tifn);
}
if (updmsr)
wrmsrl(MSR_IA32_SPEC_CTRL, msr);
}
static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
{
if (static_cpu_has(X86_FEATURE_VIRT_SSBD))
amd_set_ssb_virt_state(tifn);
else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD))
amd_set_core_ssb_state(tifn);
else
intel_set_ssb_state(tifn);
if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
if (task_spec_ssb_disable(tsk))
set_tsk_thread_flag(tsk, TIF_SSBD);
else
clear_tsk_thread_flag(tsk, TIF_SSBD);
if (task_spec_ib_disable(tsk))
set_tsk_thread_flag(tsk, TIF_SPEC_IB);
else
clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
}
/* Return the updated threadinfo flags*/
return task_thread_info(tsk)->flags;
}
void speculative_store_bypass_update(unsigned long tif)
void speculation_ctrl_update(unsigned long tif)
{
/* Forced update. Make sure all relevant TIF flags are different */
preempt_disable();
__speculative_store_bypass_update(tif);
__speculation_ctrl_update(~tif, tif);
preempt_enable();
}
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
/* Called from seccomp/prctl update */
void speculation_ctrl_update_current(void)
{
preempt_disable();
speculation_ctrl_update(speculation_ctrl_update_tif(current));
preempt_enable();
}
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev, *next;
unsigned long tifp, tifn;
@@ -343,7 +399,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
tifn = READ_ONCE(task_thread_info(next_p)->flags);
tifp = READ_ONCE(task_thread_info(prev_p)->flags);
switch_to_bitmap(tss, prev, next, tifp, tifn);
switch_to_bitmap(prev, next, tifp, tifn);
propagate_user_return_notify(prev_p, next_p);
@@ -361,8 +417,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
if ((tifp ^ tifn) & _TIF_NOTSC)
cr4_toggle_bits(X86_CR4_TSD);
if ((tifp ^ tifn) & _TIF_SSBD)
__speculative_store_bypass_update(tifn);
if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
__speculation_ctrl_update(tifp, tifn);
} else {
speculation_ctrl_update_tif(prev_p);
tifn = speculation_ctrl_update_tif(next_p);
/* Enforce MSR update to ensure consistent state */
__speculation_ctrl_update(~tifn, tifn);
}
}
/*

39
arch/x86/kernel/process.h Normal file
View File

@@ -0,0 +1,39 @@
// SPDX-License-Identifier: GPL-2.0
//
// Code shared between 32 and 64 bit
#include <asm/spec-ctrl.h>
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
/*
* This needs to be inline to optimize for the common case where no extra
* work needs to be done.
*/
static inline void switch_to_extra(struct task_struct *prev,
struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
unsigned long prev_tif = task_thread_info(prev)->flags;
if (IS_ENABLED(CONFIG_SMP)) {
/*
* Avoid __switch_to_xtra() invocation when conditional
* STIPB is disabled and the only different bit is
* TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
* in the TIF_WORK_CTXSW masks.
*/
if (!static_branch_likely(&switch_to_cond_stibp)) {
prev_tif &= ~_TIF_SPEC_IB;
next_tif &= ~_TIF_SPEC_IB;
}
}
/*
* __switch_to_xtra() handles debug registers, i/o bitmaps,
* speculation mitigations etc.
*/
if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT ||
prev_tif & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev, next);
}

View File

@@ -55,6 +55,8 @@
#include <asm/switch_to.h>
#include <asm/vm86.h>
#include "process.h"
void __show_regs(struct pt_regs *regs, int all)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -264,12 +266,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
set_iopl_mask(next->iopl);
/*
* Now maybe handle debug registers and/or IO bitmaps
*/
if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
__switch_to_xtra(prev_p, next_p, tss);
switch_to_extra(prev_p, next_p);
/*
* Leave lazy mode, flushing any hypercalls made here.

View File

@@ -51,6 +51,8 @@
#include <asm/xen/hypervisor.h>
#include <asm/vdso.h>
#include "process.h"
__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
/* Prints also some state that isn't saved in the pt_regs */
@@ -454,12 +456,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Reload esp0 and ss1. This changes current_thread_info(). */
load_sp0(tss, next);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
*/
if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);
switch_to_extra(prev_p, next_p);
#ifdef CONFIG_XEN
/*

View File

@@ -62,6 +62,7 @@
#include <asm/alternative.h>
#include <asm/fpu/xstate.h>
#include <asm/trace/mpx.h>
#include <asm/nospec-branch.h>
#include <asm/mpx.h>
#include <asm/vm86.h>
@@ -340,6 +341,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
regs->ip = (unsigned long)general_protection;
regs->sp = (unsigned long)&normal_regs->orig_ax;
/*
* This situation can be triggered by userspace via
* modify_ldt(2) and the return does not take the regular
* user space exit, so a CPU buffer clear is required when
* MDS mitigation is enabled.
*/
mds_user_clear_cpu_buffers();
return;
}
#endif

View File

@@ -713,7 +713,7 @@ unsigned long native_calibrate_tsc(void)
case INTEL_FAM6_KABYLAKE_DESKTOP:
crystal_khz = 24000; /* 24.0 MHz */
break;
case INTEL_FAM6_ATOM_DENVERTON:
case INTEL_FAM6_ATOM_GOLDMONT_X:
crystal_khz = 25000; /* 25.0 MHz */
break;
case INTEL_FAM6_ATOM_GOLDMONT:

View File

@@ -355,7 +355,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD);
F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
F(AMD_SSB_NO) | F(AMD_STIBP);
/* cpuid 0xC0000001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -380,7 +381,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES);
F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) |
F(INTEL_STIBP) | F(MD_CLEAR);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -633,7 +635,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ebx |= F(VIRT_SSBD);
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
/*
* The preference is to use SPEC CTRL MSR instead of the
* VIRT_SPEC MSR.
*/
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
!boot_cpu_has(X86_FEATURE_AMD_SSBD))
entry->ebx |= F(VIRT_SSBD);
break;
}

View File

@@ -176,7 +176,7 @@ static inline bool guest_cpuid_has_spec_ctrl(struct kvm_vcpu *vcpu)
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
if (best && (best->ebx & bit(X86_FEATURE_AMD_IBRS)))
if (best && (best->ebx & (bit(X86_FEATURE_AMD_IBRS | bit(X86_FEATURE_AMD_SSBD)))))
return true;
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SPEC_CTRL_SSBD)));

View File

@@ -3704,7 +3704,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
return 1;
/* The STIBP bit doesn't fault even if it's not advertised */
if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
return 1;
svm->spec_ctrl = data;

View File

@@ -9205,8 +9205,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->__launched = vmx->loaded_vmcs->launched;
/* L1D Flush includes CPU buffer clear to mitigate MDS */
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&mds_user_clear))
mds_clear_cpu_buffers();
asm(
/* Store host registers */
@@ -9565,8 +9568,8 @@ free_vcpu:
return ERR_PTR(err);
}
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
static int vmx_vm_init(struct kvm *kvm)
{

View File

@@ -790,7 +790,7 @@ unsigned long max_swapfile_size(void)
pages = generic_max_swapfile_size();
if (boot_cpu_has_bug(X86_BUG_L1TF)) {
if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) {
/* Limit the swap file size to MAX_PA/2 for L1TF workaround */
unsigned long long l1tf_limit = l1tf_pfn_limit();
/*

View File

@@ -9,6 +9,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <linux/cpu.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -297,7 +298,8 @@ void __init kaiser_check_boottime_disable(void)
goto skip;
}
if (cmdline_find_option_bool(boot_command_line, "nopti"))
if (cmdline_find_option_bool(boot_command_line, "nopti") ||
cpu_mitigations_off())
goto disable;
skip:

View File

@@ -251,7 +251,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
pgdp[i] = native_make_pgd(0);
pgd_clear(&pgdp[i]);
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
@@ -419,7 +419,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed && dirty) {
*ptep = entry;
set_pte(ptep, entry);
pte_update(vma->vm_mm, address, ptep);
}
@@ -436,7 +436,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (changed && dirty) {
*pmdp = entry;
set_pmd(pmdp, entry);
/*
* We had a write-protection fault here and changed the pmd
* to to more permissive. No need to flush the TLB for that,

View File

@@ -30,6 +30,12 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
/*
* Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
* stored in cpu_tlb_state.last_user_mm_ibpb.
*/
#define LAST_USER_MM_IBPB 0x1UL
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
struct flush_tlb_info {
@@ -101,33 +107,101 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
local_irq_restore(flags);
}
static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
return (unsigned long)next->mm | ibpb;
}
static void cond_ibpb(struct task_struct *next)
{
if (!next || !next->mm)
return;
/*
* Both, the conditional and the always IBPB mode use the mm
* pointer to avoid the IBPB when switching between tasks of the
* same process. Using the mm pointer instead of mm->context.ctx_id
* opens a hypothetical hole vs. mm_struct reuse, which is more or
* less impossible to control by an attacker. Aside of that it
* would only affect the first schedule so the theoretically
* exposed data is not really interesting.
*/
if (static_branch_likely(&switch_mm_cond_ibpb)) {
unsigned long prev_mm, next_mm;
/*
* This is a bit more complex than the always mode because
* it has to handle two cases:
*
* 1) Switch from a user space task (potential attacker)
* which has TIF_SPEC_IB set to a user space task
* (potential victim) which has TIF_SPEC_IB not set.
*
* 2) Switch from a user space task (potential attacker)
* which has TIF_SPEC_IB not set to a user space task
* (potential victim) which has TIF_SPEC_IB set.
*
* This could be done by unconditionally issuing IBPB when
* a task which has TIF_SPEC_IB set is either scheduled in
* or out. Though that results in two flushes when:
*
* - the same user space task is scheduled out and later
* scheduled in again and only a kernel thread ran in
* between.
*
* - a user space task belonging to the same process is
* scheduled in after a kernel thread ran in between
*
* - a user space task belonging to the same process is
* scheduled in immediately.
*
* Optimize this with reasonably small overhead for the
* above cases. Mangle the TIF_SPEC_IB bit into the mm
* pointer of the incoming task which is stored in
* cpu_tlbstate.last_user_mm_ibpb for comparison.
*/
next_mm = mm_mangle_tif_spec_ib(next);
prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
/*
* Issue IBPB only if the mm's are different and one or
* both have the IBPB bit set.
*/
if (next_mm != prev_mm &&
(next_mm | prev_mm) & LAST_USER_MM_IBPB)
indirect_branch_prediction_barrier();
this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
}
if (static_branch_unlikely(&switch_mm_always_ibpb)) {
/*
* Only flush when switching to a user space task with a
* different context than the user space task which ran
* last on this CPU.
*/
if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
indirect_branch_prediction_barrier();
this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
}
}
}
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
unsigned cpu = smp_processor_id();
if (likely(prev != next)) {
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
* Avoid user/user BTB poisoning by flushing the branch
* predictor when switching between processes. This stops
* one process from doing Spectre-v2 attacks on another.
*
* As an optimization, flush indirect branches only when
* switching into processes that disable dumping. This
* protects high value processes like gpg, without having
* too high performance overhead. IBPB is *expensive*!
*
* This will not flush branches when switching into kernel
* threads. It will also not flush if we switch to idle
* thread and back to the same process. It will flush if we
* switch to a different non-dumpable process.
*/
if (tsk && tsk->mm &&
tsk->mm->context.ctx_id != last_ctx_id &&
get_dumpable(tsk->mm) != SUID_DUMP_USER)
indirect_branch_prediction_barrier();
cond_ibpb(tsk);
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
@@ -143,14 +217,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
}
/*
* Record last user mm's context id, so we can avoid
* flushing branch buffer with IBPB if we switch back
* to the same user.
*/
if (next != &init_mm)
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
this_cpu_write(cpu_tlbstate.active_mm, next);

View File

@@ -154,8 +154,8 @@ static void punit_dbgfs_unregister(void)
(kernel_ulong_t)&drv_data }
static const struct x86_cpu_id intel_punit_cpu_ids[] = {
ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt),
ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng),
ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt),
ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng),
ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht),
{}
};

View File

@@ -243,7 +243,7 @@ static const struct lpss_device_desc bsw_spi_dev_desc = {
#define ICPU(model) { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, }
static const struct x86_cpu_id lpss_cpu_ids[] = {
ICPU(INTEL_FAM6_ATOM_SILVERMONT1), /* Valleyview, Bay Trail */
ICPU(INTEL_FAM6_ATOM_SILVERMONT), /* Valleyview, Bay Trail */
ICPU(INTEL_FAM6_ATOM_AIRMONT), /* Braswell, Cherry Trail */
{}
};

View File

@@ -531,11 +531,18 @@ ssize_t __weak cpu_show_l1tf(struct device *dev,
return sprintf(buf, "Not affected\n");
}
ssize_t __weak cpu_show_mds(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "Not affected\n");
}
static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);
static struct attribute *cpu_root_vulnerabilities_attrs[] = {
&dev_attr_meltdown.attr,
@@ -543,6 +550,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
&dev_attr_spectre_v2.attr,
&dev_attr_spec_store_bypass.attr,
&dev_attr_l1tf.attr,
&dev_attr_mds.attr,
NULL
};

View File

@@ -1413,7 +1413,7 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
ICPU(INTEL_FAM6_SANDYBRIDGE, core_params),
ICPU(INTEL_FAM6_SANDYBRIDGE_X, core_params),
ICPU(INTEL_FAM6_ATOM_SILVERMONT1, silvermont_params),
ICPU(INTEL_FAM6_ATOM_SILVERMONT, silvermont_params),
ICPU(INTEL_FAM6_IVYBRIDGE, core_params),
ICPU(INTEL_FAM6_HASWELL_CORE, core_params),
ICPU(INTEL_FAM6_BROADWELL_CORE, core_params),

View File

@@ -1107,14 +1107,14 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
ICPU(INTEL_FAM6_WESTMERE, idle_cpu_nehalem),
ICPU(INTEL_FAM6_WESTMERE_EP, idle_cpu_nehalem),
ICPU(INTEL_FAM6_NEHALEM_EX, idle_cpu_nehalem),
ICPU(INTEL_FAM6_ATOM_PINEVIEW, idle_cpu_atom),
ICPU(INTEL_FAM6_ATOM_LINCROFT, idle_cpu_lincroft),
ICPU(INTEL_FAM6_ATOM_BONNELL, idle_cpu_atom),
ICPU(INTEL_FAM6_ATOM_BONNELL_MID, idle_cpu_lincroft),
ICPU(INTEL_FAM6_WESTMERE_EX, idle_cpu_nehalem),
ICPU(INTEL_FAM6_SANDYBRIDGE, idle_cpu_snb),
ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb),
ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom),
ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt),
ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier),
ICPU(INTEL_FAM6_ATOM_SALTWELL, idle_cpu_atom),
ICPU(INTEL_FAM6_ATOM_SILVERMONT, idle_cpu_byt),
ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, idle_cpu_tangier),
ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht),
ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb),
ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt),
@@ -1122,7 +1122,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
ICPU(INTEL_FAM6_HASWELL_X, idle_cpu_hsw),
ICPU(INTEL_FAM6_HASWELL_ULT, idle_cpu_hsw),
ICPU(INTEL_FAM6_HASWELL_GT3E, idle_cpu_hsw),
ICPU(INTEL_FAM6_ATOM_SILVERMONT2, idle_cpu_avn),
ICPU(INTEL_FAM6_ATOM_SILVERMONT_X, idle_cpu_avn),
ICPU(INTEL_FAM6_BROADWELL_CORE, idle_cpu_bdw),
ICPU(INTEL_FAM6_BROADWELL_GT3E, idle_cpu_bdw),
ICPU(INTEL_FAM6_BROADWELL_X, idle_cpu_bdw),
@@ -1134,7 +1134,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx),
ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl),
ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt),
ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv),
ICPU(INTEL_FAM6_ATOM_GOLDMONT_X, idle_cpu_dnv),
{}
};

View File

@@ -127,7 +127,7 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = {
static bool sdhci_acpi_byt(void)
{
static const struct x86_cpu_id byt[] = {
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT },
{}
};

View File

@@ -71,8 +71,8 @@ static struct pci_platform_pm_ops mid_pci_platform_pm = {
* arch/x86/platform/intel-mid/pwr.c.
*/
static const struct x86_cpu_id lpss_cpu_ids[] = {
ICPU(INTEL_FAM6_ATOM_PENWELL),
ICPU(INTEL_FAM6_ATOM_MERRIFIELD),
ICPU(INTEL_FAM6_ATOM_SALTWELL_MID),
ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID),
{}
};

View File

@@ -1175,12 +1175,12 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
RAPL_CPU(INTEL_FAM6_KABYLAKE_MOBILE, rapl_defaults_core),
RAPL_CPU(INTEL_FAM6_KABYLAKE_DESKTOP, rapl_defaults_core),
RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT1, rapl_defaults_byt),
RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT, rapl_defaults_byt),
RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT, rapl_defaults_cht),
RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD, rapl_defaults_tng),
RAPL_CPU(INTEL_FAM6_ATOM_MOOREFIELD, rapl_defaults_ann),
RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT_MID,rapl_defaults_tng),
RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT_MID, rapl_defaults_ann),
RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT, rapl_defaults_core),
RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core),
RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT_X, rapl_defaults_core),
RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server),
{}

View File

@@ -43,7 +43,7 @@ static irqreturn_t soc_irq_thread_fn(int irq, void *dev_data)
}
static const struct x86_cpu_id soc_thermal_ids[] = {
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1, 0,
{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT, 0,
BYT_SOC_DTS_APIC_IRQ},
{}
};

View File

@@ -1,28 +1,9 @@
#ifndef _LINUX_BITOPS_H
#define _LINUX_BITOPS_H
#include <asm/types.h>
#include <linux/bits.h>
#ifdef __KERNEL__
#define BIT(nr) (1UL << (nr))
#define BIT_ULL(nr) (1ULL << (nr))
#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG))
#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG)
#define BITS_PER_BYTE 8
#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
#endif
/*
* Create a contiguous bitmask starting at bit position @l and ending at
* position @h. For example
* GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
*/
#define GENMASK(h, l) \
(((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
#define GENMASK_ULL(h, l) \
(((~0ULL) << (l)) & (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);

26
include/linux/bits.h Normal file
View File

@@ -0,0 +1,26 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BITS_H
#define __LINUX_BITS_H
#include <asm/bitsperlong.h>
#define BIT(nr) (1UL << (nr))
#define BIT_ULL(nr) (1ULL << (nr))
#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG))
#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG)
#define BITS_PER_BYTE 8
/*
* Create a contiguous bitmask starting at bit position @l and ending at
* position @h. For example
* GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
*/
#define GENMASK(h, l) \
(((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
#define GENMASK_ULL(h, l) \
(((~0ULL) - (1ULL << (l)) + 1) & \
(~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
#endif /* __LINUX_BITS_H */

View File

@@ -54,6 +54,8 @@ extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_l1tf(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_mds(struct device *dev,
struct device_attribute *attr, char *buf);
extern __printf(4, 5)
struct device *cpu_device_create(struct device *parent, void *drvdata,
@@ -276,6 +278,30 @@ static inline void cpu_smt_check_topology_early(void) { }
static inline void cpu_smt_check_topology(void) { }
#endif
/*
* These are used for a global "mitigations=" cmdline option for toggling
* optional CPU mitigations.
*/
enum cpu_mitigations {
CPU_MITIGATIONS_OFF,
CPU_MITIGATIONS_AUTO,
CPU_MITIGATIONS_AUTO_NOSMT,
};
extern enum cpu_mitigations cpu_mitigations;
/* mitigations=off */
static inline bool cpu_mitigations_off(void)
{
return cpu_mitigations == CPU_MITIGATIONS_OFF;
}
/* mitigations=auto,nosmt */
static inline bool cpu_mitigations_auto_nosmt(void)
{
return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
}
#define IDLE_START 1
#define IDLE_END 2

View File

@@ -60,14 +60,17 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
#define PTRACE_MODE_READ 0x01
#define PTRACE_MODE_ATTACH 0x02
#define PTRACE_MODE_NOAUDIT 0x04
#define PTRACE_MODE_FSCREDS 0x08
#define PTRACE_MODE_REALCREDS 0x10
#define PTRACE_MODE_FSCREDS 0x08
#define PTRACE_MODE_REALCREDS 0x10
#define PTRACE_MODE_SCHED 0x20
#define PTRACE_MODE_IBPB 0x40
/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */
#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS)
#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS)
#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS)
#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS)
#define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB)
/**
* ptrace_may_access - check whether the caller is permitted to access
@@ -85,6 +88,20 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
*/
extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
/**
* ptrace_may_access - check whether the caller is permitted to access
* a target task.
* @task: target task
* @mode: selects type of access and caller credentials
*
* Returns true on success, false on denial.
*
* Similar to ptrace_may_access(). Only to be called from context switch
* code. Does not call into audit and the regular LSM hooks due to locking
* constraints.
*/
extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode);
static inline int ptrace_reparented(struct task_struct *child)
{
return !same_thread_group(child->real_parent, child->parent);

View File

@@ -2460,6 +2460,8 @@ static inline void memalloc_noio_restore(unsigned int flags)
#define PFA_LMK_WAITING 3 /* Lowmemorykiller is waiting */
#define PFA_SPEC_SSB_DISABLE 4 /* Speculative Store Bypass disabled */
#define PFA_SPEC_SSB_FORCE_DISABLE 5 /* Speculative Store Bypass force disabled*/
#define PFA_SPEC_IB_DISABLE 6 /* Indirect branch speculation restricted */
#define PFA_SPEC_IB_FORCE_DISABLE 7 /* Indirect branch speculation permanently restricted */
#define TASK_PFA_TEST(name, func) \
@@ -2493,6 +2495,13 @@ TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
/*
* task->jobctl flags
*/

20
include/linux/sched/smt.h Normal file
View File

@@ -0,0 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_SMT_H
#define _LINUX_SCHED_SMT_H
#include <linux/atomic.h>
#ifdef CONFIG_SCHED_SMT
extern atomic_t sched_smt_present;
static __always_inline bool sched_smt_active(void)
{
return atomic_read(&sched_smt_present);
}
#else
static inline bool sched_smt_active(void) { return false; }
#endif
void arch_smt_update(void);
#endif

View File

@@ -202,6 +202,7 @@ struct prctl_mm_map {
#define PR_SET_SPECULATION_CTRL 53
/* Speculation control variants */
# define PR_SPEC_STORE_BYPASS 0
# define PR_SPEC_INDIRECT_BRANCH 1
/* Return and control values for PR_SET/GET_SPECULATION_CTRL */
# define PR_SPEC_NOT_AFFECTED 0
# define PR_SPEC_PRCTL (1UL << 0)

View File

@@ -8,6 +8,7 @@
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/sched.h>
#include <linux/sched/smt.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
@@ -356,6 +357,12 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
/*
* Architectures that need SMT-specific errata handling during SMT hotplug
* should override this.
*/
void __weak arch_smt_update(void) { }
#ifdef CONFIG_HOTPLUG_SMT
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
EXPORT_SYMBOL_GPL(cpu_smt_control);
@@ -1058,6 +1065,7 @@ out:
/* This post dead nonsense must die */
if (!ret && hasdied)
cpu_notify_nofail(CPU_POST_DEAD, cpu);
arch_smt_update();
return ret;
}
@@ -1177,6 +1185,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
ret = cpuhp_up_callbacks(cpu, st, target);
out:
cpu_hotplug_done();
arch_smt_update();
return ret;
}
@@ -2019,8 +2028,10 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
*/
cpuhp_offline_cpu_device(cpu);
}
if (!ret)
if (!ret) {
cpu_smt_control = ctrlval;
arch_smt_update();
}
cpu_maps_update_done();
return ret;
}
@@ -2031,6 +2042,7 @@ static int cpuhp_smt_enable(void)
cpu_maps_update_begin();
cpu_smt_control = CPU_SMT_ENABLED;
arch_smt_update();
for_each_present_cpu(cpu) {
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
@@ -2230,6 +2242,21 @@ void __init boot_cpu_hotplug_init(void)
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}
enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
static int __init mitigations_parse_cmdline(char *arg)
{
if (!strcmp(arg, "off"))
cpu_mitigations = CPU_MITIGATIONS_OFF;
else if (!strcmp(arg, "auto"))
cpu_mitigations = CPU_MITIGATIONS_AUTO;
else if (!strcmp(arg, "auto,nosmt"))
cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
return 0;
}
early_param("mitigations", mitigations_parse_cmdline);
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
void idle_notifier_register(struct notifier_block *n)

View File

@@ -258,6 +258,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
{
if (mode & PTRACE_MODE_SCHED)
return false;
if (mode & PTRACE_MODE_NOAUDIT)
return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
else
@@ -325,9 +328,16 @@ ok:
!ptrace_has_cap(mm->user_ns, mode)))
return -EPERM;
if (mode & PTRACE_MODE_SCHED)
return 0;
return security_ptrace_access_check(task, mode);
}
bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode)
{
return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED);
}
bool ptrace_may_access(struct task_struct *task, unsigned int mode)
{
int err;

View File

@@ -7539,11 +7539,22 @@ static int cpuset_cpu_inactive(unsigned int cpu)
return 0;
}
#ifdef CONFIG_SCHED_SMT
atomic_t sched_smt_present = ATOMIC_INIT(0);
#endif
int sched_cpu_activate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
#ifdef CONFIG_SCHED_SMT
/*
* When going up, increment the number of cores with SMT present.
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
atomic_inc(&sched_smt_present);
#endif
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
@@ -7592,6 +7603,14 @@ int sched_cpu_deactivate(unsigned int cpu)
else
synchronize_rcu();
#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
atomic_dec(&sched_smt_present);
#endif
if (!sched_smp_initialized)
return 0;

View File

@@ -2,6 +2,7 @@
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/smt.h>
#include <linux/u64_stats_sync.h>
#include <linux/sched/deadline.h>
#include <linux/kernel_stat.h>

View File

@@ -8,7 +8,7 @@ ifeq ("$(origin O)", "command line")
endif
turbostat : turbostat.c
CFLAGS += -Wall
CFLAGS += -Wall -I../../../include
CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"'
%: %.c