Merge 4.19.220 into android-4.19-stable
Linux 4.19.220
ipmi: msghandler: Make symbol 'remove_work_wq' static
parisc: Mark cr16 CPU clocksource unstable on all SMP machines
* serial: core: fix transmit-buffer reset and memleak
drivers/tty/serial/serial_core.c
serial: pl011: Add ACPI SBSA UART match id
tty: serial: msm_serial: Deactivate RX DMA for polling support
x86/64/mm: Map all kernel memory into trampoline_pgd
usb: typec: tcpm: Wait in SNK_DEBOUNCED until disconnect
* USB: NO_LPM quirk Lenovo Powered USB-C Travel Hub
drivers/usb/core/quirks.c
* xhci: Fix commad ring abort, write all 64 bits to CRCR register.
drivers/usb/host/xhci-ring.c
vgacon: Propagate console boot parameters before calling `vc_resize'
parisc: Fix "make install" on newer debian releases
parisc: Fix KBUILD_IMAGE for self-extracting kernel
drm/msm: Do hw_init() before capturing GPU state
net/smc: Keep smc_close_final rc during active close
net/rds: correct socket tunable error in rds_tcp_tune()
* net: annotate data-races on txq->xmit_lock_owner
include/linux/netdevice.h
net/core/dev.c
net: usb: lan78xx: lan78xx_phy_init(): use PHY_POLL instead of "0" if no IRQ is available
rxrpc: Fix rxrpc_local leak in rxrpc_lookup_peer()
net/mlx4_en: Fix an use-after-free bug in mlx4_en_try_alloc_resources()
* siphash: use _unaligned version by default
include/linux/siphash.h
lib/siphash.c
net: mpls: Fix notifications when deleting a device
net: qlogic: qlcnic: Fix a NULL pointer dereference in qlcnic_83xx_add_rings()
natsemi: xtensa: fix section mismatch warnings
i2c: stm32f7: stop dma transfer in case of NACK
i2c: stm32f7: recover the bus on access timeout
* fget: check that the fd still exists after getting a ref to it
fs/file.c
* fs: add fget_many() and fput_many()
fs/file.c
fs/file_table.c
include/linux/file.h
include/linux/fs.h
sata_fsl: fix warning in remove_proc_entry when rmmod sata_fsl
sata_fsl: fix UAF in sata_fsl_port_stop when rmmod sata_fsl
ipmi: Move remove_work to dedicated workqueue
* kprobes: Limit max data_size of the kretprobe instances
include/linux/kprobes.h
vrf: Reset IPCB/IP6CB when processing outbound pkts in vrf dev xmit
perf hist: Fix memory leak of a perf_hpp_fmt
net: ethernet: dec: tulip: de4x5: fix possible array overflows in type3_infoblock()
net: tulip: de4x5: fix the problem that the array 'lp->phy[8]' may be out of bound
ethernet: hisilicon: hns: hns_dsaf_misc: fix a possible array overflow in hns_dsaf_ge_srst_by_port()
ata: ahci: Add Green Sardine vendor ID as board_ahci_mobile
scsi: iscsi: Unblock session then wake up error handler
* thermal: core: Reset previous low and high trip during thermal zone init
drivers/thermal/thermal_core.c
btrfs: check-integrity: fix a warning on write caching disabled disk
s390/setup: avoid using memblock_enforce_memory_limit
platform/x86: thinkpad_acpi: Fix WWAN device disabled issue after S3 deep
* net: return correct error code
net/ipv4/devinet.c
atlantic: Fix OOB read and write in hw_atl_utils_fw_rpc_wait
gfs2: Fix length of holes reported at end-of-file
* of: clk: Make <linux/of_clk.h> self-contained
include/linux/of_clk.h
NFSv42: Fix pagecache invalidation after COPY/CLONE
* shm: extend forced shm destroy to support objects from several IPC nses
include/linux/ipc_namespace.h
include/linux/sched/task.h
* BACKPORT: arm64: vdso32: suppress error message for 'make mrproper'
arch/arm64/kernel/vdso32/Makefile
Merge 4.19.219 into android-4.19-stable
Linux 4.19.219
tty: hvc: replace BUG_ON() with negative return value
xen/netfront: don't trust the backend response data blindly
xen/netfront: disentangle tx_skb_freelist
xen/netfront: don't read data from request on the ring page
xen/netfront: read response from backend only once
xen/blkfront: don't trust the backend response data blindly
xen/blkfront: don't take local copy of a request from the ring page
xen/blkfront: read response from backend only once
* xen: sync include/xen/interface/io/ring.h with Xen's newest version
include/xen/interface/io/ring.h
* fuse: release pipe buf after last use
fs/fuse/dev.c
* NFC: add NCI_UNREG flag to eliminate the race
include/net/nfc/nci_core.h
* hugetlbfs: flush TLBs correctly after huge_pmd_unshare
include/asm-generic/tlb.h
mm/memory.c
s390/mm: validate VMA in PGSTE manipulation functions
* tracing: Check pid filtering when creating events
kernel/trace/trace_events.c
vhost/vsock: fix incorrect used length reported to the guest
net: hns3: fix VF RSS failed problem after PF enable multi-TCs
net/smc: Don't call clcsock shutdown twice when smc shutdown
MIPS: use 3-level pgtable for 64KB page size on MIPS_VA_BITS_48
* tcp_cubic: fix spurious Hystart ACK train detections for not-cwnd-limited flows
net/ipv4/tcp_cubic.c
PM: hibernate: use correct mode for swsusp_close()
net/smc: Ensure the active closing peer first closes clcsock
* ipv6: fix typos in __ip6_finish_output()
net/ipv6/ip6_output.c
drm/vc4: fix error code in vc4_create_object()
scsi: mpt3sas: Fix kernel panic during drive powercycle test
ARM: socfpga: Fix crash with CONFIG_FORTIRY_SOURCE
NFSv42: Don't fail clone() unless the OP_CLONE operation failed
firmware: arm_scmi: pm: Propagate return value to caller
* net: ieee802154: handle iftypes as u32
include/net/nl802154.h
ASoC: topology: Add missing rwsem around snd_ctl_remove() calls
ASoC: qdsp6: q6routing: Conditionally reset FrontEnd Mixer
ARM: dts: BCM5301X: Add interrupt properties to GPIO node
ARM: dts: BCM5301X: Fix I2C controller interrupt
netfilter: ipvs: Fix reuse connection if RS weight is 0
arm64: dts: marvell: armada-37xx: Set pcie_reset_pin to gpio function
arm64: dts: marvell: armada-37xx: declare PCIe reset pin
pinctrl: armada-37xx: Correct PWM pins definitions
pinctrl: armada-37xx: add missing pin: PCIe1 Wakeup
pinctrl: armada-37xx: Correct mpp definitions
PCI: aardvark: Fix checking for link up via LTSSM state
PCI: aardvark: Fix link training
PCI: aardvark: Fix PCIe Max Payload Size setting
PCI: aardvark: Configure PCIe resources from 'ranges' DT property
PCI: aardvark: Update comment about disabling link training
PCI: aardvark: Move PCIe reset card code to advk_pcie_train_link()
PCI: aardvark: Fix compilation on s390
PCI: aardvark: Don't touch PCIe registers if no card connected
PCI: aardvark: Indicate error in 'val' when config read fails
PCI: aardvark: Replace custom macros by standard linux/pci_regs.h macros
PCI: aardvark: Issue PERST via GPIO
PCI: aardvark: Improve link training
PCI: aardvark: Train link immediately after enabling training
PCI: aardvark: Wait for endpoint to be ready before training link
PCI: aardvark: Fix a leaked reference by adding missing of_node_put()
proc/vmcore: fix clearing user buffer by properly using clear_user()
xtensa: use CONFIG_USE_OF instead of CONFIG_OF
* tracing: Fix pid filtering when triggers are attached
kernel/trace/trace.h
xen: detect uninitialized xenbus in xenbus_init
xen: don't continue xenstore initialization in case of errors
* fuse: fix page stealing
fs/fuse/dev.c
staging: rtl8192e: Fix use after free in _rtl92e_pci_disconnect()
HID: wacom: Use "Confidence" flag to prevent reporting invalid contacts
media: cec: copy sequence field for the reply
ALSA: ctxfi: Fix out-of-range access
* binder: fix test regression due to sender_euid change
drivers/android/binder.c
* usb: hub: Fix locking issues with address0_mutex
drivers/usb/core/hub.c
* usb: hub: Fix usb enumeration issue due to address0 race
drivers/usb/core/hub.c
usb: dwc2: hcd_queue: Fix use of floating point literal
USB: serial: option: add Fibocom FM101-GL variants
USB: serial: option: add Telit LE910S1 0x9200 composition
* Revert "net: sched: update default qdisc visibility after Tx queue cnt changes"
include/net/sch_generic.h
net/core/dev.c
net/sched/sch_generic.c
net/sched/sch_mq.c
* Revert "serial: core: Fix initializing and restoring termios speed"
drivers/tty/serial/serial_core.c
include/linux/console.h
ANDROID: GKI: disable CONFIG_FORTIFY_SOURCE
Merge 4.19.218 into android-4.19-stable
Linux 4.19.218
soc/tegra: pmc: Fix imbalanced clock disabling in error code path
usb: max-3421: Use driver data instead of maintaining a list of bound devices
* ASoC: DAPM: Cover regression by kctl change notification fix
sound/soc/soc-dapm.c
* RDMA/netlink: Add __maybe_unused to static inline in C file
include/rdma/rdma_netlink.h
batman-adv: Don't always reallocate the fragmentation skb head
batman-adv: Reserve needed_*room for fragments
batman-adv: Consider fragmentation for needed_headroom
batman-adv: mcast: fix duplicate mcast packets in BLA backbone from LAN
* perf/core: Avoid put_page() when GUP fails
kernel/events/core.c
drm/amdgpu: fix set scaling mode Full/Full aspect/Center not works on vga and dvi connectors
drm/udl: fix control-message timeout
* cfg80211: call cfg80211_stop_ap when switch from P2P_GO type
net/wireless/util.c
parisc/sticon: fix reverse colors
btrfs: fix memory ordering between normal and ordered work functions
udf: Fix crash after seekdir
x86/hyperv: Fix NULL deref in set_hv_tscchange_cb() if Hyper-V setup fails
* mm: kmemleak: slob: respect SLAB_NOLEAKTRACE flag
mm/slab.h
ipc: WARN if trying to remove ipc object which is absent
hexagon: export raw I/O routines for modules
* tun: fix bonding active backup with arp monitoring
drivers/net/tun.c
perf/x86/intel/uncore: Fix IIO event constraints for Skylake Server
perf/x86/intel/uncore: Fix filter_tid mask for CHA events on Skylake Server
NFC: reorder the logic in nfc_{un,}register_device
NFC: reorganize the functions in nci_request
i40e: Fix display error code in dmesg
i40e: Fix changing previously set num_queue_pairs for PFs
i40e: Fix NULL ptr dereference on VSI filter sync
i40e: Fix correct max_pkt_size on VF RX queue
* net: virtio_net_hdr_to_skb: count transport header in UFO
include/linux/virtio_net.h
platform/x86: hp_accel: Fix an error handling path in 'lis3lv02d_probe()'
mips: lantiq: add support for clk_get_parent()
mips: bcm63xx: add support for clk_get_parent()
MIPS: generic/yamon-dt: fix uninitialized variable error
iavf: Fix for the false positive ASQ/ARQ errors while issuing VF reset
iavf: check for null in iavf_fix_features
net: bnx2x: fix variable dereferenced before check
drm/nouveau: hdmigv100.c: fix corrupted HDMI Vendor InfoFrame
* sched/core: Mitigate race cpus_share_cache()/update_top_cache_domain()
kernel/sched/core.c
mips: BCM63XX: ensure that CPU_SUPPORTS_32BIT_KERNEL is set
sh: define __BIG_ENDIAN for math-emu
sh: fix kconfig unmet dependency warning for FRAME_POINTER
* f2fs: fix up f2fs_lookup tracepoints
include/trace/events/f2fs.h
maple: fix wrong return value of maple_bus_init().
sh: check return code of request_irq
powerpc/dcr: Use cmplwi instead of 3-argument cmpli
ALSA: gus: fix null pointer dereference on pointer block
powerpc/5200: dts: fix memory node unit name
scsi: target: Fix alua_tg_pt_gps_count tracking
* scsi: target: Fix ordered tag handling
include/target/target_core_base.h
MIPS: sni: Fix the build
* tty: tty_buffer: Fix the softlockup issue in flush_to_ldisc
drivers/tty/tty_buffer.c
* ALSA: ISA: not for M68K
sound/core/Makefile
sound/isa/Kconfig
sound/pci/Kconfig
usb: host: ohci-tmio: check return value after calling platform_get_resource()
ARM: dts: omap: fix gpmc,mux-add-data type
* firmware_loader: fix pre-allocated buf built-in firmware use
drivers/base/firmware_loader/main.c
scsi: advansys: Fix kernel pointer leak
ASoC: nau8824: Add DMI quirk mechanism for active-high jack-detect
arm64: dts: freescale: fix arm,sp805 compatible string
usb: typec: tipd: Remove WARN_ON in tps6598x_block_read
usb: musb: tusb6010: check return value after calling platform_get_resource()
arm64: dts: hisilicon: fix arm,sp805 compatible string
scsi: lpfc: Fix list_add() corruption in lpfc_drain_txq()
arm64: zynqmp: Fix serial compatible string
arm64: zynqmp: Do not duplicate flash partition label property
erofs: fix unsafe pagevec reuse of hooked pclusters
erofs: remove the occupied parameter from z_erofs_pagevec_enqueue()
* PCI: Add MSI masking quirk for Nvidia ION AHCI
drivers/pci/quirks.c
* PCI/MSI: Deal with devices lying about their MSI mask capability
drivers/pci/msi.c
include/linux/pci.h
* PCI/MSI: Destroy sysfs before freeing entries
drivers/pci/msi.c
parisc/entry: fix trace test in syscall exit path
* fortify: Explicitly disable Clang support
security/Kconfig
* ext4: fix lazy initialization next schedule time computation in more granular unit
fs/ext4/super.c
x86/cpu: Fix migration safety with X86_BUG_NULL_SEL
* fuse: truncate pagecache on atomic_o_trunc
fs/fuse/file.c
* PCI: Add PCI_EXP_DEVCTL_PAYLOAD_* macros
include/uapi/linux/pci_regs.h
s390/tape: fix timer initialization in tape_std_assign()
s390/cio: check the subchannel validity for dev_busid
* video: backlight: Drop maximum brightness override for brightness zero
drivers/video/backlight/backlight.c
backlight: gpio-backlight: Correct initial power state handling
* mm, oom: do not trigger out_of_memory from the #PF
mm/oom_kill.c
* mm, oom: pagefault_out_of_memory: don't force global OOM for dying tasks
mm/oom_kill.c
powerpc/bpf: Emit stf barrier instruction sequences for BPF_NOSPEC
powerpc/security: Add a helper to query stf_barrier type
powerpc/bpf: Fix BPF_SUB when imm == 0x80000000
powerpc/bpf: Validate branch ranges
powerpc/lib: Add helper to check if offset is within conditional branch range
9p/net: fix missing error check in p9_check_errors
* f2fs: should use GFP_NOFS for directory inodes
fs/f2fs/inode.c
fs/f2fs/namei.c
ARM: 9156/1: drop cc-option fallbacks for architecture selection
ARM: 9155/1: fix early early_iounmap()
USB: chipidea: fix interrupt deadlock
cxgb4: fix eeprom len when diagnostics not implemented
vsock: prevent unnecessary refcnt inc for nonblocking connect
* arm64: pgtable: make __pte_to_phys/__phys_to_pte_val inline functions
arch/arm64/include/asm/pgtable.h
nfc: pn533: Fix double free when pn533_fill_fragment_skbs() fails
* llc: fix out-of-bound array index in llc_sk_dev_hash()
include/net/llc.h
* zram: off by one in read_block_state()
drivers/block/zram/zram_drv.c
* mm/zsmalloc.c: close race window between zs_pool_dec_isolated() and zs_unregister_migration()
mm/zsmalloc.c
* bonding: Fix a use-after-free problem when bond_sysfs_slave_add() failed
drivers/net/bonding/bond_sysfs_slave.c
ACPI: PMIC: Fix intel_pmic_regs_handler() read accesses
net: davinci_emac: Fix interrupt pacing disable
xen-pciback: Fix return in pm_ctrl_init()
i2c: xlr: Fix a resource leak in the error handling path of 'xlr_i2c_probe()'
scsi: qla2xxx: Turn off target reset during issue_lip
scsi: qla2xxx: Fix gnl list corruption
* ar7: fix kernel builds for compiler test
drivers/watchdog/Kconfig
watchdog: f71808e_wdt: fix inaccurate report in WDIOC_GETTIMEOUT
m68k: set a default value for MEMORY_RESERVE
* dmaengine: dmaengine_desc_callback_valid(): Check for `callback_result`
drivers/dma/dmaengine.h
* netfilter: nfnetlink_queue: fix OOB when mac header was cleared
net/netfilter/nfnetlink_queue.c
auxdisplay: ht16k33: Fix frame buffer device blanking
auxdisplay: ht16k33: Connect backlight to fbdev
auxdisplay: img-ascii-lcd: Fix lock-up when displaying empty string
dmaengine: at_xdmac: fix AT_XDMAC_CC_PERID() macro
mtd: spi-nor: hisi-sfc: Remove excessive clk_disable_unprepare()
fs: orangefs: fix error return code of orangefs_revalidate_lookup()
NFS: Fix deadlocks in nfs_scan_commit_list()
PCI: aardvark: Don't spam about PIO Response Status
* drm/plane-helper: fix uninitialized variable reference
drivers/gpu/drm/drm_plane_helper.c
pnfs/flexfiles: Fix misplaced barrier in nfs4_ff_layout_prepare_ds
* rpmsg: Fix rpmsg_create_ept return when RPMSG config is not defined
include/linux/rpmsg.h
apparmor: fix error check
power: supply: bq27xxx: Fix kernel crash on IRQ handler register error
mips: cm: Convert to bitfield API to fix out-of-bounds access
serial: xilinx_uartps: Fix race condition causing stuck TX
phy: qcom-qusb2: Fix a memory leak on probe
ASoC: cs42l42: Defer probe if request_threaded_irq() returns EPROBE_DEFER
ASoC: cs42l42: Correct some register default values
RDMA/mlx4: Return missed an error if device doesn't support steering
scsi: csiostor: Uninitialized data in csio_ln_vnp_read_cbfn()
power: supply: rt5033_battery: Change voltage values to µV
usb: gadget: hid: fix error code in do_config()
serial: 8250_dw: Drop wrong use of ACPI_PTR()
video: fbdev: chipsfb: use memset_io() instead of memset()
memory: fsl_ifc: fix leak of irq and nand_irq in fsl_ifc_ctrl_probe
soc/tegra: Fix an error handling path in tegra_powergate_power_up()
arm: dts: omap3-gta04a4: accelerometer irq fix
ALSA: hda: Reduce udelay() at SKL+ position reporting
JFS: fix memleak in jfs_mount
MIPS: loongson64: make CPU_LOONGSON64 depends on MIPS_FP_SUPPORT
scsi: dc395: Fix error case unwinding
ARM: dts: at91: tse850: the emac<->phy interface is rmii
RDMA/bnxt_re: Fix query SRQ failure
arm64: dts: rockchip: Fix GPU register width for RK3328
ARM: s3c: irq-s3c24xx: Fix return value check for s3c24xx_init_intc()
RDMA/rxe: Fix wrong port_cap_flags
ibmvnic: Process crqs after enabling interrupts
selftests/bpf: Fix fclose/pclose mismatch in test_progs
crypto: pcrypt - Delay write to padata->info
net: phylink: avoid mvneta warning when setting pause parameters
net: amd-xgbe: Toggle PLL settings during rate change
wcn36xx: add proper DMA memory barriers in rx path
libertas: Fix possible memory leak in probe and disconnect
libertas_tf: Fix possible memory leak in probe and disconnect
KVM: s390: Fix handle_sske page fault handling
samples/kretprobes: Fix return value if register_kretprobe() failed
* tcp: don't free a FIN sk_buff in tcp_remove_empty_skb()
net/ipv4/tcp.c
irq: mips: avoid nested irq_enter()
s390/gmap: don't unconditionally call pte_unmap_unlock() in __gmap_zap()
smackfs: use netlbl_cfg_cipsov4_del() for deleting cipso_v4_doi
drm/msm: Fix potential NULL dereference in DPU SSPP
* clocksource/drivers/timer-ti-dm: Select TIMER_OF
drivers/clocksource/Kconfig
PM: hibernate: fix sparse warnings
nvme-rdma: fix error code in nvme_rdma_setup_ctrl
phy: micrel: ksz8041nl: do not use power down mode
mwifiex: Send DELBA requests according to spec
rsi: stop thread firstly in rsi_91x_init() error handling
platform/x86: thinkpad_acpi: Fix bitwise vs. logical warning
mmc: mxs-mmc: disable regulator on error and in the remove function
* net: stream: don't purge sk_error_queue in sk_stream_kill_queues()
net/core/stream.c
drm/msm: uninitialized variable in msm_gem_import()
ath10k: fix max antenna gain unit
hwmon: (pmbus/lm25066) Let compiler determine outer dimension of lm25066_coeff
* hwmon: Fix possible memleak in __hwmon_device_register()
drivers/hwmon/hwmon.c
memstick: jmb38x_ms: use appropriate free function in jmb38x_ms_alloc_host()
memstick: avoid out-of-range warning
mmc: sdhci-omap: Fix NULL pointer exception if regulator is not configured
b43: fix a lower bounds test
b43legacy: fix a lower bounds test
hwrng: mtk - Force runtime pm ops for sleep ops
crypto: qat - disregard spurious PFVF interrupts
crypto: qat - detect PFVF collision after ACK
media: dvb-frontends: mn88443x: Handle errors of clk_prepare_enable()
ath9k: Fix potential interrupt storm on queue reset
media: em28xx: Don't use ops->suspend if it is NULL
* cpuidle: Fix kobject memory leaks in error paths
drivers/cpuidle/sysfs.c
media: cx23885: Fix snd_card_free call on null card pointer
media: si470x: Avoid card name truncation
media: mtk-vpu: Fix a resource leak in the error handling path of 'mtk_vpu_probe()'
media: dvb-usb: fix ununit-value in az6027_rc_query
media: em28xx: add missing em28xx_close_extension
drm/amdgpu: fix warning for overflow check
net: dsa: rtl8366rb: Fix off-by-one bug
* cgroup: Make rebind_subsystems() disable v2 controllers all at once
kernel/cgroup/cgroup.c
* Bluetooth: fix init and cleanup of sco_conn.timeout_work
net/bluetooth/sco.c
parisc/kgdb: add kgdb_roundup() to make kgdb work with idle polling
parisc/unwind: fix unwinder when CONFIG_64BIT is enabled
* task_stack: Fix end_of_stack() for architectures with upwards-growing stack
include/linux/sched/task_stack.h
parisc: fix warning in flush_tlb_all
x86/hyperv: Protect set_hv_tscchange_cb() against getting preempted
spi: bcm-qspi: Fix missing clk_disable_unprepare() on error in bcm_qspi_probe()
ARM: 9136/1: ARMv7-M uses BE-8, not BE-32
* gre/sit: Don't generate link-local addr if addr_gen_mode is IN6_ADDR_GEN_MODE_NONE
net/ipv6/addrconf.c
ARM: clang: Do not rely on lr register for stacktrace
smackfs: use __GFP_NOFAIL for smk_cipso_doi()
iwlwifi: mvm: disable RX-diversity in powersave
PM: hibernate: Get block device exclusively in swsusp_check()
mwl8k: Fix use-after-free in mwl8k_fw_state_machine()
tracing/cfi: Fix cmp_entries_* functions signature mismatch
* workqueue: make sysfs of unbound kworker cpumask more clever
kernel/workqueue.c
* lib/xz: Validate the value before assigning it to an enum variable
lib/xz/xz_dec_stream.c
* lib/xz: Avoid overlapping memcpy() with invalid input with in-place decompression
lib/xz/xz_dec_lzma2.c
memstick: r592: Fix a UAF bug when removing the driver
leaking_addresses: Always print a trailing newline
ACPI: battery: Accept charges over the design capacity as full
ath: dfs_pattern_detector: Fix possible null-pointer dereference in channel_detector_create()
* tracefs: Have tracefs directories not set OTH permission bits by default
fs/tracefs/inode.c
media: usb: dvd-usb: fix uninit-value bug in dibusb_read_eeprom_byte()
ACPICA: Avoid evaluating methods too early during system resume
media: rcar-csi2: Add checking to rcsi2_start_receiver()
ia64: don't do IA64_CMPXCHG_DEBUG without CONFIG_PRINTK
media: mceusb: return without resubmitting URB in case of -EPROTO error.
media: s5p-mfc: Add checking to s5p_mfc_probe().
media: s5p-mfc: fix possible null-pointer dereference in s5p_mfc_probe()
media: uvcvideo: Return -EIO for control errors
media: uvcvideo: Set capability in s_param
media: netup_unidvb: handle interrupt properly according to the firmware
media: mt9p031: Fix corrupted frame after restarting stream
mwifiex: Properly initialize private structure on interface type changes
mwifiex: Run SET_BSS_MODE when changing from P2P to STATION vif-type
x86: Increase exception stack sizes
smackfs: Fix use-after-free in netlbl_catmap_walk()
* net: sched: update default qdisc visibility after Tx queue cnt changes
include/net/sch_generic.h
net/core/dev.c
net/sched/sch_generic.c
net/sched/sch_mq.c
locking/lockdep: Avoid RCU-induced noinstr fail
MIPS: lantiq: dma: reset correct number of channel
MIPS: lantiq: dma: add small delay after reset
platform/x86: wmi: do not fail if disabling fails
* Bluetooth: fix use-after-free error in lock_sock_nested()
net/bluetooth/l2cap_sock.c
* Bluetooth: sco: Fix lock_sock() blockage by memcpy_from_msg()
net/bluetooth/sco.c
* drm: panel-orientation-quirks: Add quirk for KD Kurio Smart C15200 2-in-1
drivers/gpu/drm/drm_panel_orientation_quirks.c
USB: iowarrior: fix control-message timeouts
USB: serial: keyspan: fix memleak on probe errors
iio: dac: ad5446: Fix ad5622_write() return value
* pinctrl: core: fix possible memory leak in pinctrl_enable()
drivers/pinctrl/core.c
* quota: correct error number in free_dqentry()
fs/quota/quota_tree.c
* quota: check block number when reading the block in quota file
fs/quota/quota_tree.c
PCI: aardvark: Read all 16-bits from PCIE_MSI_PAYLOAD_REG
PCI: aardvark: Fix return value of MSI domain .alloc() method
PCI: aardvark: Do not unmask unused interrupts
PCI: aardvark: Do not clear status bits of masked interrupts
xen/balloon: add late_initcall_sync() for initial ballooning done
ALSA: mixer: fix deadlock in snd_mixer_oss_set_volume
ALSA: mixer: oss: Fix racy access to slots
* serial: core: Fix initializing and restoring termios speed
drivers/tty/serial/serial_core.c
include/linux/console.h
powerpc/85xx: Fix oops when mpc85xx_smp_guts_ids node cannot be found
power: supply: max17042_battery: use VFSOC for capacity when no rsns
power: supply: max17042_battery: Prevent int underflow in set_soc_threshold
signal/mips: Update (_save|_restore)_fp_context to fail with -EFAULT
* signal: Remove the bogus sigkill_pending in ptrace_stop
kernel/signal.c
RDMA/qedr: Fix NULL deref for query_qp on the GSI QP
rsi: Fix module dev_oper_mode parameter description
rsi: fix rate mask set leading to P2P failure
rsi: fix key enabled check causing unwanted encryption for vap_id > 0
rsi: fix occasional initialisation failure with BT coex
wcn36xx: handle connection loss indication
libata: fix checking of DMA state
mwifiex: Read a PCI register after writing the TX ring write pointer
wcn36xx: Fix HT40 capability for 2Ghz band
evm: mark evm_fixmode as __ro_after_init
rtl8187: fix control-message timeouts
* PCI: Mark Atheros QCA6174 to avoid bus reset
drivers/pci/quirks.c
ath10k: fix division by zero in send path
ath10k: fix control-message timeout
ath6kl: fix control-message timeout
ath6kl: fix division by zero in send path
mwifiex: fix division by zero in fw download path
EDAC/sb_edac: Fix top-of-high-memory value for Broadwell/Haswell
regulator: dt-bindings: samsung,s5m8767: correct s5m8767,pmic-buck-default-dvs-idx property
regulator: s5m8767: do not use reset value as DVS voltage if GPIO DVS is disabled
hwmon: (pmbus/lm25066) Add offset coefficients
ia64: kprobes: Fix to pass correct trampoline address to the handler
btrfs: call btrfs_check_rw_degradable only if there is a missing device
btrfs: fix lost error handling when replaying directory deletes
btrfs: clear MISSING device status bit in btrfs_close_one_device
vmxnet3: do not stop tx queues after netif_device_detach()
watchdog: Fix OMAP watchdog early handling
spi: spl022: fix Microwire full duplex mode
xen/netfront: stop tx queues during live migration
* bpf: Prevent increasing bpf_jit_limit above max
include/linux/filter.h
kernel/bpf/core.c
net/core/sysctl_net_core.c
* drm: panel-orientation-quirks: Add quirk for Aya Neo 2021
drivers/gpu/drm/drm_panel_orientation_quirks.c
* mmc: winbond: don't build on M68K
drivers/mmc/host/Kconfig
hyperv/vmbus: include linux/bitops.h
sfc: Don't use netif_info before net_device setup
cavium: Fix return values of the probe function
scsi: qla2xxx: Fix unmap of already freed sgl
cavium: Return negative value when pci_alloc_irq_vectors() fails
x86/irq: Ensure PI wakeup handler is unregistered before module unload
x86/sme: Use #define USE_EARLY_PGTABLE_L5 in mem_encrypt_identity.c
* ALSA: timer: Unconditionally unlink slave instances, too
sound/core/timer.c
* ALSA: timer: Fix use-after-free problem
sound/core/timer.c
ALSA: synth: missing check for possible NULL after the call to kstrdup
* ALSA: usb-audio: Add registration quirk for JBL Quantum 400
sound/usb/quirks.c
ALSA: line6: fix control and interrupt message timeouts
ALSA: 6fire: fix control and bulk message timeouts
ALSA: ua101: fix division by zero at probe
ALSA: hda/realtek: Add quirk for Clevo PC70HS
media: ir-kbd-i2c: improve responsiveness of hauppauge zilog receivers
media: ite-cir: IR receiver stop working after receive overflow
crypto: s5p-sss - Add error handling in s5p_aes_probe()
firmware/psci: fix application of sizeof to pointer
tpm: Check for integer overflow in tpm2_map_response_body()
parisc: Fix ptrace check on syscall return
mmc: dw_mmc: Dont wait for DRTO on Write RSP error
ocfs2: fix data corruption on truncate
* libata: fix read log timeout value
include/linux/libata.h
Input: i8042 - Add quirk for Fujitsu Lifebook T725
Input: elantench - fix misreporting trackpoint coordinates
* binder: use cred instead of task for selinux checks
drivers/android/binder.c
include/linux/lsm_hooks.h
include/linux/security.h
security/security.c
security/selinux/hooks.c
* binder: use euid from cred instead of using task
drivers/android/binder.c
* xhci: Fix USB 3.1 enumeration issues by increasing roothub power-on-good delay
drivers/usb/host/xhci-hub.c
* ANDROID: usb: gadget: f_accessory: Mitgate handling of non-existent USB request
drivers/usb/gadget/function/f_accessory.c
* UPSTREAM: binder: use cred instead of task for getsecid
drivers/android/binder.c
include/linux/security.h
* FROMGIT: binder: fix test regression due to sender_euid change
drivers/android/binder.c
* BACKPORT: binder: use cred instead of task for selinux checks
drivers/android/binder.c
include/linux/lsm_hooks.h
include/linux/security.h
security/security.c
security/selinux/hooks.c
* UPSTREAM: binder: use euid from cred instead of using task
drivers/android/binder.c
* ANDROID: setlocalversion: make KMI_GENERATION optional
scripts/setlocalversion
Merge 4.19.217 into android-4.19-stable
Linux 4.19.217
rsi: fix control-message timeout
staging: rtl8192u: fix control-message timeouts
staging: r8712u: fix control-message timeout
comedi: vmk80xx: fix bulk and interrupt message timeouts
comedi: vmk80xx: fix bulk-buffer overflow
comedi: vmk80xx: fix transfer-buffer overflows
comedi: ni_usb6501: fix NULL-deref in command paths
comedi: dt9812: fix DMA buffers on stack
isofs: Fix out of bound access for corrupted isofs image
* printk/console: Allow to disable console output by using console="" or console=null
kernel/printk/printk.c
* usb-storage: Add compatibility quirk flags for iODD 2531/2541
drivers/usb/storage/unusual_devs.h
usb: musb: Balance list entry in musb_gadget_queue
* usb: gadget: Mark USB_FSL_QE broken on 64-bit
drivers/usb/gadget/udc/Kconfig
* usb: ehci: handshake CMD_RUN instead of STS_HALT
drivers/usb/host/ehci-hcd.c
drivers/usb/host/ehci-platform.c
drivers/usb/host/ehci.h
Revert "x86/kvm: fix vcpu-id indexed array sizes"
Merge 4.19.216 into android-4.19-stable
Linux 4.19.216
* ARM: 9120/1: Revert "amba: make use of -1 IRQs warn"
drivers/amba/bus.c
* arch: pgtable: define MAX_POSSIBLE_PHYSMEM_BITS where needed
include/asm-generic/pgtable.h
sfc: Fix reading non-legacy supported link modes
IB/qib: Protect from buffer overflow in struct qib_user_sdma_pkt fields
IB/qib: Use struct_size() helper
media: firewire: firedtv-avc: fix a buffer overflow in avc_ca_pmt()
* scsi: core: Put LLD module refcnt after SCSI device is released
drivers/scsi/scsi.c
drivers/scsi/scsi_sysfs.c
* UPSTREAM: security: selinux: allow per-file labeling for bpffs
security/selinux/hooks.c
Bug: 210364486
Change-Id: I6232c6c7fde1bf54c16a32dd632456dc41e01e6e
Signed-off-by: JohnnLee <johnnlee@google.com>
1168 lines
31 KiB
C
1168 lines
31 KiB
C
/*
|
|
* linux/mm/oom_kill.c
|
|
*
|
|
* Copyright (C) 1998,2000 Rik van Riel
|
|
* Thanks go out to Claus Fischer for some serious inspiration and
|
|
* for goading me into coding this file...
|
|
* Copyright (C) 2010 Google, Inc.
|
|
* Rewritten by David Rientjes
|
|
*
|
|
* The routines in this file are used to kill a process when
|
|
* we're seriously out of memory. This gets called from __alloc_pages()
|
|
* in mm/page_alloc.c when we really run out of memory.
|
|
*
|
|
* Since we won't call these routines often (on a well-configured
|
|
* machine) this file will double as a 'coding guide' and a signpost
|
|
* for newbie kernel hackers. It features several pointers to major
|
|
* kernel subsystems and hints as to where to find out what things do.
|
|
*/
|
|
|
|
#include <linux/oom.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/err.h>
|
|
#include <linux/gfp.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <linux/sched/coredump.h>
|
|
#include <linux/sched/task.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/timex.h>
|
|
#include <linux/jiffies.h>
|
|
#include <linux/cpuset.h>
|
|
#include <linux/export.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/memcontrol.h>
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/security.h>
|
|
#include <linux/ptrace.h>
|
|
#include <linux/freezer.h>
|
|
#include <linux/ftrace.h>
|
|
#include <linux/ratelimit.h>
|
|
#include <linux/kthread.h>
|
|
#include <linux/init.h>
|
|
#include <linux/mmu_notifier.h>
|
|
#include <linux/memory_hotplug.h>
|
|
#include <linux/show_mem_notifier.h>
|
|
|
|
#include <asm/tlb.h>
|
|
#include "internal.h"
|
|
#include "slab.h"
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
#include <trace/events/oom.h>
|
|
|
|
int sysctl_panic_on_oom;
|
|
int sysctl_oom_kill_allocating_task;
|
|
int sysctl_oom_dump_tasks = 1;
|
|
|
|
/*
|
|
* Serializes oom killer invocations (out_of_memory()) from all contexts to
|
|
* prevent from over eager oom killing (e.g. when the oom killer is invoked
|
|
* from different domains).
|
|
*
|
|
* oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
|
|
* and mark_oom_victim
|
|
*/
|
|
DEFINE_MUTEX(oom_lock);
|
|
/* Serializes oom_score_adj and oom_score_adj_min updates */
|
|
DEFINE_MUTEX(oom_adj_mutex);
|
|
|
|
#ifdef CONFIG_NUMA
|
|
/**
|
|
* has_intersects_mems_allowed() - check task eligiblity for kill
|
|
* @start: task struct of which task to consider
|
|
* @mask: nodemask passed to page allocator for mempolicy ooms
|
|
*
|
|
* Task eligibility is determined by whether or not a candidate task, @tsk,
|
|
* shares the same mempolicy nodes as current if it is bound by such a policy
|
|
* and whether or not it has the same set of allowed cpuset nodes.
|
|
*/
|
|
static bool has_intersects_mems_allowed(struct task_struct *start,
|
|
const nodemask_t *mask)
|
|
{
|
|
struct task_struct *tsk;
|
|
bool ret = false;
|
|
|
|
rcu_read_lock();
|
|
for_each_thread(start, tsk) {
|
|
if (mask) {
|
|
/*
|
|
* If this is a mempolicy constrained oom, tsk's
|
|
* cpuset is irrelevant. Only return true if its
|
|
* mempolicy intersects current, otherwise it may be
|
|
* needlessly killed.
|
|
*/
|
|
ret = mempolicy_nodemask_intersects(tsk, mask);
|
|
} else {
|
|
/*
|
|
* This is not a mempolicy constrained oom, so only
|
|
* check the mems of tsk's cpuset.
|
|
*/
|
|
ret = cpuset_mems_allowed_intersects(current, tsk);
|
|
}
|
|
if (ret)
|
|
break;
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
}
|
|
#else
|
|
static bool has_intersects_mems_allowed(struct task_struct *tsk,
|
|
const nodemask_t *mask)
|
|
{
|
|
return true;
|
|
}
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
/*
|
|
* The process p may have detached its own ->mm while exiting or through
|
|
* use_mm(), but one or more of its subthreads may still have a valid
|
|
* pointer. Return p, or any of its subthreads with a valid ->mm, with
|
|
* task_lock() held.
|
|
*/
|
|
struct task_struct *find_lock_task_mm(struct task_struct *p)
|
|
{
|
|
struct task_struct *t;
|
|
|
|
rcu_read_lock();
|
|
|
|
for_each_thread(p, t) {
|
|
task_lock(t);
|
|
if (likely(t->mm))
|
|
goto found;
|
|
task_unlock(t);
|
|
}
|
|
t = NULL;
|
|
found:
|
|
rcu_read_unlock();
|
|
|
|
return t;
|
|
}
|
|
|
|
/*
|
|
* order == -1 means the oom kill is required by sysrq, otherwise only
|
|
* for display purposes.
|
|
*/
|
|
static inline bool is_sysrq_oom(struct oom_control *oc)
|
|
{
|
|
return oc->order == -1;
|
|
}
|
|
|
|
static inline bool is_memcg_oom(struct oom_control *oc)
|
|
{
|
|
return oc->memcg != NULL;
|
|
}
|
|
|
|
/* return true if the task is not adequate as candidate victim task. */
|
|
static bool oom_unkillable_task(struct task_struct *p,
|
|
struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
|
{
|
|
if (is_global_init(p))
|
|
return true;
|
|
if (p->flags & PF_KTHREAD)
|
|
return true;
|
|
|
|
/* When mem_cgroup_out_of_memory() and p is not member of the group */
|
|
if (memcg && !task_in_mem_cgroup(p, memcg))
|
|
return true;
|
|
|
|
/* p may not have freeable memory in nodemask */
|
|
if (!has_intersects_mems_allowed(p, nodemask))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Print out unreclaimble slabs info when unreclaimable slabs amount is greater
|
|
* than all user memory (LRU pages)
|
|
*/
|
|
static bool is_dump_unreclaim_slabs(void)
|
|
{
|
|
unsigned long nr_lru;
|
|
|
|
nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
|
|
global_node_page_state(NR_INACTIVE_ANON) +
|
|
global_node_page_state(NR_ACTIVE_FILE) +
|
|
global_node_page_state(NR_INACTIVE_FILE) +
|
|
global_node_page_state(NR_ISOLATED_ANON) +
|
|
global_node_page_state(NR_ISOLATED_FILE) +
|
|
global_node_page_state(NR_UNEVICTABLE);
|
|
|
|
return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
|
|
}
|
|
|
|
/**
|
|
* oom_badness - heuristic function to determine which candidate task to kill
|
|
* @p: task struct of which task we should calculate
|
|
* @totalpages: total present RAM allowed for page allocation
|
|
* @memcg: task's memory controller, if constrained
|
|
* @nodemask: nodemask passed to page allocator for mempolicy ooms
|
|
*
|
|
* The heuristic for determining which task to kill is made to be as simple and
|
|
* predictable as possible. The goal is to return the highest value for the
|
|
* task consuming the most memory to avoid subsequent oom failures.
|
|
*/
|
|
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
|
|
const nodemask_t *nodemask, unsigned long totalpages)
|
|
{
|
|
long points;
|
|
long adj;
|
|
|
|
if (oom_unkillable_task(p, memcg, nodemask))
|
|
return 0;
|
|
|
|
p = find_lock_task_mm(p);
|
|
if (!p)
|
|
return 0;
|
|
|
|
/*
|
|
* Do not even consider tasks which are explicitly marked oom
|
|
* unkillable or have been already oom reaped or the are in
|
|
* the middle of vfork
|
|
*/
|
|
adj = (long)p->signal->oom_score_adj;
|
|
if (adj == OOM_SCORE_ADJ_MIN ||
|
|
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
|
|
in_vfork(p)) {
|
|
task_unlock(p);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* The baseline for the badness score is the proportion of RAM that each
|
|
* task's rss, pagetable and swap space use.
|
|
*/
|
|
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
|
|
mm_pgtables_bytes(p->mm) / PAGE_SIZE;
|
|
task_unlock(p);
|
|
|
|
/* Normalize to oom_score_adj units */
|
|
adj *= totalpages / 1000;
|
|
points += adj;
|
|
|
|
/*
|
|
* Never return 0 for an eligible task regardless of the root bonus and
|
|
* oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
|
|
*/
|
|
return points > 0 ? points : 1;
|
|
}
|
|
|
|
enum oom_constraint {
|
|
CONSTRAINT_NONE,
|
|
CONSTRAINT_CPUSET,
|
|
CONSTRAINT_MEMORY_POLICY,
|
|
CONSTRAINT_MEMCG,
|
|
};
|
|
|
|
/*
|
|
* Determine the type of allocation constraint.
|
|
*/
|
|
static enum oom_constraint constrained_alloc(struct oom_control *oc)
|
|
{
|
|
struct zone *zone;
|
|
struct zoneref *z;
|
|
enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
|
|
bool cpuset_limited = false;
|
|
int nid;
|
|
|
|
if (is_memcg_oom(oc)) {
|
|
oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
|
|
return CONSTRAINT_MEMCG;
|
|
}
|
|
|
|
/* Default to all available memory */
|
|
oc->totalpages = totalram_pages + total_swap_pages;
|
|
|
|
if (!IS_ENABLED(CONFIG_NUMA))
|
|
return CONSTRAINT_NONE;
|
|
|
|
if (!oc->zonelist)
|
|
return CONSTRAINT_NONE;
|
|
/*
|
|
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
|
|
* to kill current.We have to random task kill in this case.
|
|
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
|
|
*/
|
|
if (oc->gfp_mask & __GFP_THISNODE)
|
|
return CONSTRAINT_NONE;
|
|
|
|
/*
|
|
* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
|
|
* the page allocator means a mempolicy is in effect. Cpuset policy
|
|
* is enforced in get_page_from_freelist().
|
|
*/
|
|
if (oc->nodemask &&
|
|
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
|
|
oc->totalpages = total_swap_pages;
|
|
for_each_node_mask(nid, *oc->nodemask)
|
|
oc->totalpages += node_spanned_pages(nid);
|
|
return CONSTRAINT_MEMORY_POLICY;
|
|
}
|
|
|
|
/* Check this allocation failure is caused by cpuset's wall function */
|
|
for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
|
|
high_zoneidx, oc->nodemask)
|
|
if (!cpuset_zone_allowed(zone, oc->gfp_mask))
|
|
cpuset_limited = true;
|
|
|
|
if (cpuset_limited) {
|
|
oc->totalpages = total_swap_pages;
|
|
for_each_node_mask(nid, cpuset_current_mems_allowed)
|
|
oc->totalpages += node_spanned_pages(nid);
|
|
return CONSTRAINT_CPUSET;
|
|
}
|
|
return CONSTRAINT_NONE;
|
|
}
|
|
|
|
static int oom_evaluate_task(struct task_struct *task, void *arg)
|
|
{
|
|
struct oom_control *oc = arg;
|
|
unsigned long points;
|
|
|
|
if (oom_unkillable_task(task, NULL, oc->nodemask))
|
|
goto next;
|
|
|
|
/*
|
|
* This task already has access to memory reserves and is being killed.
|
|
* Don't allow any other task to have access to the reserves unless
|
|
* the task has MMF_OOM_SKIP because chances that it would release
|
|
* any memory is quite low.
|
|
*/
|
|
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
|
|
if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
|
|
goto next;
|
|
goto abort;
|
|
}
|
|
|
|
/*
|
|
* If task is allocating a lot of memory and has been marked to be
|
|
* killed first if it triggers an oom, then select it.
|
|
*/
|
|
if (oom_task_origin(task)) {
|
|
points = ULONG_MAX;
|
|
goto select;
|
|
}
|
|
|
|
points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
|
|
if (!points || points < oc->chosen_points)
|
|
goto next;
|
|
|
|
/* Prefer thread group leaders for display purposes */
|
|
if (points == oc->chosen_points && thread_group_leader(oc->chosen))
|
|
goto next;
|
|
select:
|
|
if (oc->chosen)
|
|
put_task_struct(oc->chosen);
|
|
get_task_struct(task);
|
|
oc->chosen = task;
|
|
oc->chosen_points = points;
|
|
next:
|
|
return 0;
|
|
abort:
|
|
if (oc->chosen)
|
|
put_task_struct(oc->chosen);
|
|
oc->chosen = (void *)-1UL;
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Simple selection loop. We choose the process with the highest number of
|
|
* 'points'. In case scan was aborted, oc->chosen is set to -1.
|
|
*/
|
|
static void select_bad_process(struct oom_control *oc)
|
|
{
|
|
if (is_memcg_oom(oc))
|
|
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
|
|
else {
|
|
struct task_struct *p;
|
|
|
|
rcu_read_lock();
|
|
for_each_process(p)
|
|
if (oom_evaluate_task(p, oc))
|
|
break;
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
|
|
}
|
|
|
|
/**
|
|
* dump_tasks - dump current memory state of all system tasks
|
|
* @memcg: current's memory controller, if constrained
|
|
* @nodemask: nodemask passed to page allocator for mempolicy ooms
|
|
*
|
|
* Dumps the current memory state of all eligible tasks. Tasks not in the same
|
|
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
|
|
* are not shown.
|
|
* State information includes task's pid, uid, tgid, vm size, rss,
|
|
* pgtables_bytes, swapents, oom_score_adj value, and name.
|
|
*/
|
|
void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
|
{
|
|
struct task_struct *p;
|
|
struct task_struct *task;
|
|
|
|
pr_info("Tasks state (memory values in pages):\n");
|
|
pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
|
|
rcu_read_lock();
|
|
for_each_process(p) {
|
|
if (oom_unkillable_task(p, memcg, nodemask))
|
|
continue;
|
|
|
|
task = find_lock_task_mm(p);
|
|
if (!task) {
|
|
/*
|
|
* This is a kthread or all of p's threads have already
|
|
* detached their mm's. There's no need to report
|
|
* them; they can't be oom killed anyway.
|
|
*/
|
|
continue;
|
|
}
|
|
|
|
pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
|
|
task->pid, from_kuid(&init_user_ns, task_uid(task)),
|
|
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
|
|
mm_pgtables_bytes(task->mm),
|
|
get_mm_counter(task->mm, MM_SWAPENTS),
|
|
task->signal->oom_score_adj, task->comm);
|
|
task_unlock(task);
|
|
}
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
static void dump_header(struct oom_control *oc, struct task_struct *p)
|
|
{
|
|
pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
|
|
current->comm, oc->gfp_mask, &oc->gfp_mask,
|
|
nodemask_pr_args(oc->nodemask), oc->order,
|
|
current->signal->oom_score_adj);
|
|
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
|
|
pr_warn("COMPACTION is disabled!!!\n");
|
|
|
|
cpuset_print_current_mems_allowed();
|
|
dump_stack();
|
|
if (is_memcg_oom(oc))
|
|
mem_cgroup_print_oom_info(oc->memcg, p);
|
|
else {
|
|
show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
|
|
if (is_dump_unreclaim_slabs())
|
|
dump_unreclaimable_slab();
|
|
|
|
show_mem_call_notifiers();
|
|
}
|
|
|
|
if (sysctl_oom_dump_tasks)
|
|
dump_tasks(oc->memcg, oc->nodemask);
|
|
}
|
|
|
|
/*
|
|
* Number of OOM victims in flight
|
|
*/
|
|
static atomic_t oom_victims = ATOMIC_INIT(0);
|
|
static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
|
|
|
|
static bool oom_killer_disabled __read_mostly;
|
|
|
|
#define K(x) ((x) << (PAGE_SHIFT-10))
|
|
|
|
/*
|
|
* task->mm can be NULL if the task is the exited group leader. So to
|
|
* determine whether the task is using a particular mm, we examine all the
|
|
* task's threads: if one of those is using this mm then this task was also
|
|
* using it.
|
|
*/
|
|
bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
|
|
{
|
|
struct task_struct *t;
|
|
|
|
for_each_thread(p, t) {
|
|
struct mm_struct *t_mm = READ_ONCE(t->mm);
|
|
if (t_mm)
|
|
return t_mm == mm;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
#ifdef CONFIG_MMU
|
|
/*
|
|
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
|
|
* victim (if that is possible) to help the OOM killer to move on.
|
|
*/
|
|
static struct task_struct *oom_reaper_th;
|
|
static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
|
|
static struct task_struct *oom_reaper_list;
|
|
static DEFINE_SPINLOCK(oom_reaper_lock);
|
|
|
|
bool __oom_reap_task_mm(struct mm_struct *mm)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
bool ret = true;
|
|
|
|
/*
|
|
* Tell all users of get_user/copy_from_user etc... that the content
|
|
* is no longer stable. No barriers really needed because unmapping
|
|
* should imply barriers already and the reader would hit a page fault
|
|
* if it stumbled over a reaped memory.
|
|
*/
|
|
set_bit(MMF_UNSTABLE, &mm->flags);
|
|
|
|
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
|
|
if (!can_madv_dontneed_vma(vma))
|
|
continue;
|
|
|
|
/*
|
|
* Only anonymous pages have a good chance to be dropped
|
|
* without additional steps which we cannot afford as we
|
|
* are OOM already.
|
|
*
|
|
* We do not even care about fs backed pages because all
|
|
* which are reclaimable have already been reclaimed and
|
|
* we do not want to block exit_mmap by keeping mm ref
|
|
* count elevated without a good reason.
|
|
*/
|
|
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
|
|
const unsigned long start = vma->vm_start;
|
|
const unsigned long end = vma->vm_end;
|
|
struct mmu_gather tlb;
|
|
|
|
tlb_gather_mmu(&tlb, mm, start, end);
|
|
if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
|
|
tlb_finish_mmu(&tlb, start, end);
|
|
ret = false;
|
|
continue;
|
|
}
|
|
unmap_page_range(&tlb, vma, start, end, NULL);
|
|
mmu_notifier_invalidate_range_end(mm, start, end);
|
|
tlb_finish_mmu(&tlb, start, end);
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Reaps the address space of the give task.
|
|
*
|
|
* Returns true on success and false if none or part of the address space
|
|
* has been reclaimed and the caller should retry later.
|
|
*/
|
|
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
|
|
{
|
|
bool ret = true;
|
|
|
|
if (!down_read_trylock(&mm->mmap_sem)) {
|
|
trace_skip_task_reaping(tsk->pid);
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
|
|
* work on the mm anymore. The check for MMF_OOM_SKIP must run
|
|
* under mmap_sem for reading because it serializes against the
|
|
* down_write();up_write() cycle in exit_mmap().
|
|
*/
|
|
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
|
|
trace_skip_task_reaping(tsk->pid);
|
|
goto out_unlock;
|
|
}
|
|
|
|
trace_start_task_reaping(tsk->pid);
|
|
|
|
/* failed to reap part of the address space. Try again later */
|
|
ret = __oom_reap_task_mm(mm);
|
|
if (!ret)
|
|
goto out_finish;
|
|
|
|
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
|
|
task_pid_nr(tsk), tsk->comm,
|
|
K(get_mm_counter(mm, MM_ANONPAGES)),
|
|
K(get_mm_counter(mm, MM_FILEPAGES)),
|
|
K(get_mm_counter(mm, MM_SHMEMPAGES)));
|
|
out_finish:
|
|
trace_finish_task_reaping(tsk->pid);
|
|
out_unlock:
|
|
up_read(&mm->mmap_sem);
|
|
|
|
return ret;
|
|
}
|
|
|
|
#define MAX_OOM_REAP_RETRIES 10
|
|
static void oom_reap_task(struct task_struct *tsk)
|
|
{
|
|
int attempts = 0;
|
|
struct mm_struct *mm = tsk->signal->oom_mm;
|
|
|
|
/* Retry the down_read_trylock(mmap_sem) a few times */
|
|
while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
|
|
schedule_timeout_idle(HZ/10);
|
|
|
|
if (attempts <= MAX_OOM_REAP_RETRIES ||
|
|
test_bit(MMF_OOM_SKIP, &mm->flags))
|
|
goto done;
|
|
|
|
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
|
|
task_pid_nr(tsk), tsk->comm);
|
|
debug_show_all_locks();
|
|
|
|
done:
|
|
tsk->oom_reaper_list = NULL;
|
|
|
|
/*
|
|
* Hide this mm from OOM killer because it has been either reaped or
|
|
* somebody can't call up_write(mmap_sem).
|
|
*/
|
|
set_bit(MMF_OOM_SKIP, &mm->flags);
|
|
|
|
/* Drop a reference taken by wake_oom_reaper */
|
|
put_task_struct(tsk);
|
|
}
|
|
|
|
static int oom_reaper(void *unused)
|
|
{
|
|
while (true) {
|
|
struct task_struct *tsk = NULL;
|
|
|
|
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
|
|
spin_lock(&oom_reaper_lock);
|
|
if (oom_reaper_list != NULL) {
|
|
tsk = oom_reaper_list;
|
|
oom_reaper_list = tsk->oom_reaper_list;
|
|
}
|
|
spin_unlock(&oom_reaper_lock);
|
|
|
|
if (tsk)
|
|
oom_reap_task(tsk);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void wake_oom_reaper(struct task_struct *tsk)
|
|
{
|
|
/* mm is already queued? */
|
|
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
|
|
return;
|
|
|
|
get_task_struct(tsk);
|
|
|
|
spin_lock(&oom_reaper_lock);
|
|
tsk->oom_reaper_list = oom_reaper_list;
|
|
oom_reaper_list = tsk;
|
|
spin_unlock(&oom_reaper_lock);
|
|
trace_wake_reaper(tsk->pid);
|
|
wake_up(&oom_reaper_wait);
|
|
}
|
|
|
|
static int __init oom_init(void)
|
|
{
|
|
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
|
|
return 0;
|
|
}
|
|
subsys_initcall(oom_init)
|
|
#else
|
|
static inline void wake_oom_reaper(struct task_struct *tsk)
|
|
{
|
|
}
|
|
#endif /* CONFIG_MMU */
|
|
|
|
/**
|
|
* mark_oom_victim - mark the given task as OOM victim
|
|
* @tsk: task to mark
|
|
*
|
|
* Has to be called with oom_lock held and never after
|
|
* oom has been disabled already.
|
|
*
|
|
* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
|
|
* under task_lock or operate on the current).
|
|
*/
|
|
static void mark_oom_victim(struct task_struct *tsk)
|
|
{
|
|
struct mm_struct *mm = tsk->mm;
|
|
|
|
WARN_ON(oom_killer_disabled);
|
|
/* OOM killer might race with memcg OOM */
|
|
if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
|
|
return;
|
|
|
|
/* oom_mm is bound to the signal struct life time. */
|
|
if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
|
|
mmgrab(tsk->signal->oom_mm);
|
|
set_bit(MMF_OOM_VICTIM, &mm->flags);
|
|
}
|
|
|
|
/*
|
|
* Make sure that the task is woken up from uninterruptible sleep
|
|
* if it is frozen because OOM killer wouldn't be able to free
|
|
* any memory and livelock. freezing_slow_path will tell the freezer
|
|
* that TIF_MEMDIE tasks should be ignored.
|
|
*/
|
|
__thaw_task(tsk);
|
|
atomic_inc(&oom_victims);
|
|
trace_mark_victim(tsk->pid);
|
|
}
|
|
|
|
/**
|
|
* exit_oom_victim - note the exit of an OOM victim
|
|
*/
|
|
void exit_oom_victim(void)
|
|
{
|
|
clear_thread_flag(TIF_MEMDIE);
|
|
|
|
if (!atomic_dec_return(&oom_victims))
|
|
wake_up_all(&oom_victims_wait);
|
|
}
|
|
|
|
/**
|
|
* oom_killer_enable - enable OOM killer
|
|
*/
|
|
void oom_killer_enable(void)
|
|
{
|
|
oom_killer_disabled = false;
|
|
pr_info("OOM killer enabled.\n");
|
|
}
|
|
|
|
/**
|
|
* oom_killer_disable - disable OOM killer
|
|
* @timeout: maximum timeout to wait for oom victims in jiffies
|
|
*
|
|
* Forces all page allocations to fail rather than trigger OOM killer.
|
|
* Will block and wait until all OOM victims are killed or the given
|
|
* timeout expires.
|
|
*
|
|
* The function cannot be called when there are runnable user tasks because
|
|
* the userspace would see unexpected allocation failures as a result. Any
|
|
* new usage of this function should be consulted with MM people.
|
|
*
|
|
* Returns true if successful and false if the OOM killer cannot be
|
|
* disabled.
|
|
*/
|
|
bool oom_killer_disable(signed long timeout)
|
|
{
|
|
signed long ret;
|
|
|
|
/*
|
|
* Make sure to not race with an ongoing OOM killer. Check that the
|
|
* current is not killed (possibly due to sharing the victim's memory).
|
|
*/
|
|
if (mutex_lock_killable(&oom_lock))
|
|
return false;
|
|
oom_killer_disabled = true;
|
|
mutex_unlock(&oom_lock);
|
|
|
|
ret = wait_event_interruptible_timeout(oom_victims_wait,
|
|
!atomic_read(&oom_victims), timeout);
|
|
if (ret <= 0) {
|
|
oom_killer_enable();
|
|
return false;
|
|
}
|
|
pr_info("OOM killer disabled.\n");
|
|
|
|
return true;
|
|
}
|
|
|
|
static inline bool __task_will_free_mem(struct task_struct *task)
|
|
{
|
|
struct signal_struct *sig = task->signal;
|
|
|
|
/*
|
|
* A coredumping process may sleep for an extended period in exit_mm(),
|
|
* so the oom killer cannot assume that the process will promptly exit
|
|
* and release memory.
|
|
*/
|
|
if (sig->flags & SIGNAL_GROUP_COREDUMP)
|
|
return false;
|
|
|
|
if (sig->flags & SIGNAL_GROUP_EXIT)
|
|
return true;
|
|
|
|
if (thread_group_empty(task) && (task->flags & PF_EXITING))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Checks whether the given task is dying or exiting and likely to
|
|
* release its address space. This means that all threads and processes
|
|
* sharing the same mm have to be killed or exiting.
|
|
* Caller has to make sure that task->mm is stable (hold task_lock or
|
|
* it operates on the current).
|
|
*/
|
|
static bool task_will_free_mem(struct task_struct *task)
|
|
{
|
|
struct mm_struct *mm = task->mm;
|
|
struct task_struct *p;
|
|
bool ret = true;
|
|
|
|
/*
|
|
* Skip tasks without mm because it might have passed its exit_mm and
|
|
* exit_oom_victim. oom_reaper could have rescued that but do not rely
|
|
* on that for now. We can consider find_lock_task_mm in future.
|
|
*/
|
|
if (!mm)
|
|
return false;
|
|
|
|
if (!__task_will_free_mem(task))
|
|
return false;
|
|
|
|
/*
|
|
* This task has already been drained by the oom reaper so there are
|
|
* only small chances it will free some more
|
|
*/
|
|
if (test_bit(MMF_OOM_SKIP, &mm->flags))
|
|
return false;
|
|
|
|
if (atomic_read(&mm->mm_users) <= 1)
|
|
return true;
|
|
|
|
/*
|
|
* Make sure that all tasks which share the mm with the given tasks
|
|
* are dying as well to make sure that a) nobody pins its mm and
|
|
* b) the task is also reapable by the oom reaper.
|
|
*/
|
|
rcu_read_lock();
|
|
for_each_process(p) {
|
|
if (!process_shares_mm(p, mm))
|
|
continue;
|
|
if (same_thread_group(task, p))
|
|
continue;
|
|
ret = __task_will_free_mem(p);
|
|
if (!ret)
|
|
break;
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void __oom_kill_process(struct task_struct *victim)
|
|
{
|
|
struct task_struct *p;
|
|
struct mm_struct *mm;
|
|
bool can_oom_reap = true;
|
|
|
|
p = find_lock_task_mm(victim);
|
|
if (!p) {
|
|
put_task_struct(victim);
|
|
return;
|
|
} else if (victim != p) {
|
|
get_task_struct(p);
|
|
put_task_struct(victim);
|
|
victim = p;
|
|
}
|
|
|
|
/* Get a reference to safely compare mm after task_unlock(victim) */
|
|
mm = victim->mm;
|
|
mmgrab(mm);
|
|
|
|
/* Raise event before sending signal: task reaper must see this */
|
|
count_vm_event(OOM_KILL);
|
|
memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
|
|
|
|
/*
|
|
* We should send SIGKILL before granting access to memory reserves
|
|
* in order to prevent the OOM victim from depleting the memory
|
|
* reserves from the user space under its control.
|
|
*/
|
|
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
|
|
mark_oom_victim(victim);
|
|
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
|
|
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
|
|
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
|
|
K(get_mm_counter(victim->mm, MM_FILEPAGES)),
|
|
K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
|
|
task_unlock(victim);
|
|
|
|
/*
|
|
* Kill all user processes sharing victim->mm in other thread groups, if
|
|
* any. They don't get access to memory reserves, though, to avoid
|
|
* depletion of all memory. This prevents mm->mmap_sem livelock when an
|
|
* oom killed thread cannot exit because it requires the semaphore and
|
|
* its contended by another thread trying to allocate memory itself.
|
|
* That thread will now get access to memory reserves since it has a
|
|
* pending fatal signal.
|
|
*/
|
|
rcu_read_lock();
|
|
for_each_process(p) {
|
|
if (!process_shares_mm(p, mm))
|
|
continue;
|
|
if (same_thread_group(p, victim))
|
|
continue;
|
|
if (is_global_init(p)) {
|
|
can_oom_reap = false;
|
|
set_bit(MMF_OOM_SKIP, &mm->flags);
|
|
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
|
|
task_pid_nr(victim), victim->comm,
|
|
task_pid_nr(p), p->comm);
|
|
continue;
|
|
}
|
|
/*
|
|
* No use_mm() user needs to read from the userspace so we are
|
|
* ok to reap it.
|
|
*/
|
|
if (unlikely(p->flags & PF_KTHREAD))
|
|
continue;
|
|
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
if (can_oom_reap)
|
|
wake_oom_reaper(victim);
|
|
|
|
mmdrop(mm);
|
|
put_task_struct(victim);
|
|
}
|
|
#undef K
|
|
|
|
/*
|
|
* Kill provided task unless it's secured by setting
|
|
* oom_score_adj to OOM_SCORE_ADJ_MIN.
|
|
*/
|
|
static int oom_kill_memcg_member(struct task_struct *task, void *unused)
|
|
{
|
|
if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
|
|
!is_global_init(task)) {
|
|
get_task_struct(task);
|
|
__oom_kill_process(task);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void oom_kill_process(struct oom_control *oc, const char *message)
|
|
{
|
|
struct task_struct *p = oc->chosen;
|
|
unsigned int points = oc->chosen_points;
|
|
struct task_struct *victim = p;
|
|
struct task_struct *child;
|
|
struct task_struct *t;
|
|
struct mem_cgroup *oom_group;
|
|
unsigned int victim_points = 0;
|
|
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
|
DEFAULT_RATELIMIT_BURST);
|
|
|
|
/*
|
|
* If the task is already exiting, don't alarm the sysadmin or kill
|
|
* its children or threads, just give it access to memory reserves
|
|
* so it can die quickly
|
|
*/
|
|
task_lock(p);
|
|
if (task_will_free_mem(p)) {
|
|
mark_oom_victim(p);
|
|
wake_oom_reaper(p);
|
|
task_unlock(p);
|
|
put_task_struct(p);
|
|
return;
|
|
}
|
|
task_unlock(p);
|
|
|
|
if (__ratelimit(&oom_rs))
|
|
dump_header(oc, p);
|
|
|
|
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
|
|
message, task_pid_nr(p), p->comm, points);
|
|
|
|
/*
|
|
* If any of p's children has a different mm and is eligible for kill,
|
|
* the one with the highest oom_badness() score is sacrificed for its
|
|
* parent. This attempts to lose the minimal amount of work done while
|
|
* still freeing memory.
|
|
*/
|
|
read_lock(&tasklist_lock);
|
|
|
|
/*
|
|
* The task 'p' might have already exited before reaching here. The
|
|
* put_task_struct() will free task_struct 'p' while the loop still try
|
|
* to access the field of 'p', so, get an extra reference.
|
|
*/
|
|
get_task_struct(p);
|
|
for_each_thread(p, t) {
|
|
list_for_each_entry(child, &t->children, sibling) {
|
|
unsigned int child_points;
|
|
|
|
if (process_shares_mm(child, p->mm))
|
|
continue;
|
|
/*
|
|
* oom_badness() returns 0 if the thread is unkillable
|
|
*/
|
|
child_points = oom_badness(child,
|
|
oc->memcg, oc->nodemask, oc->totalpages);
|
|
if (child_points > victim_points) {
|
|
put_task_struct(victim);
|
|
victim = child;
|
|
victim_points = child_points;
|
|
get_task_struct(victim);
|
|
}
|
|
}
|
|
}
|
|
put_task_struct(p);
|
|
read_unlock(&tasklist_lock);
|
|
|
|
/*
|
|
* Do we need to kill the entire memory cgroup?
|
|
* Or even one of the ancestor memory cgroups?
|
|
* Check this out before killing the victim task.
|
|
*/
|
|
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
|
|
|
|
__oom_kill_process(victim);
|
|
|
|
/*
|
|
* If necessary, kill all tasks in the selected memory cgroup.
|
|
*/
|
|
if (oom_group) {
|
|
mem_cgroup_print_oom_group(oom_group);
|
|
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
|
|
mem_cgroup_put(oom_group);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
|
|
*/
|
|
static void check_panic_on_oom(struct oom_control *oc,
|
|
enum oom_constraint constraint)
|
|
{
|
|
if (likely(!sysctl_panic_on_oom))
|
|
return;
|
|
if (sysctl_panic_on_oom != 2) {
|
|
/*
|
|
* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
|
|
* does not panic for cpuset, mempolicy, or memcg allocation
|
|
* failures.
|
|
*/
|
|
if (constraint != CONSTRAINT_NONE)
|
|
return;
|
|
}
|
|
/* Do not panic for oom kills triggered by sysrq */
|
|
if (is_sysrq_oom(oc))
|
|
return;
|
|
dump_header(oc, NULL);
|
|
panic("Out of memory: %s panic_on_oom is enabled\n",
|
|
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
|
|
}
|
|
|
|
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
|
|
|
|
int register_oom_notifier(struct notifier_block *nb)
|
|
{
|
|
return blocking_notifier_chain_register(&oom_notify_list, nb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(register_oom_notifier);
|
|
|
|
int unregister_oom_notifier(struct notifier_block *nb)
|
|
{
|
|
return blocking_notifier_chain_unregister(&oom_notify_list, nb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
|
|
|
|
/**
|
|
* out_of_memory - kill the "best" process when we run out of memory
|
|
* @oc: pointer to struct oom_control
|
|
*
|
|
* If we run out of memory, we have the choice between either
|
|
* killing a random task (bad), letting the system crash (worse)
|
|
* OR try to be smart about which process to kill. Note that we
|
|
* don't have to be perfect here, we just have to be good.
|
|
*/
|
|
bool out_of_memory(struct oom_control *oc)
|
|
{
|
|
unsigned long freed = 0;
|
|
enum oom_constraint constraint = CONSTRAINT_NONE;
|
|
|
|
if (oom_killer_disabled)
|
|
return false;
|
|
|
|
if (try_online_one_block(numa_node_id())) {
|
|
/* Got some memory back */
|
|
WARN(1, "OOM killer had to online a memory block\n");
|
|
return true;
|
|
}
|
|
|
|
if (!is_memcg_oom(oc)) {
|
|
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
|
|
if (freed > 0)
|
|
/* Got some memory back in the last second. */
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* If current has a pending SIGKILL or is exiting, then automatically
|
|
* select it. The goal is to allow it to allocate so that it may
|
|
* quickly exit and free its memory.
|
|
*/
|
|
if (task_will_free_mem(current)) {
|
|
mark_oom_victim(current);
|
|
wake_oom_reaper(current);
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* The OOM killer does not compensate for IO-less reclaim.
|
|
* pagefault_out_of_memory lost its gfp context so we have to
|
|
* make sure exclude 0 mask - all other users should have at least
|
|
* ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
|
|
* invoke the OOM killer even if it is a GFP_NOFS allocation.
|
|
*/
|
|
if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
|
|
return true;
|
|
|
|
/*
|
|
* Check if there were limitations on the allocation (only relevant for
|
|
* NUMA and memcg) that may require different handling.
|
|
*/
|
|
constraint = constrained_alloc(oc);
|
|
if (constraint != CONSTRAINT_MEMORY_POLICY)
|
|
oc->nodemask = NULL;
|
|
check_panic_on_oom(oc, constraint);
|
|
|
|
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
|
|
current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
|
|
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
|
|
get_task_struct(current);
|
|
oc->chosen = current;
|
|
oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
|
|
return true;
|
|
}
|
|
|
|
select_bad_process(oc);
|
|
/* Found nothing?!?! */
|
|
if (!oc->chosen) {
|
|
dump_header(oc, NULL);
|
|
pr_warn("Out of memory and no killable processes...\n");
|
|
/*
|
|
* If we got here due to an actual allocation at the
|
|
* system level, we cannot survive this and will enter
|
|
* an endless loop in the allocator. Bail out now.
|
|
*/
|
|
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
|
|
panic("System is deadlocked on memory\n");
|
|
}
|
|
if (oc->chosen && oc->chosen != (void *)-1UL)
|
|
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
|
|
"Memory cgroup out of memory");
|
|
return !!oc->chosen;
|
|
}
|
|
|
|
/*
|
|
* The pagefault handler calls here because some allocation has failed. We have
|
|
* to take care of the memcg OOM here because this is the only safe context without
|
|
* any locks held but let the oom killer triggered from the allocation context care
|
|
* about the global OOM.
|
|
*/
|
|
void pagefault_out_of_memory(void)
|
|
{
|
|
static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
|
DEFAULT_RATELIMIT_BURST);
|
|
|
|
if (mem_cgroup_oom_synchronize(true))
|
|
return;
|
|
|
|
if (fatal_signal_pending(current))
|
|
return;
|
|
|
|
if (__ratelimit(&pfoom_rs))
|
|
pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
|
|
}
|