Files
msm-5.15/kernel/sched/fair.c
Greg Kroah-Hartman 5155624549 Merge tag 'android13-5.15.94_r00' into android13-5.15
This is the merge of the upstream LTS release of 5.15.94 into the
android13-5.15 branch.

It contains the following commits:

*   5448b2fda8 Merge 5.15.94 into android13-5.15-lts
|\
| * e2c1a934fd Linux 5.15.94
| * 17170acdc7 Documentation/hw-vuln: Add documentation for Cross-Thread Return Predictions
| * 5122e0e443 KVM: x86: Mitigate the cross-thread return address predictions bug
| * 8f12dcab90 x86/speculation: Identify processors vulnerable to SMT RSB predictions
| * e63c434de8 drm/i915: Fix VBT DSI DVO port handling
| * fc88c68381 drm/i915: Initialize the obj flags for shmem objects
| * 2e557c8ca2 drm/amdgpu/fence: Fix oops due to non-matching drm_sched init/fini
| * 3af734f3ea Fix page corruption caused by racy check in __free_pages
| * c94ce5ea68 arm64: dts: meson-axg: Make mmc host controller interrupts level-sensitive
| * b796c02df3 arm64: dts: meson-g12-common: Make mmc host controller interrupts level-sensitive
| * 5d9b771f53 arm64: dts: meson-gx: Make mmc host controller interrupts level-sensitive
| * ac39dce119 rtmutex: Ensure that the top waiter is always woken up
| * 86f7e42393 powerpc/64s/interrupt: Fix interrupt exit race with security mitigation switch
| * 2907cf3f2e riscv: Fixup race condition on PG_dcache_clean in flush_icache_pte
| * beb1cefa3c ceph: flush cap releases when the session is flushed
| * 86733ab239 clk: ingenic: jz4760: Update M/N/OD calculation algorithm
| * 239e927eb2 usb: typec: altmodes/displayport: Fix probe pin assign check
| * 48aecce116 usb: core: add quirk for Alcor Link AK9563 smartcard reader
| * a8178bb1c7 btrfs: free device in btrfs_close_devices for a single device filesystem
| * 8d13f2c3e2 mptcp: be careful on subflow status propagation on errors
| * 25141fb411 net: USB: Fix wrong-direction WARNING in plusb.c
| * d1fba1e096 cifs: Fix use-after-free in rdata->read_into_pages()
| * 1b83e7e174 pinctrl: intel: Restore the pins that used to be in Direct IRQ mode
| * f5f025b703 spi: dw: Fix wrong FIFO level setting for long xfers
| * 71668706fb pinctrl: single: fix potential NULL dereference
| * a2a1065739 pinctrl: aspeed: Fix confusing types in return value
| * 99450163bc pinctrl: mediatek: Fix the drive register definition of some Pins
| * 9f0d2c2684 ASoC: topology: Return -ENOMEM on memory allocation failure
| * 1a52ef89e3 riscv: stacktrace: Fix missing the first frame
| * 5fb8154334 ALSA: pci: lx6464es: fix a debug loop
| * 105ea562f6 selftests: forwarding: lib: quote the sysctl values
| * 528e3f3a4b rds: rds_rm_zerocopy_callback() use list_first_entry()
| * 48d6d8f2f6 igc: Add ndo_tx_timeout support
| * 62ff7dd961 net/mlx5: Serialize module cleanup with reload and remove
| * 95d2394f84 net/mlx5: fw_tracer, Zero consumer index when reloading the tracer
| * ab7f3f6a9d net/mlx5: fw_tracer, Clear load bit when freeing string DBs buffers
| * 193528646e net/mlx5e: IPoIB, Show unknown speed instead of error
| * 7c6e8eb617 net/mlx5: Bridge, fix ageing of peer FDB entries
| * 49ece61a07 net/mlx5e: Update rx ring hw mtu upon each rx-fcs flag change
| * 31172267ba net/mlx5e: Introduce the mlx5e_flush_rq function
| * e4e4e93d31 net/mlx5e: Move repeating clear_bit in mlx5e_rx_reporter_err_rq_cqe_recover
| * 3f18b9ed8c net: mscc: ocelot: fix VCAP filters not matching on MAC with "protocol 802.1Q"
| * 6acb5d853b net: dsa: mt7530: don't change PVC_EG_TAG when CPU port becomes VLAN-aware
| * ca834a0178 ice: Do not use WQ_MEM_RECLAIM flag for workqueue
| * 70d48c7992 uapi: add missing ip/ipv6 header dependencies for linux/stddef.h
| * 3cec44036f ionic: clean interrupt before enabling queue to avoid credit race
| * fad12afe87 net: phy: meson-gxl: use MMD access dummy stubs for GXL, internal PHY
| * d23385a200 bonding: fix error checking in bond_debug_reregister()
| * 11006d9d08 net: phylink: move phy_device_free() to correctly release phy device
| * fb022d7b1c xfrm: fix bug with DSCP copy to v6 from v4 tunnel
| * 6fe1ad42af RDMA/usnic: use iommu_map_atomic() under spin_lock()
| * 8f5fe1cd8e RDMA/irdma: Fix potential NULL-ptr-dereference
| * 1b4ef90cbc IB/IPoIB: Fix legacy IPoIB due to wrong number of queues
| * 5dc688fae6 xfrm/compat: prevent potential spectre v1 gadget in xfrm_xlate32_attr()
| * 9bae58d58b IB/hfi1: Restore allocated resources on failed copyout
| * 558b1fa01c xfrm: compat: change expression for switch in xfrm_xlate64
| * 238b38e89f can: j1939: do not wait 250 ms if the same addr was already claimed
| * d859184b60 of/address: Return an error when no valid dma-ranges are found
| * 70f37b3118 tracing: Fix poll() and select() do not work on per_cpu trace_pipe and trace_pipe_raw
| * df01749503 ALSA: hda/realtek: Enable mute/micmute LEDs on HP Elitebook, 645 G9
| * ca9d542203 ALSA: hda/realtek: Fix the speaker output on Samsung Galaxy Book2 Pro 360
| * 706b6d86a6 ALSA: emux: Avoid potential array out-of-bound in snd_emux_xg_control()
| * 731fc29de6 ALSA: hda/realtek: Add Positivo N14KP6-TG
| * b938059807 btrfs: zlib: zero-initialize zlib workspace
| * e65faa7e39 btrfs: limit device extents to the device size
| * 2e4dd07fda migrate: hugetlb: check for hugetlb shared PMD in node migration
| * 072e7412e8 mm/migration: return errno when isolate_huge_page failed
* | f977f92131 Revert "nvmem: core: remove nvmem_config wp_gpio"
* | 787413edad Merge 5.15.93 into android13-5.15-lts
|\|
| * 85d7786c66 Linux 5.15.93
| * 6e2fac197d bpf: Skip invalid kfunc call in backtrack_insn
| * 46c9088cab gfs2: Always check inode size of inline inodes
| * 8eb2e58a92 gfs2: Cosmetic gfs2_dinode_{in,out} cleanup
| * e4991910f1 wifi: brcmfmac: Check the count value of channel spec to prevent out-of-bounds reads
| * 97ccfffcc0 f2fs: fix to do sanity check on i_extra_isize in is_alive()
| * 64fa364ad3 fbdev: smscufx: fix error handling code in ufx_usb_probe
| * a77141a063 ovl: Use "buf" flexible array for memcpy() destination
| * 1692fedd0f fs/ntfs3: Validate attribute data and valid sizes
| * a5b9cb7276 powerpc/imc-pmu: Revert nest_init_lock to being a mutex
| * 3691f43a09 iio:adc:twl6030: Enable measurement of VAC
| * 8c84f50390 bpf: Do not reject when the stack read size is different from the tracked scalar size
| * 14b6198abb bpf: Fix incorrect state pruning for <8B spill/fill
| * 575a9f6fef phy: qcom-qmp-combo: fix runtime suspend
| * e58df87394 phy: qcom-qmp-combo: fix broken power on
| * 368ea32e0a phy: qcom-qmp-usb: fix memleak on probe deferral
| * 2f27d3811a phy: qcom-qmp-combo: fix memleak on probe deferral
| * 0cb10ddab7 phy: qcom-qmp-combo: disable runtime PM on unbind
| * 0ef5ffe116 serial: 8250_dma: Fix DMA Rx rearm race
| * e30328f599 serial: 8250_dma: Fix DMA Rx completion race
| * a5a171f61a nvmem: core: fix cell removal on error
| * 6d9fa3ff65 nvmem: core: remove nvmem_config wp_gpio
| * adf80e072c nvmem: core: initialise nvmem->id early
| * e3ebc3e23b drm/i915: Fix potential bit_17 double-free
| * 997bed0f3c Squashfs: fix handling and sanity checking of xattr_ids count
| * 7a0cfaf9d4 highmem: round down the address passed to kunmap_flush_on_unmap()
| * 5dbe1ebd56 mm/swapfile: add cond_resched() in get_swap_pages()
| * daf8241804 fpga: stratix10-soc: Fix return value check in s10_ops_write_init()
| * afd32b6831 x86/debug: Fix stack recursion caused by wrongly ordered DR7 accesses
| * 066ecbf1a5 kernel/irq/irqdomain.c: fix memory leak with using debugfs_lookup()
| * 481bf49f58 usb: gadget: f_uac2: Fix incorrect increment of bNumEndpoints
| * fdf40e5824 mm: hugetlb: proc: check for hugetlb shared PMD in /proc/PID/smaps
| * 6c300351c5 riscv: disable generation of unwind tables
| * a5c275add9 parisc: Wire up PTRACE_GETREGS/PTRACE_SETREGS for compat case
| * a964decd13 parisc: Fix return code of pdc_iodc_print()
| * 488eaf0625 nvmem: qcom-spmi-sdam: fix module autoloading
| * 8569beb66f iio: imu: fxos8700: fix MAGN sensor scale and unit
| * 8aa5cdcfaf iio: imu: fxos8700: remove definition FXOS8700_CTRL_ODR_MIN
| * 4112ba1ad5 iio: imu: fxos8700: fix failed initialization ODR mode assignment
| * abf7b2ba51 iio: imu: fxos8700: fix incorrect ODR mode readback
| * 412757741c iio: imu: fxos8700: fix swapped ACCEL and MAGN channels readback
| * 34909532b1 iio: imu: fxos8700: fix map label of channel type to MAGN sensor
| * 8346eb4987 iio: imu: fxos8700: fix IMU data bits returned to user space
| * 7567cdf3ce iio: imu: fxos8700: fix incomplete ACCEL and MAGN channels readback
| * 6969852220 iio: imu: fxos8700: fix ACCEL measurement range selection
| * cdacfb2205 iio:adc:twl6030: Enable measurements of VUSB, VBAT and others
| * 9988063dce iio: adc: berlin2-adc: Add missing of_node_put() in error path
| * c691a5c0fd iio: hid: fix the retval in gyro_3d_capture_sample
| * ef80a34699 iio: hid: fix the retval in accel_3d_capture_sample
| * c4eae85c73 efi: Accept version 2 of memory attributes table
| * 710db82063 ALSA: hda/realtek: Add Acer Predator PH315-54
| * 3fbddf86d9 watchdog: diag288_wdt: fix __diag288() inline assembly
| * 700dd5bc72 watchdog: diag288_wdt: do not use stack buffers for hardware data
| * 21bc51e29e net: qrtr: free memory on error path in radix_tree_insert()
| * dccbd062d7 fbcon: Check font dimension limits
| * 5d7500d991 Input: i8042 - add Clevo PCX0DX to i8042 quirk table
| * fc9e27f3ba vc_screen: move load of struct vc_data pointer in vcs_read() to avoid UAF
| * 9ba1188a71 usb: gadget: f_fs: Fix unbalanced spinlock in __ffs_ep0_queue_wait
| * fe86480e90 usb: dwc3: qcom: enable vbus override when in OTG dr-mode
| * a412fe7baf iio: adc: stm32-dfsdm: fill module aliases
| * 9944659398 drm/amd/display: Fix timing not changning when freesync video is enabled
| * a3967128bc net/x25: Fix to not accept on connected socket
| * 396ea318e7 platform/x86: gigabyte-wmi: add support for B450M DS3H WIFI-CF
| * 1577524633 platform/x86: dell-wmi: Add a keymap for KEY_MUTE in type 0x0010 table
| * 540cea9f9b i2c: rk3x: fix a bunch of kernel-doc warnings
| * 0aaabdb900 scsi: iscsi_tcp: Fix UAF during login when accessing the shost ipaddress
| * 17b738590b scsi: iscsi_tcp: Fix UAF during logout when accessing the shost ipaddress
| * 8cd0499f9c perf/x86/intel: Add Emerald Rapids
| * 7093515370 scsi: target: core: Fix warning on RT kernels
| * b7960f5436 i2c: mxs: suppress probe-deferral error message
| * b9b87fc34b i2c: designware-pci: Add new PCI IDs for AMD NAVI GPU
| * d8fc0b5fb3 efi: fix potential NULL deref in efi_mem_reserve_persistent
| * f423c2efd5 net: openvswitch: fix flow memory leak in ovs_flow_cmd_new
| * 7985028647 virtio-net: Keep stop() to follow mirror sequence of open()
| * 5d884f9e80 selftests: net: udpgso_bench_tx: Cater for pending datagrams zerocopy benchmarking
| * 63aa63af3a selftests: net: udpgso_bench: Fix racing bug between the rx/tx programs
| * d41a3f9cc2 selftests: net: udpgso_bench_rx/tx: Stop when wrong CLI args are provided
| * 5af98283e5 selftests: net: udpgso_bench_rx: Fix 'used uninitialized' compiler warning
| * 89e0701e03 ata: libata: Fix sata_down_spd_limit() when no link speed is reported
| * 9ab896775f can: j1939: fix errant WARN_ON_ONCE in j1939_session_deactivate
| * 02d77d98e0 igc: return an error if the mac type is unknown in igc_ptp_systim_to_hwtstamp()
| * 04a7355820 riscv: kprobe: Fixup kernel panic when probing an illegal position
| * 206c367b6a ip/ip6_gre: Fix non-point-to-point tunnel not generating IPv6 link local address
| * 90178bc0f2 ip/ip6_gre: Fix changing addr gen mode not generating IPv6 link local address
| * dfe2f0ea38 net: phy: meson-gxl: Add generic dummy stubs for MMD register access
| * b7398efe24 squashfs: harden sanity check in squashfs_read_xattr_id_table
| * 89a69216f1 netfilter: br_netfilter: disable sabotage_in hook after first suppression
| * cdb444e73f drm/i915/adlp: Fix typo for reference clock
| * 960f20d858 drm/i915/guc: Fix locking when searching for a hung request
| * c27e0eac56 netrom: Fix use-after-free caused by accept on already connected socket
| * 511c922c5b block, bfq: fix uaf for bfqq in bic_set_bfqq()
| * a62c129dcb block, bfq: replace 0/1 with false/true in bic apis
| * 37a744a068 block/bfq-iosched.c: use "false" rather than "BLK_RW_ASYNC"
| * 2cd1e9c013 net: phy: dp83822: Fix null pointer access on DP83825/DP83826 devices
| * 18c18c2110 sfc: correctly advertise tunneled IPv6 segmentation
| * 878b06f60a dpaa2-eth: execute xdp_do_flush() before napi_complete_done()
| * 3b5774cd6b dpaa_eth: execute xdp_do_flush() before napi_complete_done()
| * 5a7040a649 virtio-net: execute xdp_do_flush() before napi_complete_done()
| * 94add5b272 qede: execute xdp_do_flush() before napi_complete_done()
| * a273f8e3ab ice: Prevent set_channel from changing queues while RDMA active
| * b432e183c2 fix "direction" argument of iov_iter_kvec()
| * d8b8306e96 fix iov_iter_bvec() "direction" argument
| * 389c7c0ef9 READ is "data destination", not source...
| * 7a3649bf5b WRITE is "data source", not destination...
| * 83cc6a7bb7 vhost/net: Clear the pending messages when the backend is removed
| * 7c7d344bc3 scsi: Revert "scsi: core: map PQ=1, PDT=other values to SCSI_SCAN_TARGET_PRESENT"
| * 4b199dc094 drm/vc4: hdmi: make CEC adapter name unique
| * dc1f8ab25a arm64: dts: imx8mm: Fix pad control for UART1_DTE_RX
| * c681d7a4ed bpf, sockmap: Check for any of tcp_bpf_prots when cloning a listener
| * 34ad5d8885 bpf: Fix to preserve reg parent/live fields when copying range info
| * 7b86f9ab56 bpf: Support <8-byte scalar spill and refill
| * 1b9256c962 ALSA: hda/via: Avoid potential array out-of-bound in add_secret_dac_path()
| * b7abeb6916 bpf: Fix a possible task gone issue with bpf_send_signal[_thread]() helpers
| * cfcc2390db ASoC: Intel: bytcr_wm5102: Drop reference count of ACPI device after use
| * b4b204565a ASoC: Intel: bytcr_rt5640: Drop reference count of ACPI device after use
| * 1f1e7635c5 ASoC: Intel: bytcr_rt5651: Drop reference count of ACPI device after use
| * 41d323c352 ASoC: Intel: bytcht_es8316: Drop reference count of ACPI device after use
| * 6a9990e1d9 ASoC: Intel: bytcht_es8316: move comment to the right place
| * ffcdf35455 ASoC: Intel: boards: fix spelling in comments
| * bd0b17ab1b bus: sunxi-rsb: Fix error handling in sunxi_rsb_init()
| * 5f4543c938 firewire: fix memory leak for payload of request subaction to IEC 61883-1 FCP region
* | 5020746bff Merge 5.15.92 into android13-5.15-lts
|\|
| * e515b9902f Linux 5.15.92
| * c7caf669b8 net: mctp: purge receive queues on sk destruction
| * 046de74f9a net: fix NULL pointer in skb_segment_list
| * 7ab3376703 selftests: Provide local define of __cpuid_count()
| * e92e311ced selftests/vm: remove ARRAY_SIZE define from individual tests
| * c9e52db900 tools: fix ARRAY_SIZE defines in tools and selftests hdrs
| * c1aa0dd52d Bluetooth: fix null ptr deref on hci_sync_conn_complete_evt
| * 02e61196c5 ACPI: processor idle: Practically limit "Dummy wait" workaround to old Intel systems
| * 79dd676b44 extcon: usbc-tusb320: fix kernel-doc warning
| * c2bd60ef20 ext4: fix bad checksum after online resize
| * 4cd1e18bc0 cifs: fix return of uninitialized rc in dfs_cache_update_tgthint()
| * 43acd767bd dmaengine: imx-sdma: Fix a possible memory leak in sdma_transfer_init
| * a54c5ad007 HID: playstation: sanity check DualSense calibration data.
| * 6d7686cc11 blk-cgroup: fix missing pd_online_fn() while activating policy
| * 2144859229 erofs/zmap.c: Fix incorrect offset calculation
| * 0dfef50313 bpf: Skip task with pid=1 in send_signal_common()
| * e8bb772f74 firmware: arm_scmi: Clear stale xfer->hdr.status
| * 80cb9f1a76 arm64: dts: imx8mq-thor96: fix no-mmc property for SDHCI
| * 162fad24d2 arm64: dts: freescale: Fix pca954x i2c-mux node names
| * 82ad105e1a ARM: dts: vf610: Fix pca9548 i2c-mux node names
| * 5aee5f33e0 ARM: dts: imx: Fix pca9547 i2c-mux node name
* | 7e0097918f Revert "scsi: ufs: core: Fix devfreq deadlocks"
* | 6ce0fcdcc2 Revert "thermal/core: Rename 'trips' to 'num_trips'"
* | 49a5232dfb Revert "thermal: Validate new state in cur_state_store()"
* | be0ca2fc43 Revert "thermal/core: fix error code in __thermal_cooling_device_register()"
* | 9617a003cc Revert "thermal: core: call put_device() only after device_register() fails"
* | ccb2c48531 Revert "cpufreq: governor: Use kobject release() method to free dbs_data"
* | 0108f014a5 Revert "gpio: use raw spinlock for gpio chip shadowed data"
* | 1d2449f6be Revert "gpio: mxc: Protect GPIO irqchip RMW with bgpio spinlock"
* | 5f51aedcba Revert "gpio: mxc: Unlock on error path in mxc_flip_edge()"
* | 7622c50ba6 Merge 5.15.91 into android13-5.15-lts
|\|
| * 9cf4111cdf Linux 5.15.91
| * 14cc13e433 perf/x86/amd: fix potential integer overflow on shift of a int
| * 033636b322 netfilter: conntrack: unify established states for SCTP paths
| * 0b08201158 x86/i8259: Mark legacy PIC interrupts with IRQ_LEVEL
| * b577400367 block: fix and cleanup bio_check_ro
| * 1d152437e4 kbuild: Allow kernel installation packaging to override pkg-config
| * a196468858 cpufreq: governor: Use kobject release() method to free dbs_data
| * 7c513ced0d cpufreq: Move to_gov_attr_set() to cpufreq.h
| * cf7a08622d Revert "Input: synaptics - switch touchpad on HP Laptop 15-da3001TU to RMI mode"
| * 53c5d61198 tools: gpio: fix -c option of gpio-event-mon
| * a7d1a303ff treewide: fix up files incorrectly marked executable
| * 046fe53907 net: mdio-mux-meson-g12a: force internal PHY off on mux switch
| * 86bdccde78 net/tg3: resolve deadlock in tg3_reset_task() during EEH
| * 4364bf79d8 thermal: intel: int340x: Add locking to int340x_thermal_get_trip_type()
| * e69c3a0d9d net: mctp: mark socks as dead on unhash, prevent re-add
| * 954cc215cd net: ravb: Fix possible hang if RIS2_QFF1 happen
| * 0f7218bf0a net: ravb: Fix lack of register setting after system resumed for Gen3
| * 3db4ca2938 ravb: Rename "no_ptp_cfg_active" and "ptp_cfg_active" variables
| * 621f296f11 gpio: mxc: Unlock on error path in mxc_flip_edge()
| * 071a839286 nvme: fix passthrough csi check
| * 614471b7f7 riscv/kprobe: Fix instruction simulation of JALR
| * 3391bd4235 sctp: fail if no bound addresses can be used for a given scope
| * b0784860e1 net/sched: sch_taprio: do not schedule in taprio_reset()
| * d2d3ab1b1d netrom: Fix use-after-free of a listening socket.
| * 9df5ab02c6 netfilter: conntrack: fix vtag checks for ABORT/SHUTDOWN_COMPLETE
| * ca3cf94776 ipv4: prevent potential spectre v1 gadget in fib_metrics_match()
| * d50e7348b4 ipv4: prevent potential spectre v1 gadget in ip_metrics_convert()
| * ead06e3449 netlink: annotate data races around sk_state
| * c4eb423c6b netlink: annotate data races around dst_portid and dst_group
| * fac9b69a93 netlink: annotate data races around nlk->portid
| * 8a13595600 netfilter: nft_set_rbtree: skip elements in transaction from garbage collection
| * 2bf1435fa1 netfilter: nft_set_rbtree: Switch to node list walk for overlap detection
| * e481654426 drm/i915/selftest: fix intel_selftest_modify_policy argument types
| * 66689a72ba net: fix UaF in netns ops registration error path
| * 41b74e95f2 netlink: prevent potential spectre v1 gadgets
| * 2f29d780bd i2c: designware: use casting of u64 in clock multiplication to avoid overflow
| * b03f7ed9af scsi: ufs: core: Fix devfreq deadlocks
| * 858d7e9218 net: mana: Fix IRQ name - add PCI and queue number
| * bff5243bd3 EDAC/qcom: Do not pass llcc_driv_data as edac_device_ctl_info's pvt_info
| * 5eedf4568d EDAC/device: Respect any driver-supplied workqueue polling value
| * 4b7dfd0a68 ARM: 9280/1: mm: fix warning on phys_addr_t to void pointer assignment
| * 7807871f28 ipv6: fix reachability confirmation with proxy_ndp
| * f9a22f6fa1 thermal: intel: int340x: Protect trip temperature from concurrent updates
| * 036093c08d KVM: arm64: GICv4.1: Fix race with doorbell on VPE activation/deactivation
| * c56683c062 KVM: x86/vmx: Do not skip segment attributes if unusable bit is set
| * e91308e637 ovl: fail on invalid uid/gid mapping at copy up
| * 33a9657d67 ksmbd: limit pdu length size according to connection status
| * 8d83a758ee ksmbd: downgrade ndr version error message to debug
| * 87a7f38a90 ksmbd: do not sign response to session request for guest login
| * 4210c3555d ksmbd: add max connections parameter
| * cc6742b160 ksmbd: add smbd max io size parameter
| * 3c8a5648a5 i2c: mv64xxx: Add atomic_xfer method to driver
| * e619ab4fb3 i2c: mv64xxx: Remove shutdown method from driver
| * 4b83bc6f87 cifs: Fix oops due to uncleared server->smbd_conn in reconnect
| * 89042d3d85 ftrace/scripts: Update the instructions for ftrace-bisect.sh
| * 592ba7116f trace_events_hist: add check for return value of 'create_hist_field'
| * b0af180514 tracing: Make sure trace_printk() can output as soon as it can be used
| * 91135d7233 module: Don't wait for GOING modules
| * 85ee9919ad KVM: SVM: fix tsc scaling cache logic
| * f0227eca97 scsi: hpsa: Fix allocation size for scsi_host_alloc()
| * e5af9a458a drm/amdgpu: complete gfxoff allow signal during suspend without delay
| * 62b9e9f921 Bluetooth: hci_sync: cancel cmd_timer if hci_open failed
| * 21998acd31 exit: Use READ_ONCE() for all oops/warn limit reads
| * e82b1598eb docs: Fix path paste-o for /sys/kernel/warn_count
| * 1c51698ad6 panic: Expose "warn_count" to sysfs
| * 0691ddae56 panic: Introduce warn_limit
| * 7b98914a6c panic: Consolidate open-coded panic_on_warn checks
| * fc636b1362 exit: Allow oops_limit to be disabled
| * 339f8a8e52 exit: Expose "oops_count" to sysfs
| * f80fb0001f exit: Put an upper limit on how often we can oops
| * 2857ce7f47 panic: Separate sysctl logic from CONFIG_SMP
| * e156d4dcb0 ia64: make IA64_MCA_RECOVERY bool instead of tristate
| * 9024f77224 csky: Fix function name in csky_alignment() and die()
| * 2ea497d153 h8300: Fix build errors from do_exit() to make_task_dead() transition
| * a452ca0228 hexagon: Fix function name in die()
| * 3b39f47474 objtool: Add a missing comma to avoid string concatenation
| * 39a26d8721 exit: Add and use make_task_dead.
| * b5c1acaa43 kasan: no need to unset panic_on_warn in end_report()
| * b5c967dc68 ubsan: no need to unset panic_on_warn in ubsan_epilogue()
| * e4cd210032 panic: unset panic_on_warn inside panic()
| * 191f1f1f6a kernel/panic: move panic sysctls to its own file
| * 654f6e8512 sysctl: add a new register_sysctl_init() interface
| * 3aa991cde9 fs: reiserfs: remove useless new_opts in reiserfs_remount
| * d830531f8f x86: ACPI: cstate: Optimize C3 entry on AMD CPUs
| * 1f54762231 drm/i915: Remove unused variable
| * 6e10127093 Revert "selftests/bpf: check null propagation only neither reg is PTR_TO_BTF_ID"
| * 619ee31b96 drm/i915: Allow switching away via vga-switcheroo if uninitialized
| * ea435ba9eb firmware: coreboot: Check size of table entry and use flex-array
| * a4e70bcf2e lockref: stop doing cpu_relax in the cmpxchg loop
| * b0ee61f5ee platform/x86: asus-nb-wmi: Add alternate mapping for KEY_SCREENLOCK
| * e8d2f7f566 platform/x86: touchscreen_dmi: Add info for the CSL Panther Tab HD
| * 2e0a8bacbe r8152: add vendor/device ID pair for Microsoft Devkit
| * d4b717e34d scsi: hisi_sas: Set a port invalid only if there are no devices attached when refreshing port id
| * e15750aa28 KVM: s390: interrupt: use READ_ONCE() before cmpxchg()
| * 9300c65207 spi: spidev: remove debug messages that access spidev->spi without locking
| * 48ff5d3812 ASoC: fsl-asoc-card: Fix naming of AC'97 CODEC widgets
| * 5001ffb31d ASoC: fsl_ssi: Rename AC'97 streams to avoid collisions with AC'97 CODEC
| * b76120e206 cpufreq: armada-37xx: stop using 0 as NULL pointer
| * eda26fa856 perf/x86/intel/uncore: Add Emerald Rapids
| * 544f9d4e9d perf/x86/msr: Add Emerald Rapids
| * b1eb964d78 s390: expicitly align _edata and _end symbols on page boundary
| * fb45ec279b s390/debug: add _ASM_S390_ prefix to header guard
| * cd488abed9 drm: Add orientation quirk for Lenovo ideapad D330-10IGL
| * ff7ab370b8 net: usb: cdc_ether: add support for Thales Cinterion PLS62-W modem
| * d6935084e4 ASoC: fsl_micfil: Correct the number of steps on SX controls
| * ac07316b2d cpufreq: Add SM6375 to cpufreq-dt-platdev blocklist
| * f0e6dcae14 kcsan: test: don't put the expect array on the stack
| * c51c0b3754 cpufreq: Add Tegra234 to cpufreq-dt-platdev blocklist
| * 28e4e8ca9e scsi: iscsi: Fix multiple iSCSI session unbind events sent to userspace
| * 14b1df2004 tcp: fix rate_app_limited to default to 1
| * 120b8e527e net: stmmac: enable all safety features by default
| * a7d736cc3c thermal: core: call put_device() only after device_register() fails
| * ed08f958e4 thermal/core: fix error code in __thermal_cooling_device_register()
| * 108a6f91e2 thermal: Validate new state in cur_state_store()
| * bd0ea77edf thermal/core: Rename 'trips' to 'num_trips'
| * 521c6ebd4f thermal/core: Remove duplicate information when an error occurs
| * 6504afa263 net: dsa: microchip: ksz9477: port map correction in ALU table entry register
| * 18346db185 selftests/net: toeplitz: fix race on tpacket_v3 block close
| * caa28c7c83 driver core: Fix test_async_probe_init saves device in wrong array
| * 89c62cee5d w1: fix WARNING after calling w1_process()
| * 3d0eafe413 w1: fix deadloop in __w1_remove_master_device()
| * 7701a4bd45 device property: fix of node refcount leak in fwnode_graph_get_next_endpoint()
| * ed0d8f731e ptdma: pt_core_execute_cmd() should use spinlock
| * 29e9c67bf3 octeontx2-pf: Fix the use of GFP_KERNEL in atomic context on rt
| * 03bff5819a tcp: avoid the lookup process failing to get sk in ehash table
| * 5bd69d2ea8 nvme-pci: fix timeout request state check
| * 39178dfe86 drm/amd/display: fix issues with driver unload
| * 9a5a537e14 phy: phy-can-transceiver: Skip warning if no "max-bitrate"
| * 4095065b59 dmaengine: xilinx_dma: call of_node_put() when breaking out of for_each_child_of_node()
| * 5bd3c1c1bc cifs: fix potential deadlock in cache_refresh_path()
| * 1a2a47b85c HID: betop: check shape of output reports
| * b2a7309743 l2tp: prevent lockdep issue in l2tp_tunnel_register()
| * edf0e509ce virtio-net: correctly enable callback during start_xmit
| * d3401c7624 net: macb: fix PTP TX timestamp failure due to packet padding
| * 71c6019655 dmaengine: Fix double increment of client_count in dma_chan_get()
| * 1e7919f0b1 drm/panfrost: fix GENERIC_ATOMIC64 dependency
| * a1b3e50e21 net: mlx5: eliminate anonymous module_init & module_exit
| * 09e3fb6f53 net/mlx5: E-switch, Fix setting of reserved fields on MODIFY_SCHEDULING_ELEMENT
| * 01a6e10810 net: ipa: disable ipa interrupt during suspend
| * 98aec50ff7 Bluetooth: Fix possible deadlock in rfcomm_sk_state_change
| * 0e59f60b74 usb: gadget: f_fs: Ensure ep0req is dequeued before free_request
| * ae8e136bca usb: gadget: f_fs: Prevent race during ffs_ep0_queue_wait
| * f25cd2b731 HID: revert CHERRY_MOUSE_000C quirk
| * 39483511fd pinctrl: rockchip: fix mux route data for rk3568
| * 1dae88a0b4 net: stmmac: fix invalid call to mdiobus_get_phy()
| * 6716838bf8 HID: check empty report_list in bigben_probe()
| * 2b49568254 HID: check empty report_list in hid_validate_values()
| * ad67de330d net: mdio: validate parameter addr in mdiobus_get_phy()
| * 4869129379 net: usb: sr9700: Handle negative len
| * 2827c4eb42 octeontx2-pf: Avoid use of GFP_KERNEL in atomic context
| * 77e8ed776c l2tp: close all race conditions in l2tp_tunnel_register()
| * af22d2c0b4 l2tp: convert l2tp_tunnel_list to idr
| * 22c7d45ca3 l2tp: Don't sleep and disable BH under writer-side sk_callback_lock
| * 87d9205d9a l2tp: Serialize access to sk_user_data with sk_callback_lock
| * c53acbf2fa net/sched: sch_taprio: fix possible use-after-free
| * 40516d042b net: stmmac: Fix queue statistics reading
| * 620aa67f80 pinctrl: rockchip: fix reading pull type on rk3568
| * ddca674af1 pinctrl/rockchip: add error handling for pull/drive register getters
| * 259ab8fb8c pinctrl/rockchip: Use temporary variable for struct device
| * 8cbf932c5c wifi: rndis_wlan: Prevent buffer overflow in rndis_query_oid
| * f792d26e5c gpio: mxc: Always set GPIOs used as interrupt source to INPUT mode
| * 8335f877ef gpio: mxc: Protect GPIO irqchip RMW with bgpio spinlock
| * fb4fb3d267 gpio: use raw spinlock for gpio chip shadowed data
| * 52e3eebfe6 sch_htb: Avoid grafting on htb_destroy_class_offload when destroying htb
| * 8232e5a84d net: enetc: avoid deadlock in enetc_tx_onestep_tstamp()
| * 95347e41ca net: wan: Add checks for NULL for utdm in undo_uhdlc_init and unmap_si_regs
| * 7f129927fe net: nfc: Fix use-after-free in local_cleanup()
| * 397aaac884 phy: rockchip-inno-usb2: Fix missing clk_disable_unprepare() in rockchip_usb2phy_power_on()
| * 01bdcc73db bpf: Fix pointer-leak due to insufficient speculative store bypass mitigation
| * 261e2f12b6 amd-xgbe: Delay AN timeout during KR training
| * a8cf4af544 amd-xgbe: TX Flow Ctrl Registers are h/w ver dependent
| * 8e897cb674 ARM: dts: at91: sam9x60: fix the ddr clock for sam9x60
| * 0a27dcd534 NFSD: fix use-after-free in nfsd4_ssc_setup_dul()
| * 24af570c99 phy: ti: fix Kconfig warning and operator precedence
| * 631fc36685 arm64: dts: qcom: msm8992-libra: Fix the memory map
| * dda20ffec8 arm64: dts: qcom: msm8992-libra: Add CPU regulators
| * 37ba5e9293 arm64: dts: qcom: msm8992: Don't use sfpb mutex
| * bab87524f6 PM: AVS: qcom-cpr: Fix an error handling path in cpr_probe()
| * b7a479c764 affs: initialize fsdata in affs_truncate()
| * 623d111689 IB/hfi1: Remove user expected buffer invalidate race
| * 47d5fc0dcd IB/hfi1: Immediately remove invalid memory from hardware
| * 85caef2cfd IB/hfi1: Fix expected receive setup error exit issues
| * cb193984d4 IB/hfi1: Reserve user expected TIDs
| * 891ddfae39 IB/hfi1: Reject a zero-length user expected buffer
| * 362c948972 RDMA/core: Fix ib block iterator counter overflow
| * e26c571c3b tomoyo: fix broken dependency on *.conf.default
| * 7dfe83ecc3 firmware: arm_scmi: Harden shared memory access in fetch_notification
| * a653dbb70c firmware: arm_scmi: Harden shared memory access in fetch_response
| * caffa7fed1 EDAC/highbank: Fix memory leak in highbank_mc_probe()
| * 95de286200 reset: uniphier-glue: Fix possible null-ptr-deref
| * 4773a8cf9a reset: uniphier-glue: Use reset_control_bulk API
| * 7b33accc8f soc: imx8m: Fix incorrect check for of_clk_get_by_name()
| * f07427f8d9 arm64: dts: imx8mm-venice-gw7901: fix USB2 controller OC polarity
| * c4cb73febe HID: intel_ish-hid: Add check for ishtp_dma_tx_map
| * 25f97c9883 ARM: imx: add missing of_node_put()
| * 3e9d79ded9 arm64: dts: imx8mm-beacon: Fix ecspi2 pinmux
| * 5381350761 ARM: dts: imx6qdl-gw560x: Remove incorrect 'uart-has-rtscts'
| * 0e4bba1656 ARM: dts: imx7d-pico: Use 'clock-frequency'
| * 108cf4c6d5 ARM: dts: imx6ul-pico-dwarf: Use 'clock-frequency'
| * 207c9e64ed arm64: dts: imx8mp-phycore-som: Remove invalid PMIC property
| * 7ce380fe75 dmaengine: ti: k3-udma: Do conditional decrement of UDMA_CHAN_RT_PEER_BCNT_REG
| * edba9b7a70 memory: mvebu-devbus: Fix missing clk_disable_unprepare in mvebu_devbus_probe()
| * e66f6949da memory: atmel-sdramc: Fix missing clk_disable_unprepare in atmel_ramc_probe()
| * eda11ab556 memory: tegra: Remove clients SID override programming
* | cab35cbd71 Revert "xhci: Add update_hub_device override for PCI xHCI hosts"
* | 29e8f224d8 Revert "xhci: Detect lpm incapable xHC USB3 roothub ports from ACPI tables"
* | 5739b27e8f Revert "xhci: Add a flag to disable USB3 lpm on a xhci root port level."
* | 5b60fdf2e0 Merge 5.15.90 into android13-5.15-lts
|\|
| * aabd5ba7e9 Linux 5.15.90
| * 4b6f8263e9 io_uring/rw: remove leftover debug statement
| * b10acfcd61 io_uring/rw: ensure kiocb_end_write() is always called
| * 124fb13cc7 io_uring: fix double poll leak on repolling
| * e944f1e37b io_uring: Clean up a false-positive warning from GCC 9.3.0
| * 940e8922c1 mm/khugepaged: fix collapse_pte_mapped_thp() to allow anon_vma
| * e83cc8a780 soc: qcom: apr: Make qcom,protection-domain optional again
| * 982c8b1e95 Revert "wifi: mac80211: fix memory leak in ieee80211_if_add()"
| * 40a4797e08 block: mq-deadline: Rename deadline_is_seq_writes()
| * 3abf10b4c4 net/mlx5: fix missing mutex_unlock in mlx5_fw_fatal_reporter_err_work()
| * 1aab00aa41 net/ulp: use consistent error code when blocking ULP
| * 2e4c95a404 io_uring/net: fix fast_iov assignment in io_setup_async_msg()
| * 311b298a33 io_uring: io_kiocb_update_pos() should not touch file for non -1 offset
| * 487a086595 tracing: Use alignof__(struct {type b;}) instead of offsetof()
| * 430443f856 x86/fpu: Use _Alignof to avoid undefined behavior in TYPE_ALIGN
| * f114717dfa Revert "drm/amdgpu: make display pinning more flexible (v2)"
| * 7a993c1be5 efi: rt-wrapper: Add missing include
| * de2af657ca arm64: efi: Execute runtime services from a dedicated stack
| * 9cca110cf8 fs/ntfs3: Fix attr_punch_hole() null pointer derenference
| * d4d112e5c4 drm/amdgpu: drop experimental flag on aldebaran
| * c82fa690da drm/amd/display: Fix COLOR_SPACE_YCBCR2020_TYPE matrix
| * 88c3375224 drm/amd/display: Calculate output_color_space after pixel encoding adjustment
| * 87e605b161 drm/amd/display: Fix set scaling doesn's work
| * 8687b8cdc3 drm/i915/display: Check source height is > 0
| * 5d96179166 drm/i915: re-disable RC6p on Sandy Bridge
| * e9a7ec188b mei: me: add meteor lake point M DID
| * eb0421d90f gsmi: fix null-deref in gsmi_get_variable
| * b8d99cda52 serial: atmel: fix incorrect baudrate setup
| * b85498385a serial: amba-pl011: fix high priority character transmission in rs486 mode
| * 0f150134dd dmaengine: idxd: Let probe fail when workqueue cannot be enabled
| * 1e8c127c2e dmaengine: tegra210-adma: fix global intr clear
| * 473e2281f7 dmaengine: lgm: Move DT parsing after initialization
| * 73337724cb serial: pch_uart: Pass correct sg to dma_unmap_sg()
| * 4307a41cbc dt-bindings: phy: g12a-usb3-pcie-phy: fix compatible string documentation
| * c9d55f564a dt-bindings: phy: g12a-usb2-phy: fix compatible string documentation
| * 78aa45bb7a usb-storage: apply IGNORE_UAS only for HIKSEMI MD202 on RTL9210
| * a69c8dfb85 usb: gadget: f_ncm: fix potential NULL ptr deref in ncm_bitrate()
| * 1ab67e87b1 usb: gadget: g_webcam: Send color matching descriptor per frame
| * b08167d8f0 usb: typec: altmodes/displayport: Fix pin assignment calculation
| * 7fb1322e7a usb: typec: altmodes/displayport: Add pin assignment helper
| * 59f9ee3796 usb: typec: tcpm: Fix altmode re-registration causes sysfs create fail
| * a1c8a5c2f8 usb: host: ehci-fsl: Fix module alias
| * f073d10cd5 usb: cdns3: remove fetched trb from cache before dequeuing
| * 73f4bde973 USB: serial: cp210x: add SCALANCE LPE-9000 device id
| * a2e075f401 USB: gadgetfs: Fix race between mounting and unmounting
| * 2da67bff29 tty: fix possible null-ptr-defer in spk_ttyio_release
| * cb53a3366e tty: serial: qcom-geni-serial: fix slab-out-of-bounds on RX FIFO buffer
| * f322dd2e4a staging: mt7621-dts: change some node hex addresses to lower case
| * 6508788b2c bpf: restore the ebpf program ID for BPF_AUDIT_UNLOAD and PERF_BPF_EVENT_PROG_UNLOAD
| * 7b122c33bd riscv: dts: sifive: fu740: fix size of pcie 32bit memory
| * 701f9c3da6 thunderbolt: Use correct function to calculate maximum USB3 link rate
| * 5b1b03a3d3 cifs: do not include page data when checking signature
| * 64287cd456 btrfs: fix race between quota rescan and disable leading to NULL pointer deref
| * f2e0e1615d btrfs: do not abort transaction on failure to write log tree when syncing log
| * f653abe619 mmc: sdhci-esdhc-imx: correct the tuning start tap and step setting
| * 9881436f01 mmc: sunxi-mmc: Fix clock refcount imbalance during unbind
| * 33bd0db750 ACPI: PRM: Check whether EFI runtime is available
| * 87e1ee6058 comedi: adv_pci1760: Fix PWM instruction handling
| * b5d24a8e4a usb: core: hub: disable autosuspend for TI TUSB8041
| * 61a0890cb9 misc: fastrpc: Fix use-after-free race condition for maps
| * 1b7b7bb400 misc: fastrpc: Don't remove map on creater_process and device_release
| * e7e41fcf90 USB: misc: iowarrior: fix up header size for USB_DEVICE_ID_CODEMERCS_IOW100
| * f3de34d90d staging: vchiq_arm: fix enum vchiq_status return types
| * 16d09c4bc9 USB: serial: option: add Quectel EM05CN modem
| * 34d769f0c6 USB: serial: option: add Quectel EM05CN (SG) modem
| * 768d56ed24 USB: serial: option: add Quectel EC200U modem
| * 829916f069 USB: serial: option: add Quectel EM05-G (RS) modem
| * eb8808f769 USB: serial: option: add Quectel EM05-G (CS) modem
| * 6e0430db19 USB: serial: option: add Quectel EM05-G (GR) modem
| * f01aefe374 prlimit: do_prlimit needs to have a speculation check
| * 418e2c756d xhci: Detect lpm incapable xHC USB3 roothub ports from ACPI tables
| * 10cb7d53be usb: acpi: add helper to check port lpm capability using acpi _DSM
| * 1818e2a97d xhci: Add a flag to disable USB3 lpm on a xhci root port level.
| * 8911ff7963 xhci: Add update_hub_device override for PCI xHCI hosts
| * c462ac871f xhci: Fix null pointer dereference when host dies
| * f39c813af0 usb: xhci: Check endpoint is valid before dereferencing it
| * 0f175cebc4 xhci-pci: set the dma max_seg_size
| * 89a410dbd0 io_uring/rw: defer fsnotify calls to task context
| * 05d69b372b io_uring: do not recalculate ppos unnecessarily
| * ff8a070253 io_uring: update kiocb->ki_pos at execution time
| * b7958caf41 io_uring: remove duplicated calls to io_kiocb_ppos
| * 86e2d6901a io_uring: ensure that cached task references are always put on exit
| * 30b9068934 io_uring: fix async accept on O_NONBLOCK sockets
| * a79b13f249 io_uring: allow re-poll if we made progress
| * 3c1a3d0269 io_uring: support MSG_WAITALL for IORING_OP_SEND(MSG)
| * 390b881631 io_uring: add flag for disabling provided buffer recycling
| * 9b7b0f2116 io_uring: ensure recv and recvmsg handle MSG_WAITALL correctly
| * cdc68e714d io_uring: improve send/recv error handling
| * ccf06b5a98 io_uring: pass in EPOLL_URING_WAKE for eventfd signaling and wakeups
| * 77baf39227 eventfd: provide a eventfd_signal_mask() helper
| * a2d8ff00a7 eventpoll: add EPOLL_URING_WAKE poll wakeup flag
| * a9aa4aa7a5 io_uring: don't gate task_work run on TIF_NOTIFY_SIGNAL
| * bd9a23a4bb hugetlb: unshare some PMDs when splitting VMAs
| * 393d9e3ed1 drm/amd: Delay removal of the firmware framebuffer
| * 865e244e06 drm/amdgpu: disable runtime pm on several sienna cichlid cards(v2)
| * 560373fb1e ALSA: hda/realtek: fix mute/micmute LEDs don't work for a HP platform
| * 26264260a8 ALSA: hda/realtek: fix mute/micmute LEDs for a HP ProBook
| * 1026756321 efi: fix userspace infinite retry read efivars after EFI runtime services page fault
| * 45627a1a64 nilfs2: fix general protection fault in nilfs_btree_insert()
| * 350d66d9e7 zonefs: Detect append writes at invalid locations
| * 5054d001ff Add exception protection processing for vd in axi_chan_handle_err function
| * a12fd43bd1 wifi: mac80211: sdata can be NULL during AMPDU start
| * f96a6c009e wifi: brcmfmac: fix regression for Broadcom PCIe wifi devices
| * 908d1742b6 Bluetooth: hci_qca: Fix driver shutdown on closed serdev
| * 7530fbc05f fbdev: omapfb: avoid stack overflow warning
| * e1df7f0b27 perf/x86/rapl: Treat Tigerlake like Icelake
| * 2c129e8689 f2fs: let's avoid panic if extent_tree is not created
| * 58bac74402 x86/asm: Fix an assembler warning with current binutils
| * fdb4a70bb7 btrfs: always report error in run_one_delayed_ref()
| * f641067ea2 RDMA/srp: Move large values to a new enum for gcc13
| * 793f8ac218 r8169: move rtl_wol_enable_rx() and rtl_prepare_power_down()
| * dc072762f9 net/ethtool/ioctl: return -EOPNOTSUPP if we have no phy stats
| * 308d24d875 vduse: Validate vq_num in vduse_validate_config()
| * 8e1eb926a0 virtio_pci: modify ENOENT to EINVAL
| * 64a6f3689d tools/virtio: initialize spinlocks in vring_test.c
| * 95fc28a8e9 selftests/bpf: check null propagation only neither reg is PTR_TO_BTF_ID
| * d4a9d2944f pNFS/filelayout: Fix coalescing test for single DS
| * 6a3319af6b btrfs: fix trace event name typo for FLUSH_DELAYED_REFS
* |   52cea9ba91 Merge "Merge 5.15.89 into android13-5.15-lts" into android13-5.15-lts
|\ \
| * | de550d72f1 Merge 5.15.89 into android13-5.15-lts
| |\|
| | * 3bcc86eb3e Linux 5.15.89
| | * 37c18ef49e pinctrl: amd: Add dynamic debugging for active GPIOs
| | * a5841b81ad Revert "usb: ulpi: defer ulpi_register on ulpi_read_id timeout"
| | * 7ec9a45fc4 block: handle bio_split_to_limits() NULL return
| | * ba86db02d4 io_uring/io-wq: only free worker if it was allocated for creation
| | * bb135bcc94 io_uring/io-wq: free worker if task_work creation is canceled
| | * 63c2fa09b8 scsi: mpt3sas: Remove scsi_dma_map() error messages
| | * e2ea555642 efi: fix NULL-deref in init error path
| | * 94b6cf84db arm64: cmpxchg_double*: hazard against entire exchange variable
| | * 3891fa4982 arm64: atomics: remove LL/SC trampolines
| | * 61e86339af arm64: atomics: format whitespace consistently
| | * ed4629d1e9 io_uring: lock overflowing for IOPOLL
| | * fbf5015141 KVM: x86: Do not return host topology information from KVM_GET_SUPPORTED_CPUID
| | * ee16841134 Documentation: KVM: add API issues section
| | * b8f3b3cffb mm: Always release pages to the buddy allocator in memblock_free_late().
| | * d2dc110dea platform/surface: aggregator: Add missing call to ssam_request_sync_free()
| | * cfd5978411 igc: Fix PPS delta between two synchronized end-points
| | * 0bf52601ce perf build: Properly guard libbpf includes
| | * 205f35eee7 net/mlx5e: Don't support encap rules with gbp option
| | * 0526fc9330 net/mlx5: Fix ptp max frequency adjustment range
| | * 9e2c38827c net/sched: act_mpls: Fix warning during failed attribute validation
| | * e3bb44beaf tools/nolibc: fix the O_* fcntl/open macro definitions for riscv
| | * 1e6ec75bb3 tools/nolibc: restore mips branch ordering in the _start block
| | * bd0431a66c tools/nolibc: Remove .global _start from the entry point code
| | * a77c54f5b5 tools/nolibc/arch: mark the _start symbol as weak
| | * da51e086d1 tools/nolibc/arch: split arch-specific code into individual files
| | * 8591e788be tools/nolibc/types: split syscall-specific definitions into their own files
| | * 4fceecdeaa tools/nolibc/std: move the standard type definitions to std.h
| | * 1792136f22 tools/nolibc: use pselect6 on RISCV
| | * 487386a49e tools/nolibc: x86-64: Use `mov $60,%eax` instead of `mov $60,%rax`
| | * 27af4f2260 tools/nolibc: x86: Remove `r8`, `r9` and `r10` from the clobber list
| | * a60b24192b af_unix: selftest: Fix the size of the parameter to connect()
| | * 39ae73e581 nfc: pn533: Wait for out_urb's completion in pn533_usb_send_frame()
| | * f6003784b1 hvc/xen: lock console list traversal
| | * 79c58b7424 octeontx2-af: Fix LMAC config in cgx_lmac_rx_tx_enable
| | * 303d062881 tipc: fix unexpected link reset due to discovery messages
| | * e79d0f97cc ALSA: usb-audio: Relax hw constraints for implicit fb sync
| | * c9557906bd ALSA: usb-audio: Make sure to stop endpoints before closing EPs
| | * 83e758105b ASoC: wm8904: fix wrong outputs volume after power reactivation
| | * 7c26d21872 scsi: ufs: core: WLUN suspend SSU/enter hibern8 fail recovery
| | * 513fdf0b8e scsi: ufs: Stop using the clock scaling lock in the error handler
| | * 13259b60b7 scsi: mpi3mr: Refer CONFIG_SCSI_MPI3MR in Makefile
| | * 470f6a9175 regulator: da9211: Use irq handler when ready
| | * 24107ad469 x86/resctrl: Fix task CLOSID/RMID update race
| | * cd3da505fb EDAC/device: Fix period calculation in edac_device_reset_delay_period()
| | * ab0d02c53a x86/boot: Avoid using Intel mnemonics in AT&T syntax asm
| | * a90d339f1f powerpc/imc-pmu: Fix use of mutex in IRQs disabled section
| | * 511cf17b24 netfilter: ipset: Fix overflow before widen in the bitmap_ip_create() function.
| | * b22faa21b6 sched/core: Fix use-after-free bug in dup_user_cpus_ptr()
| | * d766ccadbe iommu/mediatek-v1: Fix an error handling path in mtk_iommu_v1_probe()
| | * c929a230c8 iommu/iova: Fix alloc iova overflows issue
| | * 4b51aa263a usb: ulpi: defer ulpi_register on ulpi_read_id timeout
| | * 9a8bf443f6 bus: mhi: host: Fix race between channel preparation and M0 event
| | * 456e3794e0 ipv6: raw: Deduct extension header length in rawv6_push_pending_frames
| | * 4c93422a54 ixgbe: fix pci device refcount leak
| | * e97da5d97a platform/x86: sony-laptop: Don't turn off 0x153 keyboard backlight during probe
| | * f3b1e04daf dt-bindings: msm/dsi: Don't require vcca-supply on 14nm PHY
| | * 52a5f596c6 dt-bindings: msm/dsi: Don't require vdds-supply on 10nm PHY
| | * 984ad875db drm/msm/dp: do not complete dp_aux_cmd_fifo_tx() if irq is not for aux transfer
| | * 92ae83665e platform/x86: ideapad-laptop: Add Legion 5 15ARH05 DMI id to set_fn_lock_led_list[]
| | * e38b5f81df dt-bindings: msm: dsi-phy-28nm: Add missing qcom, dsi-phy-regulator-ldo-mode
| | * bb32ab40cb dt-bindings: msm: dsi-controller-main: Fix description of core clock
| | * 3fb8d10bee dt-bindings: msm: dsi-controller-main: Fix power-domain constraint
| | * dc5b651cad drm/msm/adreno: Make adreno quirks not overwrite each other
| | * 757d665ee1 dt-bindings: msm: dsi-controller-main: Fix operating-points-v2 constraint
| | * c90cf47d30 platform/x86: dell-privacy: Fix SW_CAMERA_LENS_COVER reporting
| | * 25b5f693bc platform/surface: aggregator: Ignore command messages not intended for us
| | * ee7b8ce2cc platform/x86: dell-privacy: Only register SW_CAMERA_LENS_COVER if present
| | * e0072068ad cifs: Fix uninitialized memory read for smb311 posix symlink create
| | * f3495b5e9e net/mlx5e: Set action fwd flag when parsing tc action goto
| | * 1a8431cc20 drm/i915/gt: Reset twice
| | * 011ecdbcd5 drm/virtio: Fix GEM handle creation UAF
| | * 798dfeeae3 s390/percpu: add READ_ONCE() to arch_this_cpu_to_op_simple()
| | * a400593eb3 s390/cpum_sf: add READ_ONCE() semantics to compare and swap loops
| | * d4fa65960a ASoC: qcom: lpass-cpu: Fix fallback SD line index handling
| | * 8400b91c11 s390/kexec: fix ipl report address for kdump
| | * c07e0babd1 perf auxtrace: Fix address filter duplicate symbol selection
| | * e81d82da61 net: stmmac: add aux timestamps fifo clearance wait
| | * 44167b74a8 docs: Fix the docs build with Sphinx 6.0
| | * 24176bf2a1 efi: tpm: Avoid READ_ONCE() for accessing the event log
| | * 01b966b14c selftests: kvm: Fix a compile error in selftests/kvm/rseq_test.c
| | * c773ebe11c KVM: arm64: nvhe: Fix build with profile optimization
| | * c1d6a72fc8 KVM: arm64: Fix S1PTW handling on RO memslots
| | * e04e6cd883 ALSA: hda/realtek: Enable mute/micmute LEDs on HP Spectre x360 13-aw0xxx
| | * b983c9a971 ALSA: hda/realtek - Turn on power early
| | * 9ab3696881 ALSA: control-led: use strscpy in set_led_id()
| | * a8acfe2c6f netfilter: nft_payload: incorrect arithmetics when fetching VLAN header bits
* | | 2c4f6d72f1 Merge "Merge 5.15.88 into android13-5.15-lts" into android13-5.15-lts
|\| |
| * | 773ec50a8a Merge 5.15.88 into android13-5.15-lts
| |\|
| | * 90bb4f8f39 Linux 5.15.88
| | * cbd3e6d5e5 ALSA: hda - Enable headset mic on another Dell laptop with ALC3254
| | * b98dee4746 ALSA: hda/hdmi: Add a HP device 0x8715 to force connect list
| | * 26350c21bc ALSA: pcm: Move rwsem lock inside snd_ctl_elem_read to prevent UAF
| | * dadd0dcaa6 net/ulp: prevent ULP without clone op from entering the LISTEN status
| | * 04941c1d5b net: sched: disallow noqueue for qdisc classes
| | * 068b512193 serial: fixup backport of "serial: Deassert Transmit Enable on probe in driver-specific way"
| | * 46aa155758 selftests/vm/pkeys: Add a regression test for setting PKRU through ptrace
| | * 3c1940c549 x86/fpu: Emulate XRSTOR's behavior if the xfeatures PKRU bit is not set
| | * 3f1c81426a x86/fpu: Allow PKRU to be (once again) written by ptrace.
| | * b29773d6b0 x86/fpu: Add a pkru argument to copy_uabi_to_xstate()
| | * 9813c5fc22 x86/fpu: Add a pkru argument to copy_uabi_from_kernel_to_xstate().
| | * fea26e83a1 x86/fpu: Take task_struct* in copy_sigframe_from_user_to_xstate()
| | * d4d152017e parisc: Align parisc MADV_XXX constants with all other architectures
| * | 1867565896 Revert "ASoC/SoundWire: dai: expand 'stream' concept beyond SoundWire"
| * | 43064ed394 Revert "ASoC: Intel/SOF: use set_stream() instead of set_tdm_slots() for HDAudio"
| * | 959d50edd2 Revert "PM/devfreq: governor: Add a private governor_data for governor"
* | | c34c76a947 Revert "ASoC/SoundWire: dai: expand 'stream' concept beyond SoundWire"
* | | 33ef84070b Revert "ASoC: Intel/SOF: use set_stream() instead of set_tdm_slots() for HDAudio"
* | | e60641bdca Revert "PM/devfreq: governor: Add a private governor_data for governor"
* | | 793ec0a9cc Merge "Merge 5.15.87 into android13-5.15-lts" into android13-5.15-lts
|\| |
| * | fc4de343bd Merge 5.15.87 into android13-5.15-lts
| |\|
| | * d57287729e Linux 5.15.87
| | * 24186c6822 drm/mgag200: Fix PLL setup for G200_SE_A rev >=4
| | * e326ee018a io_uring: Fix unsigned 'res' comparison with zero in io_fixup_rw_res()
| | * b2b6eefab4 efi: random: combine bootloader provided RNG seed with RNG protocol output
| | * 99c0759495 mbcache: Avoid nesting of cache->c_list_lock under bit locks
| | * d50d6c193a net: hns3: fix return value check bug of rx copybreak
| | * d4e6a13eb9 btrfs: make thaw time super block check to also verify checksum
| | * 70a1dccd0e selftests: set the BUILD variable to absolute path
| | * 58fef3ebc8 ext4: don't allow journal inode to have encrypt flag
| | * bd5dc96fea mptcp: use proper req destructor for IPv6
| | * 78bd6ab52c mptcp: dedicated request sock for subflow in v6
| | * 6e9c1aef3e Revert "ACPI: PM: Add support for upcoming AMD uPEP HID AMDI007"
| | * e32f867b37 ksmbd: check nt_len to be at least CIFS_ENCPWD_SIZE in ksmbd_decode_ntlmssp_auth_blob
| | * 4136f1ac1e ksmbd: fix infinite loop in ksmbd_conn_handler_loop()
| | * f10defb0be hfs/hfsplus: avoid WARN_ON() for sanity check, use proper error handling
| | * 48d9e2e6de hfs/hfsplus: use WARN_ON for sanity check
| | * f5a9bbf962 drm/i915/gvt: fix vgpu debugfs clean in remove
| | * ae9a615117 drm/i915/gvt: fix gvt debugfs destroy
| | * eb3e943a32 riscv, kprobes: Stricter c.jr/c.jalr decoding
| | * 620a229f57 riscv: uaccess: fix type of 0 variable on error in get_user()
| | * 8e05a993f8 thermal: int340x: Add missing attribute for data rate base
| | * c3222fd282 io_uring: fix CQ waiting timeout handling
| | * b7b9bc9305 block: don't allow splitting of a REQ_NOWAIT bio
| | * e1358c8787 fbdev: matroxfb: G200eW: Increase max memory from 1 MB to 16 MB
| | * 682a7d064f nfsd: fix handling of readdir in v4root vs. mount upcall timeout
| | * cb42aa7b5f x86/bugs: Flush IBP in ib_prctl_set()
| | * 554a880a1f x86/kexec: Fix double-free of elf header buffer
| | * 264241a610 btrfs: check superblock to ensure the fs was not modified at thaw time
| | * 69f4bda5f4 nvme: also return I/O command effects from nvme_command_effects
| | * a6a4b057cd nvmet: use NVME_CMD_EFFECTS_CSUPP instead of open coding it
| | * f9309dcaa9 io_uring: check for valid register opcode earlier
| | * 4df413d469 nvme: fix multipath crash caused by flush request when blktrace is enabled
| | * 03ce792128 ASoC: Intel: bytcr_rt5640: Add quirk for the Advantech MICA-071 tablet
| | * 0dca7375e2 udf: Fix extension of the last extent in the file
| | * dc1bc90397 caif: fix memory leak in cfctrl_linkup_request()
| | * bce3680b48 drm/i915: unpin on error in intel_vgpu_shadow_mm_pin()
| | * da6a3653b8 perf stat: Fix handling of --for-each-cgroup with --bpf-counters to match non BPF mode
| | * 11cd4ec635 usb: rndis_host: Secure rndis_query check against int overflow
| | * 6ea5273c71 octeontx2-pf: Fix lmtst ID used in aura free
| | * 4e5f2c74cb drivers/net/bonding/bond_3ad: return when there's no aggregator
| | * 8414983c2e fs/ntfs3: don't hold ni_lock when calling truncate_setsize()
| | * a23e8376e6 drm/imx: ipuv3-plane: Fix overlay plane width
| | * a8f7fd322f perf tools: Fix resources leak in perf_data__open_dir()
| | * a1e1521b46 netfilter: ipset: Rework long task execution when adding/deleting entries
| | * 6f19a38483 netfilter: ipset: fix hash:net,port,net hang with /0 subnet
| | * 774d259749 net: sparx5: Fix reading of the MAC address
| | * 04dc4003e5 net: sched: cbq: dont intepret cls results when asked to drop
| | * f02327a487 net: sched: atm: dont intepret cls results when asked to drop
| | * 95da1882ce gpio: sifive: Fix refcount leak in sifive_gpio_probe
| | * da9c9883ec ceph: switch to vfs_inode_has_locks() to fix file lock bug
| | * 54e72ce5f1 filelock: new helper: vfs_inode_has_locks
| | * f34b03ce3a drm/meson: Reduce the FIFO lines held when AFBC is not used
| | * 05a8410b0f RDMA/mlx5: Fix validation of max_rd_atomic caps for DC
| | * 8d89870d63 RDMA/mlx5: Fix mlx5_ib_get_hw_stats when used for device
| | * 4d112f0016 net: phy: xgmiitorgmii: Fix refcount leak in xgmiitorgmii_probe
| | * e5fbeb3d16 net: ena: Update NUMA TPH hint register upon NUMA node update
| | * 7840b93cfd net: ena: Set default value for RX interrupt moderation
| | * d09b7a9d2f net: ena: Fix rx_copybreak value update
| | * 0e7ad9b006 net: ena: Use bitmask to indicate packet redirection
| | * 5d4964984b net: ena: Account for the number of processed bytes in XDP
| | * f17d9aec07 net: ena: Don't register memory info on XDP exchange
| | * a4aa727ad0 net: ena: Fix toeplitz initial hash value
| | * 0bec17f1ce net: amd-xgbe: add missed tasklet_kill
| | * cb2f74685f net/mlx5e: Fix hw mtu initializing at XDP SQ allocation
| | * 6c72abb78b net/mlx5e: Always clear dest encap in neigh-update-del
| | * b36783bc11 net/mlx5e: TC, Refactor mlx5e_tc_add_flow_mod_hdr() to get flow attr
| | * f8c10eeba3 net/mlx5e: IPoIB, Don't allow CQE compression to be turned on by default
| | * 7227bbb7c1 net/mlx5: Avoid recovery in probe flows
| | * 9369b9afa8 net/mlx5: Add forgotten cleanup calls into mlx5_init_once() error path
| | * d966f2ee4b net/mlx5: E-Switch, properly handle ingress tagged packets on VST
| | * 6a37a01aba vdpa_sim: fix vringh initialization in vdpasim_queue_ready()
| | * e3462410c3 vhost: fix range used in translate_desc()
| | * 13871f60ec vringh: fix range used in iotlb_translate()
| | * e05d4c8c28 vhost/vsock: Fix error handling in vhost_vsock_init()
| | * 586e6fd7d5 vdpa_sim: fix possible memory leak in vdpasim_net_init() and vdpasim_blk_init()
| | * b63bc2db24 nfc: Fix potential resource leaks
| | * 945e58bdaf net: dsa: mv88e6xxx: depend on PTP conditionally
| | * 95df720e64 qlcnic: prevent ->dcb use-after-free on qlcnic_dcb_enable() failure
| | * 6c55953e23 net: sched: fix memory leak in tcindex_set_parms
| | * d14a4b24d5 net: hns3: fix VF promisc mode not update when mac table full
| | * 7ed205b947 net: hns3: fix miss L3E checking for rx packet
| | * 47868cb77f net: hns3: extract macro to simplify ring stats update code
| | * 7457c5a776 net: hns3: refactor hns3_nic_reuse_page()
| | * 4a6e9fb534 net: hns3: add interrupts re-initialization while doing VF FLR
| | * 5e48ed805c nfsd: shut down the NFSv4 state objects before the filecache
| | * 7e2825f5fb veth: Fix race with AF_XDP exposing old or uninitialized descriptors
| | * ac95cdafac netfilter: nf_tables: honor set timeout and garbage collection updates
| | * 49677ea151 vmxnet3: correctly report csum_level for encapsulated packet
| | * 9d30cb4421 netfilter: nf_tables: perform type checking for existing sets
| | * c3bfb7784a netfilter: nf_tables: add function to create set stateful expressions
| | * 996cd779c2 netfilter: nf_tables: consolidate set description
| | * 4f1105ee72 drm/panfrost: Fix GEM handle creation ref-counting
| | * df493f676f bpf: pull before calling skb_postpull_rcsum()
| | * d7e817e689 btrfs: fix an error handling path in btrfs_defrag_leaves()
| | * 4d69cdba2c SUNRPC: ensure the matching upcall is in-flight upon downcall
| | * af0265dfef drm/i915/migrate: fix length calculation
| | * 8b25a526a5 drm/i915/migrate: fix offset calculation
| | * a3d1e6f9b6 drm/i915/migrate: don't check the scratch page
| | * 5bc0b2fda4 ext4: fix deadlock due to mbcache entry corruption
| | * a6e4094faf mbcache: automatically delete entries from cache on freeing
| | * 1872549129 ext4: correct inconsistent error msg in nojournal mode
| | * 761f88f82e ext4: goto right label 'failed_mount3a'
| | * eb16602140 ravb: Fix "failed to switch device to config mode" message during unbind
| | * 4216995dbd perf probe: Fix to get the DW_AT_decl_file and DW_AT_call_file as unsinged data
| | * d8bbbf2b52 perf probe: Use dwarf_attr_integrate as generic DWARF attr accessor
| | * b131b5f136 media: s5p-mfc: Fix in register read and write for H264
| | * ff27800c0a media: s5p-mfc: Clear workbit to handle error condition
| | * 4653ba32ad media: s5p-mfc: Fix to handle reference queue during finishing
| | * 1bd7283dc0 x86/MCE/AMD: Clear DFR errors found in THR handler
| | * 5ddcd349d9 x86/mce: Get rid of msr_ops
| | * b8e7ed42bc btrfs: fix extent map use-after-free when handling missing device in read_one_chunk
| | * 9c3beebd21 btrfs: move missing device handling in a dedicate function
| | * 7528b21ceb btrfs: replace strncpy() with strscpy()
| | * 4cef44525f phy: qcom-qmp-combo: fix out-of-bounds clock access
| | * 855edc4ec6 ARM: renumber bits related to _TIF_WORK_MASK
| | * 18f28f1330 ext4: fix off-by-one errors in fast-commit block filling
| | * b205332b6b ext4: fix unaligned memory access in ext4_fc_reserve_space()
| | * 9c197dcbac ext4: add missing validation of fast-commit record lengths
| | * 6220ec4055 ext4: don't set up encryption key during jbd2 transaction
| | * 6482d42baf ext4: disable fast-commit of encrypted dir operations
| | * 6969367c15 ext4: fix potential out of bound read in ext4_fc_replay_scan()
| | * 818175ae3b ext4: factor out ext4_fc_get_tl()
| | * ffd84d0bc5 ext4: introduce EXT4_FC_TAG_BASE_LEN helper
| | * 37914e029b ext4: use ext4_debug() instead of jbd_debug()
| | * b0ed9a032e ext4: remove unused enum EXT4_FC_COMMIT_FAILED
| | * 394514ddf9 tracing: Fix issue of missing one synthetic field
| | * 5234dd5d20 block: mq-deadline: Fix dd_finish_request() for zoned devices
| | * 78623b10fc drm/amdgpu: make display pinning more flexible (v2)
| | * 6363da2c85 drm/amdgpu: handle polaris10/11 overlap asics (v2)
| | * 2771c7a0ee ext4: allocate extended attribute value in vmalloc area
| | * e995ff918e ext4: avoid unaccounted block allocation when expanding inode
| | * 877247222a ext4: initialize quota before expanding inode in setproject ioctl
| | * 322cf639b0 ext4: fix inode leak in ext4_xattr_inode_create() on an error path
| | * 6380a93b57 ext4: fix kernel BUG in 'ext4_write_inline_data_end()'
| | * dc3bbc9753 ext4: avoid BUG_ON when creating xattrs
| | * 844c405552 ext4: fix error code return to user-space in ext4_get_branch()
| | * b870b28e29 ext4: fix corruption when online resizing a 1K bigalloc fs
| | * d440d6427a ext4: fix delayed allocation bug in ext4_clu_mapped for bigalloc + inline
| | * def7a39091 ext4: init quota for 'old.inode' in 'ext4_rename'
| | * 3c31d8d3ad ext4: fix uninititialized value in 'ext4_evict_inode'
| | * 871800770d ext4: fix leaking uninitialized memory in fast-commit journal
| | * d480a49c15 ext4: fix bug_on in __es_tree_search caused by bad boot loader inode
| | * 91009e361e ext4: check and assert if marking an no_delete evicting inode dirty
| | * 820eacbc4e ext4: fix reserved cluster accounting in __es_remove_extent()
| | * 0dcbf4dc3d ext4: fix bug_on in __es_tree_search caused by bad quota inode
| | * 06a20a68bb ext4: add helper to check quota inums
| | * f7e6b5548f ext4: add EXT4_IGET_BAD flag to prevent unexpected bad inode
| | * 205ac16628 ext4: fix undefined behavior in bit shift for ext4_check_flag_values
| | * cf0e0817b0 ext4: fix use-after-free in ext4_orphan_cleanup
| | * 970bfd7a41 fs: ext4: initialize fsdata in pagecache_write()
| | * 744bbde378 ext4: remove trailing newline from ext4_msg() message
| | * 7192afa5e4 ext4: add inode table check in __ext4_get_inode_loc to aovid possible infinite loop
| | * 0d041b7251 ext4: silence the warning when evicting inode with dioread_nolock
| | * af4ceb00eb drm/ingenic: Fix missing platform_driver_unregister() call in ingenic_drm_init()
| | * c919e1154b drm/i915/dsi: fix VBT send packet port selection for dual link DSI
| | * 6948e570f5 drm/vmwgfx: Validate the box size for the snooped cursor
| | * 5594fde1ef drm/connector: send hotplug uevent on connector cleanup
| | * 317ebe61a6 device_cgroup: Roll back to original exceptions after copy failure
| | * ac838c663b parisc: led: Fix potential null-ptr-deref in start_task()
| | * 2c1881f081 remoteproc: core: Do pm_relax when in RPROC_OFFLINE state
| | * 9b615f957c iommu/amd: Fix ivrs_acpihid cmdline parsing code
| | * 35b792179b phy: qcom-qmp-combo: fix sc8180x reset
| | * dfd05a1335 driver core: Fix bus_type.match() error handling in __driver_attach()
| | * 44618a3397 crypto: ccp - Add support for TEE for PCI ID 0x14CA
| | * c55507a94b crypto: n2 - add missing hash statesize
| | * 4830750696 riscv: mm: notify remote harts about mmu cache updates
| | * 16b6d9525d riscv: stacktrace: Fixup ftrace_graph_ret_addr retp argument
| | * 657b440a27 PCI/sysfs: Fix double free in error path
| | * 67fd41bbb0 PCI: Fix pci_device_is_present() for VFs by checking PF
| | * bfce073089 ipmi: fix use after free in _ipmi_destroy_user()
| | * 3b4984035c ima: Fix a potential NULL pointer access in ima_restore_measurement_list
| | * a843699f16 mtd: spi-nor: Check for zero erase size in spi_nor_find_best_erase_type()
| | * 24f4649cd8 ipmi: fix long wait in unload when IPMI disconnect
| | * fa6bbb4894 ipu3-imgu: Fix NULL pointer dereference in imgu_subdev_set_selection()
| | * cdb208b090 ASoC: jz4740-i2s: Handle independent FIFO flush bits
| | * 2d0d083d8a wifi: wilc1000: sdio: fix module autoloading
| | * 2e4a088804 efi: Add iMac Pro 2017 to uefi skip cert quirk
| | * c49fb9b760 md/bitmap: Fix bitmap chunk size overflow issues
| | * 94fe975d54 block: mq-deadline: Do not break sequential write streams to zoned HDDs
| | * 8e91679f7b rtc: ds1347: fix value written to century register
| | * 5eb8296d73 cifs: fix missing display of three mount options
| | * cfa9f66f91 cifs: fix confusing debug message
| | * 8b45a3b19a media: dvb-core: Fix UAF due to refcount races at releasing
| | * acf984a371 media: dvb-core: Fix double free in dvb_register_device()
| | * 5fac317bee ARM: 9256/1: NWFPE: avoid compiler-generated __aeabi_uldivmod
| | * ce50c61245 staging: media: tegra-video: fix device_node use after free
| | * 6b16758215 staging: media: tegra-video: fix chan->mipi value on error
| | * 4f5de49d8c tracing: Fix infinite loop in tracing_read_pipe on overflowed print_trace_line
| | * 17becbc4dd tracing/probes: Handle system names with hyphens
| | * 2442e655a6 tracing/hist: Fix wrong return value in parse_action_params()
| | * 2a81ff5ce8 tracing: Fix complicated dependency of CONFIG_TRACER_MAX_TRACE
| | * fe8c35c6ff tracing: Fix race where eprobes can be called before the event
| | * eb20f6ed37 x86/kprobes: Fix optprobe optimization check with CONFIG_RETHUNK
| | * 3e0fbc06db x86/kprobes: Fix kprobes instruction boudary check with CONFIG_RETHUNK
| | * 6268a0704b ftrace/x86: Add back ftrace_expected for ftrace bug reports
| | * c95cf30dd4 x86/microcode/intel: Do not retry microcode reloading on the APs
| | * f8fe2f4178 KVM: nVMX: Properly expose ENABLE_USR_WAIT_PAUSE control to L1
| | * ca3483d71b KVM: nVMX: Inject #GP, not #UD, if "generic" VMXON CR0/CR4 check fails
| | * 2c73b349fd KVM: VMX: Resume guest immediately when injecting #GP on ECREATE
| | * 4a19f48bee of/kexec: Fix reading 32-bit "linux,initrd-{start,end}" values
| | * 7eddcdb09f perf/core: Call LSM hook after copying perf_event_attr
| | * 15697f6533 tracing/hist: Fix out-of-bound write on 'action_data.var_ref_idx'
| | * fd52b86a72 dm cache: set needs_check flag after aborting metadata
| | * d2a0b298eb dm cache: Fix UAF in destroy()
| | * 856edd0e92 dm clone: Fix UAF in clone_dtr()
| | * 9215b25f2e dm integrity: Fix UAF in dm_integrity_dtr()
| | * 34cd15d83b dm thin: Fix UAF in run_timer_softirq()
| | * ac362c40e3 dm thin: resume even if in FAIL mode
| | * 4b710e8481 dm thin: Use last transaction's pmd->root when commit failed
| | * f8c26c33fe dm thin: Fix ABBA deadlock between shrink_slab and dm_pool_abort_metadata
| | * 28d307f380 dm cache: Fix ABBA deadlock between shrink_slab and dm_cache_metadata_abort
| | * a9e89a567f mptcp: remove MPTCP 'ifdef' in TCP SYN cookies
| | * 13b9fd0dee mptcp: mark ops structures as ro_after_init
| | * b2120ed7fd fs: dlm: retry accept() until -EAGAIN or error returns
| | * 5b4478615f fs: dlm: fix sock release if listen fails
| | * b7ede8a63d ALSA: hda/realtek: Apply dual codec fixup for Dell Latitude laptops
| | * dbd1f30191 ALSA: patch_realtek: Fix Dell Inspiron Plus 16
| | * 8fb4c98f20 cpufreq: Init completion before kobject_init_and_add()
| | * 876c6ab967 PM/devfreq: governor: Add a private governor_data for governor
| | * 0e945ea733 selftests: Use optional USERCFLAGS and USERLDFLAGS
| | * 31697c5953 arm64: dts: qcom: sdm850-lenovo-yoga-c630: correct I2C12 pins drive strength
| | * 1630498660 ARM: ux500: do not directly dereference __iomem
| | * 99590f29b2 btrfs: fix resolving backrefs for inline extent followed by prealloc
| | * 1f9cf4daf2 mmc: sdhci-sprd: Disable CLK_AUTO when the clock is less than 400K
| | * 58d53ff30a arm64: dts: qcom: sdm845-db845c: correct SPI2 pins drive strength
| | * a777b90a05 perf/x86/intel/uncore: Clear attr_update properly
| | * ca77ac238c perf/x86/intel/uncore: Disable I/O stacks to PMU mapping on ICX-D
| | * df06e7777c jbd2: use the correct print format
| | * 8e75b1dd4b ktest.pl minconfig: Unset configs instead of just removing them
| | * 55e5e8b445 kest.pl: Fix grub2 menu handling for rebooting
| | * 823fed7c40 soc: qcom: Select REMAP_MMIO for LLCC driver
| | * 8dabeeb1ff media: stv0288: use explicitly signed char
| | * d167ebea90 net/af_packet: make sure to pull mac header
| | * 9ff46c36df net/af_packet: add VLAN support for AF_PACKET SOCK_RAW GSO
| | * cd0f597c8a rcu-tasks: Simplify trc_read_check_handler() atomic operations
| | * 593ca69668 ASoC/SoundWire: dai: expand 'stream' concept beyond SoundWire
| | * a7874dac6b ASoC: Intel/SOF: use set_stream() instead of set_tdm_slots() for HDAudio
| | * ae4f70b2fe kcsan: Instrument memcpy/memset/memmove with newer Clang
| | * d01fa993eb SUNRPC: Don't leak netobj memory when gss_read_proxy_verf() fails
| | * 43135fb098 tpm: tpm_tis: Add the missed acpi_put_table() to fix memory leak
| | * 986cd9a9b9 tpm: tpm_crb: Add the missed acpi_put_table() to fix memory leak
| | * 638cd298df tpm: acpi: Call acpi_put_table() to fix memory leak
| | * d58289fc77 mmc: vub300: fix warning - do not call blocking ops when !TASK_RUNNING
| | * 7eb57bc92f f2fs: allow to read node block after shutdown
| | * acc13987fd f2fs: should put a page when checking the summary info
| | * 35d8a89862 mm, compaction: fix fast_isolate_around() to stay within boundaries
| | * 91bd504128 md: fix a crash in mempool_free
| | * 29328fbce5 mfd: mt6360: Add bounds checking in Regmap read/write call-backs
| | * c24cc476ac pnode: terminate at peers of source
| | * 0c9118e381 ALSA: line6: fix stack overflow in line6_midi_transmit
| | * ac4b4fdf32 ALSA: line6: correct midi status byte when receiving data from podxt
| | * 83c44f0ebf ovl: Use ovl mounter's fsuid and fsgid in ovl_link()
| | * fcb94283e0 binfmt: Fix error return code in load_elf_fdpic_binary()
| | * ed9947277b hfsplus: fix bug causing custom uid and gid being unable to be assigned with mount
| | * 76d52b5412 pstore/zone: Use GFP_ATOMIC to allocate zone buffer
| | * 74b0a2fcc3 pstore: Properly assign mem_type property
| | * d25aac3489 HID: plantronics: Additional PIDs for double volume key presses quirk
| | * 9d4294545c HID: multitouch: fix Asus ExpertBook P2 P2451FA trackpoint
| | * 7280fdb80b powerpc/rtas: avoid scheduling in rtas_os_term()
| | * d8939315b7 powerpc/rtas: avoid device tree lookups in rtas_os_term()
| | * 23a249b118 objtool: Fix SEGFAULT
| | * ed686e7a26 fs/ntfs3: Fix slab-out-of-bounds in r_page
| | * dd34665cb0 fs/ntfs3: Delete duplicate condition in ntfs_read_mft()
| | * a9847a11b6 fs/ntfs3: Use __GFP_NOWARN allocation at ntfs_fill_super()
| | * abd2ee2cf4 fs/ntfs3: Use __GFP_NOWARN allocation at wnd_init()
| | * d7ce7bb688 fs/ntfs3: Validate index root when initialize NTFS security
| | * f29676cc3a soundwire: dmi-quirks: add quirk variant for LAPBC710 NUC15
| | * 9c8471a17f fs/ntfs3: Fix slab-out-of-bounds read in run_unpack
| | * 3a52f17867 fs/ntfs3: Validate resident attribute name
| | * 3cd9e5b41b fs/ntfs3: Validate buffer length while parsing index
| | * c878a915bc fs/ntfs3: Validate attribute name offset
| | * f62506f5e4 fs/ntfs3: Add null pointer check for inode operations
| | * 2dd9ccfb06 fs/ntfs3: Fix memory leak on ntfs_fill_super() error path
| | * ea6b359840 fs/ntfs3: Add null pointer check to attr_load_runs_vcn
| | * de5e095524 fs/ntfs3: Validate data run offset
| | * d4489ba8fb fs/ntfs3: Add overflow check for attribute size
| | * af7a195dea fs/ntfs3: Validate BOOT record_size
| | * 8e228ac90c nvmet: don't defer passthrough commands with trivial effects to the workqueue
| | * f068a7315a nvme: fix the NVME_CMD_EFFECTS_CSE_MASK definition
| | * 576502f25f ata: ahci: Fix PCS quirk application for suspend
| | * 7949b0df3d block, bfq: fix uaf for bfqq in bfq_exit_icq_bfqq
| | * ff3d9ab51c ACPI: resource: do IRQ override on Lenovo 14ALC7
| | * 698a0813ce ACPI: resource: do IRQ override on XMG Core 15
| | * a9ac7633bb ACPI: resource: do IRQ override on LENOVO IdeaPad
| | * 5fe31f2950 ACPI: resource: Skip IRQ override on Asus Vivobook K3402ZA/K3502ZA
| | * 4c5fee0d88 nvme-pci: fix page size checks
| | * 9141144b37 nvme-pci: fix mempool alloc size
| | * f17cf8fa2c nvme-pci: fix doorbell buffer value endianness
| | * ead99ec669 Revert "selftests/bpf: Add test for unstable CT lookup API"
| | * bf0543b937 cifs: fix oops during encryption
| | * 56f6de394f usb: dwc3: qcom: Fix memory leak in dwc3_qcom_interconnect_init
* | | 2ce8e6e296 ANDROID: add __dev_kfree_skb_irq to virtual_device abi list
|/ /
* | 24bc28221f Revert "net: add atomic_long_t to net_device_stats fields"
* | 34d878c5b3 Revert "ipv6/sit: use DEV_STATS_INC() to avoid data-races"
* | 956e2924f3 Revert "arm64: Treat ESR_ELx as a 64-bit register"
* | 8a3baaa85e Revert "arm64: mm: kfence: only handle translation faults"
* | 8b3730f922 Revert "gpiolib: protect the GPIO device against being dropped while in use by user-space"
* | b0e87c106d Revert "soreuseport: Fix socket selection for SO_INCOMING_CPU."
* | 8a8a0cb6c6 Revert "bpf, sockmap: Fix missing BPF_F_INGRESS flag when using apply_bytes"
* | 2d4c48bff9 Revert "xhci: Prevent infinite loop in transaction errors recovery for streams"
* | 20ec745823 Merge 5.15.86 into android13-5.15-lts
|\|
| * 90ffbb727c Linux 5.15.86
| * 3082f8705e pwm: tegra: Fix 32 bit build
| * caa40d1f85 mfd: qcom_rpm: Use devm_of_platform_populate() to simplify code
| * 408dbaa065 extcon: usbc-tusb320: Call the Type-C IRQ handler only if a port is registered
| * 2471a44769 media: dvbdev: fix refcnt bug
| * 579fb0a332 media: dvbdev: fix build warning due to comments
| * 1115e77c4f net: stmmac: fix errno when create_singlethread_workqueue() fails
| * d3871af13a scsi: qla2xxx: Fix crash when I/O abort times out
| * 50f993da94 btrfs: do not BUG_ON() on ENOMEM when dropping extent items for a range
| * 1c65d50315 ovl: fix use inode directly in rcu-walk mode
| * 88ec6d1105 fbdev: fbcon: release buffer when fbcon_do_set_font() failed
| * ca8bcb348a gcov: add support for checksum field
| * f36d8c8651 floppy: Fix memory leak in do_floppy_init()
| * 4193a6745b regulator: core: fix deadlock on regulator enable
| * ce5d0ef1cf iio: adc128s052: add proper .data members in adc128_of_match table
| * aec1058f2a iio: adc: ad_sigma_delta: do not use internal iio_dev lock
| * dc6afd6070 iio: fix memory leak in iio_device_register_eventset()
| * 38c257ee6a reiserfs: Add missing calls to reiserfs_security_free()
| * 8a4236456a security: Restrict CONFIG_ZERO_CALL_USED_REGS to gcc or clang > 15.0.6
| * 1cabce5662 9p: set req refcount to zero to avoid uninitialized usage
| * dd2157a98f loop: Fix the max_loop commandline argument treatment when it is set to 0
| * fd03bd4c7b HID: mcp2221: don't connect hidraw
| * 6c886be1ff HID: wacom: Ensure bootloader PID is usable in hidraw mode
| * 4d640eb112 xhci: Prevent infinite loop in transaction errors recovery for streams
| * 936c5f96c8 usb: dwc3: core: defer probe on ulpi_read_id timeout
| * e6bf6c4022 usb: dwc3: Fix race between dwc3_set_mode and __dwc3_set_mode
| * 0e883f3bc8 arm64: dts: qcom: sm8250: fix USB-DP PHY registers
| * ffb14aac26 usb: xhci-mtk: fix leakage of shared hcd when fail to set wakeup irq
| * fcacd970e0 usb: cdnsp: fix lack of ZLP for ep0
| * bcac79df08 ALSA: hda/hdmi: Add HP Device 0x8711 to force connect list
| * 50c23a1107 ALSA: hda/realtek: Add quirk for Lenovo TianYi510Pro-14IOB
| * 76574b3465 ALSA: usb-audio: add the quirk for KT0206 device
| * 9e787dab98 ima: Simplify ima_lsm_copy_rule
| * 2cd365029c pstore: Make sure CONFIG_PSTORE_PMSG selects CONFIG_RT_MUTEXES
| * 2068d41a3d afs: Fix lost servers_outstanding count
| * 0def8af038 perf debug: Set debug_peo_args and redirect_to_stderr variable to correct values in perf_quiet_option()
| * 41cccae10e pstore: Switch pmsg_lock to an rt_mutex to avoid priority inversion
| * 8877df8135 LoadPin: Ignore the "contents" argument of the LSM hooks
| * 584202b0f1 drm/i915/display: Don't disable DDI/Transcoder when setting phy test pattern
| * b253e075b1 ASoC: rt5670: Remove unbalanced pm_runtime_put()
| * 59f797a913 ASoC: rockchip: spdif: Add missing clk_disable_unprepare() in rk_spdif_runtime_resume()
| * 132844d92f ASoC: wm8994: Fix potential deadlock
| * 82f7c814ed ASoC: mediatek: mt8183: fix refcount leak in mt8183_mt6358_ts3a227_max98357_dev_probe()
| * e5d6bf3e5a ASoC: rockchip: pdm: Add missing clk_disable_unprepare() in rockchip_pdm_runtime_resume()
| * 85eb5c952b ASoC: audio-graph-card: fix refcount leak of cpu_ep in __graph_for_each_link()
| * 9ff07316ca ASoC: mediatek: mt8173-rt5650-rt5514: fix refcount leak in mt8173_rt5650_rt5514_dev_probe()
| * 7643909cf0 ASoC: Intel: Skylake: Fix driver hang during shutdown
| * 33ff0f9f9c ALSA: hda: add snd_hdac_stop_streams() helper
| * 78649a624d ALSA/ASoC: hda: move/rename snd_hdac_ext_stop_streams to hdac_stream.c
| * 98b0f50fec hwmon: (jc42) Fix missing unlock on error in jc42_write()
| * 5e69233508 KVM: selftests: Fix build regression by using accessor function
| * 6215904fe2 tools/include: Add _RET_IP_ and math definitions to kernel.h
| * c885326728 orangefs: Fix kmemleak in orangefs_{kernel,client}_debug_init()
| * 39529b79b0 orangefs: Fix kmemleak in orangefs_prepare_debugfs_help_string()
| * a075c21ee0 drm/sti: Fix return type of sti_{dvo,hda,hdmi}_connector_mode_valid()
| * f3c14b99f3 drm/fsl-dcu: Fix return type of fsl_dcu_drm_connector_mode_valid()
| * 9a8862820c hugetlbfs: fix null-ptr-deref in hugetlbfs_parse_param()
| * 4f6b206998 scsi: elx: libefc: Fix second parameter type in state callbacks
| * 23f0e9f863 scsi: ufs: Reduce the START STOP UNIT timeout
| * 2cf66428a2 scsi: lpfc: Fix hard lockup when reading the rx_monitor from debugfs
| * 2b3e3ecdb4 crypto: hisilicon/hpre - fix resource leak in remove process
| * adf6a00859 clk: st: Fix memory leak in st_of_quadfs_setup()
| * 6c8aee0c8f media: si470x: Fix use-after-free in si470x_int_in_callback()
| * 58b6496a74 mmc: renesas_sdhi: better reset from HS400 mode
| * c33c904124 mmc: f-sdh30: Add quirks for broken timeout clock capability
| * 69346de0eb wifi: mt76: do not run mt76u_status_worker if the device is not running
| * feb847e659 regulator: core: fix use_count leakage when handling boot-on
| * 474e70bd90 libbpf: Avoid enum forward-declarations in public API in C++ mode
| * 6209542869 drm/amd/display: Use the largest vready_offset in pipe group
| * eff45bfbc2 blk-mq: fix possible memleak when register 'hctx' failed
| * d0af6220bb media: dvb-usb: fix memory leak in dvb_usb_adapter_init()
| * 88a6f8a72d media: dvbdev: adopts refcnt to avoid UAF
| * 438cd29fec media: dvb-frontends: fix leak of memory fw
| * a96841f5aa ethtool: avoiding integer overflow in ethtool_phys_id()
| * b327c68ace bpf: Prevent decl_tag from being referenced in func_proto arg
| * 4b8f3b9392 ppp: associate skb with a device at tx
| * 5d5a481a7f mrp: introduce active flags to prevent UAF when applicant uninit
| * 222cc04356 ipv6/sit: use DEV_STATS_INC() to avoid data-races
| * 8a3b023710 net: add atomic_long_t to net_device_stats fields
| * 58dd11f624 drm/amd/display: fix array index out of bound error in bios parser
| * a3cc41e05e md/raid1: stop mdx_raid1 thread when raid1 array run failed
| * b621d17fe8 drivers/md/md-bitmap: check the return value of md_bitmap_get_counter()
| * 5afac74f15 drm/mediatek: Fix return type of mtk_hdmi_bridge_mode_valid()
| * 072508e99d drm/sti: Use drm_mode_copy()
| * 673a3e0199 drm/rockchip: Use drm_mode_copy()
| * b9b07900d2 drm/msm: Use drm_mode_copy()
| * 5ad774fb82 s390/lcs: Fix return type of lcs_start_xmit()
| * dfbf0122ea s390/netiucv: Fix return type of netiucv_tx()
| * 8131d1880c s390/ctcm: Fix return type of ctc{mp,}m_tx()
| * f9084e9930 drm/amdgpu: Fix type of second parameter in odn_edit_dpm_table() callback
| * b74580d618 drm/amdgpu: Fix type of second parameter in trans_msg() callback
| * 314f7092b2 igb: Do not free q_vector unless new one was allocated
| * 0b12d2aa26 wifi: brcmfmac: Fix potential shift-out-of-bounds in brcmf_fw_alloc_request()
| * 19bb9e98e1 hamradio: baycom_epp: Fix return type of baycom_send_packet()
| * a413ebb604 net: ethernet: ti: Fix return type of netcp_ndo_start_xmit()
| * 5d3f4478d2 bpf: make sure skb->len != 0 when redirecting to a tunneling device
| * be2803dd29 qed (gcc13): use u16 for fid to be big enough
| * a8bc0ac438 Revert "drm/amd/display: Limit max DSC target bpp for specific monitors"
| * cc8deb82cc drm/amd/display: prevent memory leak
| * 49dd0e8029 ipmi: fix memleak when unload ipmi driver
| * 68871c005f ASoC: codecs: rt298: Add quirk for KBL-R RVP platform
| * 3eca9697c2 wifi: ar5523: Fix use-after-free on ar5523_cmd() timed out
| * c319196a0e wifi: ath9k: verify the expected usb_endpoints are present
| * 10c4b63d09 brcmfmac: return error when getting invalid max_flowrings from dongle
| * ad31bc146f media: imx-jpeg: Disable useless interrupt to avoid kernel panic
| * 6e1a6880e1 drm/etnaviv: add missing quirks for GC300
| * 367296925c hfs: fix OOB Read in __hfs_brec_find
| * ebe16676e1 acct: fix potential integer overflow in encode_comp_t()
| * 8b6ef451b5 nilfs2: fix shift-out-of-bounds due to too large exponent of block size
| * b47f5c579c nilfs2: fix shift-out-of-bounds/overflow in nilfs_sb2_bad_offset()
| * 5777432eba ACPICA: Fix error code path in acpi_ds_call_control_method()
| * 10b87da8fa fs: jfs: fix shift-out-of-bounds in dbDiscardAG
| * 5059ea84a8 jfs: Fix fortify moan in symlink
| * e7a6a53c87 udf: Avoid double brelse() in udf_rename()
| * 0536f76a2b fs: jfs: fix shift-out-of-bounds in dbAllocAG
| * 88cea1676a binfmt_misc: fix shift-out-of-bounds in check_special_flags
| * cadb938a5e x86/hyperv: Remove unregister syscore call from Hyper-V cleanup
| * 659747f6f6 video: hyperv_fb: Avoid taking busy spinlock on panic path
| * 9d05c20b0a arm64: make is_ttbrX_addr() noinstr-safe
| * 98a5b1265a rcu: Fix __this_cpu_read() lockdep warning in rcu_force_quiescent_state()
| * d238f94b2b HID: amd_sfh: Add missing check for dma_alloc_coherent
| * 9da204cd67 net: stream: purge sk_error_queue in sk_stream_kill_queues()
| * f47426250f myri10ge: Fix an error handling path in myri10ge_probe()
| * 1ec0a7d5b0 rxrpc: Fix missing unlock in rxrpc_do_sendmsg()
| * 5478eb7adc net_sched: reject TCF_EM_SIMPLE case for complex ematch module
| * 4f05d8e2fb mailbox: zynq-ipi: fix error handling while device_register() fails
| * 550f403e46 mailbox: arm_mhuv2: Fix return value check in mhuv2_probe()
| * 28604a960c mailbox: mpfs: read the system controller's status
| * 8fb773eed4 skbuff: Account for tail adjustment during pull operations
| * dc0f38957a arm64: dts: mt8183: Fix Mali GPU clock
| * 790b396f6b soc: mediatek: pm-domains: Fix the power glitch issue
| * 0133615a06 openvswitch: Fix flow lookup to use unmasked key
| * 04e454bd97 selftests: devlink: fix the fd redirect in dummy_reporter_test
| * d52646a46c rtc: mxc_v2: Add missing clk_disable_unprepare()
| * ac95c4e35f igc: Set Qbv start_time and end_time to end_time if not being configured in GCL
| * af59985138 igc: Lift TAPRIO schedule restriction
| * 4d50d640ed igc: recalculate Qbv end_time by considering cycle time
| * 1ef9416957 igc: allow BaseTime 0 enrollment for Qbv
| * c0df8e7ba6 igc: Add checking for basetime less than zero
| * 5b46b53f45 igc: Use strict cycles for Qbv scheduling
| * fd7d029436 igc: Enhance Qbv scheduling by using first flag bit
| * 9b5b50329e r6040: Fix kmemleak in probe and remove
| * 1b428ba31b unix: Fix race in SOCK_SEQPACKET's unix_dgram_sendmsg()
| * aae9c24ebd nfc: pn533: Clear nfc_target before being used
| * bcf2c1dc53 net: enetc: avoid buffer leaks on xdp_do_redirect() failure
| * f463a1295c selftests/bpf: Add test for unstable CT lookup API
| * 094f3d9314 block, bfq: fix possible uaf for 'bfqq->bic'
| * cf48cb8deb mISDN: hfcmulti: don't call dev_kfree_skb/kfree_skb() under spin_lock_irqsave()
| * 5607353751 mISDN: hfcpci: don't call dev_kfree_skb/kfree_skb() under spin_lock_irqsave()
| * ada4022f48 mISDN: hfcsusb: don't call dev_kfree_skb/kfree_skb() under spin_lock_irqsave()
| * 0578f9929f net: macsec: fix net device access prior to holding a lock
| * a472f069ce nfsd: under NFSv4.1, fix double svc_xprt_put on rpc_create failure
| * f8f1d037d6 rtc: pcf85063: fix pcf85063_clkout_control
| * 35a174552b rtc: pic32: Move devm_rtc_allocate_device earlier in pic32_rtc_probe()
| * eea105c4e4 rtc: st-lpc: Add missing clk_disable_unprepare in st_rtc_probe()
| * 74248b5560 netfilter: flowtable: really fix NAT IPv6 offload
| * 5c940632ca mfd: pm8008: Fix return value check in pm8008_probe()
| * ec10848e26 mfd: pm8008: Remove driver data structure pm8008_data
| * 38959417d3 mfd: qcom_rpm: Fix an error handling path in qcom_rpm_probe()
| * b95ae3543e mfd: bd957x: Fix Kconfig dependency on REGMAP_IRQ
| * 615d3c8a46 powerpc/pseries/eeh: use correct API for error log size
| * 68de42e008 powerpc/eeh: Drop redundant spinlock initialization
| * 2b157b4b13 remoteproc: qcom: q6v5: Fix missing clk_disable_unprepare() in q6v5_wcss_qcs404_power_on()
| * 4b191533f5 remoteproc: qcom_q6v5_pas: Fix missing of_node_put() in adsp_alloc_memory_region()
| * d7628ebca8 remoteproc: qcom_q6v5_pas: detach power domains on remove
| * fdf47f462a remoteproc: qcom_q6v5_pas: disable wakeup on probe fail or remove
| * 098ebb9089 remoteproc: qcom: q6v5: Fix potential null-ptr-deref in q6v5_wcss_init_mmio()
| * 131c0a3ead remoteproc: sysmon: fix memory leak in qcom_add_sysmon_subdev()
| * 4507c6a672 pwm: mediatek: always use bus clock for PWM on MT7622
| * 4fbbb14f0e pwm: mtk-disp: Fix the parameters calculated by the enabled flag of disp_pwm
| * eec59807a2 pwm: sifive: Call pwm_sifive_update_clock() while mutex is held
| * 37ea9a6c41 iommu/sun50i: Remove IOMMU_DOMAIN_IDENTITY
| * 8de2c29db6 selftests/powerpc: Fix resource leaks
| * dd49c5031e powerpc/hv-gpci: Fix hv_gpci event list
| * 65d3469f3b powerpc/83xx/mpc832x_rdb: call platform_device_put() in error case in of_fsl_spi_probe()
| * cf03db2896 powerpc/perf: callchain validate kernel stack pointer bounds
| * 5de1902244 powerpc/xive: add missing iounmap() in error path in xive_spapr_populate_irq_data()
| * b31e9647f1 powerpc/xmon: Fix -Wswitch-unreachable warning in bpt_cmds
| * 6a310e8db5 cxl: Fix refcount leak in cxl_calc_capp_routing
| * 0accd460dc powerpc/52xx: Fix a resource leak in an error handling path
| * be2b9b1a60 macintosh/macio-adb: check the return value of ioremap()
| * 19ded60b40 macintosh: fix possible memory leak in macio_add_one_device()
| * e42b543d08 iommu/fsl_pamu: Fix resource leak in fsl_pamu_probe()
| * 6e501b3fd7 iommu/amd: Fix pci device refcount leak in ppr_notifier()
| * 9383921e8b rtc: pcf85063: Fix reading alarm
| * b66aa7b306 rtc: snvs: Allow a time difference on clock register read
| * 7a6cc22eab rtc: cmos: Disable ACPI RTC event on removal
| * 689f757f0a rtc: cmos: Rename ACPI-related functions
| * 1c74bbecda rtc: cmos: Eliminate forward declarations of some functions
| * 3a439a2cab rtc: cmos: Call rtc_wake_setup() from cmos_do_probe()
| * 9febdff75c rtc: cmos: Call cmos_wake_setup() from cmos_do_probe()
| * d9324fb3ee rtc: cmos: fix build on non-ACPI platforms
| * fe46b9303e rtc: cmos: Fix wake alarm breakage
| * 60c6e563a8 rtc: cmos: Fix event handler registration ordering issue
| * d3aa083469 rtc: rtc-cmos: Do not check ACPI_FADT_LOW_POWER_S0
| * 6e98a93c75 dmaengine: idxd: Fix crc_val field for completion record
| * ab53749c32 fs/ntfs3: Fix slab-out-of-bounds read in ntfs_trim_fs
| * 1ba0968b33 pwm: tegra: Improve required rate calculation
| * c160505c9b include/uapi/linux/swab: Fix potentially missing __always_inline
| * 59463193b0 phy: usb: s2 WoL wakeup_count not incremented for USB->Eth devices
| * ae00848e55 iommu/rockchip: fix permission bits in page table entries v2
| * a7f6ad2c42 iommu/sun50i: Fix flush size
| * 38ccb9b469 iommu/sun50i: Fix R/W permission check
| * ae4ab47a0b iommu/sun50i: Consider all fault sources for reset
| * 84fee3ce82 iommu/sun50i: Fix reset release
| * 6f9fe31a48 fs/ntfs3: Harden against integer overflows
| * 30f20ceb87 overflow: Implement size_t saturating arithmetic helpers
| * 4b51f27d44 fs/ntfs3: Avoid UBSAN error on true_sectors_per_clst()
| * 28f345bec7 RDMA/siw: Fix pointer cast warning
| * 01d925e2a5 perf stat: Do not delay the workload with --delay
| * a273f1dd5d perf stat: Refactor __run_perf_stat() common code
| * d21534ab4f power: supply: fix null pointer dereferencing in power_supply_get_battery_info
| * d4898d8de6 power: supply: ab8500: Fix error handling in ab8500_charger_init()
| * 30b191798f HSI: omap_ssi_core: Fix error handling in ssi_init()
| * a72fe8eb55 power: supply: z2_battery: Fix possible memleak in z2_batt_probe()
| * 5ba0e8fa15 perf symbol: correction while adjusting symbol
| * a34027b63d perf trace: Handle failure when trace point folder is missed
| * 60aeacce64 perf trace: Use macro RAW_SYSCALL_ARGS_NUM to replace number
| * e4700f62dc perf trace: Return error if a system call doesn't exist
| * 870ad0917d power: supply: fix residue sysfs file in error handle route of __power_supply_register()
| * 1c2b9c8100 HSI: omap_ssi_core: fix possible memory leak in ssi_probe()
| * c5f729d3d6 HSI: omap_ssi_core: fix unbalanced pm_runtime_disable()
| * ea37831f83 fbdev: uvesafb: Fixes an error handling path in uvesafb_probe()
| * 5bcae36b58 fbdev: uvesafb: don't build on UML
| * 07c1a3c2df fbdev: geode: don't build on UML
| * ace8312b5d fbdev: ep93xx-fb: Add missing clk_disable_unprepare in ep93xxfb_probe()
| * 04946113fb fbdev: vermilion: decrease reference count in error path
| * fc0d5034fa fbdev: via: Fix error in via_core_init()
| * 9827246333 fbdev: pm2fb: fix missing pci_disable_device()
| * 3aa4205134 fbdev: ssd1307fb: Drop optional dependency
| * 4958316a6d thermal/drivers/qcom/lmh: Fix irq handler return value
| * ad72205ac6 thermal/drivers/qcom/temp-alarm: Fix inaccurate warning for gen2
| * 37fb4e13d2 thermal/drivers/imx8mm_thermal: Validate temperature range
| * 95c18f4a3c samples: vfio-mdev: Fix missing pci_disable_device() in mdpy_fb_probe()
| * 31c1b5d300 ksmbd: Fix resource leak in ksmbd_session_rpc_open()
| * a44102d36a tracing/hist: Fix issue of losting command info in error_log
| * 8308ccfcbd usb: storage: Add check for kcalloc
| * 96c12fd0ec i2c: ismt: Fix an out-of-bounds bug in ismt_access()
| * 8212800943 i2c: mux: reg: check return value after calling platform_get_resource()
| * 46d8f63bb8 gpiolib: protect the GPIO device against being dropped while in use by user-space
| * 333a271dfd gpiolib: make struct comments into real kernel docs
| * 7c755a2d6d gpiolib: cdev: fix NULL-pointer dereferences
| * b0a26e1999 gpiolib: Get rid of redundant 'else'
| * 4bc217b25e vme: Fix error not catched in fake_init()
| * 31bfe024a9 staging: rtl8192e: Fix potential use-after-free in rtllib_rx_Monitor()
| * b0aaec894a staging: rtl8192u: Fix use after free in ieee80211_rx()
| * ed4580c3f8 i2c: pxa-pci: fix missing pci_disable_device() on error in ce4100_i2c_probe
| * 28dc61cc49 chardev: fix error handling in cdev_device_add()
| * 43bfc7c240 mcb: mcb-parse: fix error handing in chameleon_parse_gdd()
| * f3686e5e8d drivers: mcb: fix resource leak in mcb_probe()
| * 9d4a0aca8a usb: gadget: f_hid: fix refcount leak on error path
| * d3136b7970 usb: gadget: f_hid: fix f_hidg lifetime vs cdev
| * a41c2bba7f usb: roles: fix of node refcount leak in usb_role_switch_is_parent()
| * 18b9202188 coresight: trbe: remove cpuhp instance node before remove cpuhp state
| * e854a4ab38 counter: stm32-lptimer-cnt: fix the check on arr and cmp registers update
| * 39a156715f iio: adis: add '__adis_enable_irq()' implementation
| * 3a2dde8e5d iio:imu:adis: Move exports into IIO_ADISLIB namespace
| * 3c2e13025b iio: adis: stylistic changes
| * de3e358912 iio: adis: handle devices that cannot unmask the drdy pin
| * 994243de7a iio: temperature: ltc2983: make bulk write buffer DMA-safe
| * 22511eefa6 cxl: fix possible null-ptr-deref in cxl_pci_init_afu|adapter()
| * e5021bbf11 cxl: fix possible null-ptr-deref in cxl_guest_init_afu|adapter()
| * b308fdedef firmware: raspberrypi: fix possible memory leak in rpi_firmware_probe()
| * d5c8f9003a misc: sgi-gru: fix use-after-free error in gru_set_context_option, gru_fault and gru_handle_user_call_os
| * 848c45964d misc: tifm: fix possible memory leak in tifm_7xx1_switch_media()
| * 37a13b274e ocxl: fix pci device refcount leak when calling get_function_0()
| * 3299983a6b misc: ocxl: fix possible name leak in ocxl_file_register_afu()
| * 357379d504 test_firmware: fix memory leak in test_firmware_init()
| * 07d547d742 serial: sunsab: Fix error handling in sunsab_init()
| * 919e745fdd serial: altera_uart: fix locking in polling mode
| * e1c4f18214 tty: serial: altera_uart_{r,t}x_chars() need only uart_port
| * b133b45ba6 tty: serial: clean up stop-tx part in altera_uart_tx_chars()
| * 6f7d82380f serial: pch: Fix PCI device refcount leak in pch_request_dma()
| * 0dfc7dfe5b serial: stm32: move dma_request_chan() before clk_prepare_enable()
| * 926b0967f7 serial: pl011: Do not clear RX FIFO & RX interrupt in unthrottle.
| * d71a611fca serial: amba-pl011: avoid SBSA UART accessing DMACR register
| * f46f9d2e16 extcon: usbc-tusb320: Update state on probe even if no IRQ pending
| * ac067e75c4 extcon: usbc-tusb320: Add USB TYPE-C support
| * 9280761167 extcon: usbc-tusb320: Factor out extcon into dedicated functions
| * 05aa8ff22d usb: typec: Factor out non-PD fwnode properties
| * 31e9c27510 extcon: usbc-tusb320: Add support for TUSB320L
| * b9c8820d91 extcon: usbc-tusb320: Add support for mode setting and reset
| * 4524599a6a usb: typec: tipd: Fix spurious fwnode_handle_put in error path
| * b0d86eacc8 usb: typec: tipd: Cleanup resources if devm_tps6598_psy_register fails
| * ba75be6f0d usb: typec: tcpci: fix of node refcount leak in tcpci_register_port()
| * 154d5713a2 usb: typec: Check for ops->exit instead of ops->enter in altmode_exit
| * 1f5661388f staging: vme_user: Fix possible UAF in tsi148_dma_list_add
| * a3c4bc2616 usb: fotg210-udc: Fix ages old endianness issues
| * 5e87d41221 uio: uio_dmem_genirq: Fix deadlock between irq config and handling
| * 79a4bdb6b9 uio: uio_dmem_genirq: Fix missing unlock in irq configuration
| * 3f22a273ef vfio: platform: Do not pass return buffer to ACPI _RST method
| * 417ef049e3 class: fix possible memory leak in __class_register()
| * f76824ab2b serial: 8250_bcm7271: Fix error handling in brcmuart_init()
| * 6b4424efcf serial: tegra: Read DMA status before terminating
| * a0ead7e8da drivers: dio: fix possible memory leak in dio_init()
| * e8985caf05 RISC-V: Align the shadow stack
| * ca48174a76 IB/IPoIB: Fix queue count inconsistency for PKEY child interfaces
| * 82bd423ed9 hwrng: geode - Fix PCI device refcount leak
| * 2b79a5e560 hwrng: amd - Fix PCI device refcount leak
| * 42cbff35f4 crypto: img-hash - Fix variable dereferenced before check 'hdev->req'
| * b9634f99b6 RDMA/hns: Fix error code of CMD
| * b06bb747ce RDMA/hns: Fix page size cap from firmware
| * 4c05c7cf25 RDMA/hns: Fix PBL page MTR find
| * fa267c4192 RDMA/hns: Fix AH attr queried by query_qp
| * e27fb26e75 orangefs: Fix sysfs not cleanup when dev init failed
| * 3e9c395ef2 PCI: mt7621: Add sentinel to quirks table
| * bcc65c2e2a PCI: mt7621: Rename mt7621_pci_ to mt7621_pcie_
| * 0a7eab1cc4 RDMA/srp: Fix error return code in srp_parse_options()
| * 6301100179 RDMA/hfi1: Fix error return code in parse_platform_config()
| * 339ca035af riscv/mm: add arch hook arch_clear_hugepage_flags
| * 20d363dcd6 crypto: omap-sham - Use pm_runtime_resume_and_get() in omap_sham_probe()
| * 815b65d714 crypto: amlogic - Remove kcalloc without check
| * af71199291 RDMA/nldev: Fix failure to send large messages
| * bb895786a4 f2fs: avoid victim selection from previous victim section
| * 655e955deb RDMA/nldev: Add checks for nla_nest_start() in fill_stat_counter_qps()
| * 1895e908b3 scsi: snic: Fix possible UAF in snic_tgt_create()
| * 09a60f908d scsi: fcoe: Fix transport not deattached when fcoe_if_init() fails
| * e59da17205 scsi: ipr: Fix WARNING in ipr_init()
| * c444f58fda scsi: scsi_debug: Fix possible name leak in sdebug_add_host_helper()
| * 4e4968dfb5 scsi: fcoe: Fix possible name leak when device_register() fails
| * 0f5006d7d1 scsi: scsi_debug: Fix a warning in resp_report_zones()
| * 2432719b1a scsi: scsi_debug: Fix a warning in resp_verify()
| * 038359eecc scsi: efct: Fix possible memleak in efct_device_init()
| * 23053a7926 scsi: hpsa: Fix possible memory leak in hpsa_add_sas_device()
| * 2ab6d5927c scsi: hpsa: Fix error handling in hpsa_add_sas_host()
| * 6a92129c8f scsi: mpt3sas: Fix possible resource leaks in mpt3sas_transport_port_add()
| * 26c0f7e1ac padata: Fix list iterator in padata_do_serial()
| * 17afa98bcc padata: Always leave BHs disabled when running ->parallel()
| * 221afb2a1b crypto: tcrypt - Fix multibuffer skcipher speed test mem leak
| * bfe10a1d9f scsi: hpsa: Fix possible memory leak in hpsa_init_one()
| * 38ef0c0b09 dt-bindings: visconti-pcie: Fix interrupts array max constraints
| * 83aad8111b dt-bindings: imx6q-pcie: Fix clock names for imx6sx and imx8mq
| * f64f08b9e6 RDMA/rxe: Fix NULL-ptr-deref in rxe_qp_do_cleanup() when socket create failed
| * 35f9cd060e RDMA/hns: fix memory leak in hns_roce_alloc_mr()
| * 6d5220a553 crypto: ccree - Make cc_debugfs_global_fini() available for module init function
| * 2e9cf3e783 RDMA/hfi: Decrease PCI device reference count in error path
| * 7f476d639c PCI: Check for alloc failure in pci_request_irq()
| * 49bc2be897 RDMA/hns: Fix ext_sge num error when post send
| * 0e6160d79d RDMA/hns: Repacing 'dseg_len' by macros in fill_ext_sge_inl_data()
| * e5ea48788e crypto: hisilicon/qm - add missing pci_dev_put() in q_num_set()
| * 442caec12f crypto: cryptd - Use request context instead of stack for sub-request
| * ab677729fc crypto: ccree - Remove debugfs when platform_driver_register failed
| * 0328ca389a scsi: scsi_debug: Fix a warning in resp_write_scat()
| * 1ba8ecb664 RDMA/siw: Set defined status for work completion with undefined status
| * 6e757005ba RDMA/nldev: Return "-EAGAIN" if the cm_id isn't from expected port
| * f981c697b2 RDMA/core: Make sure "ib_port" is valid when access sysfs node
| * 13586753ae RDMA/restrack: Release MR restrack when delete
| * 6e78ca677f PCI: vmd: Disable MSI remapping after suspend
| * 47e31b86ed IB/mad: Don't call to function that might sleep while in atomic context
| * f8d8fbd3b6 RDMA/siw: Fix immediate work request flush to completion queue
| * 2a26849d79 scsi: qla2xxx: Fix set-but-not-used variable warnings
| * 799ed37559 RDMA/irdma: Report the correct link speed
| * d40d1b1c61 f2fs: fix to destroy sbi->post_read_wq in error path of f2fs_fill_super()
| * 847f725006 f2fs: fix normal discard process
| * 865bb7b5a7 f2fs: fix to invalidate dcc->f2fs_issue_discard in error path
| * 5f509fa740 apparmor: Fix memleak in alloc_ns()
| * 46f3cb83e4 crypto: rockchip - rework by using crypto_engine
| * 3ed0548d39 crypto: rockchip - remove non-aligned handling
| * 5562009f5f crypto: rockchip - better handle cipher key
| * 26f3971356 crypto: rockchip - add fallback for ahash
| * 34fe54af3c crypto: rockchip - add fallback for cipher
| * 314217591e crypto: rockchip - do not store mode globally
| * 853cd97d2b crypto: rockchip - do not do custom power management
| * d5100272e4 f2fs: Fix the race condition of resize flag between resizefs
| * db72c5dffc PCI: pci-epf-test: Register notifier if only core_init_notifier is enabled
| * 26ffeff67b RDMA/core: Fix order of nldev_exit call
| * a00a7ac251 PCI: dwc: Fix n_fts[] array overrun
| * 10ae636115 apparmor: Use pointer to struct aa_label for lbs_cred
| * 8d50ccfbe2 scsi: core: Fix a race between scsi_done() and scsi_timeout()
| * 9bdf3a59b3 crypto: nitrox - avoid double free on error path in nitrox_sriov_init()
| * 7efc0d39ee crypto: sun8i-ss - use dma_addr instead u32
| * aaef0bdd7a crypto: hisilicon/qm - fix missing destroy qp_idr
| * d567776ae2 apparmor: Fix abi check to include v8 abi
| * bc9d2cbbdc apparmor: fix lockdep warning when removing a namespace
| * 775a37ffa9 apparmor: fix a memleak in multi_transaction_new()
| * 09f30f394e net: dsa: tag_8021q: avoid leaking ctx on dsa_tag_8021q_register() error path
| * 86664b8652 i40e: Fix the inability to attach XDP program on downed interface
| * 0abd337acd stmmac: fix potential division by 0
| * 93a4a04558 Bluetooth: RFCOMM: don't call kfree_skb() under spin_lock_irqsave()
| * 8d6bbe5241 Bluetooth: hci_core: don't call kfree_skb() under spin_lock_irqsave()
| * 804de4e24a Bluetooth: hci_bcsp: don't call kfree_skb() under spin_lock_irqsave()
| * 1030c3aeee Bluetooth: hci_h5: don't call kfree_skb() under spin_lock_irqsave()
| * 9fcb5b367e Bluetooth: hci_ll: don't call kfree_skb() under spin_lock_irqsave()
| * 14cc94a598 Bluetooth: hci_qca: don't call kfree_skb() under spin_lock_irqsave()
| * 06467130d5 Bluetooth: btusb: don't call kfree_skb() under spin_lock_irqsave()
| * e52b7d460a Bluetooth: btintel: Fix missing free skb in btintel_setup_combined()
| * f7c9de3bcf Bluetooth: MGMT: Fix error report for ADD_EXT_ADV_PARAMS
| * 2addf3cb63 sctp: sysctl: make extra pointers netns aware
| * 21296a52ca ntb_netdev: Use dev_kfree_skb_any() in interrupt context
| * 0fff763f11 net: lan9303: Fix read error execution path
| * 882bad40a0 can: tcan4x5x: Fix use of register error status mask
| * d50092f662 can: m_can: Call the RAM init directly from m_can_chip_config
| * 55064642aa can: tcan4x5x: Remove invalid write in clear_interrupts
| * 641eef8766 net: amd-xgbe: Check only the minimum speed for active/passive cables
| * 60b35e28dc net: amd-xgbe: Fix logic around active and passive cables
| * d436bf39f4 af_unix: call proto_unregister() in the error path in af_unix_init()
| * ee9d03bf89 net: amd: lance: don't call dev_kfree_skb() under spin_lock_irqsave()
| * 6f1c4c01cc hamradio: don't call dev_kfree_skb() under spin_lock_irqsave()
| * eb2c6a6e8f net: ethernet: dnet: don't call dev_kfree_skb() under spin_lock_irqsave()
| * ef08e1082c net: emaclite: don't call dev_kfree_skb() under spin_lock_irqsave()
| * 2786ef4066 net: apple: bmac: don't call dev_kfree_skb() under spin_lock_irqsave()
| * d81314e2dd net: apple: mace: don't call dev_kfree_skb() under spin_lock_irqsave()
| * 9a6544343b net/tunnel: wait until all sk_user_data reader finish before releasing the sock
| * 998b4e54f5 net: farsync: Fix kmemleak when rmmods farsync
| * 71605c6906 ethernet: s2io: don't call dev_kfree_skb() under spin_lock_irqsave()
| * ce1b3a41e7 of: overlay: fix null pointer dereferencing in find_dup_cset_node_entry() and find_dup_cset_prop()
| * 8399b98935 drivers: net: qlcnic: Fix potential memory leak in qlcnic_sriov_init()
| * 96e5089702 net: stmmac: fix possible memory leak in stmmac_dvr_probe()
| * ecaf934e44 net: stmmac: selftests: fix potential memleak in stmmac_test_arpoffload()
| * e1359bc90a net: defxx: Fix missing err handling in dfx_init()
| * c65603abc3 net: vmw_vsock: vmci: Check memcpy_from_msg()
| * 9de42116fc clk: socfpga: Fix memory leak in socfpga_gate_init()
| * e515881ade bpf: Do not zero-extend kfunc return values
| * ce61a877c7 blktrace: Fix output non-blktrace event when blk_classic option enabled
| * f2ae56fa0b wifi: brcmfmac: Fix error return code in brcmf_sdio_download_firmware()
| * 23060daf37 wifi: rtl8xxxu: Fix the channel width reporting
| * 6d0e00334e wifi: rtl8xxxu: Add __packed to struct rtl8723bu_c2h
| * e69d380650 spi: spi-gpio: Don't set MOSI as an input if not 3WIRE mode
| * 4e501a31af clk: samsung: Fix memory leak in _samsung_clk_register_pll()
| * 441c05485c media: coda: Add check for kmalloc
| * b99872178e media: coda: Add check for dcoda_iram_alloc
| * fbf081ebe2 media: c8sectpfe: Add of_node_put() when breaking out of loop
| * 2a7330d820 regulator: qcom-labibb: Fix missing of_node_put() in qcom_labibb_regulator_probe()
| * ecf1b317a8 mmc: core: Normalize the error handling branch in sd_read_ext_regs()
| * 7fecca429e memstick/ms_block: Add check for alloc_ordered_workqueue
| * b77ced3fce memstick: ms_block: Add error handling support for add_disk()
| * ae00eb6779 mmc: renesas_sdhi: alway populate SCC pointer
| * 88fa6a4e39 mmc: mmci: fix return value check of mmc_add_host()
| * 29c3690969 mmc: wbsd: fix return value check of mmc_add_host()
| * 0959cc1685 mmc: via-sdmmc: fix return value check of mmc_add_host()
| * e0cfe7aa41 mmc: meson-gx: fix return value check of mmc_add_host()
| * 62005dfcc3 mmc: omap_hsmmc: fix return value check of mmc_add_host()
| * 1925472dec mmc: atmel-mci: fix return value check of mmc_add_host()
| * 58c3a8d0f1 mmc: wmt-sdmmc: fix return value check of mmc_add_host()
| * afc898019e mmc: vub300: fix return value check of mmc_add_host()
| * 6444079767 mmc: toshsd: fix return value check of mmc_add_host()
| * df683201c7 mmc: rtsx_usb_sdmmc: fix return value check of mmc_add_host()
| * 30dc645461 mmc: rtsx_pci: fix return value check of mmc_add_host()
| * bc7e8744f5 mmc: pxamci: fix return value check of mmc_add_host()
| * 2d496050de mmc: mxcmmc: fix return value check of mmc_add_host()
| * f0502fe86a mmc: moxart: fix return value check of mmc_add_host()
| * 29c5b4da41 mmc: alcor: fix return value check of mmc_add_host()
| * 52e0d8a8dd riscv, bpf: Emit fixed-length instructions for BPF_PSEUDO_FUNC
| * 0de70ed675 NFSv4.x: Fail client initialisation if state manager thread can't run
| * 7055c878a0 SUNRPC: Fix missing release socket in rpc_sockname()
| * 79d4cd40da xprtrdma: Fix regbuf data not freed in rpcrdma_req_create()
| * cba633b24a ALSA: mts64: fix possible null-ptr-defer in snd_mts64_interrupt
| * 9018550d96 media: saa7164: fix missing pci_disable_device()
| * 2df1e2a6ec ALSA: pcm: Set missing stop_operating flag at undoing trigger start
| * a443c55d96 bpf, sockmap: fix race in sock_map_free()
| * 5229b90337 hwmon: (jc42) Restore the min/max/critical temperatures on resume
| * 785f5c732a hwmon: (jc42) Convert register access and caching to regmap/regcache
| * c4c64d8abd regulator: core: fix resource leak in regulator_register()
| * 07f82dca11 configfs: fix possible memory leak in configfs_create_dir()
| * 21a061772b hsr: Synchronize sequence number updates.
| * a82f5b2e08 hsr: Synchronize sending frames to have always incremented outgoing seq nr.
| * bb3b40cd6a hsr: Disable netpoll.
| * 8e148d981b hsr: Avoid double remove of a node.
| * 9387cbf7f7 hsr: Add a rcu-read lock to hsr_forward_skb().
| * a051e10bfc clk: qcom: clk-krait: fix wrong div2 functions
| * 8275c7465d clk: qcom: lpass-sc7180: Fix pm_runtime usage
| * 91657ec4d0 regulator: core: fix module refcount leak in set_supply()
| * 66976a3be9 wifi: mt76: fix coverity overrun-call in mt76_get_txpower()
| * a21e3f6f41 wifi: mt76: mt7921: fix reporting of TX AGGR histogram
| * c8659018b6 mt76: stop the radar detector after leaving dfs channel
| * ae19622e7f wifi: cfg80211: Fix not unregister reg_pdev when load_builtin_regdb_keys() fails
| * 2e32f12998 wifi: mac80211: fix memory leak in ieee80211_if_add()
| * f58888434d spi: spidev: mask SPI_CS_HIGH in SPI_IOC_RD_MODE
| * b6d27d9250 bonding: uninitialized variable in bond_miimon_inspect()
| * 7201e4f4f5 bpf, sockmap: Fix data loss caused by using apply_bytes on ingress redirect
| * 6105ed3598 bpf, sockmap: Fix missing BPF_F_INGRESS flag when using apply_bytes
| * 8786bde11a bpf, sockmap: Fix repeated calls to sock_put() when msg has more_data
| * a222f992ce Input: wistron_btns - disable on UML
| * d78649c21b netfilter: conntrack: set icmpv6 redirects as RELATED
| * 09fe3b1392 ASoC: pcm512x: Fix PM disable depth imbalance in pcm512x_probe
| * 8876793e56 drm/amdkfd: Fix memory leakage
| * 8f2d2badf8 drm/amdgpu: Fix PCI device refcount leak in amdgpu_atrm_get_bios()
| * 88c6e0995c drm/radeon: Fix PCI device refcount leak in radeon_atrm_get_bios()
| * 0af0ff9fc0 drm/amd/pm/smu11: BACO is supported when it's in BACO state
| * 27e7cf595d ASoC: mediatek: mt8173: Enable IRQ when pdata is ready
| * 905e565375 ASoC: mediatek: mt8173: Fix debugfs registration for components
| * d8e32f1bf1 wifi: iwlwifi: mvm: fix double free on tx path.
| * d0bb44775c ALSA: asihpi: fix missing pci_disable_device()
| * f12377abac NFS: Fix an Oops in nfs_d_automount()
| * 9a96aff53c NFSv4: Fix a deadlock between nfs4_open_recover_helper() and delegreturn
| * c6aca4c7ba NFSv4: Fix a credential leak in _nfs4_discover_trunking()
| * 7f6607c884 NFSv4.2: Fix initialisation of struct nfs4_label
| * 51899eefd1 NFSv4.2: Fix a memory stomp in decode_attr_security_label
| * 34dffc77dd NFSv4.2: Clear FATTR4_WORD2_SECURITY_LABEL when done decoding
| * d926611c89 ASoC: mediatek: mtk-btcvsd: Add checks for write and read of mtk_btcvsd_snd
| * f243ff92d6 ASoC: dt-bindings: wcd9335: fix reset line polarity in example
| * 41d7b8291c drm/tegra: Add missing clk_disable_unprepare() in tegra_dc_probe()
| * 2376d7fa08 media: s5p-mfc: Add variant data for MFC v7 hardware for Exynos 3250 SoC
| * 210fcf64be media: dvb-usb: az6027: fix null-ptr-deref in az6027_i2c_xfer()
| * b223cc15f9 media: dvb-core: Fix ignored return value in dvb_register_frontend()
| * 825a8af31d pinctrl: pinconf-generic: add missing of_node_put()
| * eedc698d66 clk: imx8mn: fix imx8mn_enet_phy_sels clocks list
| * f86a432604 clk: imx8mn: fix imx8mn_sai2_sels clocks list
| * 5e98c3a345 clk: imx: replace osc_hdmi with dummy
| * 9453e097b8 clk: imx8mn: rename vpu_pll to m7_alt_pll
| * bffc80bac8 media: imon: fix a race condition in send_packet()
| * 9c9ff35d68 media: vimc: Fix wrong function called when vimc_init() fails
| * f849c116d3 ASoC: qcom: Add checks for devm_kcalloc
| * 16437645dd drbd: destroy workqueue when drbd device was freed
| * cdaf45415c drbd: remove call to memset before free device/resource/connection
| * f35981083c mtd: maps: pxa2xx-flash: fix memory leak in probe
| * 87c750affd bonding: fix link recovery in mode 2 when updelay is nonzero
| * 02105f0b30 drm/amdgpu: fix pci device refcount leak
| * 5b0a1f1247 clk: rockchip: Fix memory leak in rockchip_clk_register_pll()
| * 27aac5c012 regulator: core: use kfree_const() to free space conditionally
| * a69b1faa9b ALSA: seq: fix undefined behavior in bit shift for SNDRV_SEQ_FILTER_USE_EVENT
| * 9c0f3617ba ALSA: pcm: fix undefined behavior in bit shift for SNDRV_PCM_RATE_KNOT
| * 6159424e2d pinctrl: k210: call of_node_put()
| * 18a973fcb1 HID: hid-sensor-custom: set fixed size for custom attributes
| * 0fc4280dbe bpf: Move skb->len == 0 checks into __bpf_redirect
| * 8dbcb4c284 mtd: spi-nor: Fix the number of bytes for the dummy cycles
| * 58e1a0ef52 mtd: spi-nor: hide jedec_id sysfs attribute if not present
| * 348d95e39f inet: add READ_ONCE(sk->sk_bound_dev_if) in inet_csk_bind_conflict()
| * 4451bef1a3 media: videobuf-dma-contig: use dma_mmap_coherent
| * b2781a8626 media: platform: exynos4-is: Fix error handling in fimc_md_init()
| * 7cf71bbe5d media: solo6x10: fix possible memory leak in solo_sysfs_init()
| * c290aa527f media: vidtv: Fix use-after-free in vidtv_bridge_dvb_init()
| * 648f303102 Input: elants_i2c - properly handle the reset GPIO when power is off
| * e0d3e46ac6 mtd: lpddr2_nvm: Fix possible null-ptr-deref
| * ab4e42f519 drm/msm/a6xx: Fix speed-bin detection vs probe-defer
| * fea795f7c7 wifi: ath10k: Fix return value in ath10k_pci_init()
| * 77482c4dd4 block: clear ->slave_dir when dropping the main slave_dir reference
| * 62251948e2 ima: Fix misuse of dereference of pointer in template_desc_init_fields()
| * 29d6c69ba4 integrity: Fix memory leakage in keyring allocation error path
| * 8e6df95717 drm/fourcc: Fix vsub/hsub for Q410 and Q401
| * ec1727f89e drm/fourcc: Add packed 10bit YUV 4:2:0 format
| * f72608b8dd regulator: qcom-rpmh: Fix PMR735a S3 regulator spec
| * 63d011ad05 nvme: return err on nvme_init_non_mdts_limits fail
| * f289a38df0 amdgpu/pm: prevent array underflow in vega20_odn_edit_dpm_table()
| * cda1895f3b regulator: core: fix unbalanced of node refcount in regulator_dev_lookup()
| * 1a5aaa5736 nvmet: only allocate a single slab for bvecs
| * cb3033a432 libbpf: Fix uninitialized warning in btf_dump_dump_type_data
| * 83baa50939 ASoC: pxa: fix null-pointer dereference in filter()
| * a06ba0f7f8 drm/mediatek: Modify dpi power on/off sequence.
| * 6d25bc6370 drm/radeon: Add the missed acpi_put_table() to fix memory leak
| * 4cf11e9d31 bfq: fix waker_bfqq inconsistency crash
| * 55e822212e rxrpc: Fix ack.bufferSize to be 0 when generating an ack
| * 5ef8bf0df1 net, proc: Provide PROC_FS=n fallback for proc_create_net_single_write()
| * d1c44928bb media: camss: Clean up received buffers on failed start of streaming
| * 3b4b4df3f8 wifi: rsi: Fix handling of 802.3 EAPOL frames sent via control port
| * 9e1440c858 Input: joystick - fix Kconfig warning for JOYSTICK_ADC
| * 71212d7318 mtd: Fix device name leak when register device failed in add_mtd_device()
| * 106311677b clk: qcom: gcc-sm8250: Use retention mode for USB GDSCs
| * 322c7415e7 bpf: propagate precision across all frames, not just the last one
| * 07c286c10a bpf: Check the other end of slot_type for STACK_SPILL
| * fdbc363bc1 bpf: propagate precision in ALU/ALU64 operations
| * b29e46610c media: platform: exynos4-is: fix return value check in fimc_md_probe()
| * ab54081a28 media: vivid: fix compose size exceed boundary
| * 3c58c83c6f bpf: Fix slot type check in check_stack_write_var_off
| * cffa75198c drm/msm/hdmi: use devres helper for runtime PM management
| * 58d002b72e drm/msm/hdmi: drop unused GPIO support
| * 2d4bc60693 ima: Handle -ESTALE returned by ima_filter_rule_match()
| * 13fc167e16 drm/panel/panel-sitronix-st7701: Remove panel on DSI attach failure
| * c20672cfa0 spi: Update reference to struct spi_controller
| * 2858d038c5 clk: renesas: r9a06g032: Repair grave increment error
| * f6ed73db39 drm/rockchip: lvds: fix PM usage counter unbalance in poweron
| * 13fab6322b can: kvaser_usb: Compare requested bittiming parameters with actual parameters in do_set_{,data}_bittiming
| * 4e55d61e87 can: kvaser_usb: Add struct kvaser_usb_busparams
| * fcfd4df200 can: kvaser_usb_leaf: Fix bogus restart events
| * 51f07da38b can: kvaser_usb_leaf: Fix wrong CAN state after stopping
| * 647c26887b can: kvaser_usb_leaf: Fix improved state not being reported
| * 9676d65a4a can: kvaser_usb: make use of units.h in assignment of frequency
| * c761108562 can: kvaser_usb_leaf: Set Warning state even without bus errors
| * a60bf9d814 can: kvaser_usb: kvaser_usb_leaf: Handle CMD_ERROR_EVENT
| * 8aae6bddc1 can: kvaser_usb: kvaser_usb_leaf: Rename {leaf,usbcan}_cmd_error_event to {leaf,usbcan}_cmd_can_error_event
| * 972270be24 can: kvaser_usb: kvaser_usb_leaf: Get capabilities from device
| * e9e0d9945f can: kvaser_usb: do not increase tx statistics when sending error message frames
| * e39bce64e5 libbpf: Btf dedup identical struct test needs check for nested structs/arrays
| * d4419f93e2 media: exynos4-is: don't rely on the v4l2_async_subdev internals
| * 8741792d82 soreuseport: Fix socket selection for SO_INCOMING_CPU.
| * 094f56192c venus: pm_helpers: Fix error check in vcodec_domains_get()
| * 3c793a9ad9 media: i2c: ad5820: Fix error path
| * 07611f9e44 media: adv748x: afe: Select input port when initializing AFE
| * aa81257dbf media: coda: jpeg: Add check for kmalloc
| * 9a402adc9f media: v4l2-ctrls: Fix off-by-one error in integer menu control check
| * 1caed03305 drm/amdgpu/powerplay/psm: Fix memory leak in power state init
| * f66a877083 ipmi: kcs: Poll OBF briefly to reduce OBE latency
| * 983320199e ata: libata: fix NCQ autosense logic
| * a9caf71aeb ata: add/use ata_taskfile::{error|status} fields
| * 3483c3fb48 ata: libata: move ata_{port,link,dev}_dbg to standard pr_XXX() macros
| * 6706135577 libbpf: Fix null-pointer dereference in find_prog_by_sec_insn()
| * a733bf1019 libbpf: Fix use-after-free in btf_dump_name_dups
| * b5ec2a04fe drm/bridge: adv7533: remove dynamic lane switching from adv7533 bridge
| * 6d40a49d05 wifi: rtl8xxxu: Fix reading the vendor of combo chips
| * 355f16f756 wifi: ath9k: hif_usb: Fix use-after-free in ath9k_hif_usb_reg_in_cb()
| * d856f7574b wifi: ath9k: hif_usb: fix memory leak of urbs in ath9k_hif_usb_dealloc_tx_urbs()
| * 12229a2523 platform/mellanox: mlxbf-pmc: Fix event typo
| * a0d93aac54 rapidio: devices: fix missing put_device in mport_cdev_open
| * 7af9cb8cbb hfs: Fix OOB Write in hfs_asc2mac
| * 90962b3b1c relay: fix type mismatch when allocating memory in relay_create_buf()
| * 0d60b11f8f eventfd: change int to __u64 in eventfd_signal() ifndef CONFIG_EVENTFD
| * 2f5cc7fd73 rapidio: fix possible UAF when kfifo_alloc() fails
| * 337b68da68 fs: sysv: Fix sysv_nblocks() returns wrong value
| * 95d42a8d3d lockd: set other missing fields when unlocking files
| * 318229b4d3 MIPS: OCTEON: warn only once if deprecated link status is being used
| * 5e6d37a93a MIPS: BCM63xx: Add check for NULL for clk in clk_enable
| * 50af0ba3e1 platform/x86: intel_scu_ipc: fix possible name leak in __intel_scu_ipc_register()
| * 3cf8150135 platform/x86: mxm-wmi: fix memleak in mxm_wmi_call_mx[ds|mx]()
| * 0ceadb5a3e platform/chrome: cros_ec_typec: zero out stale pointers
| * 49c98b5688 platform/chrome: cros_ec_typec: Cleanup switch handle return paths
| * b55ef8508a PM: runtime: Do not call __rpm_callback() from rpm_idle()
| * 0bf874183b xen/privcmd: Fix a possible warning in privcmd_ioctl_mmap_resource()
| * 70966d6b0f x86/xen: Fix memory leak in xen_init_lock_cpu()
| * 23aef94eea x86/xen: Fix memory leak in xen_smp_intr_init{_pv}()
| * 03ab1c5c2f uprobes/x86: Allow to probe a NOP instruction with 0x66 prefix
| * 6fde666278 ACPICA: Fix use-after-free in acpi_ut_copy_ipackage_to_ipackage()
| * 9cabd5f4f1 clocksource/drivers/timer-ti-dm: Fix missing clk_disable_unprepare in dmtimer_systimer_init_clock()
| * b73c76c3c4 cpu/hotplug: Do not bail-out in DYING/STARTING sections
| * 6eb1802184 cpu/hotplug: Make target_store() a nop when target == state
| * cd130e2676 futex: Resend potentially swallowed owner death notification
| * fd8a10d44c futex: Move to kernel/futex/
| * 156144bd18 mips: ralink: mt7621: do not use kzalloc too early
| * 186d59bb6a mips: ralink: mt7621: soc queries and tests as functions
| * 8348da01e5 mips: ralink: mt7621: define MT7621_SYSC_BASE with __iomem
| * 0f8e6fe09c clocksource/drivers/sh_cmt: Access registers according to spec
| * a47de2fd3f rapidio: rio: fix possible name leak in rio_register_mport()
| * ec3f04f74f rapidio: fix possible name leaks when rio_add_device() fails
| * 4662d8e6ab debugfs: fix error when writing negative value to atomic_t debugfs file
| * 7e8e8cc136 lib/notifier-error-inject: fix error when writing -errno to debugfs file
| * 39b5e6130b libfs: add DEFINE_SIMPLE_ATTRIBUTE_SIGNED for signed value
| * 19c202e6e5 cpufreq: amd_freq_sensitivity: Add missing pci_dev_put()
| * 93e3c80338 genirq/irqdesc: Don't try to remove non-existing sysfs files
| * 435cc7d18c nfsd: don't call nfsd_file_put from client states seqfile display
| * 5030d4d2bf NFSD: Finish converting the NFSv2 GETACL result encoder
| * e498675e06 SUNRPC: Return true/false (not 1/0) from bool functions
| * 3e255dc210 EDAC/i10nm: fix refcount leak in pci_get_dev_wrapper()
| * 740efb64ca irqchip/wpcm450: Fix memory leak in wpcm450_aic_of_init()
| * 77b99b483f irqchip: gic-pm: Use pm_runtime_resume_and_get() in gic_probe()
| * 5139cbc0c6 thermal: core: fix some possible name leaks in error paths
| * cab345f9d5 platform/chrome: cros_usbpd_notify: Fix error handling in cros_usbpd_notify_init()
| * 0358bc7cc2 perf/x86/intel/uncore: Fix reference count leak in __uncore_imc_init_box()
| * 433bd587dc perf/x86/intel/uncore: Fix reference count leak in snr_uncore_mmio_map()
| * 3485f19751 perf/x86/intel/uncore: Fix reference count leak in hswep_has_limit_sbox()
| * 0021ef7dc6 perf/x86/intel/uncore: Fix reference count leak in sad_cfg_iio_topology()
| * c12b314bb2 PNP: fix name memory leak in pnp_alloc_dev()
| * f1c7a6af71 selftests/efivarfs: Add checking of the test return value
| * 46be3ee1ca MIPS: vpe-cmp: fix possible memory leak while module exiting
| * e820a8192f MIPS: vpe-mt: fix possible memory leak while module exiting
| * 61d68cf2ba ocfs2: fix memory leak in ocfs2_stack_glue_init()
| * e83b47580a lib/fonts: fix undefined behavior in bit shift for get_default_font
| * 0df7d9ab6b proc: fixup uptime selftest
| * 07b8659b8e timerqueue: Use rb_entry_safe() in timerqueue_getnext()
| * 413b18866b platform/x86: huawei-wmi: fix return value calculation
| * 4b46932283 lib/debugobjects: fix stat count and optimize debug_objects_mem_init
| * f790dfe816 perf: Fix possible memleak in pmu_dev_alloc()
| * 418d21c0df selftests/ftrace: event_triggers: wait longer for test_event_enable
| * 4ea765b106 cpufreq: qcom-hw: Fix memory leak in qcom_cpufreq_hw_read_lut()
| * c52d9c25d9 fs: don't audit the capability check in simple_xattr_list()
| * e4d0d13b46 PM: hibernate: Fix mistake in kerneldoc comment
| * 1f62b8e50d x86/sgx: Reduce delay and interference of enclave release
| * f5b88170f0 alpha: fix syscall entry in !AUDUT_SYSCALL case
| * a819ba80b9 alpha: fix TIF_NOTIFY_SIGNAL handling
| * eb2a732ef4 cpuidle: dt: Return the correct numbers of parsed idle states
| * 3af4f5cb8a sched/uclamp: Make asym_fits_capacity() use util_fits_cpu()
| * 23cb580e0c sched/core: Introduce sched_asym_cpucap_active()
| * 41c2dba388 sched/fair: Removed useless update of p->recent_used_cpu
| * 55ffeab089 sched/uclamp: Make select_idle_capacity() use util_fits_cpu()
| * 4639bfbb83 sched/uclamp: Make task_fits_capacity() use util_fits_cpu()
| * 309e50cbfe sched/uclamp: Fix relationship between uclamp and migration margin
| * 54a766e196 sched/fair: Cleanup task_util and capacity type
| * 26bffaf678 ovl: remove privs in ovl_fallocate()
| * 5dc34f9aaa ovl: remove privs in ovl_copyfile()
| * 9636e70ee2 ovl: use ovl_copy_{real,upper}attr() wrappers
| * a54843833c ovl: store lower path in ovl_inode
| * 163c5bbe7d tpm/tpm_crb: Fix error message in __crb_relinquish_locality()
| * fe880e9df9 tpm/tpm_ftpm_tee: Fix error handling in ftpm_mod_init()
| * ebc73c4f26 pstore: Avoid kcore oops by vmap()ing with VM_IOREMAP
| * d4dcde11bf ARM: mmp: fix timer_read delay
| * 95916147dc pstore/ram: Fix error return code in ramoops_probe()
| * a31a647a3d seccomp: Move copy_seccomp() to no failure path.
| * b8b76b8da6 arm64: dts: armada-3720-turris-mox: Add missing interrupt for RTC
| * 820a5ccca7 ARM: dts: turris-omnia: Add switch port 6 node
| * b311f8e9f5 ARM: dts: turris-omnia: Add ethernet aliases
| * 48ebdd06c9 ARM: dts: armada-39x: Fix assigned-addresses for every PCIe Root Port
| * f27dd04e44 ARM: dts: armada-38x: Fix assigned-addresses for every PCIe Root Port
| * 1e53c63da8 ARM: dts: armada-375: Fix assigned-addresses for every PCIe Root Port
| * 3af1a73e9e ARM: dts: armada-xp: Fix assigned-addresses for every PCIe Root Port
| * e4ed8133c4 ARM: dts: armada-370: Fix assigned-addresses for every PCIe Root Port
| * b335b6344e ARM: dts: dove: Fix assigned-addresses for every PCIe Root Port
| * 5b3415e683 arm64: dts: mediatek: mt6797: Fix 26M oscillator unit name
| * 93f5e66496 arm64: dts: mediatek: pumpkin-common: Fix devicetree warnings
| * debd938e21 arm64: dts: mt2712-evb: Fix usb vbus regulators unit names
| * b2c6397754 arm64: dts: mt2712-evb: Fix vproc fixed regulators unit names
| * 96c972f835 arm64: dts: mt2712e: Fix unit address for pinctrl node
| * 2cd1391c28 arm64: dts: mt2712e: Fix unit_address_vs_reg warning for oscillators
| * 39877a3636 arm64: dts: mt6779: Fix devicetree build warnings
| * af431ce47e ARM: dts: nuvoton: Remove bogus unit addresses from fixed-partition nodes
| * 0a616049ec arm64: dts: ti: k3-j721e-main: Drop dma-coherent in crypto node
| * 22a740824a arm64: dts: ti: k3-am65-main: Drop dma-coherent in crypto node
| * b131304fe7 perf/smmuv3: Fix hotplug callback leak in arm_smmu_pmu_init()
| * b99fbe8d94 perf/arm_dmc620: Fix hotplug callback leak in dmc620_pmu_init()
| * 9285b623bb perf: arm_dsu: Fix hotplug callback leak in dsu_pmu_init()
| * e6318a7e19 arm64: mm: kfence: only handle translation faults
| * 46ddfb9d1e arm64: Treat ESR_ELx as a 64-bit register
| * 681e340128 soc: ti: smartreflex: Fix PM disable depth imbalance in omap_sr_probe
| * 6eca7a2535 soc: ti: knav_qmss_queue: Fix PM disable depth imbalance in knav_queue_probe
| * 972f8fc065 soc: ti: knav_qmss_queue: Use pm_runtime_resume_and_get instead of pm_runtime_get_sync
| * fe53048f2a arm: dts: spear600: Fix clcd interrupt
| * 75baeec464 arm64: dts: qcom: sm6125: fix SDHCI CQE reg names
| * 0f9ac04191 soc: qcom: apr: Add check for idr_alloc and of_property_read_string_index
| * 6855dd02c5 soc: qcom: apr: make code more reuseable
| * c9fb81a835 arm64: dts: qcom: sm8250: drop bogus DP PHY clock
| * 53ffa57464 arm64: dts: qcom: sm8350: fix UFS PHY registers
| * d5a6bbd7a2 arm64: dts: qcom: sm8250: fix UFS PHY registers
| * 3a52ff845f arm64: dts: qcom: sm8150: fix UFS PHY registers
| * 800f8165e0 arm64: dts: qcom: Correct QMP PHY child node name
| * ee136f275b soc: qcom: llcc: make irq truly optional
| * aa7ffd4174 arm64: dts: qcom: sm8250: correct LPASS pin pull down
| * f94bacc616 arm64: dts: qcom: pm660: Use unique ADC5_VCOIN address in node name
| * d5bf119781 drivers: soc: ti: knav_qmss_queue: Mark knav_acc_firmwares as static
| * 4707d5daf8 ARM: dts: stm32: Fix AV96 WLAN regulator gpio property
| * 33647d7a46 ARM: dts: stm32: Drop stm32mp15xc.dtsi from Avenger96
| * 9f271a8660 objtool, kcsan: Add volatile read/write instrumentation to whitelist
| * 51fe2dcba8 arm64: dts: qcom: msm8916: Drop MSS fallback compatible
| * a9fff3524f arm64: dts: qcom: sdm845-cheza: fix AP suspend pin bias
| * 6487f48ea3 arm64: dts: qcom: sdm630: fix UART1 pin bias
| * 6c0c9c5458 ARM: dts: qcom: apq8064: fix coresight compatible
| * 0f9b088d68 arm64: dts: qcom: msm8996: fix GPU OPP table
| * 270683fc7b arm64: dts: qcom: msm8996: fix supported-hw in cpufreq OPP tables
| * 5c5a628914 arm64: dts: qcom: msm8996: Add MSM8996 Pro support
| * 3f14048ee4 arm64: dts: qcom: sm8250-sony-xperia-edo: fix touchscreen bias-disable
| * 89f79f8d7f arm64: dts: qcom: ipq6018-cp01-c1: use BLSPI1 pins
| * 9db5992e72 usb: musb: remove extra check in musb_gadget_vbus_draw
| * adc063a491 drm/amd/display: Manually adjust strobe for DCN303
* | 50e12445ab Merge 5.15.85 into android13-5.15-lts
|\|
| * 5827ddaf45 Linux 5.15.85
| * e22dbadac8 net: loopback: use NET_NAME_PREDICTABLE for name_assign_type
| * 314e7a7836 selftests: net: Use "grep -E" instead of "egrep"
| * 19a7814396 Bluetooth: L2CAP: Fix u8 overflow
| * f692abf139 HID: uclogic: Add HID_QUIRK_HIDINPUT_FORCE quirk
| * 5325a884e2 usb: dwc3: pci: Update PCIe device ID for USB3 controller on CPU sub-system for Raptor Lake
| * 367e1e3399 igb: Initialize mailbox message for VF reset
| * a301742b35 xhci: Apply XHCI_RESET_TO_DEFAULT quirk to ADL-N
| * 5e959f0c4c USB: serial: f81534: fix division by zero on line-speed change
| * 68fbe268d2 USB: serial: f81232: fix division by zero on line-speed change
| * 3ec7f24b8b USB: serial: cp210x: add Kamstrup RF sniffer PIDs
| * 2b092fab23 USB: serial: option: add Quectel EM05-G modem
| * 6b41a35b41 usb: gadget: uvc: Prevent buffer overflow in setup handler
| * 828112571c udf: Fix extending file within last block
| * df1a2596c7 udf: Do not bother looking for prealloc extents if i_lenExtents matches i_size
| * 63dbbd8f14 udf: Fix preallocation discarding at indirect extent boundary
| * 79a97f08ae udf: Discard preallocation before extending file with a hole
* | fb8d543b61 Merge 5.15.84 into android13-5.15-lts
|\|
| * d68f50bfb0 Linux 5.15.84
| * 972707bae3 net: fec: properly guard irq coalesce setup
| * 289721fe09 ASoC: ops: Correct bounds check for second channel on SX controls
| * de0866b94a nvme-pci: clear the prp2 field when not used
| * 8bffa95ac1 perf: Fix perf_pending_task() UaF
| * 825bd2af42 ASoC: cs42l51: Correct PGA Volume minimum value
| * 91582b3a1a net: fec: don't reset irq coalesce settings to defaults on "ip link up"
| * c772dab247 can: mcba_usb: Fix termination command argument
| * aa822de7de can: sja1000: fix size of OCR_MODE_MASK define
| * 09e08740d7 pinctrl: meditatek: Startup with the IRQs disabled
| * 172a95026f libbpf: Use page size as max_entries when probing ring buffer map
| * cf611d7867 ASoC: ops: Check bounds for second channel in snd_soc_put_volsw_sx()
| * a74b88e170 ASoC: fsl_micfil: explicitly clear CHnF flags
| * afac1e7d78 ASoC: fsl_micfil: explicitly clear software reset bit
| * 9d933af8fe nfp: fix use-after-free in area_cache_get()
| * e1a4f5880d vfs: fix copy_file_range() averts filesystem freeze protection
| * 86e28ed25b x86/vdso: Conditionally export __vdso_sgx_enter_enclave()
* | bfbd2237c1 Merge 5.15.83 into android13-5.15-lts
|\|
| * fd6d66840b Linux 5.15.83
| * f895511de9 io_uring: Fix a null-ptr-deref in io_tctx_exit_cb()
| * f435c66d23 io_uring: move to separate directory
| * d9e1e5d8a7 block: move CONFIG_BLOCK guard to top Makefile
| * e5c0bc4ff5 can: esd_usb: Allow REC and TEC to return to zero
| * db6343a5b0 s390/qeth: fix use-after-free in hsci
| * a56c1cebe4 s390/qeth: fix various format strings
| * a6dba316c9 macsec: add missing attribute validation for offload
| * 40500f1f47 net: mvneta: Fix an out of bounds check
| * b9274dbe39 net: thunderbolt: fix memory leak in tbnet_open()
| * 7390c70bd4 ipv6: avoid use-after-free in ip6_fragment()
| * 1beb475892 net: plip: don't call kfree_skb/dev_kfree_skb() under spin_lock_irq()
| * b08412a9cf net: phy: mxl-gpy: fix version reporting
| * dec5abd91a xen/netback: fix build warning
| * 54d830e242 dpaa2-switch: Fix memory leak in dpaa2_switch_acl_entry_add() and dpaa2_switch_acl_entry_remove()
| * c7adcbd0fd ethernet: aeroflex: fix potential skb leak in greth_init_rings()
| * d962d42d63 tipc: call tipc_lxc_xmit without holding node_read_lock
| * f3b5dda26c net: dsa: sja1105: fix memory leak in sja1105_setup_devlink_regions()
| * 5dab6fa068 ipv4: Fix incorrect route flushing when table ID 0 is used
| * ac566bd577 ipv4: Fix incorrect route flushing when source address is deleted
| * af4ccae4b7 tipc: Fix potential OOB in tipc_link_proto_rcv()
| * b8ce0e6f9f net: hisilicon: Fix potential use-after-free in hix5hd2_rx()
| * 1685417774 net: mdio: fix unbalanced fwnode reference count in mdio_device_release()
| * 6f4798ac9c net: hisilicon: Fix potential use-after-free in hisi_femac_rx()
| * 114e65a221 net: thunderx: Fix missing destroy_workqueue of nicvf_rx_mode_wq
| * 51c0494575 net: microchip: sparx5: Fix missing destroy_workqueue of mact_queue
| * 99eec0a766 ip_gre: do not report erspan version on GRE interface
| * 2891957853 net: stmmac: fix "snps,axi-config" node property parsing
| * 5cb8f1a784 gpio/rockchip: fix refcount leak in rockchip_gpiolib_register()
| * b8c2f0392d nvme initialize core quirks before calling nvme_init_subsystem
| * 908b2da426 NFC: nci: Bounds check struct nfc_target arrays
| * d841cc1563 i40e: Disallow ip4 and ip6 l4_4_bytes
| * 625a13850b i40e: Fix for VF MAC address 0
| * 5538794dbd i40e: Fix not setting default xps_cpus after reset
| * a6b30598fe net: mvneta: Prevent out of bounds read in mvneta_config_rss()
| * e6e897d4fe xen-netfront: Fix NULL sring after live migration
| * eefd8953a7 octeontx2-pf: Fix potential memory leak in otx2_init_tc()
| * f88acaed07 net: mdiobus: fix double put fwnode in the error path
| * cc62d76928 net: mdiobus: fwnode_mdiobus_register_phy() rework error handling
| * ea113b570e net: encx24j600: Fix invalid logic in reading of MISTAT register
| * 8aae746d06 net: encx24j600: Add parentheses to fix precedence
| * a110287ef4 mac802154: fix missing INIT_LIST_HEAD in ieee802154_if_add()
| * e046421bed selftests: rtnetlink: correct xfrm policy rule in kci_test_ipsec_offload
| * 4fa8988a36 net: dsa: sja1105: Check return value
| * b35be171df net: dsa: hellcreek: Check return value
| * a4c342e645 net: dsa: ksz: Check return value
| * edf7284a98 Bluetooth: Fix not cleanup led when bt_init fails
| * 3322193949 Bluetooth: 6LoWPAN: add missing hci_dev_put() in get_l2cap_conn()
| * 6c88c764e0 vmxnet3: use correct intrConf reference when using extended queues
| * 5ad0d85757 vmxnet3: correctly report encapsulated LRO packet
| * 5c014eb0ed af_unix: Get user_ns from in_skb in unix_diag_get_exact().
| * 807a01a329 drm: bridge: dw_hdmi: fix preference of RGB modes over YUV420
| * eb96fd3983 net: broadcom: Add PTP_1588_CLOCK_OPTIONAL dependency for BCMGENET under ARCH_BCM2835
| * 16eb678bca igb: Allocate MSI-X vector when testing
| * 34c6367c94 e1000e: Fix TX dispatch condition
| * 4271515f18 gpio: amd8111: Fix PCI device reference count leak
| * d57b60e9b3 drm/bridge: ti-sn65dsi86: Fix output polarity setting bug
| * f8b2965601 netfilter: ctnetlink: fix compilation warning after data race fixes in ct mark
| * 246bcd05ba ca8210: Fix crash by zero initializing data
| * 80dad8df5f ieee802154: cc2520: Fix error return code in cc2520_hw_init()
| * dd9dcfb85c drm/vmwgfx: Fix race issue calling pin_user_pages
| * 7b09ba9036 netfilter: nft_set_pipapo: Actually validate intervals in fields after the first one
| * 6daaa84b62 gpiolib: fix memory leak in gpiochip_setup_dev()
| * 1a1075d371 gpiolib: check the 'ngpios' property in core gpiolib code
| * 70c5515c1c gpiolib: improve coding style for local variables
| * 3b714f25fc clk: Fix pointer casting to prevent oops in devm_clk_release()
| * c142cba37d can: af_can: fix NULL pointer dereference in can_rcv_filter
| * 104bb1f67e HID: ite: Enable QUIRK_TOUCHPAD_ON_OFF_REPORT on Acer Aspire Switch V 10
| * f755d11c55 HID: core: fix shift-out-of-bounds in hid_report_raw_event
| * 2d4b310c32 HID: hid-lg4ff: Add check for empty lbuf
| * 5e8021ae08 HID: usbhid: Add ALWAYS_POLL quirk for some mice
| * 5e88c6f4aa net: dsa: sja1105: avoid out of bounds access in sja1105_init_l2_policing()
| * 1074fefce9 drm/shmem-helper: Avoid vm_open error paths
| * 83e3da8bb9 drm/shmem-helper: Remove errant put in error path
| * 249011f4c3 drm/amdgpu/sdma_v4_0: turn off SDMA ring buffer in the s2idle suspend
| * 1e4fe9a154 drm/vmwgfx: Don't use screen objects when SEV is active
| * f6550976fe KVM: s390: vsie: Fix the initialization of the epoch extension (epdx) field
| * fe50a9bbeb net: mana: Fix race on per-CQ variable napi work_done
| * a49894a5ac Bluetooth: Fix crash when replugging CSR fake controllers
| * 1dee2b5047 Bluetooth: btusb: Add debug message for CSR controllers
| * 3ac29732a2 mm/gup: fix gup_pud_range() for dax
| * aad8bbd17a memcg: fix possible use-after-free in memcg_write_event_control()
| * 6fb8bc29bf media: v4l2-dv-timings.c: fix too strict blanking sanity checks
| * a4c575541e Revert "ARM: dts: imx7: Fix NAND controller size-cells"
| * 28abc11459 soundwire: intel: Initialize clock stop timeout
| * 22d800b378 media: videobuf2-core: take mmap_lock in vb2_get_unmapped_area()
| * 5d0fa6fc88 xen/netback: don't call kfree_skb() with interrupts disabled
| * 4422241cef xen/netback: do some code cleanup
| * 0fe29bd925 xen/netback: Ensure protocol headers don't fall in the non-linear area
| * f01677be31 drm/bridge: anx7625: Fix edid_read break case in sp_tx_edid_read()
| * ee2536830b cifs: fix use-after-free caused by invalid pointer `hostname`
| * dc62f05f66 rtc: cmos: avoid UIP when reading alarm time
| * 48ea4199af rtc: cmos: avoid UIP when writing alarm time
| * 3f52afc6ed rtc: mc146818-lib: extract mc146818_avoid_UIP
| * 1a3f8c6cd2 mm/khugepaged: invoke MMU notifiers in shmem/file collapse paths
| * 79ad784c9d mm/khugepaged: fix GUP-fast interaction by sending IPI
| * d15cd6de01 mm/khugepaged: take the right locks for page table retraction
| * 26f084e554 net: usb: qmi_wwan: add u-blox 0x1342 composition
| * 029a7f1c5d 9p/xen: check logical size for buffer size
| * b398832893 usb: dwc3: gadget: Disable GUSB2PHYCFG.SUSPHY for End Transfer
| * e70a572440 fbcon: Use kzalloc() in fbcon_prepare_logo()
| * fd3768597d regulator: twl6030: fix get status of twl6032 regulators
| * 9f74b9aa8d ASoC: soc-pcm: Add NULL check in BE reparenting
| * dae93f4168 btrfs: send: avoid unaligned encoded writes when attempting to clone range
| * f54e1edf57 selftests/net: Find nettest in current directory
| * fccd454129 ALSA: seq: Fix function prototype mismatch in snd_seq_expand_var_event
| * 542a563bb7 regulator: slg51000: Wait after asserting CS pin
| * 3d1b5fde36 9p/fd: Use P9_HDRSZ for header size
| * fe2d44e86e ASoC: rt711-sdca: fix the latency time of clock stop prepare state machine transitions
| * e945f3d809 ARM: dts: rockchip: disable arm_global_timer on rk3066 and rk3188
| * c3b818c91a spi: mediatek: Fix DEVAPC Violation at KO Remove
| * d9f0107be1 ASoC: wm8962: Wait for updated value of WM8962_CLOCKING1 register
| * 7ae0262748 ARM: 9266/1: mm: fix no-MMU ZERO_PAGE() implementation
| * d81c62e312 ARM: 9251/1: perf: Fix stacktraces for tracepoint events in THUMB2 kernels
| * 66717ad03b fs: use acquire ordering in __fget_light()
| * 1222e2364a ARM: dts: rockchip: rk3188: fix lcdc1-rgb24 node name
| * 996fb29b06 arm64: dts: rockchip: fix ir-receiver node names
| * 752138ef89 ARM: dts: rockchip: fix ir-receiver node names
| * 8045971e40 arm: dts: rockchip: remove clock-frequency from rtc
| * 5e9fb8013a arm: dts: rockchip: fix node name for hym8563 rtc
| * 2ed7137e91 arm64: dts: rockchip: keep I2S1 disabled for GPIO function on ROCK Pi 4 series
| * 5a1122e1a8 mmc: mtk-sd: Fix missing clk_disable_unprepare in msdc_of_clock_parse()
| * 282f52c954 clk: Provide new devm_clk helpers for prepared and enabled clocks
| * eb94a7a20f clk: generalize devm_clk_get() a bit
* | 20de784185 ANDROID: fix up abi change in struct sdhci_host
* | ebd1f8013d ANDROID: gki_defconfig: add CONFIG_FUNCTION_ERROR_INJECTION
* | 112ff45bb5 Merge 5.15.82 into android13-5.15-lts
|\|
| * d979030136 Linux 5.15.82
| * 48642f9431 proc: proc_skip_spaces() shouldn't think it is working on C strings
| * 3eb9213f66 proc: avoid integer type confusion in get_proc_long
| * 4a4073a2e2 ipc/sem: Fix dangling sem_array access in semtimedop race
| * 53b9b1201e Input: raydium_ts_i2c - fix memory leak in raydium_i2c_send()
| * 571b6bbbf5 char: tpm: Protect tpm_pm_suspend with locks
| * f39891cfe7 Revert "clocksource/drivers/riscv: Events are stopped during CPU suspend"
| * a759057af7 ACPI: HMAT: Fix initiator registration for single-initiator systems
| * da8a794d71 ACPI: HMAT: remove unnecessary variable initialization
| * 2d16161a2c i2c: imx: Only DMA messages with I2C_M_DMA_SAFE flag set
| * 950a05cb15 i2c: npcm7xx: Fix error handling in npcm_i2c_init()
| * db3f8da033 serial: stm32: Deassert Transmit Enable on ->rs485_config()
| * 45f628f4fd serial: stm32: Use TC interrupt to deassert GPIO RTS in RS485 mode
| * c60eae5b1d serial: stm32: Factor out GPIO RTS toggling into separate function
| * 041f8dc882 ipv4: Fix route deletion when nexthop info is not specified
| * 25174d91e4 ipv4: Handle attempt to delete multipath route when fib_info contains an nh reference
| * a0ad247e55 selftests: net: fix nexthop warning cleanup double ip typo
| * 532847b69c selftests: net: add delete nexthop route warning test
| * e078355881 Kconfig.debug: provide a little extra FRAME_WARN leeway when KASAN is enabled
| * 723fa02e0e parisc: Increase FRAME_WARN to 2048 bytes on parisc
| * b951ab4b35 mm: migrate: fix THP's mapcount on isolation
| * c5eda6029c mm: __isolate_lru_page_prepare() in isolate_migratepages_block()
| * bdb613ef17 iommu/vt-d: Fix PCI device refcount leak in dmar_dev_scope_init()
| * b6eea8b2e8 iommu/vt-d: Fix PCI device refcount leak in has_external_pci()
| * 787d81d4eb nvme: fix SRCU protection of nvme_ns_head list
| * 12f237200c riscv: kexec: Fixup irq controller broken in kexec crash path
| * ac00301adb riscv: fix race when vmap stack overflow
| * fa7a7d185e riscv: Sync efi page table's kernel mappings before switching
| * d86d698925 pinctrl: single: Fix potential division by zero
| * 98b15c7066 ASoC: ops: Fix bounds check for _sx controls
| * f88a6977f8 KVM: x86/mmu: Fix race condition in direct_page_fault
| * df4b177b48 io_uring/poll: fix poll_refs race with cancelation
| * 4b702b7d11 io_uring: make poll refs more robust
| * 1d58849ac2 io_uring: cmpxchg for poll arm refs release
| * cd1981a8c3 io_uring: fix tw losing poll events
| * 62321dc7b0 io_uring: update res mask in io_poll_check_events
| * 417d5ea6e7 tracing: Free buffers when a used dynamic event is removed
| * 52fc245d15 tracing: Fix race where histograms can be called before the event
| * cb2b0612cd tracing/osnoise: Fix duration type
| * 615a996ff3 drm/i915: Never return 0 if not all requests retired
| * 01a2b25ef2 drm/i915: Fix negative value passed as remaining time
| * ff1591ba33 drm/amdgpu: enable Vangogh VCN indirect sram mode
| * ac2d7fa908 drm/amdgpu: temporarily disable broken Clang builds due to blown stack-frame
| * 57ee7bc4c6 mmc: sdhci: Fix voltage switch delay
| * bb8f809514 mmc: sdhci-sprd: Fix no reset data and command after voltage switch
| * 4c7681c1a5 mmc: sdhci-esdhc-imx: correct CQHCI exit halt state check
| * 01dbe4db59 mmc: core: Fix ambiguous TRIM and DISCARD arg
| * 738946e355 mmc: mmc_test: Fix removal of debugfs file
| * 635d051734 net: stmmac: Set MAC's flow control register to reflect current settings
| * 9132dcdf3b v4l2: don't fall back to follow_pfn() if pin_user_pages_fast() fails
| * 76ad884be0 pinctrl: intel: Save and restore pins in "direct IRQ" mode
| * 41296b85fa x86/bugs: Make sure MSR_SPEC_CTRL is updated properly upon resume from S3
| * 33021419fd nilfs2: fix NULL pointer dereference in nilfs_palloc_commit_free_entry()
| * 2e44dd9a8d tools/vm/slabinfo-gnuplot: use "grep -E" instead of "egrep"
| * b60a8ad771 error-injection: Add prompt for function error injection
| * 757eb00c4c ALSA: dice: fix regression for Lexicon I-ONIX FW810S
| * a1a96a6f30 riscv: mm: Proper page permissions after initmem free
| * 823df3607d riscv: vdso: fix section overlapping under some conditions
| * 6e035d5a2a hwmon: (coretemp) fix pci device refcount leak in nv1a_ram_new()
| * 7692700ac8 hwmon: (coretemp) Check for null before removing sysfs attrs
| * 9b5836b9c4 net: ethernet: renesas: ravb: Fix promiscuous mode after system resumed
| * 0dfb9a5663 sctp: fix memory leak in sctp_stream_outq_migrate()
| * fcb3e02161 packet: do not set TP_STATUS_CSUM_VALID on CHECKSUM_COMPLETE
| * 04b995e963 net: tun: Fix use-after-free in tun_detach()
| * 43ca0adf79 afs: Fix fileserver probe RTT handling
| * 543d917f69 net: mdiobus: fix unbalanced node reference count
| * dca370e575 net: hsr: Fix potential use-after-free
| * 1daec08156 tipc: re-fetch skb cb after tipc_msg_validate
| * 16a64dc265 dsa: lan9303: Correct stat name
| * 766086ea8c net: wwan: iosm: fix dma_alloc_coherent incompatible pointer type
| * c667751a42 net: wwan: iosm: fix kernel test robot reported error
| * 9c584d6d9c net: ethernet: nixge: fix NULL dereference
| * 8782b32ef8 net/9p: Fix a potential socket leak in p9_socket_open
| * 6fc9425bff net: net_netdev: Fix error handling in ntb_netdev_init_module()
| * 3bc893ef36 net: ethernet: ti: am65-cpsw: fix error handling in am65_cpsw_nuss_probe()
| * 7730904f50 net: phy: fix null-ptr-deref while probe() failed
| * 59b54f0563 wifi: mac8021: fix possible oob access in ieee80211_get_rate_duration
| * dc0853f8b5 wifi: cfg80211: don't allow multi-BSSID in S1G
| * 88a6fe3707 wifi: cfg80211: fix buffer overflow in elem comparison
| * 08fff7aaeb aquantia: Do not purge addresses when setting the number of rings
| * 2a7aa52573 qlcnic: fix sleep-in-atomic-context bugs caused by msleep
| * 7b734d26f0 can: m_can: Add check for devm_clk_get
| * ea8dc27bb0 can: m_can: pci: add missing m_can_class_free_dev() in probe/remove methods
| * b1d2a8e02a can: etas_es58x: es58x_init_netdev(): free netdev when register_candev()
| * e53da04e37 can: cc770: cc770_isa_probe(): add missing free_cc770dev()
| * d452a71995 can: sja1000_isa: sja1000_isa_probe(): add missing free_sja1000dev()
| * 372eb550fa net/mlx5e: Fix use-after-free when reverting termination table
| * 839eeab03c net/mlx5: Fix uninitialized variable bug in outlen_write()
| * 34feea3bfb net/mlx5: DR, Fix uninitialized var warning
| * 3485ef2aab net/mlx5: DR, Rename list field in matcher struct to list_node
| * 9fc27d22cd e100: Fix possible use after free in e100_xmit_prepare
| * 0d9f5bd54b iavf: Fix error handling in iavf_init_module()
| * b0b2b9050c iavf: remove redundant ret variable
| * 69501d8205 fm10k: Fix error handling in fm10k_init_module()
| * 5e3657dede i40e: Fix error handling in i40e_init_module()
| * 7109e94109 ixgbevf: Fix resource leak in ixgbevf_init_module()
| * 196ea810e2 of: property: decrement node refcount in of_fwnode_get_reference_args()
| * 36164db278 nvmem: rmem: Fix return value check in rmem_read()
| * e376183167 bpf: Do not copy spin lock field from user in bpf_selem_alloc
| * 45f6e81863 hwmon: (ibmpex) Fix possible UAF when ibmpex_register_bmc() fails
| * a90251376c hwmon: (i5500_temp) fix missing pci_disable_device()
| * eeb31b828d hwmon: (ina3221) Fix shunt sum critical calculation
| * 9514b95cac hwmon: (ltc2947) fix temperature scaling
| * 0140e079a4 libbpf: Handle size overflow for ringbuf mmap
| * 06d5790e7d ARM: at91: rm9200: fix usb device clock id
| * d074f173fb scripts/faddr2line: Fix regression in name resolution on ppc64le
| * ee3d37d796 bpf, perf: Use subprog name when reporting subprog ksymbol
| * ec02fc0a41 iio: light: rpr0521: add missing Kconfig dependencies
| * f7419fc42a iio: health: afe4404: Fix oob read in afe4404_[read|write]_raw
| * e7e76a77aa iio: health: afe4403: Fix oob read in afe4403_read_raw
| * ebdca90efb drm/amdgpu: Partially revert "drm/amdgpu: update drm_display_info correctly when the edid is read"
| * c365d3c3e5 drm/amdgpu: update drm_display_info correctly when the edid is read
| * df5346466e drm/display/dp_mst: Fix drm_dp_mst_add_affected_dsc_crtcs() return code
| * 044da1a371 btrfs: qgroup: fix sleep from invalid context bug in btrfs_qgroup_inherit()
| * da86809ab8 btrfs: move QUOTA_ENABLED check to rescan_should_stop from btrfs_qgroup_rescan_worker
| * 5d66eadc1c spi: spi-imx: Fix spi_bus_clk if requested clock is higher than input clock
| * 6b4544a131 btrfs: free btrfs_path before copying inodes to userspace
| * c7ae3becee btrfs: sink iterator parameter to btrfs_ioctl_logical_to_ino
| * acc2f40b98 erofs: fix order >= MAX_ORDER warning due to crafted negative i_size
| * ca9f27448a drm/i915/gt: Use i915_vm_put on ppgtt_create error paths
| * c2f2972889 drm/i915: Create a dummy object for gen6 ppgtt
| * 918002bdbe arm64: mte: Avoid setting PG_mte_tagged if no tags cleared or restored
* | d753150bdc Revert "serial: Add rs485_supported to uart_port"
* | 8ccd9528be Revert "serial: fsl_lpuart: Fill in rs485_supported"
* | a924bb92c6 Merge 5.15.81 into android13-5.15-lts
|\|
| * e4a7232c91 Linux 5.15.81
| * 5c5c563a08 cifs: fix missed refcounting of ipc tcon
| * ee2d04f23b drm/i915: fix TLB invalidation for Gen12 video and compute engines
| * bef834845d drm/amdgpu: always register an MMU notifier for userptr
| * 7901de7aa8 drm/amdgpu: Enable Aldebaran devices to report CU Occupancy
| * e7bf1fe538 drm/amd/display: No display after resume from WB/CB
| * 5033cba00c drm/amd/dc/dce120: Fix audio register mapping, stop triggering KASAN
| * b8dc245909 btrfs: sysfs: normalize the error handling branch in btrfs_init_sysfs()
| * 914baca57a btrfs: use kvcalloc in btrfs_get_dev_zone_info
| * c1e6d4bfde btrfs: zoned: fix missing endianness conversion in sb_write_pointer
| * d88bf6be02 btrfs: free btrfs_path before copying subvol info to userspace
| * f218b404fc btrfs: free btrfs_path before copying fspath to userspace
| * fea9397101 btrfs: free btrfs_path before copying root refs to userspace
| * 7d0c25b5fe genirq: Take the proposed affinity at face value if force==true
| * f17657cce0 irqchip/gic-v3: Always trust the managed affinity provided by the core code
| * 52a93f2dcf genirq: Always limit the affinity to online CPUs
| * 599cf4b845 genirq/msi: Shutdown managed interrupts with unsatifiable affinities
| * 7aed1dd5d2 wifi: wilc1000: validate number of channels
| * e9de501cf7 wifi: wilc1000: validate length of IEEE80211_P2P_ATTR_CHANNEL_LIST attribute
| * 143232cb5a wifi: wilc1000: validate length of IEEE80211_P2P_ATTR_OPER_CHANNEL attribute
| * cd9c486971 wifi: wilc1000: validate pairwise and authentication suite offsets
| * f2fb18d429 fuse: lock inode unconditionally in fuse_fallocate()
| * bb1c33bdf4 dm integrity: clear the journal on suspend
| * 20ad31b09e dm integrity: flush the journal on suspend
| * 5ca2110ba5 gpu: host1x: Avoid trying to use GART on Tegra20
| * 97f47617e8 scsi: iscsi: Fix possible memory leak when device_register() failed
| * 56ab7f237e net: usb: qmi_wwan: add Telit 0x103a composition
| * e2e33f213d tcp: configurable source port perturb table size
| * 269928e5c7 platform/x86: ideapad-laptop: Fix interrupt storm on fn-lock toggle on some Yoga laptops
| * 17d995dc69 platform/x86: hp-wmi: Ignore Smart Experience App event
| * e85bdc7872 zonefs: fix zone report size in __zonefs_io_error()
| * 982fcd83fb drm/amdgpu: disable BACO support on more cards
| * ea11f8197d platform/x86: acer-wmi: Enable SW_TABLET_MODE on Switch V 10 (SW5-017)
| * 09af15e691 platform/x86: asus-wmi: add missing pci_dev_put() in asus_wmi_set_xusb2pr()
| * ba040bea9d xen/platform-pci: add missing free_irq() in error path
| * 6815b2087d xen-pciback: Allow setting PCI_MSIX_FLAGS_MASKALL too
| * 4c13ddb74f ASoC: stm32: dfsdm: manage cb buffers cleanup
| * dd82295a23 Input: i8042 - apply probe defer to more ASUS ZenBook models
| * e12e121feb Input: soc_button_array - add Acer Switch V 10 to dmi_use_low_level_irq[]
| * 9f5c167074 Input: soc_button_array - add use_low_level_irq module parameter
| * aaef86eac9 Input: goodix - try resetting the controller when no config is set
| * e2223f5fbb serial: 8250: 8250_omap: Avoid RS485 RTS glitch on ->set_termios()
| * 4e208294de tools: iio: iio_generic_buffer: Fix read size
| * 0d0e2545fa ASoC: Intel: bytcht_es8316: Add quirk for the Nanote UMPC-01
| * e394cf9d7a Input: synaptics - switch touchpad on HP Laptop 15-da3001TU to RMI mode
| * 96b5d11777 x86/ioremap: Fix page aligned size calculation in __ioremap_caller()
| * d048f74815 x86/pm: Add enumeration check before spec MSRs save/restore setup
| * 070e3560bf x86/tsx: Add a feature bit for TSX control MSR support
| * 1430c98ebb KVM: x86: remove exit_int_info warning in svm_handle_exit
| * 27550a5930 KVM: x86: add kvm_leave_nested
| * 3e87cb0caa KVM: x86: nSVM: harden svm_free_nested against freeing vmcb02 while still in use
| * 6425c590d0 KVM: x86: forcibly leave nested mode on vCPU reset
| * f42ebf972a KVM: x86: nSVM: leave nested mode on vCPU free
| * 7b3c9405b2 mm: vmscan: fix extreme overreclaim and swap floods
| * feb2eda5e1 gcov: clang: fix the buffer overflow issue
| * ea6aa25c9a nilfs2: fix nilfs_sufile_mark_dirty() not set segment usage as dirty
| * 9d97a9fbfc usb: dwc3: gadget: Clear ep descriptor last
| * 02632ea4df usb: dwc3: gadget: Return -ESHUTDOWN on ep disable
| * 765ca3e63f usb: dwc3: gadget: conditionally remove requests
| * 7945cbf866 bus: ixp4xx: Don't touch bit 7 on IXP42x
| * 39c039018a iio: core: Fix entry not deleted when iio_register_sw_trigger_type() fails
| * 0791a5ddba iio: light: apds9960: fix wrong register for gesture gain
| * f0158b9bfc arm64: dts: rockchip: lower rk3399-puma-haikou SD controller clock frequency
| * 277d19ec28 ext4: fix use-after-free in ext4_ext_shift_extents
| * c9d133100b usb: cdnsp: fix issue with ZLP - added TD_SIZE = 1
| * c2ad434cd4 usb: cdnsp: Fix issue with Clear Feature Halt Endpoint
| * 1d91c64887 usb: dwc3: exynos: Fix remove() function
| * 0a216625c3 KVM: arm64: pkvm: Fixup boot mode to reflect that the kernel resumes from EL1
| * f0044a4a31 mmc: sdhci-brcmstb: Fix SDHCI_RESET_ALL for CQHCI
| * 8e6940979b mmc: sdhci-brcmstb: Enable Clock Gating to save power
| * 24b46bfa96 mmc: sdhci-brcmstb: Re-organize flags
| * 227543ccac nios2: add FORCE for vmlinuz.gz
| * 6a4ea16a67 init/Kconfig: fix CC_HAS_ASM_GOTO_TIED_OUTPUT test with dash
| * c4a9046c27 lib/vdso: use "grep -E" instead of "egrep"
| * 5fefdceafb s390/crashdump: fix TOD programmable field size
| * 592b6fd74a net: thunderx: Fix the ACPI memory leak
| * 697eb30a35 octeontx2-af: Fix reference count issue in rvu_sdp_init()
| * 6ba1687ea1 octeontx2-pf: Add check for devm_kcalloc
| * 26c31e7c73 net: enetc: preserve TX ring priority across reconfiguration
| * 0e16bbf616 net: enetc: cache accesses to &priv->si->hw
| * 68de40f66a net: enetc: manage ENETC_F_QBV in priv->active_offloads only when enabled
| * 5c0858e142 nfc: st-nci: fix incorrect sizing calculations in EVT_TRANSACTION
| * e09243fb16 nfc: st-nci: fix memory leaks in EVT_TRANSACTION
| * dca20b7a19 nfc: st-nci: fix incorrect validating logic in EVT_TRANSACTION
| * 67d638f8ef s390/dasd: fix no record found for raw_track_access
| * 88277853cf arcnet: fix potential memory leak in com20020_probe()
| * 1d44ec8507 ipv4: Fix error return code in fib_table_insert()
| * 918e83c6bf dccp/tcp: Reset saddr on failure after inet6?_hash_connect().
| * 8ce9b1c97f fs: do not update freeing inode i_io_list
| * 8db9e60cdf netfilter: flowtable_offload: add missing locking
| * c1da3bfca1 netfilter: ipset: restore allowing 64 clashing elements in hash:net,iface
| * 606091b2f6 dma-buf: fix racing conflict of dma_heap_add()
| * 8af9450bef bnx2x: fix pci device refcount leak in bnx2x_vf_is_pcie_pending()
| * 251bcf6cfb regulator: twl6030: re-add TWL6032_SUBCLASS
| * 6258a8f913 NFC: nci: fix memory leak in nci_rx_data_packet()
| * ffe6021154 net: sched: allow act_ct to be built without NF_NAT
| * a05c0f9511 net: sparx5: fix error handling in sparx5_port_open()
| * 182ef20f0f sfc: fix potential memleak in __ef100_hard_start_xmit()
| * 2da022fac9 net: wwan: iosm: use ACPI_FREE() but not kfree() in ipc_pcie_read_bios_cfg()
| * a48b345b87 xfrm: Fix ignored return value in xfrm6_init()
| * 19989e1635 xfrm: Fix oops in __xfrm_state_delete()
| * 46d450067f tipc: check skb_linearize() return value in tipc_disc_rcv()
| * 33fb115a76 tipc: add an extra conn_get in tipc_conn_alloc
| * 4ae907c45f tipc: set con sock in tipc_conn_alloc
| * ef866d9ea9 net/mlx5: Fix handling of entry refcount when command is not issued to FW
| * 3101318939 net/mlx5: Fix FW tracer timestamp calculation
| * 1eaabb5bbb net/mlx5: Do not query pci info while pci disabled
| * 8180099b2a netfilter: ipset: regression in ip_set_hash_ip.c
| * 448b627370 Drivers: hv: vmbus: fix possible memory leak in vmbus_device_register()
| * 082c31cb99 Drivers: hv: vmbus: fix double free in the error path of vmbus_add_channel_work()
| * 7fdd9daa5b macsec: Fix invalid error code set
| * e8fb93a079 nfp: add port from netdev validation for EEPROM access
| * e44e424ed9 nfp: fill splittable of devlink_port_attrs correctly
| * 527046c138 net: pch_gbe: fix pci device refcount leak while module exiting
| * f77c84dd5b octeontx2-af: debugsfs: fix pci device refcount leak
| * cd581ffd8d net/qla3xxx: fix potential memleak in ql3xxx_send()
| * a8976074e2 net: mvpp2: fix possible invalid pointer dereference
| * 3a4cc56cd1 net/mlx4: Check retval of mlx4_bitmap_init
| * c368220e17 net: ethernet: mtk_eth_soc: fix error handling in mtk_open()
| * d9729437b2 ARM: dts: imx6q-prti6q: Fix ref/tcxo-clock-frequency properties
| * 1c0b6a97c4 ARM: mxs: fix memory leak in mxs_machine_init()
| * ecff08f3c4 iavf: Fix race condition between iavf_shutdown and iavf_remove
| * 31147d4e90 iavf: Do not restart Tx queues after reset task failure
| * 232942b26c iavf: Fix a crash during reset task
| * 0600615d01 netfilter: nf_tables: do not set up extensions for end interval
| * 60387731e6 netfilter: conntrack: Fix data-races around ct mark
| * ee3ccd1abb 9p/fd: fix issue of list_del corruption in p9_fd_cancel()
| * 131c2eeabc net: pch_gbe: fix potential memleak in pch_gbe_tx_queue()
| * f58df483ff nfc/nci: fix race with opening and closing
| * da22d7410a net: dsa: sja1105: disallow C45 transactions on the BASE-TX MDIO bus
| * 38fe0988bd rxrpc: Fix race between conn bundle lookup and bundle removal [ZDI-CAN-15975]
| * d92151b465 rxrpc: Use refcount_t rather than atomic_t
| * 3c33e41fa5 rxrpc: Allow list of in-use local UDP endpoints to be viewed in /proc
| * 46cefa2689 net: liquidio: simplify if expression
| * 95500ee0b3 selftests: mptcp: fix mibit vs mbit mix up
| * f8c4da198e selftests: mptcp: more stable simult_flows tests
| * 1c0efab08c ARM: dts: at91: sam9g20ek: enable udc vbus gpio pinctrl
| * ade662f3f2 tee: optee: fix possible memory leak in optee_register_device()
| * d1dd119134 bus: sunxi-rsb: Support atomic transfers
| * b1ed61e706 bus: sunxi-rsb: Remove the shutdown callback
| * 61a41d1abc regulator: core: fix UAF in destroy_regulator()
| * a85c0db3f5 spi: dw-dma: decrease reference count in dw_spi_dma_init_mfld()
| * d9f9b3255b regulator: core: fix kobject release warning and memory leak in regulator_register()
| * bd419c7c68 ASoC: max98373: Add checks for devm_kcalloc
| * f9bc4a18e7 scsi: storvsc: Fix handling of srb_status and capacity change events
| * c2153fe2d0 x86/hyperv: Restore VP assist page after cpu offlining/onlining
| * b2ddd76237 ASoC: soc-pcm: Don't zero TDM masks in __soc_pcm_open()
| * dd62cb7e6f ASoC: sgtl5000: Reset the CHIP_CLK_CTRL reg on remove
| * d80ffd4823 ASoC: hdac_hda: fix hda pcm buffer overflow issue
| * 10bee7eb2a ARM: dts: am335x-pcm-953: Define fixed regulators in root node
| * 8fe533c0f9 af_key: Fix send_acquire race with pfkey_register
| * 0c69a4658e xfrm: replay: Fix ESN wrap around for GSO
| * ecc6ce4fdf xfrm: fix "disable_policy" on ipv4 early demux
| * 5a792c1d4d MIPS: pic32: treat port as signed integer
| * 144452b421 RISC-V: vdso: Do not add missing symbols to version section in linker script
| * 799970b8cc ALSA: usb-audio: add quirk to fix Hamedal C20 disconnect issue
| * 38b09dc14f Revert "drm/amdgpu: Revert "drm/amdgpu: getting fan speed pwm for vega10 properly""
| * 44d50fccf8 nvmet: fix memory leak in nvmet_subsys_attr_model_store_locked
| * 5adc12d9e2 arm64/syscall: Include asm/ptrace.h in syscall_wrapper header.
| * 1340f02773 block, bfq: fix null pointer dereference in bfq_bio_bfqg()
| * 86d4dca4a6 drm: panel-orientation-quirks: Add quirk for Acer Switch V 10 (SW5-017)
| * b90e6234f5 scsi: scsi_debug: Make the READ CAPACITY response compliant with ZBC
| * cdbba6a4de scsi: ibmvfc: Avoid path failures during live migration
| * 6e8124a151 platform/x86/intel/hid: Add some ACPI device IDs
| * 32735e24f4 platform/x86/intel/pmt: Sapphire Rapids PMT errata fix
| * 83a6823016 platform/x86: touchscreen_dmi: Add info for the RCA Cambio W101 v2 2-in-1
| * f707986a14 platform/x86: ideapad-laptop: Disable touchpad_switch
| * 5e38740ae5 Revert "net: macsec: report real_dev features when HW offloading is enabled"
| * 26b72202ee selftests/bpf: Add verifier test for release_reference()
| * 8395e3f98c spi: stm32: fix stm32_spi_prepare_mbr() that halves spi clk for every run
| * d04722f280 wifi: ath11k: Fix QCN9074 firmware boot on x86
| * 9cc96a20a9 wifi: mac80211: Fix ack frame idr leak when mesh has no route
| * 86f90014e7 wifi: airo: do not assign -1 to unsigned char
| * f5558fbda0 audit: fix undefined behavior in bit shift for AUDIT_BIT
| * af5de982ff riscv: dts: sifive unleashed: Add PWM controlled LEDs
| * ee34a19dbe wifi: mac80211_hwsim: fix debugfs attribute ps with rc table support
| * 3513785dc1 wifi: mac80211: fix memory free error when registering wiphy fail
| * 855485d31e ceph: fix NULL pointer dereference for req->r_session
| * 729c9ad294 ceph: Use kcalloc for allocating multiple elements
| * d276fb4a7e binder: validate alloc->mm in ->mmap() handler
| * 5277e3d633 x86/sgx: Add overflow check in sgx_validate_offset_length()
| * b5a838ba47 x86/sgx: Create utility to validate user provided offset and length
| * 2f6e2de3a5 ceph: avoid putting the realm twice when decoding snaps fails
| * 8bef55d793 ceph: do not update snapshot context when there is no new snapshot
| * cdee3136c9 iio: pressure: ms5611: fixed value compensation bug
| * 5d6696e79d iio: ms5611: Simplify IO callback parameters
| * f0ee88e83c nvme-pci: add NVME_QUIRK_BOGUS_NID for Netac NV7000
| * a61716cd24 nvme-pci: disable write zeroes on various Kingston SSD
| * 19b60f3363 nvme-pci: disable namespace identifiers for the MAXIO MAP1001
| * d537e19306 nvme-pci: add NVME_QUIRK_BOGUS_NID for Micron Nitro
| * af03ce894c nvme: add a bogus subsystem NQN quirk for Micron MTFDKBA2T0TFH
| * c6803faa6a drm/display: Don't assume dual mode adaptors support i2c sub-addressing
| * d2284fe43c ata: libata-core: do not issue non-internal commands once EH is pending
| * e09583e83e ata: libata-scsi: simplify __ata_scsi_queuecmd()
| * a9059e338f cifs: Fix connections leak when tlink setup failed
| * 81d583baa5 cifs: support nested dfs links over reconnect
| * dbc0ea91be cifs: split out dfs code from cifs_reconnect()
| * b3ce844d23 cifs: introduce new helper for cifs_reconnect()
| * 2ea600b598 sctp: clear out_curr if all frag chunks of current msg are pruned
| * 1f9f346fbb sctp: remove the unnecessary sinfo_stream check in sctp_prsctp_prune_unsent
| * e8915faa9f tty: serial: fsl_lpuart: don't break the on-going transfer when global reset
| * bd19013935 serial: fsl_lpuart: Fill in rs485_supported
| * 87c81c19cd serial: Add rs485_supported to uart_port
| * c08f4ea79f ASoC: fsl_asrc fsl_esai fsl_sai: allow CONFIG_PM=N
| * d1e4288d2a ASoC: fsl_sai: use local device pointer
* | e66b45d527 Merge branch 'android13-5.15' into android13-5.15-lts
* | 72d681a01d Revert "net: use struct_group to copy ip/ipv6 header addresses"
* | c46ed1b2d7 Merge 5.15.80 into android13-5.15-lts
|\|
| * 71e496bd33 Linux 5.15.80
| * b63ddb3ba6 ntfs: check overflow when iterating ATTR_RECORDs
| * ab6a1bb17e ntfs: fix out-of-bounds read in ntfs_attr_find()
| * 5330c423b8 ntfs: fix use-after-free in ntfs_attr_find()
| * 43bbadb7e4 net/9p: use a dedicated spinlock for trans_fd
| * 9357fca9da mm: fs: initialize fsdata passed to write_begin/write_end interface
| * b334ab4c33 wifi: wext: use flex array destination for memcpy()
| * 0e07032b4b 9p/trans_fd: always use O_NONBLOCK read/write
| * 7c7b7476b5 gfs2: Switch from strlcpy to strscpy
| * 28275a7c84 gfs2: Check sb_bsize_shift after reading superblock
| * a4f1a01b2e 9p: trans_fd/p9_conn_cancel: drop client lock earlier
| * f7b0e95071 kcm: close race conditions on sk_receive_queue
| * 27d706b0d3 kcm: avoid potential race in kcm_tx_work
| * b49026d9c8 tcp: cdg: allow tcp_cdg_release() to be called multiple times
| * e41cbf98df macvlan: enforce a consistent minimal mtu
| * d5f7f6e63f Input: i8042 - fix leaking of platform device on module removal
| * c49cc2c059 kprobes: Skip clearing aggrprobe's post_handler in kprobe-on-ftrace case
| * 71beab7119 scsi: scsi_debug: Fix possible UAF in sdebug_add_host_helper()
| * a636772988 scsi: target: tcm_loop: Fix possible name leak in tcm_loop_setup_hba_bus()
| * cb7893c85e net: use struct_group to copy ip/ipv6 header addresses
| * 9b8c0c88f4 tracing: Fix warning on variable 'struct trace_array'
| * 73cf0ff9a3 ring-buffer: Include dropped pages in counting dirty patches
| * 35c60b4e8c perf: Improve missing SIGTRAP checking
| * 2ac6276864 serial: 8250_lpss: Use 16B DMA burst with Elkhart Lake
| * b1a27b2aad nvme: ensure subsystem reset is single threaded
| * bccece3c33 nvme: restrict management ioctls to admin
| * 8cddb0d96b perf/x86/intel/pt: Fix sampling using single range output
| * 8e2f33c598 misc/vmw_vmci: fix an infoleak in vmci_host_do_receive_datagram()
| * 9a72a46cb0 docs: update mediator contact information in CoC doc
| * a99a547658 mmc: sdhci-pci: Fix possible memory leak caused by missing pci_dev_put()
| * 4a1b6f7839 mmc: sdhci-pci-o2micro: fix card detect fail issue caused by CD# debounce timeout
| * fd285d4215 mmc: core: properly select voltage range without power cycle
| * 8a9bae5f1b firmware: coreboot: Register bus in module init
| * 052d0e79ef iommu/vt-d: Set SRE bit only when hardware has SRS cap
| * c31a792a82 iommu/vt-d: Preset Access bit for IOVA in FL non-leaf paging entries
| * 11edbdee43 scsi: zfcp: Fix double free of FSF request when qdio send fails
| * fdf87b5b30 net: phy: marvell: add sleep time after enabling the loopback bit
| * 9648d760ed maccess: Fix writing offset in case of fault in strncpy_from_kernel_nofault()
| * fdd57c20d4 Input: iforce - invert valid length check when fetching device IDs
| * 0cafb719be serial: 8250_lpss: Configure DMA also w/o DMA filter
| * 59f6596697 serial: 8250: Flush DMA Rx on RLSI
| * 118b52c2ae serial: 8250: Fall back to non-DMA Rx if IIR_RDI occurs
| * 6ffce7a92e dm ioctl: fix misbehavior if list_versions races with module loading
| * 2b104973f7 iio: pressure: ms5611: changed hardcoded SPI speed to value limited
| * 1678d4abb2 iio: adc: mp2629: fix potential array out of bound access
| * bd22c232ea iio: adc: mp2629: fix wrong comparison of channel
| * 656f670613 iio: trigger: sysfs: fix possible memory leak in iio_sysfs_trig_init()
| * 1bf8c0aff8 iio: adc: at91_adc: fix possible memory leak in at91_adc_allocate_trigger()
| * afc0aea702 usb: typec: mux: Enter safe mode only when pins need to be reconfigured
| * 8236628a54 usb: cdns3: host: fix endless superspeed hub port reset
| * ead83b0db8 usb: chipidea: fix deadlock in ci_otg_del_timer
| * cc9e6d8c55 usb: add NO_LPM quirk for Realforce 87U Keyboard
| * 70eca1d261 USB: serial: option: add Fibocom FM160 0x0111 composition
| * 1b6a54885c USB: serial: option: add u-blox LARA-L6 modem
| * b0467d0059 USB: serial: option: add u-blox LARA-R6 00B modem
| * 95688a8a57 USB: serial: option: remove old LARA-R6 PID
| * 53dee78ea3 USB: serial: option: add Sierra Wireless EM9191
| * e7764e88e6 USB: bcma: Make GPIO explicitly optional
| * a190a83db2 speakup: fix a segfault caused by switching consoles
| * b3c6edbee4 slimbus: stream: correct presence rate frequencies
| * 6b35ac8315 slimbus: qcom-ngd: Fix build error when CONFIG_SLIM_QCOM_NGD_CTRL=y && CONFIG_QCOM_RPROC_COMMON=m
| * 0f847462fe Revert "usb: dwc3: disable USB core PHY management"
| * 23ad214a86 ALSA: hda/realtek: Fix the speaker output on Samsung Galaxy Book Pro 360
| * a36b505749 ALSA: hda/realtek: fix speakers for Samsung Galaxy Book Pro
| * 02b94885b2 ALSA: usb-audio: Drop snd_BUG_ON() from snd_usbmidi_output_open()
| * 7176d6f3ad drm/amd/display: Add HUBP surface flip interrupt handler
| * e57daa7503 tracing: kprobe: Fix potential null-ptr-deref on trace_array in kprobe_event_gen_test_exit()
| * 3a41c0f2a5 tracing: kprobe: Fix potential null-ptr-deref on trace_event_file in kprobe_event_gen_test_exit()
| * 7291dec4f2 tracing: Fix race where eprobes can be called before the event
| * 6517b97134 tracing: Fix wild-memory-access in register_synth_event()
| * 07ba4f0603 tracing: Fix memory leak in test_gen_synth_cmd() and test_empty_synth_event()
| * 8b318f3032 tracing/ring-buffer: Have polling block on watermark
| * 2c21ee020c tracing: Fix memory leak in tracing_read_pipe()
| * 00f74b1a98 ring_buffer: Do not deactivate non-existant pages
| * 1bea037a1a ftrace: Fix null pointer dereference in ftrace_add_mod()
| * fadfcf39fb ftrace: Optimize the allocation for mcount entries
| * 5c5f264289 ftrace: Fix the possible incorrect kernel message
| * 2ab2494162 cifs: add check for returning value of SMB2_set_info_init
| * 5783abda58 net: thunderbolt: Fix error handling in tbnet_init()
| * 80e590aeb1 net: microchip: sparx5: Fix potential null-ptr-deref in sparx_stats_init() and sparx5_start()
| * 4a55aec142 cifs: Fix wrong return value checking when GETFLAGS
| * c8baf1fc24 net/x25: Fix skb leak in x25_lapb_receive_frame()
| * af4b57fa6b net: ag71xx: call phylink_disconnect_phy if ag71xx_hw_enable() fail in ag71xx_open()
| * 61404a182e cifs: add check for returning value of SMB2_close_init
| * d3233f4bf3 platform/surface: aggregator: Do not check for repeated unsequenced packets
| * 6969171403 platform/x86/intel: pmc: Don't unconditionally attach Intel PMC when virtualized
| * 7d93417d59 drbd: use after free in drbd_create_device()
| * fc16a2c81a bridge: switchdev: Fix memory leaks when changing VLAN protocol
| * 3d90a668c4 net: hns3: fix setting incorrect phy link ksettings for firmware in resetting process
| * 3f7b2ef8fe net: ena: Fix error handling in ena_init()
| * 2540eea1bd net: ionic: Fix error handling in ionic_init_module()
| * c08c13cb13 xen/pcpu: fix possible memory leak in register_pcpu()
| * 97009f07f2 net: dsa: make dsa_master_ioctl() see through port_hwtstamp_get() shims
| * 88da008e5e net: mhi: Fix memory leak in mhi_net_dellink()
| * 8f839715d0 bnxt_en: Remove debugfs when pci_register_driver failed
| * b88713d92b net: caif: fix double disconnect client in chnl_net_open()
| * 6d24034160 net: macvlan: Use built-in RCU list checking
| * 596230471d mISDN: fix misuse of put_device() in mISDN_register_device()
| * 07a6a8cf17 net: liquidio: release resources when liquidio driver open failed
| * 19feb6cf41 soc: imx8m: Enable OCOTP clock before reading the register
| * 8c54d706d8 net: stmmac: ensure tx function is not running in stmmac_xdp_release()
| * 6219f46c2b net: hinic: Fix error handling in hinic_module_init()
| * 7a05e39296 mISDN: fix possible memory leak in mISDN_dsp_element_register()
| * 0ee6455c9c net: bgmac: Drop free_netdev() from bgmac_enet_remove()
| * 7ff4fa179e bpf: Initialize same number of free nodes for each pcpu_freelist
| * 12f178cf05 MIPS: Loongson64: Add WARN_ON on kexec related kmalloc failed
| * a4d6e024be MIPS: fix duplicate definitions for exported symbols
| * 44142b652a nfp: change eeprom length to max length enumerators
| * f23058dc23 ata: libata-transport: fix error handling in ata_tdev_add()
| * 67b2193146 ata: libata-transport: fix error handling in ata_tlink_add()
| * e7bb1b7a7b ata: libata-transport: fix error handling in ata_tport_add()
| * 377ff82c33 ata: libata-transport: fix double ata_host_put() in ata_tport_add()
| * 494df0b0ef arm64: dts: imx8mn: Fix NAND controller size-cells
| * 7178d568f7 arm64: dts: imx8mm: Fix NAND controller size-cells
| * 8ccf18c82a ARM: dts: imx7: Fix NAND controller size-cells
| * e884a6c2d4 drm: Fix potential null-ptr-deref in drm_vblank_destroy_worker()
| * 07e56de876 drm/drv: Fix potential memory leak in drm_dev_init()
| * 45c300613b drm/panel: simple: set bpc field for logic technologies displays
| * 779f3f9e0c drm/vc4: kms: Fix IS_ERR() vs NULL check for vc4_kms
| * 97e5b508e9 pinctrl: devicetree: fix null pointer dereferencing in pinctrl_dt_to_map
| * 9a77b8557f parport_pc: Avoid FIFO port location truncation
| * 5d03c2911c siox: fix possible memory leak in siox_device_add()
| * 530e987a02 arm64: Fix bit-shifting UB in the MIDR_CPU_MODEL() macro
| * d494449782 bpf: Fix memory leaks in __check_func_call
| * 25521fd2e2 block: sed-opal: kmalloc the cmd/resp buffers
| * 2f21d653c6 scsi: scsi_transport_sas: Fix error handling in sas_phy_add()
| * 7cd28bc410 pinctrl: rockchip: list all pins in a possible mux route for PX30
| * ab79b8dbe2 ASoC: soc-utils: Remove __exit for snd_soc_util_exit()
| * eaa8edd865 bpf, test_run: Fix alignment problem in bpf_prog_test_run_skb()
| * 33cabe04d2 tty: n_gsm: fix sleep-in-atomic-context bug in gsm_control_send
| * ae22294e21 serial: imx: Add missing .thaw_noirq hook
| * 26db1cd519 serial: 8250: omap: Flush PM QOS work on remove
| * e0db709a58 serial: 8250: omap: Fix unpaired pm_runtime_put_sync() in omap8250_remove()
| * 83b6d4d6da serial: 8250_omap: remove wait loop from Errata i202 workaround
| * 76db05ab70 serial: 8250: omap: Fix missing PM runtime calls for omap8250_set_mctrl()
| * 2aee616a6b ARM: at91: pm: avoid soft resetting AC DLL
| * 188546c780 ASoC: tas2764: Fix set_tdm_slot in case of single slot
| * 5782896daf ASoC: tas2770: Fix set_tdm_slot in case of single slot
| * 34eee4189b ASoC: core: Fix use-after-free in snd_soc_exit()
| * aa6f8aecbb ARM: dts: at91: sama7g5: fix signal name of pin PB2
| * 487fff700f spi: stm32: Print summary 'callbacks suppressed' message
| * 2cec2f65c1 arm64: dts: qcom: sm8350-hdk: Specify which LDO modes are allowed
| * 44dbe66bb3 arm64: dts: qcom: sm8250-xperia-edo: Specify which LDO modes are allowed
| * 8b2eae7def arm64: dts: qcom: sm8150-xperia-kumano: Specify which LDO modes are allowed
| * c8e76eeea7 arm64: dts: qcom: sa8155p-adp: Specify which LDO modes are allowed
| * 30571f28bb hugetlbfs: don't delete error page from pagecache
| * 14ddbb83c3 KVM: x86/pmu: Do not speculatively query Intel GP PMCs that don't exist yet
| * a9b964ed7c spi: intel: Use correct mask for flash and protected regions
| * f4eb68642e mtd: spi-nor: intel-spi: Disable write protection only if asked
| * 156d0c823c ASoC: codecs: jz4725b: Fix spelling mistake "Sourc" -> "Source", "Routee" -> "Route"
| * 5907ff9f2c x86/cpu: Add several Intel server CPU model numbers
| * 41e37d04e3 Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm
| * b02a025dd1 btrfs: remove pointless and double ulist frees in error paths of qgroup tests
| * 1c366c206f drm/imx: imx-tve: Fix return type of imx_tve_connector_mode_valid
| * 1c8ded1b38 i2c: i801: add lis3lv02d's I2C address for Vostro 5568
| * b432581f19 i2c: tegra: Allocate DMA memory for DMA engine
| * 7b0ae4c7b9 firmware: arm_scmi: Cleanup the core driver removal callback
| * 1a8a2fef27 ACPI: x86: Add another system to quirk list for forcing StorageD3Enable
| * 8a03a4a5cf NFSv4: Retry LOCK on OLD_STATEID during delegation return
| * 49ca2227c4 btrfs: raid56: properly handle the error when unable to find the missing stripe
| * 0f7bd3a2df RDMA/efa: Add EFA 0xefa2 PCI ID
| * a42d4363e7 ACPI: scan: Add LATT2021 to acpi_ignore_dep_ids[]
| * 004decd41b drm/amd/display: Remove wrong pipe control lock
| * 7779efbb99 ASoC: rt1308-sdw: add the default value of some registers
| * ef1e4ed858 selftests/intel_pstate: fix build for ARCH=x86_64
| * dfd3cc1ef3 selftests/futex: fix build for clang
| * 648467236c ASoC: Intel: sof_sdw: add quirk variant for LAPBC710 NUC15
| * 64ee750c29 ASoC: codecs: jz4725b: fix capture selector naming
| * 150b74cd06 ASoC: codecs: jz4725b: use right control for Capture Volume
| * 5352d8b315 ASoC: codecs: jz4725b: fix reported volume for Master ctl
| * 85134577a7 ASoC: codecs: jz4725b: add missed Line In power control bit
| * 5e61dffb16 spi: intel: Fix the offset to get the 64K erase opcode
| * c697cb2e66 ASoC: wm8962: Add an event handler for TEMP_HP and TEMP_SPK
| * 569085124d ASoC: rt1019: Fix the TDM settings
| * 4160a515c7 ASoC: mt6660: Keep the pm_runtime enables before component stuff in mt6660_i2c_probe
| * 2963ec4535 ASoC: wm8997: Revert "ASoC: wm8997: Fix PM disable depth imbalance in wm8997_probe"
| * 30a2f9479c ASoC: wm5110: Revert "ASoC: wm5110: Fix PM disable depth imbalance in wm5110_probe"
| * 3bf6da38a2 ASoC: wm5102: Revert "ASoC: wm5102: Fix PM disable depth imbalance in wm5102_probe"
| * 94fa250ea5 mm: shmem: don't truncate page if memory failure happens
| * 003fa19591 mm: hwpoison: handle non-anonymous THP correctly
| * a62b1bc603 mm: hwpoison: refactor refcount check handling
* | 49ca4a5978 Revert "bpf, sockmap: Fix sk->sk_forward_alloc warn_on in sk_stream_kill_queues"
* | 6fa2a43acd Revert "ALSA: usb-audio: Yet more regression for for the delayed card registration"
* | ac2a7a141f Merge 5.15.79 into android13-5.15-lts
|/
* 3df0eeae4d Linux 5.15.79
* 599b24eedf x86/cpu: Restore AMD's DE_CFG MSR after resume
* 9132fa043f net: tun: call napi_schedule_prep() to ensure we own a napi
* 1dea25e25a drm/amdkfd: Migrate in CPU page fault use current mm
* a1c303fbd4 marvell: octeontx2: build error: unknown type name 'u64'
* d948b22834 dmaengine: at_hdmac: Check return code of dma_async_device_register
* c556ecf32a dmaengine: at_hdmac: Fix impossible condition
* 8a941ff34e dmaengine: at_hdmac: Don't allow CPU to reorder channel enable
* 53831f7a13 dmaengine: at_hdmac: Fix completion of unissued descriptor in case of errors
* 14f5462e4a dmaengine: at_hdmac: Fix descriptor handling when issuing it to hardware
* 5482403228 dmaengine: at_hdmac: Fix concurrency over the active list
* 82ca19414f dmaengine: at_hdmac: Free the memset buf without holding the chan lock
* 8fd36e069d dmaengine: at_hdmac: Fix concurrency over descriptor
* 1ee012d452 dmaengine: at_hdmac: Fix concurrency problems by removing atc_complete_all()
* 90c1b07406 dmaengine: at_hdmac: Protect atchan->status with the channel lock
* b5ee1fe06a dmaengine: at_hdmac: Do not call the complete callback on device_terminate_all
* 9bbf5df0fc dmaengine: at_hdmac: Fix premature completion of desc in issue_pending
* f7d1aaa903 dmaengine: at_hdmac: Start transfer for cyclic channels in issue_pending
* e9777b4efc dmaengine: at_hdmac: Don't start transactions at tx_submit level
* 4e28674a0e dmaengine: at_hdmac: Fix at_lli struct definition
* 49eba53137 cert host tools: Stop complaining about deprecated OpenSSL functions
* 69e86c6268 can: j1939: j1939_send_one(): fix missing CAN header initialization
* 81fc8f90b8 mm/shmem: use page_mapping() to detect page cache for uffd continue
* e91451af11 mm/memremap.c: map FS_DAX device memory as decrypted
* 48998c1773 mm/damon/dbgfs: check if rm_contexts input is for a real context
* c736ed8541 udf: Fix a slab-out-of-bounds write bug in udf_find_entry()
* 2e87eddf57 mms: sdhci-esdhc-imx: Fix SDHCI_RESET_ALL for CQHCI
* 91c38504e5 btrfs: zoned: initialize device's zone info for seeding
* 432c30ba3f btrfs: selftests: fix wrong error check in btrfs_free_dummy_root()
* c9fe4719c6 btrfs: fix match incorrectly in dev_args_match_device
* f96fd36936 wifi: ath11k: avoid deadlock during regulatory update in ath11k_regd_update()
* 8e2b576caf platform/x86: hp_wmi: Fix rfkill causing soft blocked wifi
* cb3ab0e1e0 drm/amdgpu: disable BACO on special BEIGE_GOBY card
* dc066a7850 drm/i915/dmabuf: fix sg_table handling in map_dma_buf
* afbd118838 nilfs2: fix use-after-free bug of ns_writer on remount
* abc082aac0 nilfs2: fix deadlock in nilfs_count_free_blocks()
* 589da22881 ata: libata-scsi: fix SYNCHRONIZE CACHE (16) command failure
* 51ae4579a5 vmlinux.lds.h: Fix placement of '.data..decrypted' section
* 1f8e08ab32 ALSA: usb-audio: Add DSD support for Accuphase DAC-60
* c2451f62b2 ALSA: usb-audio: Add quirk entry for M-Audio Micro
* 031d1480a0 ALSA: usb-audio: Yet more regression for for the delayed card registration
* 574f51e4aa ALSA: hda/realtek: Add Positivo C6300 model quirk
* 7140d7aaf9 ALSA: hda: fix potential memleak in 'add_widget_node'
* f6d7a487aa ALSA: hda/ca0132: add quirk for EVGA Z390 DARK
* 1ccd55b390 ALSA: hda/hdmi - enable runtime pm for more AMD display audio
* 29100c6742 mmc: sdhci-esdhc-imx: use the correct host caps for MMC_CAP_8_BIT_DATA
* 3dce99e2eb mmc: sdhci-tegra: Fix SDHCI_RESET_ALL for CQHCI
* 9d6bd33e6a mmc: sdhci_am654: Fix SDHCI_RESET_ALL for CQHCI
* ad01f16ca9 mmc: sdhci-of-arasan: Fix SDHCI_RESET_ALL for CQHCI
* 1aa78c1d01 mmc: cqhci: Provide helper for resetting both SDHCI and CQHCI
* c198524a99 MIPS: jump_label: Fix compat branch range check
* 9713ceffa4 arm64: efi: Fix handling of misaligned runtime regions and drop warning
* 518e49f059 riscv: fix reserved memory setup
* d07c3d7491 riscv: vdso: fix build with llvm
* cc36c7fa5d riscv: process: fix kernel info leakage
* a8d67367ab net: macvlan: fix memory leaks of macvlan_common_newlink
* 7b194dd32b ethernet: tundra: free irq when alloc ring failed in tsi108_open()
* 7de10342fe net: mv643xx_eth: disable napi when init rxq or txq failed in mv643xx_eth_open()
* 88e1dd2d92 ethernet: s2io: disable napi when start nic failed in s2io_card_up()
* 3652f1f8d3 net: atlantic: macsec: clear encryption keys from the stack
* fca3b0a1fd net: phy: mscc: macsec: clear encryption keys when freeing a flow
* 60a0af8813 stmmac: dwmac-loongson: fix missing of_node_put() while module exiting
* ee4a9bd2c7 stmmac: dwmac-loongson: fix missing pci_disable_device() in loongson_dwmac_probe()
* 4a8770eebc stmmac: dwmac-loongson: fix missing pci_disable_msi() while module exiting
* 83196d8dc5 cxgb4vf: shut down the adapter when t4vf_update_port_info() failed in cxgb4vf_open()
* 49d8a6e24a mctp: Fix an error handling path in mctp_init()
* 29961d2332 stmmac: intel: Update PCH PTP clock rate from 200MHz to 204.8MHz
* 8604bebc5c stmmac: intel: Enable 2.5Gbps for Intel AlderLake-S
* 7dec6dae2b net: cxgb3_main: disable napi when bind qsets failed in cxgb_up()
* 960f9d30de net: cpsw: disable napi in cpsw_ndo_open()
* 1360778fdb net/mlx5e: E-Switch, Fix comparing termination table instance
* f13e9ebd29 net/mlx5: Allow async trigger completion execution on single CPU systems
* 48b73b46a5 net/mlx5: Bridge, verify LAG state when adding bond to bridge
* 13b1ea861e net: wwan: iosm: fix memory leak in ipc_pcie_read_bios_cfg
* 7e4dcacb4d net: nixge: disable napi when enable interrupts failed in nixge_open()
* 409731df63 net: marvell: prestera: fix memory leak in prestera_rxtx_switch_init()
* 77ff31cba9 netfilter: Cleanup nft_net->module_list from nf_tables_exit_net()
* e62cb1c093 netfilter: nfnetlink: fix potential dead lock in nfnetlink_rcv_msg()
* 0bd20318da perf tools: Add the include/perf/ directory to .gitignore
* a733671e38 perf stat: Fix printing os->prefix in CSV metrics output
* c36e9e2c4a drivers: net: xgene: disable napi when register irq failed in xgene_enet_open()
* 4689bd3a1b net: lapbether: fix issue of invalid opcode in lapbeth_open()
* 1dd27541aa dmaengine: ti: k3-udma-glue: fix memory leak when register device fail
* 992e966caf dmaengine: mv_xor_v2: Fix a resource leak in mv_xor_v2_remove()
* 9766af75ba dmaengine: pxa_dma: use platform_get_irq_optional
* 301caa0609 tipc: fix the msg->req tlv len check in tipc_nl_compat_name_table_dump_header
* 6a264203db net: broadcom: Fix BCMGENET Kconfig
* e7871b9a21 net: stmmac: dwmac-meson8b: fix meson8b_devm_clk_prepare_enable()
* 261178a1c2 can: af_can: fix NULL pointer dereference in can_rx_register()
* 2acb2779b1 ipv6: addrlabel: fix infoleak when sending struct ifaddrlblmsg to network
* 13ecaa6832 tcp: prohibit TCP_REPAIR_OPTIONS if data was already sent
* bc79cb9fb0 drm/vc4: Fix missing platform_unregister_drivers() call in vc4_drm_register()
* 2845bc9070 net: wwan: mhi: fix memory leak in mhi_mbim_dellink
* 2ce2348c28 net: wwan: iosm: fix memory leak in ipc_wwan_dellink
* 7b6bc50f65 hamradio: fix issue of dev reference count leakage in bpq_device_event()
* f59adebb8c net: lapbether: fix issue of dev reference count leakage in lapbeth_device_event()
* 119407dc32 KVM: s390: pv: don't allow userspace to set the clock under PV
* 500bcd3a99 phy: ralink: mt7621-pci: add sentinel to quirks table
* 151dc8087b capabilities: fix undefined behavior in bit shift for CAP_TO_MASK
* 435c7ddfd5 net: fman: Unregister ethernet device on removal
* 3a504d6d96 bnxt_en: fix potentially incorrect return value for ndo_rx_flow_steer
* ac257c43fa bnxt_en: Fix possible crash in bnxt_hwrm_set_coal()
* d7569302a7 net: tun: Fix memory leaks of napi_get_frags
* 430d1f4964 octeontx2-pf: NIX TX overwrites SQ_CTX_HW_S[SQ_INT]
* ec0db81883 octeontx2-pf: Use hardware register for CQE count
* b89a0d8859 macsec: clear encryption keys from the stack after setting up offload
* eeba7f07a0 macsec: fix detection of RXSCs when toggling offloading
* 3070a880eb macsec: fix secy->n_rx_sc accounting
* e957555a36 macsec: delete new rxsc when offload fails
* ad25a115f5 net: gso: fix panic on frag_list with mixed head alloc types
* 466ce46f25 bpf: Fix wrong reg type conversion in release_reference()
* 35d8130f2a bpf: Add helper macro bpf_for_each_reg_in_vstate
* 61274498fb bpf, sock_map: Move cancel_work_sync() out of sock lock
* 32b5dd03be bpf: Fix sockmap calling sleepable function in teardown path
* e991558189 bpf, sockmap: Fix sk->sk_forward_alloc warn_on in sk_stream_kill_queues
* 5ad95d7134 HID: hyperv: fix possible memory leak in mousevsc_probe()
* 6dcdd1b68b bpftool: Fix NULL pointer dereference when pin {PROG, MAP, LINK} without FILE
* 2fc902245c wifi: mac80211: Set TWT Information Frame Disabled bit as 1
* 95adbd2ac8 bpf, sockmap: Fix the sk->sk_forward_alloc warning of sk_stream_kill_queues
* 06615967d4 bpf, verifier: Fix memory leak in array reallocation for stack state
* 4335a82c4f soundwire: qcom: check for outanding writes before doing a read
* ae4dad2e53 soundwire: qcom: reinit broadcast completion
* 38c9fa2cc6 wifi: cfg80211: fix memory leak in query_regdb_file()
* 2c6ba0a787 wifi: cfg80211: silence a sparse RCU warning
* 921738c280 phy: stm32: fix an error code in probe
* fa722006f7 hwspinlock: qcom: correct MMIO max register for newer SoCs
* 3c1bb6187e drm/amdkfd: Fix NULL pointer dereference in svm_migrate_to_ram()
* b1f8522771 drm/amdkfd: handle CPU fault on COW mapping
* 36770c045a drm/amdkfd: avoid recursive lock in migrations back to RAM
* 93a5de7e88 fuse: fix readdir cache race
* 1920cf9454 thunderbolt: Add DP OUT resource when DP tunnel is discovered
* 47dbf24969 thunderbolt: Tear down existing tunnels when resuming from hibernate

And update the .xml file with the new symbol that we are tracking and
the abi preservation fix:

1 function symbol(s) added
  'void __dev_kfree_skb_irq(struct sk_buff *, enum skb_free_reason)'

type 'struct sdhci_host' changed
  member 'union { struct { u8 reinit_uhs; u8 reserve01; u8 drv_type; u16 reserve02; u32 reserve03; }; struct { u64 android_kabi_reserved1; }; union { }; }' was added
  member 'u64 android_kabi_reserved1' was removed

Change-Id: If4a059230a137dee54298fff61ec87306bf96b0f
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
2023-03-24 08:44:06 +00:00

12147 lines
319 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
*
* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* Interactivity improvements by Mike Galbraith
* (C) 2007 Mike Galbraith <efault@gmx.de>
*
* Various enhancements by Dmitry Adamushko.
* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
*
* Group scheduling enhancements by Srivatsa Vaddagiri
* Copyright IBM Corporation, 2007
* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
*
* Scaled math optimizations by Thomas Gleixner
* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
*
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
#include "sched.h"
#include <trace/hooks/sched.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_runtime);
/*
* Targeted preemption latency for CPU-bound tasks:
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
* and have no persistent notion like in traditional, time-slice
* based scheduling concepts.
*
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_latency = 6000000ULL;
EXPORT_SYMBOL_GPL(sysctl_sched_latency);
static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
*
* Options are:
*
* SCHED_TUNABLESCALING_NONE - unscaled, always *1
* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 750000ULL;
static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
*/
static unsigned int sched_nr_latency = 8;
/*
* After fork, child runs first. If set to 0 (default) then
* parent will (try to) run first.
*/
unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
* SCHED_OTHER wake-up granularity.
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
int _shift = 0;
if (kstrtoint(str, 0, &_shift))
pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
sched_thermal_decay_shift = clamp(_shift, 0, 10);
return 1;
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
#ifdef CONFIG_SMP
/*
* For asym packing, by default the lower numbered CPU has higher priority.
*/
int __weak arch_asym_cpu_priority(int cpu)
{
return -cpu;
}
/*
* The margin used when comparing utilization with CPU capacity.
*
* (default: ~20%)
*/
#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
/*
* The margin used when comparing CPU capacities.
* is 'cap1' noticeably greater than 'cap2'
*
* (default: ~5%)
*/
#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
#endif
#ifdef CONFIG_CFS_BANDWIDTH
/*
* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
* each time a cfs_rq requests quota.
*
* Note: in the case that the slice exceeds the runtime remaining (either due
* to consumption or the quota being specified to be smaller than the slice)
* we will always only issue the remaining available time.
*
* (default: 5 msec, units: microseconds)
*/
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
lw->inv_weight = 0;
}
static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
lw->inv_weight = 0;
}
static inline void update_load_set(struct load_weight *lw, unsigned long w)
{
lw->weight = w;
lw->inv_weight = 0;
}
/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
* to users decreases. But the relationship is not linear,
* so pick a second-best guess by going with the log2 of the
* number of CPUs.
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
static unsigned int get_update_sysctl_factor(void)
{
unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
unsigned int factor;
switch (sysctl_sched_tunable_scaling) {
case SCHED_TUNABLESCALING_NONE:
factor = 1;
break;
case SCHED_TUNABLESCALING_LINEAR:
factor = cpus;
break;
case SCHED_TUNABLESCALING_LOG:
default:
factor = 1 + ilog2(cpus);
break;
}
return factor;
}
static void update_sysctl(void)
{
unsigned int factor = get_update_sysctl_factor();
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
SET_SYSCTL(sched_min_granularity);
SET_SYSCTL(sched_latency);
SET_SYSCTL(sched_wakeup_granularity);
#undef SET_SYSCTL
}
void __init sched_init_granularity(void)
{
update_sysctl();
}
#define WMULT_CONST (~0U)
#define WMULT_SHIFT 32
static void __update_inv_weight(struct load_weight *lw)
{
unsigned long w;
if (likely(lw->inv_weight))
return;
w = scale_load_down(lw->weight);
if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
lw->inv_weight = 1;
else if (unlikely(!w))
lw->inv_weight = WMULT_CONST;
else
lw->inv_weight = WMULT_CONST / w;
}
/*
* delta_exec * weight / lw.weight
* OR
* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
*
* Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
* we're guaranteed shift stays positive because inv_weight is guaranteed to
* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
*
* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
* weight/lw.weight <= 1, and therefore our shift will also be positive.
*/
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
u64 fact = scale_load_down(weight);
u32 fact_hi = (u32)(fact >> 32);
int shift = WMULT_SHIFT;
int fs;
__update_inv_weight(lw);
if (unlikely(fact_hi)) {
fs = fls(fact_hi);
shift -= fs;
fact >>= fs;
}
fact = mul_u32_u32(fact, lw->inv_weight);
fact_hi = (u32)(fact >> 32);
if (fact_hi) {
fs = fls(fact_hi);
shift -= fs;
fact >>= fs;
}
return mul_u64_u32_shr(delta_exec, fact, shift);
}
const struct sched_class fair_sched_class;
/**************************************************************
* CFS operations on generic schedulable entities:
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
if (!path)
return;
if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
autogroup_path(cfs_rq->tg, path, len);
else if (cfs_rq && cfs_rq->tg->css.cgroup)
cgroup_path(cfs_rq->tg->css.cgroup, path, len);
else
strlcpy(path, "(null)", len);
}
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
int cpu = cpu_of(rq);
if (cfs_rq->on_list)
return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
cfs_rq->on_list = 1;
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
* enqueued. The fact that we always enqueue bottom-up
* reduces this to two cases and a special case for the root
* cfs_rq. Furthermore, it also means that we will always reset
* tmp_alone_branch either when the branch is connected
* to a tree or when we reach the top of the tree
*/
if (cfs_rq->tg->parent &&
cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
/*
* If parent is already on the list, we add the child
* just before. Thanks to circular linked property of
* the list, this means to put the child at the tail
* of the list that starts by parent.
*/
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
/*
* The branch is now connected to its tree so we can
* reset tmp_alone_branch to the beginning of the
* list.
*/
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
return true;
}
if (!cfs_rq->tg->parent) {
/*
* cfs rq without parent should be put
* at the tail of the list.
*/
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
&rq->leaf_cfs_rq_list);
/*
* We have reach the top of a tree so we can reset
* tmp_alone_branch to the beginning of the list.
*/
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
return true;
}
/*
* The parent has not already been added so we want to
* make sure that it will be put after us.
* tmp_alone_branch points to the begin of the branch
* where we will add parent.
*/
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
/*
* update tmp_alone_branch to points to the new begin
* of the branch
*/
rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
return false;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
struct rq *rq = rq_of(cfs_rq);
/*
* With cfs_rq being unthrottled/throttled during an enqueue,
* it can happen the tmp_alone_branch points the a leaf that
* we finally want to del. In this case, tmp_alone_branch moves
* to the prev element but it will point to rq->leaf_cfs_rq_list
* at the end of the enqueue.
*/
if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
}
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
return se->cfs_rq;
return NULL;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return se->parent;
}
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
int se_depth, pse_depth;
/*
* preemption test can be made between sibling entities who are in the
* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
* both tasks until we find their ancestors who are siblings of common
* parent.
*/
/* First walk up until both entities are at same depth */
se_depth = (*se)->depth;
pse_depth = (*pse)->depth;
while (se_depth > pse_depth) {
se_depth--;
*se = parent_entity(*se);
}
while (pse_depth > se_depth) {
pse_depth--;
*pse = parent_entity(*pse);
}
while (!is_same_group(*se, *pse)) {
*se = parent_entity(*se);
*pse = parent_entity(*pse);
}
}
static int tg_is_idle(struct task_group *tg)
{
return tg->idle > 0;
}
static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
{
return cfs_rq->idle > 0;
}
static int se_is_idle(struct sched_entity *se)
{
if (entity_is_task(se))
return task_has_idle_policy(task_of(se));
return cfs_rq_is_idle(group_cfs_rq(se));
}
#else /* !CONFIG_FAIR_GROUP_SCHED */
#define for_each_sched_entity(se) \
for (; se; se = NULL)
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
if (path)
strlcpy(path, "(null)", len);
}
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
return true;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
}
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
}
static inline void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}
static inline int tg_is_idle(struct task_group *tg)
{
return 0;
}
static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
{
return 0;
}
static int se_is_idle(struct sched_entity *se)
{
return 0;
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
*/
static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
max_vruntime = vruntime;
return max_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
min_vruntime = vruntime;
return min_vruntime;
}
static inline bool entity_before(struct sched_entity *a,
struct sched_entity *b)
{
return (s64)(a->vruntime - b->vruntime) < 0;
}
#define __node_2_se(node) \
rb_entry((node), struct sched_entity, run_node)
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
if (curr) {
if (curr->on_rq)
vruntime = curr->vruntime;
else
curr = NULL;
}
if (leftmost) { /* non-empty tree */
struct sched_entity *se = __node_2_se(leftmost);
if (!curr)
vruntime = se->vruntime;
else
vruntime = min_vruntime(vruntime, se->vruntime);
}
/* ensure we never gain time by being placed backwards. */
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
{
return entity_before(__node_2_se(a), __node_2_se(b));
}
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
trace_android_rvh_enqueue_entity(cfs_rq, se);
rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
trace_android_rvh_dequeue_entity(cfs_rq, se);
rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
if (!left)
return NULL;
return __node_2_se(left);
}
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
struct rb_node *next = rb_next(&se->run_node);
if (!next)
return NULL;
return __node_2_se(next);
}
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
if (!last)
return NULL;
return __node_2_se(last);
}
/**************************************************************
* Scheduling class statistics methods:
*/
int sched_update_scaling(void)
{
unsigned int factor = get_update_sysctl_factor();
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
#undef WRT_SYSCTL
return 0;
}
#endif
/*
* delta /= w
*/
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
if (unlikely(se->load.weight != NICE_0_LOAD))
delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
return delta;
}
/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* p = (nr <= nl) ? l : l*nr/nl
*/
static u64 __sched_period(unsigned long nr_running)
{
if (unlikely(nr_running > sched_nr_latency))
return nr_running * sysctl_sched_min_granularity;
else
return sysctl_sched_latency;
}
/*
* We calculate the wall-time slice from the period by taking a part
* proportional to the weight.
*
* s = p*P[w/rw]
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned int nr_running = cfs_rq->nr_running;
u64 slice;
if (sched_feat(ALT_PERIOD))
nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
slice = __sched_period(nr_running + !se->on_rq);
for_each_sched_entity(se) {
struct load_weight *load;
struct load_weight lw;
cfs_rq = cfs_rq_of(se);
load = &cfs_rq->load;
if (unlikely(!se->on_rq)) {
lw = cfs_rq->load;
update_load_add(&lw, se->load.weight);
load = &lw;
}
slice = __calc_delta(slice, se->load.weight, load);
}
if (sched_feat(BASE_SLICE))
slice = max(slice, (u64)sysctl_sched_min_granularity);
return slice;
}
/*
* We calculate the vruntime slice of a to-be-inserted task.
*
* vs = s/w
*/
static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
#include "pelt.h"
#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
static unsigned long capacity_of(int cpu);
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
memset(sa, 0, sizeof(*sa));
/*
* Tasks are initialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level.
* Group entities are initialized with zero load to reflect the fact that
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
static void attach_entity_cfs_rq(struct sched_entity *se);
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
*
* util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
*
* However, in many cases, the above util_avg does not give a desired
* value. Moreover, the sum of the util_avgs may be divergent, such
* as when the series is a harmonic series.
*
* To solve this problem, we also cap the util_avg of successive tasks to
* only 1/2 of the left utilization budget:
*
* util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
*
* where n denotes the nth task and cpu_scale the CPU capacity.
*
* For example, for a CPU with 1024 of capacity, a simplest series from
* the beginning would be like:
*
* task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
*
* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
* if util_avg > util_avg_cap.
*/
void post_init_entity_util_avg(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
sa->util_avg /= (cfs_rq->avg.load_avg + 1);
if (sa->util_avg > cap)
sa->util_avg = cap;
} else {
sa->util_avg = cap;
}
}
sa->runnable_avg = sa->util_avg;
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
* expected state.
*/
se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
return;
}
/* Hook before this se's util is attached to cfs_rq's util */
trace_android_rvh_post_init_entity_util_avg(se);
attach_entity_cfs_rq(se);
}
#else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se)
{
}
void post_init_entity_util_avg(struct task_struct *p)
{
}
static void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
}
#endif /* CONFIG_SMP */
/*
* Update the current task's runtime statistics.
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq));
u64 delta_exec;
if (unlikely(!curr))
return;
delta_exec = now - curr->exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
curr->exec_start = now;
schedstat_set(curr->statistics.exec_max,
max(delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cgroup_account_cputime(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
static void update_curr_fair(struct rq *rq)
{
update_curr(cfs_rq_of(&rq->curr->se));
}
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 wait_start, prev_wait_start;
if (!schedstat_enabled())
return;
wait_start = rq_clock(rq_of(cfs_rq));
prev_wait_start = schedstat_val(se->statistics.wait_start);
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
likely(wait_start > prev_wait_start))
wait_start -= prev_wait_start;
__schedstat_set(se->statistics.wait_start, wait_start);
}
static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *p;
u64 delta;
if (!schedstat_enabled())
return;
/*
* When the sched_schedstat changes from 0 to 1, some sched se
* maybe already in the runqueue, the se->statistics.wait_start
* will be 0.So it will let the delta wrong. We need to avoid this
* scenario.
*/
if (unlikely(!schedstat_val(se->statistics.wait_start)))
return;
delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
if (entity_is_task(se)) {
p = task_of(se);
if (task_on_rq_migrating(p)) {
/*
* Preserve migrating task's wait time so wait_start
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
__schedstat_set(se->statistics.wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}
__schedstat_set(se->statistics.wait_max,
max(schedstat_val(se->statistics.wait_max), delta));
__schedstat_inc(se->statistics.wait_count);
__schedstat_add(se->statistics.wait_sum, delta);
__schedstat_set(se->statistics.wait_start, 0);
}
static inline void
update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *tsk = NULL;
u64 sleep_start, block_start;
if (!schedstat_enabled())
return;
sleep_start = schedstat_val(se->statistics.sleep_start);
block_start = schedstat_val(se->statistics.block_start);
if (entity_is_task(se))
tsk = task_of(se);
if (sleep_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
__schedstat_set(se->statistics.sleep_max, delta);
__schedstat_set(se->statistics.sleep_start, 0);
__schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
if (block_start) {
u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
if ((s64)delta < 0)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.block_max)))
__schedstat_set(se->statistics.block_max, delta);
__schedstat_set(se->statistics.block_start, 0);
__schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
if (tsk->in_iowait) {
__schedstat_add(se->statistics.iowait_sum, delta);
__schedstat_inc(se->statistics.iowait_count);
trace_sched_stat_iowait(tsk, delta);
}
trace_sched_stat_blocked(tsk, delta);
/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
* amount of time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
profile_hits(SLEEP_PROFILING,
(void *)get_wchan(tsk),
delta >> 20);
}
account_scheduler_latency(tsk, delta >> 10, 0);
}
}
}
/*
* Task is being enqueued - update stats:
*/
static inline void
update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
return;
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
update_stats_wait_start(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
update_stats_enqueue_sleeper(cfs_rq, se);
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
return;
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
unsigned int state;
/* XXX racy against TTWU */
state = READ_ONCE(tsk->__state);
if (state & TASK_INTERRUPTIBLE)
__schedstat_set(se->statistics.sleep_start,
rq_clock(rq_of(cfs_rq)));
if (state & TASK_UNINTERRUPTIBLE)
__schedstat_set(se->statistics.block_start,
rq_clock(rq_of(cfs_rq)));
}
}
/*
* We are picking a new current task - update its stats:
*/
static inline void
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* We are starting a new run period:
*/
se->exec_start = rq_clock_task(rq_of(cfs_rq));
}
/**************************************************
* Scheduling class queueing methods:
*/
#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
* calculated based on the tasks virtual memory size and
* numa_balancing_scan_size.
*/
unsigned int sysctl_numa_balancing_scan_period_min = 1000;
unsigned int sysctl_numa_balancing_scan_period_max = 60000;
/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
struct numa_group {
refcount_t refcount;
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
int active_nodes;
struct rcu_head rcu;
unsigned long total_faults;
unsigned long max_faults_cpu;
/*
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
* more by CPU use than by memory faults.
*/
unsigned long *faults_cpu;
unsigned long faults[];
};
/*
* For functions that can be called in multiple contexts that permit reading
* ->numa_group (see struct task_struct for locking rules).
*/
static struct numa_group *deref_task_numa_group(struct task_struct *p)
{
return rcu_dereference_check(p->numa_group, p == current ||
(lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
}
static struct numa_group *deref_curr_numa_group(struct task_struct *p)
{
return rcu_dereference_protected(p->numa_group, p == current);
}
static inline unsigned long group_faults_priv(struct numa_group *ng);
static inline unsigned long group_faults_shared(struct numa_group *ng);
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
unsigned long nr_scan_pages;
/*
* Calculations based on RSS as non-present and empty pages are skipped
* by the PTE scanner and NUMA hinting faults should be trapped based
* on resident pages
*/
nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
rss = get_mm_rss(p->mm);
if (!rss)
rss = nr_scan_pages;
rss = round_up(rss, nr_scan_pages);
return rss / nr_scan_pages;
}
/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
#define MAX_SCAN_WINDOW 2560
static unsigned int task_scan_min(struct task_struct *p)
{
unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
if (scan_size < MAX_SCAN_WINDOW)
windows = MAX_SCAN_WINDOW / scan_size;
floor = 1000 / windows;
scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
return max_t(unsigned int, floor, scan);
}
static unsigned int task_scan_start(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long period = smin;
struct numa_group *ng;
/* Scale the maximum scan period with the amount of shared memory. */
rcu_read_lock();
ng = rcu_dereference(p->numa_group);
if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
}
rcu_read_unlock();
return max(smin, period);
}
static unsigned int task_scan_max(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long smax;
struct numa_group *ng;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
/* Scale the maximum scan period with the amount of shared memory. */
ng = deref_curr_numa_group(p);
if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
smax = max(smax, period);
}
return max(smin, smax);
}
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2
/* Memory and CPU locality */
#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
/* Averaged statistics, and temporary buffers. */
#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
pid_t task_numa_group_id(struct task_struct *p)
{
struct numa_group *ng;
pid_t gid = 0;
rcu_read_lock();
ng = rcu_dereference(p->numa_group);
if (ng)
gid = ng->gid;
rcu_read_unlock();
return gid;
}
/*
* The averaged statistics, shared & private, memory & CPU,
* occupy the first half of the array. The second half of the
* array is for current counters, which are averaged into the
* first set by task_numa_placement.
*/
static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
{
return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
}
static inline unsigned long task_faults(struct task_struct *p, int nid)
{
if (!p->numa_faults)
return 0;
return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults(struct task_struct *p, int nid)
{
struct numa_group *ng = deref_task_numa_group(p);
if (!ng)
return 0;
return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_priv(struct numa_group *ng)
{
unsigned long faults = 0;
int node;
for_each_online_node(node) {
faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}
return faults;
}
static inline unsigned long group_faults_shared(struct numa_group *ng)
{
unsigned long faults = 0;
int node;
for_each_online_node(node) {
faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
}
return faults;
}
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
* between these nodes are slowed down, to allow things to settle down.
*/
#define ACTIVE_NODE_FRACTION 3
static bool numa_is_active_node(int nid, struct numa_group *ng)
{
return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
}
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
int maxdist, bool task)
{
unsigned long score = 0;
int node;
/*
* All nodes are directly connected, and the same distance
* from each other. No need for fancy placement algorithms.
*/
if (sched_numa_topology_type == NUMA_DIRECT)
return 0;
/*
* This code is called for each node, introducing N^2 complexity,
* which should be ok given the number of nodes rarely exceeds 8.
*/
for_each_online_node(node) {
unsigned long faults;
int dist = node_distance(nid, node);
/*
* The furthest away nodes in the system are not interesting
* for placement; nid was already counted.
*/
if (dist == sched_max_numa_distance || node == nid)
continue;
/*
* On systems with a backplane NUMA topology, compare groups
* of nodes, and move tasks towards the group with the most
* memory accesses. When comparing two nodes at distance
* "hoplimit", only nodes closer by than "hoplimit" are part
* of each group. Skip other nodes.
*/
if (sched_numa_topology_type == NUMA_BACKPLANE &&
dist >= maxdist)
continue;
/* Add up the faults from nearby nodes. */
if (task)
faults = task_faults(p, node);
else
faults = group_faults(p, node);
/*
* On systems with a glueless mesh NUMA topology, there are
* no fixed "groups of nodes". Instead, nodes that are not
* directly connected bounce traffic through intermediate
* nodes; a numa_group can occupy any set of nodes.
* The further away a node is, the less the faults count.
* This seems to result in good task placement.
*/
if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
faults *= (sched_max_numa_distance - dist);
faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
}
score += faults;
}
return score;
}
/*
* These return the fraction of accesses done by a particular task, or
* task group, on a particular numa node. The group weight is given a
* larger multiplier, in order to group tasks together that are almost
* evenly spread out between numa nodes.
*/
static inline unsigned long task_weight(struct task_struct *p, int nid,
int dist)
{
unsigned long faults, total_faults;
if (!p->numa_faults)
return 0;
total_faults = p->total_numa_faults;
if (!total_faults)
return 0;
faults = task_faults(p, nid);
faults += score_nearby_nodes(p, nid, dist, true);
return 1000 * faults / total_faults;
}
static inline unsigned long group_weight(struct task_struct *p, int nid,
int dist)
{
struct numa_group *ng = deref_task_numa_group(p);
unsigned long faults, total_faults;
if (!ng)
return 0;
total_faults = ng->total_faults;
if (!total_faults)
return 0;
faults = group_faults(p, nid);
faults += score_nearby_nodes(p, nid, dist, false);
return 1000 * faults / total_faults;
}
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
/*
* Multi-stage node selection is used in conjunction with a periodic
* migration fault to build a temporal task<->page relation. By using
* a two-stage filter we remove short/unlikely relations.
*
* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
* a task's usage of a particular page (n_p) per total usage of this
* page (n_t) (in a given time-span) to a probability.
*
* Our periodic faults will sample this probability and getting the
* same result twice in a row, given these samples are fully
* independent, is then given by P(n)^2, provided our sample period
* is sufficiently short compared to the usage pattern.
*
* This quadric squishes small probabilities, making it less likely we
* act on an unlikely task<->page relation.
*/
if (!cpupid_pid_unset(last_cpupid) &&
cpupid_to_nid(last_cpupid) != dst_nid)
return false;
/* Always allow migrate on private faults */
if (cpupid_match_pid(p, last_cpupid))
return true;
/* A shared fault, but p->numa_group has not been set up yet. */
if (!ng)
return true;
/*
* Destination node is much more heavily used than the source
* node? Allow migration.
*/
if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
ACTIVE_NODE_FRACTION)
return true;
/*
* Distribute memory according to CPU & memory use on each node,
* with 3/4 hysteresis to avoid unnecessary memory migrations:
*
* faults_cpu(dst) 3 faults_cpu(src)
* --------------- * - > ---------------
* faults_mem(dst) 4 faults_mem(src)
*/
return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
/*
* 'numa_type' describes the node at the moment of load balancing.
*/
enum numa_type {
/* The node has spare capacity that can be used to run more tasks. */
node_has_spare = 0,
/*
* The node is fully used and the tasks don't compete for more CPU
* cycles. Nevertheless, some tasks might wait before running.
*/
node_fully_busy,
/*
* The node is overloaded and can't provide expected CPU cycles to all
* tasks.
*/
node_overloaded
};
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
unsigned long runnable;
unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
unsigned int nr_running;
unsigned int weight;
enum numa_type node_type;
int idle_cpu;
};
static inline bool is_core_idle(int cpu)
{
#ifdef CONFIG_SCHED_SMT
int sibling;
for_each_cpu(sibling, cpu_smt_mask(cpu)) {
if (cpu == sibling)
continue;
if (!idle_cpu(sibling))
return false;
}
#endif
return true;
}
struct task_numa_env {
struct task_struct *p;
int src_cpu, src_nid;
int dst_cpu, dst_nid;
struct numa_stats src_stats, dst_stats;
int imbalance_pct;
int dist;
struct task_struct *best_task;
long best_imp;
int best_cpu;
};
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
struct numa_stats *ns)
{
if ((ns->nr_running > ns->weight) &&
(((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
return node_overloaded;
if ((ns->nr_running < ns->weight) ||
(((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
return node_has_spare;
return node_fully_busy;
}
#ifdef CONFIG_SCHED_SMT
/* Forward declarations of select_idle_sibling helpers */
static inline bool test_idle_cores(int cpu, bool def);
static inline int numa_idle_core(int idle_core, int cpu)
{
if (!static_branch_likely(&sched_smt_present) ||
idle_core >= 0 || !test_idle_cores(cpu, false))
return idle_core;
/*
* Prefer cores instead of packing HT siblings
* and triggering future load balancing.
*/
if (is_core_idle(cpu))
idle_core = cpu;
return idle_core;
}
#else
static inline int numa_idle_core(int idle_core, int cpu)
{
return idle_core;
}
#endif
/*
* Gather all necessary information to make NUMA balancing placement
* decisions that are compatible with standard load balancer. This
* borrows code and logic from update_sg_lb_stats but sharing a
* common implementation is impractical.
*/
static void update_numa_stats(struct task_numa_env *env,
struct numa_stats *ns, int nid,
bool find_idle)
{
int cpu, idle_core = -1;
memset(ns, 0, sizeof(*ns));
ns->idle_cpu = -1;
rcu_read_lock();
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
if (READ_ONCE(rq->numa_migrate_on) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
if (ns->idle_cpu == -1)
ns->idle_cpu = cpu;
idle_core = numa_idle_core(idle_core, cpu);
}
}
rcu_read_unlock();
ns->weight = cpumask_weight(cpumask_of_node(nid));
ns->node_type = numa_classify(env->imbalance_pct, ns);
if (idle_core >= 0)
ns->idle_cpu = idle_core;
}
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
struct rq *rq = cpu_rq(env->dst_cpu);
/* Check if run-queue part of active NUMA balance. */
if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
int cpu;
int start = env->dst_cpu;
/* Find alternative idle CPU. */
for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
if (cpu == env->best_cpu || !idle_cpu(cpu) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
continue;
}
env->dst_cpu = cpu;
rq = cpu_rq(env->dst_cpu);
if (!xchg(&rq->numa_migrate_on, 1))
goto assign;
}
/* Failed to find an alternative idle CPU */
return;
}
assign:
/*
* Clear previous best_cpu/rq numa-migrate flag, since task now
* found a better CPU to move/swap.
*/
if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
rq = cpu_rq(env->best_cpu);
WRITE_ONCE(rq->numa_migrate_on, 0);
}
if (env->best_task)
put_task_struct(env->best_task);
if (p)
get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
env->best_cpu = env->dst_cpu;
}
static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
{
long imb, old_imb;
long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
/*
* The load is corrected for the CPU capacity available on each node.
*
* src_load dst_load
* ------------ vs ---------
* src_capacity dst_capacity
*/
src_capacity = env->src_stats.compute_capacity;
dst_capacity = env->dst_stats.compute_capacity;
imb = abs(dst_load * src_capacity - src_load * dst_capacity);
orig_src_load = env->src_stats.load;
orig_dst_load = env->dst_stats.load;
old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
/* Would this change make things worse? */
return (imb > old_imb);
}
/*
* Maximum NUMA importance can be 1998 (2*999);
* SMALLIMP @ 30 would be close to 1998/64.
* Used to deter task migration.
*/
#define SMALLIMP 30
/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
* into account that it might be best if task running on the dst_cpu should
* be exchanged with the source task
*/
static bool task_numa_compare(struct task_numa_env *env,
long taskimp, long groupimp, bool maymove)
{
struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
long imp = p_ng ? groupimp : taskimp;
struct task_struct *cur;
long src_load, dst_load;
int dist = env->dist;
long moveimp = imp;
long load;
bool stopsearch = false;
if (READ_ONCE(dst_rq->numa_migrate_on))
return false;
rcu_read_lock();
cur = rcu_dereference(dst_rq->curr);
if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
cur = NULL;
/*
* Because we have preemption enabled we can get migrated around and
* end try selecting ourselves (current == env->p) as a swap candidate.
*/
if (cur == env->p) {
stopsearch = true;
goto unlock;
}
if (!cur) {
if (maymove && moveimp >= env->best_imp)
goto assign;
else
goto unlock;
}
/* Skip this swap candidate if cannot move to the source cpu. */
if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
goto unlock;
/*
* Skip this swap candidate if it is not moving to its preferred
* node and the best task is.
*/
if (env->best_task &&
env->best_task->numa_preferred_nid == env->src_nid &&
cur->numa_preferred_nid != env->src_nid) {
goto unlock;
}
/*
* "imp" is the fault differential for the source task between the
* source and destination node. Calculate the total differential for
* the source task and potential destination task. The more negative
* the value is, the more remote accesses that would be expected to
* be incurred if the tasks were swapped.
*
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
cur_ng = rcu_dereference(cur->numa_group);
if (cur_ng == p_ng) {
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
*/
if (cur_ng)
imp -= imp / 16;
} else {
/*
* Compare the group weights. If a task is all by itself
* (not part of a group), use the task weight instead.
*/
if (cur_ng && p_ng)
imp += group_weight(cur, env->src_nid, dist) -
group_weight(cur, env->dst_nid, dist);
else
imp += task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
}
/* Discourage picking a task already on its preferred node */
if (cur->numa_preferred_nid == env->dst_nid)
imp -= imp / 16;
/*
* Encourage picking a task that moves to its preferred node.
* This potentially makes imp larger than it's maximum of
* 1998 (see SMALLIMP and task_weight for why) but in this
* case, it does not matter.
*/
if (cur->numa_preferred_nid == env->src_nid)
imp += imp / 8;
if (maymove && moveimp > imp && moveimp > env->best_imp) {
imp = moveimp;
cur = NULL;
goto assign;
}
/*
* Prefer swapping with a task moving to its preferred node over a
* task that is not.
*/
if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
env->best_task->numa_preferred_nid != env->src_nid) {
goto assign;
}
/*
* If the NUMA importance is less than SMALLIMP,
* task migration might only result in ping pong
* of tasks and also hurt performance due to cache
* misses.
*/
if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
goto unlock;
/*
* In the overloaded case, try and keep the load balanced.
*/
load = task_h_load(env->p) - task_h_load(cur);
if (!load)
goto assign;
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
assign:
/* Evaluate an idle CPU for a task numa move. */
if (!cur) {
int cpu = env->dst_stats.idle_cpu;
/* Nothing cached so current CPU went idle since the search. */
if (cpu < 0)
cpu = env->dst_cpu;
/*
* If the CPU is no longer truly idle and the previous best CPU
* is, keep using it.
*/
if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
idle_cpu(env->best_cpu)) {
cpu = env->best_cpu;
}
env->dst_cpu = cpu;
}
task_numa_assign(env, cur, imp);
/*
* If a move to idle is allowed because there is capacity or load
* balance improves then stop the search. While a better swap
* candidate may exist, a search is not free.
*/
if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
stopsearch = true;
/*
* If a swap candidate must be identified and the current best task
* moves its preferred node then stop the search.
*/
if (!maymove && env->best_task &&
env->best_task->numa_preferred_nid == env->src_nid) {
stopsearch = true;
}
unlock:
rcu_read_unlock();
return stopsearch;
}
static void task_numa_find_cpu(struct task_numa_env *env,
long taskimp, long groupimp)
{
bool maymove = false;
int cpu;
/*
* If dst node has spare capacity, then check if there is an
* imbalance that would be overruled by the load balancer.
*/
if (env->dst_stats.node_type == node_has_spare) {
unsigned int imbalance;
int src_running, dst_running;
/*
* Would movement cause an imbalance? Note that if src has
* more running tasks that the imbalance is ignored as the
* move improves the imbalance from the perspective of the
* CPU load balancer.
* */
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running,
env->dst_stats.weight);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
maymove = true;
if (env->dst_stats.idle_cpu >= 0) {
env->dst_cpu = env->dst_stats.idle_cpu;
task_numa_assign(env, NULL, 0);
return;
}
}
} else {
long src_load, dst_load, load;
/*
* If the improvement from just moving env->p direction is better
* than swapping tasks around, check if a move is possible.
*/
load = task_h_load(env->p);
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
maymove = !load_too_imbalanced(src_load, dst_load, env);
}
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
env->dst_cpu = cpu;
if (task_numa_compare(env, taskimp, groupimp, maymove))
break;
}
}
static int task_numa_migrate(struct task_struct *p)
{
struct task_numa_env env = {
.p = p,
.src_cpu = task_cpu(p),
.src_nid = task_node(p),
.imbalance_pct = 112,
.best_task = NULL,
.best_imp = 0,
.best_cpu = -1,
};
unsigned long taskweight, groupweight;
struct sched_domain *sd;
long taskimp, groupimp;
struct numa_group *ng;
struct rq *best_rq;
int nid, ret, dist;
/*
* Pick the lowest SD_NUMA domain, as that would have the smallest
* imbalance and would be the first to start moving tasks about.
*
* And we want to avoid any moving of tasks about, as that would create
* random movement of tasks -- counter the numa conditions we're trying
* to satisfy here.
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
if (sd)
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
rcu_read_unlock();
/*
* Cpusets can break the scheduler domain tree into smaller
* balance domains, some of which do not cross NUMA boundaries.
* Tasks that are "trapped" in such domains cannot be migrated
* elsewhere, so there is no point in (re)trying.
*/
if (unlikely(!sd)) {
sched_setnuma(p, task_node(p));
return -EINVAL;
}
env.dst_nid = p->numa_preferred_nid;
dist = env.dist = node_distance(env.src_nid, env.dst_nid);
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
update_numa_stats(&env, &env.src_stats, env.src_nid, false);
taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
/*
* Look at other nodes in these cases:
* - there is no space available on the preferred_nid
* - the task is part of a numa_group that is interleaved across
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
ng = deref_curr_numa_group(p);
if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
dist = node_distance(env.src_nid, env.dst_nid);
if (sched_numa_topology_type == NUMA_BACKPLANE &&
dist != env.dist) {
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
}
/* Only consider nodes where both task and groups benefit */
taskimp = task_weight(p, nid, dist) - taskweight;
groupimp = group_weight(p, nid, dist) - groupweight;
if (taskimp < 0 && groupimp < 0)
continue;
env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
/*
* If the task is part of a workload that spans multiple NUMA nodes,
* and is migrating into one of the workload's active nodes, remember
* this node as the task's preferred numa node, so the workload can
* settle down.
* A task that migrated to a second choice node will be better off
* trying for a better one later. Do not set the preferred node here.
*/
if (ng) {
if (env.best_cpu == -1)
nid = env.src_nid;
else
nid = cpu_to_node(env.best_cpu);
if (nid != p->numa_preferred_nid)
sched_setnuma(p, nid);
}
/* No better CPU than the current one was found. */
if (env.best_cpu == -1) {
trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
return -EAGAIN;
}
best_rq = cpu_rq(env.best_cpu);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
return ret;
}
ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
put_task_struct(env.best_task);
return ret;
}
/* Attempt to migrate a task to a CPU on the preferred node. */
static void numa_migrate_preferred(struct task_struct *p)
{
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
p->numa_migrate_retry = jiffies + interval;
/* Success if task is already running on preferred CPU */
if (task_node(p) == p->numa_preferred_nid)
return;
/* Otherwise, try migrate to a CPU on the preferred node */
task_numa_migrate(p);
}
/*
* Find out how many nodes on the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently
* located.
*/
static void numa_group_count_active_nodes(struct numa_group *numa_group)
{
unsigned long faults, max_faults = 0;
int nid, active_nodes = 0;
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
if (faults > max_faults)
max_faults = faults;
}
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
if (faults * ACTIVE_NODE_FRACTION > max_faults)
active_nodes++;
}
numa_group->max_faults_cpu = max_faults;
numa_group->active_nodes = active_nodes;
}
/*
* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
* increments. The more local the fault statistics are, the higher the scan
* period will be for the next scan window. If local/(local+remote) ratio is
* below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
* the scan period will decrease. Aim for 70% local accesses.
*/
#define NUMA_PERIOD_SLOTS 10
#define NUMA_PERIOD_THRESHOLD 7
/*
* Increase the scan period (slow down scanning) if the majority of
* our memory is already on our local node, or if the majority of
* the page accesses are shared with other processes.
* Otherwise, decrease the scan period.
*/
static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
unsigned int period_slot;
int lr_ratio, ps_ratio;
int diff;
unsigned long remote = p->numa_faults_locality[0];
unsigned long local = p->numa_faults_locality[1];
/*
* If there were no record hinting faults then either the task is
* completely idle or all activity is areas that are not of interest
* to automatic numa balancing. Related to that, if there were failed
* migration then it implies we are migrating too quickly or the local
* node is overloaded. In either case, scan slower
*/
if (local + shared == 0 || p->numa_faults_locality[2]) {
p->numa_scan_period = min(p->numa_scan_period_max,
p->numa_scan_period << 1);
p->mm->numa_next_scan = jiffies +
msecs_to_jiffies(p->numa_scan_period);
return;
}
/*
* Prepare to scale scan period relative to the current period.
* == NUMA_PERIOD_THRESHOLD scan period stays the same
* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
/*
* Most memory accesses are local. There is no need to
* do fast NUMA scanning, since memory is already local.
*/
int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
/*
* Most memory accesses are shared with other tasks.
* There is no point in continuing fast NUMA scanning,
* since other tasks may just move the memory elsewhere.
*/
int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
/*
* Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
* yet they are not on the local NUMA node. Speed up
* NUMA scanning to get the memory moved over.
*/
int ratio = max(lr_ratio, ps_ratio);
diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
p->numa_scan_period = clamp(p->numa_scan_period + diff,
task_scan_min(p), task_scan_max(p));
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
/*
* Get the fraction of time the task has been running since the last
* NUMA placement cycle. The scheduler keeps similar statistics, but
* decays those on a 32ms period, which is orders of magnitude off
* from the dozens-of-seconds NUMA balancing period. Use the scheduler
* stats only if the task is so new there are no NUMA statistics yet.
*/
static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
{
u64 runtime, delta, now;
/* Use the start of this time slice to avoid calculations. */
now = p->se.exec_start;
runtime = p->se.sum_exec_runtime;
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
/* Avoid time going backwards, prevent potential divide error: */
if (unlikely((s64)*period < 0))
*period = 0;
} else {
delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
}
p->last_sum_exec_runtime = runtime;
p->last_task_numa_placement = now;
return delta;
}
/*
* Determine the preferred nid for a task in a numa_group. This needs to
* be done in a way that produces consistent results with group_weight,
* otherwise workloads might not converge.
*/
static int preferred_group_nid(struct task_struct *p, int nid)
{
nodemask_t nodes;
int dist;
/* Direct connections between all NUMA nodes. */
if (sched_numa_topology_type == NUMA_DIRECT)
return nid;
/*
* On a system with glueless mesh NUMA topology, group_weight
* scores nodes according to the number of NUMA hinting faults on
* both the node itself, and on nearby nodes.
*/
if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
unsigned long score, max_score = 0;
int node, max_node = nid;
dist = sched_max_numa_distance;
for_each_online_node(node) {
score = group_weight(p, node, dist);
if (score > max_score) {
max_score = score;
max_node = node;
}
}
return max_node;
}
/*
* Finding the preferred nid in a system with NUMA backplane
* interconnect topology is more involved. The goal is to locate
* tasks from numa_groups near each other in the system, and
* untangle workloads from different sides of the system. This requires
* searching down the hierarchy of node groups, recursively searching
* inside the highest scoring group of nodes. The nodemask tricks
* keep the complexity of the search down.
*/
nodes = node_online_map;
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
nodemask_t max_group = NODE_MASK_NONE;
int a, b;
/* Are there nodes at this distance from each other? */
if (!find_numa_distance(dist))
continue;
for_each_node_mask(a, nodes) {
unsigned long faults = 0;
nodemask_t this_group;
nodes_clear(this_group);
/* Sum group's NUMA faults; includes a==b case. */
for_each_node_mask(b, nodes) {
if (node_distance(a, b) < dist) {
faults += group_faults(p, b);
node_set(b, this_group);
node_clear(b, nodes);
}
}
/* Remember the top group. */
if (faults > max_faults) {
max_faults = faults;
max_group = this_group;
/*
* subtle: at the smallest distance there is
* just one node left in each "group", the
* winner is the preferred nid.
*/
nid = a;
}
}
/* Next round, evaluate the nodes within max_group. */
if (!max_faults)
break;
nodes = max_group;
}
return nid;
}
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
struct numa_group *ng;
/*
* The p->mm->numa_scan_seq field gets updated without
* exclusive access. Use READ_ONCE() here to ensure
* that the field is read in a single access:
*/
seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
p->numa_scan_period_max = task_scan_max(p);
total_faults = p->numa_faults_locality[0] +
p->numa_faults_locality[1];
runtime = numa_get_avg_runtime(p, &period);
/* If the task is part of a group prevent parallel updates to group stats */
ng = deref_curr_numa_group(p);
if (ng) {
group_lock = &ng->lock;
spin_lock_irq(group_lock);
}
/* Find the node with the highest number of faults */
for_each_online_node(nid) {
/* Keep track of the offsets in numa_faults array */
int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
unsigned long faults = 0, group_faults = 0;
int priv;
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
long diff, f_diff, f_weight;
mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
/* Decay existing window, copy faults since last scan */
diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
fault_types[priv] += p->numa_faults[membuf_idx];
p->numa_faults[membuf_idx] = 0;
/*
* Normalize the faults_from, so all tasks in a group
* count according to CPU use, instead of by the raw
* number of faults. Tasks with little runtime have
* little over-all impact on throughput, and thus their
* faults are less important.
*/
f_weight = div64_u64(runtime << 16, period + 1);
f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
(total_faults + 1);
f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
p->numa_faults[cpubuf_idx] = 0;
p->numa_faults[mem_idx] += diff;
p->numa_faults[cpu_idx] += f_diff;
faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
if (ng) {
/*
* safe because we can only change our own group
*
* mem_idx represents the offset for a given
* nid and priv in a specific region because it
* is at the beginning of the numa_faults array.
*/
ng->faults[mem_idx] += diff;
ng->faults_cpu[mem_idx] += f_diff;
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
}
if (!ng) {
if (faults > max_faults) {
max_faults = faults;
max_nid = nid;
}
} else if (group_faults > max_faults) {
max_faults = group_faults;
max_nid = nid;
}
}
if (ng) {
numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_nid);
}
if (max_faults) {
/* Set the new preferred node */
if (max_nid != p->numa_preferred_nid)
sched_setnuma(p, max_nid);
}
update_task_scan_period(p, fault_types[0], fault_types[1]);
}
static inline int get_numa_group(struct numa_group *grp)
{
return refcount_inc_not_zero(&grp->refcount);
}
static inline void put_numa_group(struct numa_group *grp)
{
if (refcount_dec_and_test(&grp->refcount))
kfree_rcu(grp, rcu);
}
static void task_numa_group(struct task_struct *p, int cpupid, int flags,
int *priv)
{
struct numa_group *grp, *my_grp;
struct task_struct *tsk;
bool join = false;
int cpu = cpupid_to_cpu(cpupid);
int i;
if (unlikely(!deref_curr_numa_group(p))) {
unsigned int size = sizeof(struct numa_group) +
4*nr_node_ids*sizeof(unsigned long);
grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!grp)
return;
refcount_set(&grp->refcount, 1);
grp->active_nodes = 1;
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
nr_node_ids;
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i];
grp->total_faults = p->total_numa_faults;
grp->nr_tasks++;
rcu_assign_pointer(p->numa_group, grp);
}
rcu_read_lock();
tsk = READ_ONCE(cpu_rq(cpu)->curr);
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
grp = rcu_dereference(tsk->numa_group);
if (!grp)
goto no_join;
my_grp = deref_curr_numa_group(p);
if (grp == my_grp)
goto no_join;
/*
* Only join the other group if its bigger; if we're the bigger group,
* the other task will join us.
*/
if (my_grp->nr_tasks > grp->nr_tasks)
goto no_join;
/*
* Tie-break on the grp address.
*/
if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
goto no_join;
/* Always join threads in the same process. */
if (tsk->mm == current->mm)
join = true;
/* Simple filter to avoid false positives due to PID collisions */
if (flags & TNF_SHARED)
join = true;
/* Update priv based on whether false sharing was detected */
*priv = !join;
if (join && !get_numa_group(grp))
goto no_join;
rcu_read_unlock();
if (!join)
return;
BUG_ON(irqs_disabled());
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
my_grp->faults[i] -= p->numa_faults[i];
grp->faults[i] += p->numa_faults[i];
}
my_grp->total_faults -= p->total_numa_faults;
grp->total_faults += p->total_numa_faults;
my_grp->nr_tasks--;
grp->nr_tasks++;
spin_unlock(&my_grp->lock);
spin_unlock_irq(&grp->lock);
rcu_assign_pointer(p->numa_group, grp);
put_numa_group(my_grp);
return;
no_join:
rcu_read_unlock();
return;
}
/*
* Get rid of NUMA statistics associated with a task (either current or dead).
* If @final is set, the task is dead and has reached refcount zero, so we can
* safely free all relevant data structures. Otherwise, there might be
* concurrent reads from places like load balancing and procfs, and we should
* reset the data back to default state without freeing ->numa_faults.
*/
void task_numa_free(struct task_struct *p, bool final)
{
/* safe: p either is current or is being freed by current */
struct numa_group *grp = rcu_dereference_raw(p->numa_group);
unsigned long *numa_faults = p->numa_faults;
unsigned long flags;
int i;
if (!numa_faults)
return;
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] -= p->numa_faults[i];
grp->total_faults -= p->total_numa_faults;
grp->nr_tasks--;
spin_unlock_irqrestore(&grp->lock, flags);
RCU_INIT_POINTER(p->numa_group, NULL);
put_numa_group(grp);
}
if (final) {
p->numa_faults = NULL;
kfree(numa_faults);
} else {
p->total_numa_faults = 0;
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
numa_faults[i] = 0;
}
}
/*
* Got a PROT_NONE fault for a page on @node.
*/
void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
{
struct task_struct *p = current;
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL);
struct numa_group *ng;
int priv;
if (!static_branch_likely(&sched_numa_balancing))
return;
/* for example, ksmd faulting in a user's mm */
if (!p->mm)
return;
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
if (!p->numa_faults)
return;
p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
/*
* First accesses are treated as private, otherwise consider accesses
* to be private if the accessing pid has not changed
*/
if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
priv = 1;
} else {
priv = cpupid_match_pid(p, last_cpupid);
if (!priv && !(flags & TNF_NO_GROUP))
task_numa_group(p, last_cpupid, flags, &priv);
}
/*
* If a workload spans multiple NUMA nodes, a shared fault that
* occurs wholly within the set of nodes that the workload is
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
ng = deref_curr_numa_group(p);
if (!priv && !local && ng && ng->active_nodes > 1 &&
numa_is_active_node(cpu_node, ng) &&
numa_is_active_node(mem_node, ng))
local = 1;
/*
* Retry to migrate task to preferred node periodically, in case it
* previously failed, or the scheduler moved us.
*/
if (time_after(jiffies, p->numa_migrate_retry)) {
task_numa_placement(p);
numa_migrate_preferred(p);
}
if (migrated)
p->numa_pages_migrated += pages;
if (flags & TNF_MIGRATE_FAIL)
p->numa_faults_locality[2] += pages;
p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
p->numa_faults_locality[local] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
{
/*
* We only did a read acquisition of the mmap sem, so
* p->mm->numa_scan_seq is written to without exclusive access
* and the update is not guaranteed to be atomic. That's not
* much of an issue though, since this is just used for
* statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
* expensive, to avoid any form of compiler optimizations:
*/
WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
static void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
u64 runtime = p->se.sum_exec_runtime;
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
long pages, virtpages;
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
work->next = work;
/*
* Who cares about NUMA placement when they're dying.
*
* NOTE: make sure not to dereference p->mm before this check,
* exit_task_work() happens _after_ exit_mm() so we could be called
* without p->mm even though we still had it when we enqueued this
* work.
*/
if (p->flags & PF_EXITING)
return;
if (!mm->numa_next_scan) {
mm->numa_next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
}
/*
* Enforce maximal scan/migration frequency..
*/
migrate = mm->numa_next_scan;
if (time_before(now, migrate))
return;
if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p);
p->numa_scan_period = task_scan_start(p);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;
/*
* Delay this task enough that another task of this mm will likely win
* the next time around.
*/
p->node_stamp += 2 * TICK_NSEC;
start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
virtpages = pages * 8; /* Scan up to this much virtual space */
if (!pages)
return;
if (!mmap_read_trylock(mm))
return;
vma = find_vma(mm, start);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;
}
/*
* Shared library pages mapped by multiple processes are not
* migrated as it is expected they are cache replicated. Avoid
* hinting faults in read-only file-backed mappings or the vdso
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
(vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
continue;
/*
* Skip inaccessible VMAs to avoid any confusion between
* PROT_NONE and NUMA hinting ptes
*/
if (!vma_is_accessible(vma))
continue;
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
nr_pte_updates = change_prot_numa(vma, start, end);
/*
* Try to scan sysctl_numa_balancing_size worth of
* hpages that have at least one present PTE that
* is not already pte-numa. If the VMA contains
* areas that are unused or already full of prot_numa
* PTEs, scan up to virtpages, to skip through those
* areas faster.
*/
if (nr_pte_updates)
pages -= (end - start) >> PAGE_SHIFT;
virtpages -= (end - start) >> PAGE_SHIFT;
start = end;
if (pages <= 0 || virtpages <= 0)
goto out;
cond_resched();
} while (end != vma->vm_end);
}
out:
/*
* It is possible to reach the end of the VMA list but the last few
* VMAs are not guaranteed to the vma_migratable. If they are not, we
* would find the !migratable VMA on the next scan but not reset the
* scanner to the start so check it now.
*/
if (vma)
mm->numa_scan_offset = start;
else
reset_ptenuma_scan(p);
mmap_read_unlock(mm);
/*
* Make sure tasks use at least 32x as much time to run other code
* than they used here, to limit NUMA PTE scanning overhead to 3% max.
* Usually update_task_scan_period slows down scanning enough; on an
* overloaded system we need to limit overhead on a per task basis.
*/
if (unlikely(p->se.sum_exec_runtime != runtime)) {
u64 diff = p->se.sum_exec_runtime - runtime;
p->node_stamp += 32 * diff;
}
}
void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
{
int mm_users = 0;
struct mm_struct *mm = p->mm;
if (mm) {
mm_users = atomic_read(&mm->mm_users);
if (mm_users == 1) {
mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
mm->numa_scan_seq = 0;
}
}
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
RCU_INIT_POINTER(p->numa_group, NULL);
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
init_task_work(&p->numa_work, task_numa_work);
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
p->numa_preferred_nid = NUMA_NO_NODE;
return;
}
/*
* New thread, keep existing numa_preferred_nid which should be copied
* already by arch_dup_task_struct but stagger when scans start.
*/
if (mm) {
unsigned int delay;
delay = min_t(unsigned int, task_scan_max(current),
current->numa_scan_period * mm_users * NSEC_PER_MSEC);
delay += 2 * TICK_NSEC;
p->node_stamp = delay;
}
}
/*
* Drive the periodic memory faults..
*/
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->numa_work;
u64 period, now;
/*
* We don't care about NUMA placement if we don't have memory.
*/
if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
* Using runtime rather than walltime has the dual advantage that
* we (mostly) drive the selection from busy threads and that the
* task needs to have done some actual work before we bother with
* NUMA placement.
*/
now = curr->se.sum_exec_runtime;
period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan))
task_work_add(curr, work, TWA_RESUME);
}
}
static void update_scan_period(struct task_struct *p, int new_cpu)
{
int src_nid = cpu_to_node(task_cpu(p));
int dst_nid = cpu_to_node(new_cpu);
if (!static_branch_likely(&sched_numa_balancing))
return;
if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
return;
if (src_nid == dst_nid)
return;
/*
* Allow resets if faults have been trapped before one scan
* has completed. This is most likely due to a new task that
* is pulled cross-node due to wakeups or load balancing.
*/
if (p->numa_scan_seq) {
/*
* Avoid scan adjustments if moving to the preferred
* node or if the task was not previously running on
* the preferred node.
*/
if (dst_nid == p->numa_preferred_nid ||
(p->numa_preferred_nid != NUMA_NO_NODE &&
src_nid != p->numa_preferred_nid))
return;
}
p->numa_scan_period = task_scan_start(p);
}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
}
static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
static inline void update_scan_period(struct task_struct *p, int new_cpu)
{
}
#endif /* CONFIG_NUMA_BALANCING */
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se)) {
struct rq *rq = rq_of(cfs_rq);
account_numa_enqueue(rq, task_of(se));
list_add(&se->group_node, &rq->cfs_tasks);
}
#endif
cfs_rq->nr_running++;
}
static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
}
#endif
cfs_rq->nr_running--;
}
/*
* Signed add and clamp on underflow.
*
* Explicitly do a load-store to ensure the intermediate value never hits
* memory. This allows lockless observations without ever seeing the negative
* values.
*/
#define add_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
typeof(_val) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
\
res = var + val; \
\
if (val < 0 && res > var) \
res = 0; \
\
WRITE_ONCE(*ptr, res); \
} while (0)
/*
* Unsigned subtract and clamp on underflow.
*
* Explicitly do a load-store to ensure the intermediate value never hits
* memory. This allows lockless observations without ever seeing the negative
* values.
*/
#define sub_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
typeof(*ptr) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
res = var - val; \
if (res > var) \
res = 0; \
WRITE_ONCE(*ptr, res); \
} while (0)
/*
* Remove and clamp on negative, from a local variable.
*
* A variant of sub_positive(), which does not use explicit load-store
* and is thus optimized for local variable updates.
*/
#define lsub_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
#ifdef CONFIG_SMP
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
}
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u32 divider = get_pelt_divider(&se->avg);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
}
#else
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
update_load_set(&se->load, weight);
#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
#endif
enqueue_load_avg(cfs_rq, se);
if (se->on_rq)
update_load_add(&cfs_rq->load, se->load.weight);
}
void reweight_task(struct task_struct *p, int prio)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct load_weight *load = &se->load;
unsigned long weight = scale_load(sched_prio_to_weight[prio]);
reweight_entity(cfs_rq, se, weight);
load->inv_weight = sched_prio_to_wmult[prio];
}
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
/*
* All this does is approximate the hierarchical proportion which includes that
* global sum we all love to hate.
*
* That is, the weight of a group entity, is the proportional share of the
* group weight based on the group runqueue weights. That is:
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (1)
* \Sum grq->load.weight
*
* Now, because computing that sum is prohibitively expensive to compute (been
* there, done that) we approximate it with this average stuff. The average
* moves slower and therefore the approximation is cheaper and more stable.
*
* So instead of the above, we substitute:
*
* grq->load.weight -> grq->avg.load_avg (2)
*
* which yields the following:
*
* tg->weight * grq->avg.load_avg
* ge->load.weight = ------------------------------ (3)
* tg->load_avg
*
* Where: tg->load_avg ~= \Sum grq->avg.load_avg
*
* That is shares_avg, and it is right (given the approximation (2)).
*
* The problem with it is that because the average is slow -- it was designed
* to be exactly that of course -- this leads to transients in boundary
* conditions. In specific, the case where the group was idle and we start the
* one task. It takes time for our CPU's grq->avg.load_avg to build up,
* yielding bad latency etc..
*
* Now, in that special case (1) reduces to:
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- = tg->weight (4)
* grp->load.weight
*
* That is, the sum collapses because all other CPUs are idle; the UP scenario.
*
* So what we do is modify our approximation (3) to approach (4) in the (near)
* UP case, like:
*
* ge->load.weight =
*
* tg->weight * grq->load.weight
* --------------------------------------------------- (5)
* tg->load_avg - grq->avg.load_avg + grq->load.weight
*
* But because grq->load.weight can drop to 0, resulting in a divide by zero,
* we need to use grq->avg.load_avg as its lower bound, which then gives:
*
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (6)
* tg_load_avg'
*
* Where:
*
* tg_load_avg' = tg->load_avg - grq->avg.load_avg +
* max(grq->load.weight, grq->avg.load_avg)
*
* And that is shares_weight and is icky. In the (near) UP case it approaches
* (4) while in the normal case it approaches (3). It consistently
* overestimates the ge->load.weight and therefore:
*
* \Sum ge->load.weight >= tg->weight
*
* hence icky!
*/
static long calc_group_shares(struct cfs_rq *cfs_rq)
{
long tg_weight, tg_shares, load, shares;
struct task_group *tg = cfs_rq->tg;
tg_shares = READ_ONCE(tg->shares);
load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
tg_weight = atomic_long_read(&tg->load_avg);
/* Ensure tg_weight >= load */
tg_weight -= cfs_rq->tg_load_avg_contrib;
tg_weight += load;
shares = (tg_shares * load);
if (tg_weight)
shares /= tg_weight;
/*
* MIN_SHARES has to be unscaled here to support per-CPU partitioning
* of a group with small tg->shares value. It is a floor value which is
* assigned as a minimum load.weight to the sched_entity representing
* the group on a CPU.
*
* E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
* on an 8-core system with 8 tasks each runnable on one CPU shares has
* to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
* case no task is runnable on a CPU MIN_SHARES=2 should be returned
* instead of 0.
*/
return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
#endif /* CONFIG_SMP */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
/*
* Recomputes the group entity based on the current state of its group
* runqueue.
*/
static void update_cfs_group(struct sched_entity *se)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long shares;
if (!gcfs_rq)
return;
if (throttled_hierarchy(gcfs_rq))
return;
#ifndef CONFIG_SMP
shares = READ_ONCE(gcfs_rq->tg->shares);
if (likely(se->load.weight == shares))
return;
#else
shares = calc_group_shares(gcfs_rq);
#endif
reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_cfs_group(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = rq_of(cfs_rq);
if (&rq->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
* a real problem.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
* number include things like RT tasks.
*
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
* See cpu_util().
*/
cpufreq_update_util(rq, flags);
}
}
#ifdef CONFIG_SMP
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
* immediately before a parent cfs_rq, and cfs_rqs are removed from the list
* bottom-up, we only have to test whether the cfs_rq before us on the list
* is our child.
* If cfs_rq is not on the list, test whether a child needs its to be added to
* connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
*/
static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
{
struct cfs_rq *prev_cfs_rq;
struct list_head *prev;
if (cfs_rq->on_list) {
prev = cfs_rq->leaf_cfs_rq_list.prev;
} else {
struct rq *rq = rq_of(cfs_rq);
prev = rq->tmp_alone_branch;
}
prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
return (prev_cfs_rq->tg->parent == cfs_rq->tg);
}
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
if (cfs_rq->load.weight)
return false;
if (cfs_rq->avg.load_sum)
return false;
if (cfs_rq->avg.util_sum)
return false;
if (cfs_rq->avg.runnable_sum)
return false;
if (child_cfs_rq_on_list(cfs_rq))
return false;
/*
* _avg must be null when _sum are null because _avg = _sum / divider
* Make sure that rounding and/or propagation of PELT values never
* break this.
*/
SCHED_WARN_ON(cfs_rq->avg.load_avg ||
cfs_rq->avg.util_avg ||
cfs_rq->avg.runnable_avg);
return true;
}
/**
* update_tg_load_avg - update the tg's load avg
* @cfs_rq: the cfs_rq whose avg changed
*
* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
* However, because tg->load_avg is a global value there are performance
* considerations.
*
* In order to avoid having to look at the other cfs_rq's, we use a
* differential update where we store the last value we propagated. This in
* turn allows skipping updates if the differential is 'small'.
*
* Updating tg's load_avg is necessary before update_cfs_share().
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
/*
* No need to update load_avg for root_task_group as it is not used.
*/
if (cfs_rq->tg == &root_task_group)
return;
if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
/*
* Called within set_task_rq() right before setting a task's CPU. The
* caller only guarantees p->pi_lock is held; no other assumptions,
* including the state of rq->lock, should be made.
*/
void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next)
{
u64 p_last_update_time;
u64 n_last_update_time;
if (!sched_feat(ATTACH_AGE_LOAD))
return;
/*
* We are supposed to update the task to "current" time, then its up to
* date and ready to go to new CPU/cfs_rq. But we have difficulty in
* getting what current time is, so simply throw away the out-of-date
* time. This will result in the wakee task is less decayed, but giving
* the wakee more load sounds not bad.
*/
if (!(se->avg.last_update_time && prev))
return;
#ifndef CONFIG_64BIT
{
u64 p_last_update_time_copy;
u64 n_last_update_time_copy;
do {
p_last_update_time_copy = prev->load_last_update_time_copy;
n_last_update_time_copy = next->load_last_update_time_copy;
smp_rmb();
p_last_update_time = prev->avg.last_update_time;
n_last_update_time = next->avg.last_update_time;
} while (p_last_update_time != p_last_update_time_copy ||
n_last_update_time != n_last_update_time_copy);
}
#else
p_last_update_time = prev->avg.last_update_time;
n_last_update_time = next->avg.last_update_time;
#endif
__update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
* that for each group:
*
* ge->avg == grq->avg (1)
*
* _IFF_ we look at the pure running and runnable sums. Because they
* represent the very same entity, just at different points in the hierarchy.
*
* Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
* and simply copies the running/runnable sum over (but still wrong, because
* the group entity and group rq do not have their PELT windows aligned).
*
* However, update_tg_cfs_load() is more complex. So we have:
*
* ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
*
* And since, like util, the runnable part should be directly transferable,
* the following would _appear_ to be the straight forward approach:
*
* grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
*
* And per (1) we have:
*
* ge->avg.runnable_avg == grq->avg.runnable_avg
*
* Which gives:
*
* ge->load.weight * grq->avg.load_avg
* ge->avg.load_avg = ----------------------------------- (4)
* grq->load.weight
*
* Except that is wrong!
*
* Because while for entities historical weight is not important and we
* really only care about our future and therefore can consider a pure
* runnable sum, runqueues can NOT do this.
*
* We specifically want runqueues to have a load_avg that includes
* historical weights. Those represent the blocked load, the load we expect
* to (shortly) return to us. This only works by keeping the weights as
* integral part of the sum. We therefore cannot decompose as per (3).
*
* Another reason this doesn't work is that runnable isn't a 0-sum entity.
* Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
* rq itself is runnable anywhere between 2/3 and 1 depending on how the
* runnable section of these tasks overlap (or not). If they were to perfectly
* align the rq as a whole would be runnable 2/3 of the time. If however we
* always have at least 1 runnable task, the rq as a whole is always runnable.
*
* So we'll have to approximate.. :/
*
* Given the constraint:
*
* ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
*
* We can construct a rule that adds runnable to a rq by assuming minimal
* overlap.
*
* On removal, we'll assume each task is equally runnable; which yields:
*
* grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
*
* XXX: only do this for the part of runnable > running ?
*
*/
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
u32 divider;
/* Nothing to update */
if (!delta)
return;
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
divider = get_pelt_divider(&cfs_rq->avg);
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
se->avg.util_sum = se->avg.util_avg * divider;
/* Update parent cfs_rq utilization */
add_positive(&cfs_rq->avg.util_avg, delta);
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
u32 divider;
/* Nothing to update */
if (!delta)
return;
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
divider = get_pelt_divider(&cfs_rq->avg);
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
se->avg.runnable_sum = se->avg.runnable_avg * divider;
/* Update parent cfs_rq runnable */
add_positive(&cfs_rq->avg.runnable_avg, delta);
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
}
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg;
u64 load_sum = 0;
u32 divider;
if (!runnable_sum)
return;
gcfs_rq->prop_runnable_sum = 0;
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
divider = get_pelt_divider(&cfs_rq->avg);
if (runnable_sum >= 0) {
/*
* Add runnable; clip at LOAD_AVG_MAX. Reflects that until
* the CPU is saturated running == runnable.
*/
runnable_sum += se->avg.load_sum;
runnable_sum = min_t(long, runnable_sum, divider);
} else {
/*
* Estimate the new unweighted runnable_sum of the gcfs_rq by
* assuming all tasks are equally runnable.
*/
if (scale_load_down(gcfs_rq->load.weight)) {
load_sum = div_s64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight));
}
/* But make sure to not inflate se's runnable */
runnable_sum = min(se->avg.load_sum, load_sum);
}
/*
* runnable_sum can't be lower than running_sum
* Rescale running sum to be in the same range as runnable sum
* running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
* runnable_sum is in [0 : LOAD_AVG_MAX]
*/
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, divider);
se->avg.load_sum = runnable_sum;
delta = load_avg - se->avg.load_avg;
if (!delta)
return;
se->avg.load_avg = load_avg;
add_positive(&cfs_rq->avg.load_avg, delta);
cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
{
cfs_rq->propagate = 1;
cfs_rq->prop_runnable_sum += runnable_sum;
}
/* Update task and its cfs_rq load average */
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq, *gcfs_rq;
if (entity_is_task(se))
return 0;
gcfs_rq = group_cfs_rq(se);
if (!gcfs_rq->propagate)
return 0;
gcfs_rq->propagate = 0;
cfs_rq = cfs_rq_of(se);
add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
update_tg_cfs_util(cfs_rq, se, gcfs_rq);
update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
update_tg_cfs_load(cfs_rq, se, gcfs_rq);
trace_pelt_cfs_tp(cfs_rq);
trace_pelt_se_tp(se);
return 1;
}
/*
* Check if we need to update the load and the utilization of a blocked
* group_entity:
*/
static inline bool skip_blocked_update(struct sched_entity *se)
{
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
/*
* If sched_entity still have not zero load or utilization, we have to
* decay it:
*/
if (se->avg.load_avg || se->avg.util_avg)
return false;
/*
* If there is a pending propagation, we have to update the load and
* the utilization of the sched_entity:
*/
if (gcfs_rq->propagate)
return false;
/*
* Otherwise, the load and the utilization of the sched_entity is
* already zero and there is no pending propagation, so it will be a
* waste of time to try to decay it:
*/
return true;
}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
return 0;
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_pelt()
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
* avg. The immediate corollary is that all (fair) tasks must be attached, see
* post_init_entity_util_avg().
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
* Returns true if the load decayed or we removed load.
*
* Since both these conditions indicate a changed cfs_rq->avg.load we should
* call update_tg_load_avg() when this function returns true.
*/
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
struct sched_avg *sa = &cfs_rq->avg;
int decayed = 0;
if (cfs_rq->removed.nr) {
unsigned long r;
u32 divider = get_pelt_divider(&cfs_rq->avg);
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
swap(cfs_rq->removed.load_avg, removed_load);
swap(cfs_rq->removed.runnable_avg, removed_runnable);
cfs_rq->removed.nr = 0;
raw_spin_unlock(&cfs_rq->removed.lock);
r = removed_load;
sub_positive(&sa->load_avg, r);
sa->load_sum = sa->load_avg * divider;
r = removed_util;
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * divider);
/*
* Because of rounding, se->util_sum might ends up being +1 more than
* cfs->util_sum. Although this is not a problem by itself, detaching
* a lot of tasks with the rounding problem between 2 updates of
* util_avg (~1ms) can make cfs->util_sum becoming null whereas
* cfs_util_avg is not.
* Check that util_sum is still above its lower bound for the new
* util_avg. Given that period_contrib might have moved since the last
* sync, we are only sure that util_sum must be above or equal to
* util_avg * minimum possible divider
*/
sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
sa->runnable_sum = sa->runnable_avg * divider;
/*
* removed_runnable is the unweighted version of removed_load so we
* can use it to estimate removed_load_sum.
*/
add_tg_cfs_propagate(cfs_rq,
-(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
decayed = 1;
}
decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
return decayed;
}
/**
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
u32 divider = get_pelt_divider(&cfs_rq->avg);
/*
* When we attach the @se to the @cfs_rq, we must align the decay
* window because without that, really weird and wonderful things can
* happen.
*
* XXX illustrate
*/
se->avg.last_update_time = cfs_rq->avg.last_update_time;
se->avg.period_contrib = cfs_rq->avg.period_contrib;
/*
* Hell(o) Nasty stuff.. we need to recompute _sum based on the new
* period_contrib. This isn't strictly correct, but since we're
* entirely outside of the PELT hierarchy, nobody cares if we truncate
* _sum a little.
*/
se->avg.util_sum = se->avg.util_avg * divider;
se->avg.runnable_sum = se->avg.runnable_avg * divider;
se->avg.load_sum = se->avg.load_avg * divider;
if (se_weight(se) < se->avg.load_sum)
se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
else
se->avg.load_sum = 1;
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
/**
* detach_entity_load_avg - detach this entity from its cfs_rq load avg
* @cfs_rq: cfs_rq to detach from
* @se: sched_entity to detach
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
u32 divider = get_pelt_divider(&cfs_rq->avg);
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq, 0);
trace_pelt_cfs_tp(cfs_rq);
}
/*
* Optional action to be done while updating the load average
*/
#define UPDATE_TG 0x1
#define SKIP_AGE_LOAD 0x2
#define DO_ATTACH 0x4
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 now = cfs_rq_clock_pelt(cfs_rq);
int decayed;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cfs_rq, se);
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
/*
* DO_ATTACH means we're here from enqueue_entity().
* !last_update_time means we've passed through
* migrate_task_rq_fair() indicating we migrated.
*
* IOW we're enqueueing a task on a new CPU.
*/
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
if (flags & UPDATE_TG)
update_tg_load_avg(cfs_rq);
}
}
#ifndef CONFIG_64BIT
static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
{
u64 last_update_time_copy;
u64 last_update_time;
do {
last_update_time_copy = cfs_rq->load_last_update_time_copy;
smp_rmb();
last_update_time = cfs_rq->avg.last_update_time;
} while (last_update_time != last_update_time_copy);
return last_update_time;
}
#else
static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.last_update_time;
}
#endif
/*
* Synchronize entity load avg of dequeued entity without locking
* the previous rq.
*/
static void sync_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 last_update_time;
last_update_time = cfs_rq_last_update_time(cfs_rq);
__update_load_avg_blocked_se(last_update_time, se);
}
/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
static void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
unsigned long flags;
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
* post_init_entity_util_avg() which will have added things to the
* cfs_rq, so we can remove unconditionally.
*/
sync_entity_load_avg(se);
raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
++cfs_rq->removed.nr;
cfs_rq->removed.util_avg += se->avg.util_avg;
cfs_rq->removed.load_avg += se->avg.load_avg;
cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}
static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.runnable_avg;
}
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.load_avg;
}
static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
}
static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
}
static inline unsigned long task_util_est(struct task_struct *p)
{
return max(task_util(p), _task_util_est(p));
}
#ifdef CONFIG_UCLAMP_TASK
static inline unsigned long uclamp_task_util(struct task_struct *p)
{
return clamp(task_util_est(p),
uclamp_eff_value(p, UCLAMP_MIN),
uclamp_eff_value(p, UCLAMP_MAX));
}
#else
static inline unsigned long uclamp_task_util(struct task_struct *p)
{
return task_util_est(p);
}
#endif
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Update root cfs_rq's estimated utilization */
enqueued = cfs_rq->avg.util_est.enqueued;
enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Update root cfs_rq's estimated utilization */
enqueued = cfs_rq->avg.util_est.enqueued;
enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
/*
* Check if a (signed) value is within a specified (unsigned) margin,
* based on the observation that:
*
* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
*
* NOTE: this only works when value + margin < INT_MAX.
*/
static inline bool within_margin(int value, int margin)
{
return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
}
static inline void util_est_update(struct cfs_rq *cfs_rq,
struct task_struct *p,
bool task_sleep)
{
long last_ewma_diff, last_enqueued_diff;
struct util_est ue;
int ret = 0;
trace_android_rvh_util_est_update(cfs_rq, p, task_sleep, &ret);
if (ret)
return;
if (!sched_feat(UTIL_EST))
return;
/*
* Skip update of task's estimated utilization when the task has not
* yet completed an activation, e.g. being migrated.
*/
if (!task_sleep)
return;
/*
* If the PELT values haven't changed since enqueue time,
* skip the util_est update.
*/
ue = p->se.avg.util_est;
if (ue.enqueued & UTIL_AVG_UNCHANGED)
return;
last_enqueued_diff = ue.enqueued;
/*
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
*/
ue.enqueued = task_util(p);
if (sched_feat(UTIL_EST_FASTUP)) {
if (ue.ewma < ue.enqueued) {
ue.ewma = ue.enqueued;
goto done;
}
}
/*
* Skip update of task's estimated utilization when its members are
* already ~1% close to its last activation value.
*/
last_ewma_diff = ue.enqueued - ue.ewma;
last_enqueued_diff -= ue.enqueued;
if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
goto done;
return;
}
/*
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
return;
/*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
* of the task size. This is done by storing the current PELT value
* as ue.enqueued and by using this value to update the Exponential
* Weighted Moving Average (EWMA):
*
* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
* = w * ( last_ewma_diff ) + ewma(t-1)
* = w * (last_ewma_diff + ewma(t-1) / w)
*
* Where 'w' is the weight of new samples, which is configured to be
* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
*/
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
ue.enqueued |= UTIL_AVG_UNCHANGED;
WRITE_ONCE(p->se.avg.util_est, ue);
trace_sched_util_est_se_tp(&p->se);
}
static inline int util_fits_cpu(unsigned long util,
unsigned long uclamp_min,
unsigned long uclamp_max,
int cpu)
{
unsigned long capacity_orig, capacity_orig_thermal;
unsigned long capacity = capacity_of(cpu);
bool fits, uclamp_max_fits;
/*
* Check if the real util fits without any uclamp boost/cap applied.
*/
fits = fits_capacity(util, capacity);
if (!uclamp_is_used())
return fits;
/*
* We must use capacity_orig_of() for comparing against uclamp_min and
* uclamp_max. We only care about capacity pressure (by using
* capacity_of()) for comparing against the real util.
*
* If a task is boosted to 1024 for example, we don't want a tiny
* pressure to skew the check whether it fits a CPU or not.
*
* Similarly if a task is capped to capacity_orig_of(little_cpu), it
* should fit a little cpu even if there's some pressure.
*
* Only exception is for thermal pressure since it has a direct impact
* on available OPP of the system.
*
* We honour it for uclamp_min only as a drop in performance level
* could result in not getting the requested minimum performance level.
*
* For uclamp_max, we can tolerate a drop in performance level as the
* goal is to cap the task. So it's okay if it's getting less.
*
* In case of capacity inversion, which is not handled yet, we should
* honour the inverted capacity for both uclamp_min and uclamp_max all
* the time.
*/
capacity_orig = capacity_orig_of(cpu);
capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
* We want to force a task to fit a cpu as implied by uclamp_max.
* But we do have some corner cases to cater for..
*
*
* C=z
* | ___
* | C=y | |
* |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
* | C=x | | | |
* | ___ | | | |
* | | | | | | | (util somewhere in this region)
* | | | | | | |
* | | | | | | |
* +----------------------------------------
* cpu0 cpu1 cpu2
*
* In the above example if a task is capped to a specific performance
* point, y, then when:
*
* * util = 80% of x then it does not fit on cpu0 and should migrate
* to cpu1
* * util = 80% of y then it is forced to fit on cpu1 to honour
* uclamp_max request.
*
* which is what we're enforcing here. A task always fits if
* uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
* the normal upmigration rules should withhold still.
*
* Only exception is when we are on max capacity, then we need to be
* careful not to block overutilized state. This is so because:
*
* 1. There's no concept of capping at max_capacity! We can't go
* beyond this performance level anyway.
* 2. The system is being saturated when we're operating near
* max capacity, it doesn't make sense to block overutilized.
*/
uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
fits = fits || uclamp_max_fits;
/*
*
* C=z
* | ___ (region a, capped, util >= uclamp_max)
* | C=y | |
* |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
* | C=x | | | |
* | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
* |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
* | | | | | | |
* | | | | | | | (region c, boosted, util < uclamp_min)
* +----------------------------------------
* cpu0 cpu1 cpu2
*
* a) If util > uclamp_max, then we're capped, we don't care about
* actual fitness value here. We only care if uclamp_max fits
* capacity without taking margin/pressure into account.
* See comment above.
*
* b) If uclamp_min <= util <= uclamp_max, then the normal
* fits_capacity() rules apply. Except we need to ensure that we
* enforce we remain within uclamp_max, see comment above.
*
* c) If util < uclamp_min, then we are boosted. Same as (b) but we
* need to take into account the boosted value fits the CPU without
* taking margin/pressure into account.
*
* Cases (a) and (b) are handled in the 'fits' variable already. We
* just need to consider an extra check for case (c) after ensuring we
* handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max);
if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
fits = fits && (uclamp_min <= capacity_orig_thermal);
return fits;
}
static inline int task_fits_cpu(struct task_struct *p, int cpu)
{
unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
unsigned long util = task_util_est(p);
return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
bool need_update = true;
trace_android_rvh_update_misfit_status(p, rq, &need_update);
if (!sched_asym_cpucap_active() || !need_update)
return;
if (!p || p->nr_cpus_allowed == 1) {
rq->misfit_task_load = 0;
return;
}
if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
/*
* Make sure that misfit_task_load will not be null even if
* task_h_load() returns 0.
*/
rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}
#else /* CONFIG_SMP */
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
return true;
}
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
cfs_rq_util_change(cfs_rq, 0);
}
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
static inline void
util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
bool task_sleep) {}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
s64 d = se->vruntime - cfs_rq->min_vruntime;
if (d < 0)
d = -d;
if (d > 3*sysctl_sched_latency)
schedstat_inc(cfs_rq->nr_spread_over);
#endif
}
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
* however the extra weight of the new task will slow them down a
* little, place the new task so that it fits in the slot that
* stays open at the end.
*/
if (initial && sched_feat(START_DEBIT))
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
if (!initial) {
unsigned long thresh = sysctl_sched_latency;
/*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
if (sched_feat(GENTLE_FAIR_SLEEPERS))
thresh >>= 1;
vruntime -= thresh;
}
/* ensure we never gain time by being placed backwards. */
se->vruntime = max_vruntime(se->vruntime, vruntime);
trace_android_rvh_place_entity(cfs_rq, se, initial, &vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
static inline void check_schedstat_required(void)
{
#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled())
return;
/* Force schedstat enabled if a dependent tracepoint is active */
if (trace_sched_stat_wait_enabled() ||
trace_sched_stat_sleep_enabled() ||
trace_sched_stat_iowait_enabled() ||
trace_sched_stat_blocked_enabled() ||
trace_sched_stat_runtime_enabled()) {
printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
"stat_blocked and stat_runtime require the "
"kernel parameter schedstats=enable or "
"kernel.sched_schedstats=1\n");
}
#endif
}
static inline bool cfs_bandwidth_used(void);
/*
* MIGRATION
*
* dequeue
* update_curr()
* update_min_vruntime()
* vruntime -= min_vruntime
*
* enqueue
* update_curr()
* update_min_vruntime()
* vruntime += min_vruntime
*
* this way the vruntime transition between RQs is done when both
* min_vruntime are up-to-date.
*
* WAKEUP (remote)
*
* ->migrate_task_rq_fair() (p->state == TASK_WAKING)
* vruntime -= min_vruntime
*
* enqueue
* update_curr()
* update_min_vruntime()
* vruntime += min_vruntime
*
* this way we don't have the most up-to-date min_vruntime on the originating
* CPU and an up-to-date min_vruntime on the destination CPU.
*/
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
if (renorm && curr)
se->vruntime += cfs_rq->min_vruntime;
update_curr(cfs_rq);
/*
* Otherwise, renormalise after, such that we're placed at the current
* moment in time, instead of some random moment in the past. Being
* placed in the past could significantly boost this task to the
* fairness detriment of existing tasks.
*/
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
/*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - Add its load to cfs_rq->runnable_avg
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
se_update_runnable(se);
update_cfs_group(se);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
check_schedstat_required();
update_stats_enqueue(cfs_rq, se, flags);
check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
/*
* When bandwidth control is enabled, cfs might have been removed
* because of a parent been throttled but cfs->nr_running > 1. Try to
* add it unconditionally.
*/
if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
list_add_leaf_cfs_rq(cfs_rq);
if (cfs_rq->nr_running == 1)
check_enqueue_throttle(cfs_rq);
}
static void __clear_buddies_last(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->last != se)
break;
cfs_rq->last = NULL;
}
}
static void __clear_buddies_next(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->next != se)
break;
cfs_rq->next = NULL;
}
}
static void __clear_buddies_skip(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (cfs_rq->skip != se)
break;
cfs_rq->skip = NULL;
}
}
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->last == se)
__clear_buddies_last(se);
if (cfs_rq->next == se)
__clear_buddies_next(se);
if (cfs_rq->skip == se)
__clear_buddies_skip(se);
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - Subtract its load from the cfs_rq->runnable_avg.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
update_load_avg(cfs_rq, se, UPDATE_TG);
se_update_runnable(se);
update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
/*
* Normalize after update_curr(); which will also have moved
* min_vruntime if @se is the one holding it back. But before doing
* update_min_vruntime() again, which will discount @se's position and
* can move min_vruntime forward still more.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
update_cfs_group(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
* put back on, and if we advance min_vruntime, we'll be placed back
* further than we started -- ie. we'll be penalized.
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;
struct sched_entity *se;
s64 delta;
bool skip_preempt = false;
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
trace_android_rvh_check_preempt_tick(current, &ideal_runtime, &skip_preempt,
delta_exec, cfs_rq, curr, sysctl_sched_min_granularity);
if (skip_preempt)
return;
if (delta_exec > ideal_runtime) {
resched_curr(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
*/
clear_buddies(cfs_rq, curr);
return;
}
/*
* Ensure that a task that missed wakeup preemption by a
* narrow margin doesn't have to wait for a full slice.
* This also mitigates buddy induced latencies under load.
*/
if (delta_exec < sysctl_sched_min_granularity)
return;
se = __pick_first_entity(cfs_rq);
delta = curr->vruntime - se->vruntime;
if (delta < 0)
return;
if (delta > ideal_runtime)
resched_curr(rq_of(cfs_rq));
}
void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
clear_buddies(cfs_rq, se);
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
* Any task has to be enqueued before it get to execute on
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() &&
rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
schedstat_set(se->statistics.slice_max,
max((u64)schedstat_val(se->statistics.slice_max),
se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
EXPORT_SYMBOL_GPL(set_next_entity);
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
* 2) pick the "next" process, since someone really wants that to run
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
struct sched_entity *left = __pick_first_entity(cfs_rq);
struct sched_entity *se = NULL;
trace_android_rvh_pick_next_entity(cfs_rq, curr, &se);
if (se)
goto done;
/*
* If curr is set we have to see if its left of the leftmost entity
* still in the tree, provided there was anything in the tree at all.
*/
if (!left || (curr && entity_before(curr, left)))
left = curr;
se = left; /* ideally we run the leftmost entity */
/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip && cfs_rq->skip == se) {
struct sched_entity *second;
if (se == curr) {
second = __pick_first_entity(cfs_rq);
} else {
second = __pick_next_entity(se);
if (!second || (curr && entity_before(curr, second)))
second = curr;
}
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
se = cfs_rq->next;
} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
se = cfs_rq->last;
}
done:
return se;
}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
* If still on the runqueue then deactivate_task()
* was not called and update_curr() has to be done:
*/
if (prev->on_rq)
update_curr(cfs_rq);
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
check_spread(cfs_rq, prev);
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
cfs_rq->curr = NULL;
}
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
/*
* Ensure that runnable average is periodically updated.
*/
update_load_avg(cfs_rq, curr, UPDATE_TG);
update_cfs_group(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
* validating it and just reschedule.
*/
if (queued) {
resched_curr(rq_of(cfs_rq));
return;
}
/*
* don't let the period tick interfere with the hrtick preemption
*/
if (!sched_feat(DOUBLE_TICK) &&
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
if (cfs_rq->nr_running > 1)
check_preempt_tick(cfs_rq, curr);
trace_android_rvh_entity_tick(cfs_rq, curr);
}
/**************************************************
* CFS bandwidth control machinery
*/
#ifdef CONFIG_CFS_BANDWIDTH
#ifdef CONFIG_JUMP_LABEL
static struct static_key __cfs_bandwidth_used;
static inline bool cfs_bandwidth_used(void)
{
return static_key_false(&__cfs_bandwidth_used);
}
void cfs_bandwidth_usage_inc(void)
{
static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
}
void cfs_bandwidth_usage_dec(void)
{
static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
#else /* CONFIG_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
{
return true;
}
void cfs_bandwidth_usage_inc(void) {}
void cfs_bandwidth_usage_dec(void) {}
#endif /* CONFIG_JUMP_LABEL */
/*
* default period for cfs group bandwidth.
* default: 0.1s, units: nanoseconds
*/
static inline u64 default_cfs_period(void)
{
return 100000000ULL;
}
static inline u64 sched_cfs_bandwidth_slice(void)
{
return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
}
/*
* Replenish runtime according to assigned quota. We use sched_clock_cpu
* directly instead of rq->clock to avoid adding additional synchronization
* around rq->lock.
*
* requires cfs_b->lock
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
if (unlikely(cfs_b->quota == RUNTIME_INF))
return;
cfs_b->runtime += cfs_b->quota;
cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
{
return &tg->cfs_bandwidth;
}
/* returns 0 on failure to allocate runtime */
static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
struct cfs_rq *cfs_rq, u64 target_runtime)
{
u64 min_amount, amount = 0;
lockdep_assert_held(&cfs_b->lock);
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = target_runtime - cfs_rq->runtime_remaining;
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
start_cfs_bandwidth(cfs_b);
if (cfs_b->runtime > 0) {
amount = min(cfs_b->runtime, min_amount);
cfs_b->runtime -= amount;
cfs_b->idle = 0;
}
}
cfs_rq->runtime_remaining += amount;
return cfs_rq->runtime_remaining > 0;
}
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
int ret;
raw_spin_lock(&cfs_b->lock);
ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
raw_spin_unlock(&cfs_b->lock);
return ret;
}
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
if (likely(cfs_rq->runtime_remaining > 0))
return;
if (cfs_rq->throttled)
return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
resched_curr(rq_of(cfs_rq));
}
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
__account_cfs_rq_runtime(cfs_rq, delta_exec);
}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return cfs_bandwidth_used() && cfs_rq->throttled;
}
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
/*
* Ensure that neither of the group entities corresponding to src_cpu or
* dest_cpu are members of a throttled hierarchy when performing group
* load-balance operations.
*/
static inline int throttled_lb_pair(struct task_group *tg,
int src_cpu, int dest_cpu)
{
struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
src_cfs_rq = tg->cfs_rq[src_cpu];
dest_cfs_rq = tg->cfs_rq[dest_cpu];
return throttled_hierarchy(src_cfs_rq) ||
throttled_hierarchy(dest_cfs_rq);
}
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_pelt_time += rq_clock_task_mult(rq) -
cfs_rq->throttled_clock_pelt;
/* Add cfs_rq with load or one or more already running entities to the list */
if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
list_add_leaf_cfs_rq(cfs_rq);
}
return 0;
}
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
/* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_pelt = rq_clock_task_mult(rq);
list_del_leaf_cfs_rq(cfs_rq);
}
cfs_rq->throttle_count++;
return 0;
}
static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
/*
* We have raced with bandwidth becoming available, and if we
* actually throttled the timer might not unthrottle us for an
* entire period. We additionally needed to make sure that any
* subsequent check_cfs_rq_runtime calls agree not to throttle
* us, as we may commit to do cfs put_prev+pick_next, so we ask
* for 1ns of runtime rather than just check cfs_b.
*/
dequeue = 0;
} else {
list_add_tail_rcu(&cfs_rq->throttled_list,
&cfs_b->throttled_cfs_rq);
}
raw_spin_unlock(&cfs_b->lock);
if (!dequeue)
return false; /* Throttle no longer required. */
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
goto done;
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
if (qcfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
break;
}
}
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
goto done;
update_load_avg(qcfs_rq, se, 0);
se_update_runnable(se);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
}
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, task_delta);
done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
cfs_rq->throttled = 0;
update_rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
/* Nothing to run but something to decay (on_list)? Complete the branch */
if (!cfs_rq->load.weight) {
if (cfs_rq->on_list)
goto unthrottle_throttle;
return;
}
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
if (se->on_rq)
break;
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
qcfs_rq->h_nr_running += task_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
}
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
update_load_avg(qcfs_rq, se, UPDATE_TG);
se_update_runnable(se);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
qcfs_rq->h_nr_running += task_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
/*
* One parent has been throttled and cfs_rq removed from the
* list. Add it back to not break the leaf list.
*/
if (throttled_hierarchy(qcfs_rq))
list_add_leaf_cfs_rq(qcfs_rq);
}
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);
unthrottle_throttle:
/*
* The cfs_rq_throttled() breaks in the above iteration can result in
* incomplete leaf list maintenance, resulting in triggering the
* assertion below.
*/
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
if (list_add_leaf_cfs_rq(qcfs_rq))
break;
}
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
}
static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
{
struct cfs_rq *cfs_rq;
u64 runtime, remaining = 1;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
/* By the above check, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > cfs_b->runtime)
runtime = cfs_b->runtime;
cfs_b->runtime -= runtime;
remaining = cfs_b->runtime;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += runtime;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
unthrottle_cfs_rq(cfs_rq);
next:
rq_unlock_irqrestore(rq, &rf);
if (!remaining)
break;
}
rcu_read_unlock();
}
/*
* Responsible for refilling a task_group's bandwidth and unthrottling its
* cfs_rqs as appropriate. If there has been no activity within the last
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
* used to track this state.
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
int throttled;
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
goto out_deactivate;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->nr_periods += overrun;
/* Refill extra burst quota even if cfs_b->idle */
__refill_cfs_bandwidth_runtime(cfs_b);
/*
* idle depends on !throttled (for the case of a large deficit), and if
* we're going inactive then everything else can be deferred
*/
if (cfs_b->idle && !throttled)
goto out_deactivate;
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
return 0;
}
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
/*
* This check is repeated as we release cfs_b->lock while we unthrottle.
*/
while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
/*
* While we are ensured activity in the period following an
* unthrottle, this also covers the case in which the new bandwidth is
* insufficient to cover the existing bandwidth deficit. (Forcing the
* timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
return 0;
out_deactivate:
return 1;
}
/* a cfs_rq won't donate quota below this amount */
static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
/* minimum remaining period time to redistribute slack quota */
static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
/* how long we wait to gather additional slack before distributing */
static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
/*
* Are we near the end of the current quota period?
*
* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
* hrtimer base being cleared by hrtimer_start. In the case of
* migrate_hrtimers, base is never cleared, so we are fine.
*/
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
s64 remaining;
/* if the call-back is running a quota refresh is already occurring */
if (hrtimer_callback_running(refresh_timer))
return 1;
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
if (remaining < (s64)min_expire)
return 1;
return 0;
}
static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
{
u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
/* if there's a quota refresh soon don't bother with slack */
if (runtime_refresh_within(cfs_b, min_left))
return;
/* don't push forwards an existing deferred unthrottle */
if (cfs_b->slack_started)
return;
cfs_b->slack_started = true;
hrtimer_start(&cfs_b->slack_timer,
ns_to_ktime(cfs_bandwidth_slack_period),
HRTIMER_MODE_REL);
}
/* we know any runtime found here is valid as update_curr() precedes return */
static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
if (slack_runtime <= 0)
return;
raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
!list_empty(&cfs_b->throttled_cfs_rq))
start_cfs_slack_bandwidth(cfs_b);
}
raw_spin_unlock(&cfs_b->lock);
/* even if it's not valid for return we don't want to try again */
cfs_rq->runtime_remaining -= slack_runtime;
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
return;
if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
}
/*
* This is done with a timer (instead of inline with bandwidth return) since
* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
*/
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
unsigned long flags;
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->slack_started = false;
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
return;
distribute_cfs_runtime(cfs_b);
}
/*
* When a group wakes up we want to make sure that its quota is not already
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
* runtime as update_curr() throttling can not trigger until it's on-rq.
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
return;
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
/* ensure the group is not already throttled */
if (cfs_rq_throttled(cfs_rq))
return;
/* update runtime allocation */
account_cfs_rq_runtime(cfs_rq, 0);
if (cfs_rq->runtime_remaining <= 0)
throttle_cfs_rq(cfs_rq);
}
static void sync_throttle(struct task_group *tg, int cpu)
{
struct cfs_rq *pcfs_rq, *cfs_rq;
if (!cfs_bandwidth_used())
return;
if (!tg->parent)
return;
cfs_rq = tg->cfs_rq[cpu];
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
cfs_rq->throttled_clock_pelt = rq_clock_task_mult(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
return false;
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
return false;
/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
return true;
return throttle_cfs_rq(cfs_rq);
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, slack_timer);
do_sched_cfs_slack_timer(cfs_b);
return HRTIMER_NORESTART;
}
extern const u64 max_cfs_quota_period;
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
unsigned long flags;
int overrun;
int idle = 0;
int count = 0;
raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
/*
* Grow period by a factor of 2 to avoid losing precision.
* Precision loss in the quota/period ratio can cause __cfs_schedulable
* to fail.
*/
new = old * 2;
if (new < max_cfs_quota_period) {
cfs_b->period = ns_to_ktime(new);
cfs_b->quota *= 2;
cfs_b->burst *= 2;
pr_warn_ratelimited(
"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
smp_processor_id(),
div_u64(new, NSEC_PER_USEC),
div_u64(cfs_b->quota, NSEC_PER_USEC));
} else {
pr_warn_ratelimited(
"cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
smp_processor_id(),
div_u64(old, NSEC_PER_USEC),
div_u64(cfs_b->quota, NSEC_PER_USEC));
}
/* reset count so we don't come right back in here */
count = 0;
}
}
if (idle)
cfs_b->period_active = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
cfs_b->burst = 0;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
cfs_b->slack_started = false;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
lockdep_assert_held(&cfs_b->lock);
if (cfs_b->period_active)
return;
cfs_b->period_active = 1;
hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
/* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next)
return;
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
}
/*
* Both these CPU hotplug callbacks race against unregister_fair_sched_group()
*
* The race is harmless, since modifying bandwidth settings of unhooked group
* bits doesn't do much.
*/
/* cpu online callback */
static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
struct task_group *tg;
lockdep_assert_rq_held(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
raw_spin_lock(&cfs_b->lock);
cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
raw_spin_unlock(&cfs_b->lock);
}
rcu_read_unlock();
}
/* cpu offline callback */
static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct task_group *tg;
lockdep_assert_rq_held(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
if (!cfs_rq->runtime_enabled)
continue;
/*
* clock_task is not advancing so we just need to make sure
* there's some valid quota amount
*/
cfs_rq->runtime_remaining = 1;
/*
* Offline rq is schedulable till CPU is completely disabled
* in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq->runtime_enabled = 0;
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
}
#else /* CONFIG_CFS_BANDWIDTH */
static inline bool cfs_bandwidth_used(void)
{
return false;
}
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return 0;
}
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return 0;
}
static inline int throttled_lb_pair(struct task_group *tg,
int src_cpu, int dest_cpu)
{
return 0;
}
void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
#endif
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
{
return NULL;
}
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
/**************************************************
* CFS operations on tasks:
*/
#ifdef CONFIG_SCHED_HRTICK
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
SCHED_WARN_ON(task_rq(p) != rq);
if (rq->cfs.h_nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
if (delta < 0) {
if (task_current(rq, p))
resched_curr(rq);
return;
}
hrtick_start(rq, delta);
}
}
/*
* called from enqueue/dequeue and updates the hrtick when the
* current task is from our class and nr_running is low enough
* to matter.
*/
static void hrtick_update(struct rq *rq)
{
struct task_struct *curr = rq->curr;
if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
return;
if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
}
static inline void hrtick_update(struct rq *rq)
{
}
#endif
#ifdef CONFIG_SMP
static inline unsigned long cpu_util(int cpu);
static inline bool cpu_overutilized(int cpu)
{
int overutilized = -1;
trace_android_rvh_cpu_overutilized(cpu, &overutilized);
if (overutilized != -1)
return overutilized;
return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
}
static inline void update_overutilized_status(struct rq *rq)
{
if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
}
}
#else
static inline void update_overutilized_status(struct rq *rq) { }
#endif
/* Runqueue only has SCHED_IDLE tasks enqueued */
static int sched_idle_rq(struct rq *rq)
{
return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
rq->nr_running);
}
#ifdef CONFIG_SMP
static int sched_idle_cpu(int cpu)
{
return sched_idle_rq(cpu_rq(cpu));
}
#endif
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
* then put the task into the rbtree:
*/
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);
int should_iowait_boost;
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
util_est_enqueue(&rq->cfs, p);
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
* passed.
*/
should_iowait_boost = p->in_iowait;
trace_android_rvh_set_iowait(p, rq, &should_iowait_boost);
if (should_iowait_boost)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
flags = ENQUEUE_WAKEUP;
}
trace_android_rvh_enqueue_task_fair(rq, p, flags);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
update_load_avg(cfs_rq, se, UPDATE_TG);
se_update_runnable(se);
update_cfs_group(se);
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
/*
* One parent has been throttled and cfs_rq removed from the
* list. Add it back to not break the leaf list.
*/
if (throttled_hierarchy(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
/*
* Since new tasks are assigned an initial util_avg equal to
* half of the spare capacity of their CPU, tiny tasks have the
* ability to cross the overutilized threshold, which will
* result in the load balancer ruining all the task placement
* done by EAS. As a way to mitigate that effect, do not account
* for the first enqueue operation of new tasks during the
* overutilized flag detection.
*
* A better way of solving this problem would be to wait for
* the PELT signals of tasks to converge before taking them
* into account, but that is not straightforward to implement,
* and the following generally works well enough in practice.
*/
if (!task_new)
update_overutilized_status(rq);
enqueue_throttle:
if (cfs_bandwidth_used()) {
/*
* When bandwidth control is enabled; the cfs_rq_throttled()
* breaks in the above iteration can result in incomplete
* leaf list maintenance, resulting in triggering the assertion
* below.
*/
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
if (list_add_leaf_cfs_rq(cfs_rq))
break;
}
}
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
}
static void set_next_buddy(struct sched_entity *se);
/*
* The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and
* update the fair scheduling stats:
*/
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
util_est_dequeue(&rq->cfs, p);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
if (task_sleep && se && !throttled_hierarchy(cfs_rq))
set_next_buddy(se);
break;
}
flags |= DEQUEUE_SLEEP;
}
trace_android_rvh_dequeue_task_fair(rq, p, flags);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
update_load_avg(cfs_rq, se, UPDATE_TG);
se_update_runnable(se);
update_cfs_group(se);
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
}
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, 1);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
dequeue_throttle:
util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
#ifdef CONFIG_SMP
/* Working cpumask for: load_balance, load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
#ifdef CONFIG_NO_HZ_COMMON
static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
#endif /* CONFIG_NO_HZ_COMMON */
static unsigned long cpu_load(struct rq *rq)
{
return cfs_rq_load_avg(&rq->cfs);
}
/*
* cpu_load_without - compute CPU load without any contributions from *p
* @cpu: the CPU which load is requested
* @p: the task which load should be discounted
*
* The load of a CPU is defined by the load of tasks currently enqueued on that
* CPU as well as tasks which are currently sleeping after an execution on that
* CPU.
*
* This method returns the load of the specified CPU by discounting the load of
* the specified task, whenever the task is currently contributing to the CPU
* load.
*/
static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
{
struct cfs_rq *cfs_rq;
unsigned int load;
/* Task has no contribution or is new */
if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_load(rq);
cfs_rq = &rq->cfs;
load = READ_ONCE(cfs_rq->avg.load_avg);
/* Discount task's util from CPU's util */
lsub_positive(&load, task_h_load(p));
return load;
}
static unsigned long cpu_runnable(struct rq *rq)
{
return cfs_rq_runnable_avg(&rq->cfs);
}
static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
{
struct cfs_rq *cfs_rq;
unsigned int runnable;
/* Task has no contribution or is new */
if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_runnable(rq);
cfs_rq = &rq->cfs;
runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
/* Discount task's runnable from CPU's runnable */
lsub_positive(&runnable, p->se.avg.runnable_avg);
return runnable;
}
static unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
}
static void record_wakee(struct task_struct *p)
{
/*
* Only decay a single time; tasks that have less then 1 wakeup per
* jiffy will not have built up many flips.
*/
if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
current->wakee_flips >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
if (current->last_wakee != p) {
current->last_wakee = p;
current->wakee_flips++;
}
}
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
*
* A waker of many should wake a different task than the one last awakened
* at a frequency roughly N times higher than one of its wakees.
*
* In order to determine whether we should let the load spread vs consolidating
* to shared cache, we look for a minimum 'flip' frequency of llc_size in one
* partner, and a factor of lls_size higher frequency in the other.
*
* With both conditions met, we can be relatively sure that the relationship is
* non-monogamous, with partner count exceeding socket size.
*
* Waker/wakee being client/server, worker/dispatcher, interrupt source or
* whatever is irrelevant, spread criteria is apparent partner count exceeds
* socket size.
*/
static int wake_wide(struct task_struct *p)
{
unsigned int master = current->wakee_flips;
unsigned int slave = p->wakee_flips;
int factor = __this_cpu_read(sd_llc_size);
if (master < slave)
swap(master, slave);
if (slave < factor || master < slave * factor)
return 0;
return 1;
}
/*
* The purpose of wake_affine() is to quickly determine on which CPU we can run
* soonest. For the purpose of speed we only consider the waking and previous
* CPU.
*
* wake_affine_idle() - only considers 'now', it check if the waking CPU is
* cache-affine and is (or will be) idle.
*
* wake_affine_weight() - considers the weight to reflect the average
* scheduling latency of the CPUs. This seems to work
* for the overloaded case.
*/
static int
wake_affine_idle(int this_cpu, int prev_cpu, int sync)
{
/*
* If this_cpu is idle, it implies the wakeup is from interrupt
* context. Only allow the move if cache is shared. Otherwise an
* interrupt intensive workload could force all tasks onto one
* node depending on the IO topology or IRQ affinity settings.
*
* If the prev_cpu is idle and cache affine then avoid a migration.
* There is no guarantee that the cache hot data from an interrupt
* is more important than cache hot data on the prev_cpu and from
* a cpufreq perspective, it's better to have higher utilisation
* on one CPU.
*/
if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu;
if (available_idle_cpu(prev_cpu))
return prev_cpu;
return nr_cpumask_bits;
}
static int
wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
this_eff_load = cpu_load(cpu_rq(this_cpu));
if (sync) {
unsigned long current_load = task_h_load(current);
if (current_load > this_eff_load)
return this_cpu;
this_eff_load -= current_load;
}
task_load = task_h_load(p);
this_eff_load += task_load;
if (sched_feat(WA_BIAS))
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
prev_eff_load = cpu_load(cpu_rq(prev_cpu));
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);
/*
* If sync, adjust the weight of prev_eff_load such that if
* prev_eff == this_eff that select_idle_sibling() will consider
* stacking the wakee on top of the waker if no other CPU is
* idle.
*/
if (sync)
prev_eff_load += 1;
return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
int target = nr_cpumask_bits;
if (sched_feat(WA_IDLE))
target = wake_affine_idle(this_cpu, prev_cpu, sync);
if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (target == nr_cpumask_bits)
return prev_cpu;
schedstat_inc(sd->ttwu_move_affine);
schedstat_inc(p->se.statistics.nr_wakeups_affine);
return target;
}
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
/*
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
*/
static int
find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
int shallowest_idle_cpu = -1;
int i;
/* Check if we have any choice: */
if (group->group_weight == 1)
return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
struct rq *rq = cpu_rq(i);
if (!sched_core_cookie_match(rq, p))
continue;
if (sched_idle_cpu(i))
return i;
if (available_idle_cpu(i)) {
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
/*
* We give priority to a CPU whose idle state
* has the smallest exit latency irrespective
* of any idle timestamp.
*/
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
} else if ((!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
* the most recently idled CPU might have
* a warmer cache.
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
}
}
}
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
int cpu, int prev_cpu, int sd_flag)
{
int new_cpu = cpu;
if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
return prev_cpu;
/*
* We need task's util for cpu_util_without, sync it up to
* prev_cpu's last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(&p->se);
while (sd) {
struct sched_group *group;
struct sched_domain *tmp;
int weight;
if (!(sd->flags & sd_flag)) {
sd = sd->child;
continue;
}
group = find_idlest_group(sd, p, cpu);
if (!group) {
sd = sd->child;
continue;
}
new_cpu = find_idlest_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of 'cpu': */
sd = sd->child;
continue;
}
/* Now try balancing at a lower domain level of 'new_cpu': */
cpu = new_cpu;
weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) {
if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
}
}
return new_cpu;
}
static inline int __select_idle_cpu(int cpu, struct task_struct *p)
{
if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
sched_cpu_cookie_match(cpu_rq(cpu), p))
return cpu;
return -1;
}
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
EXPORT_SYMBOL_GPL(sched_smt_present);
static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
static inline bool test_idle_cores(int cpu, bool def)
{
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
return def;
}
/*
* Scans the local SMT mask to see if the entire core is idle, and records this
* information in sd_llc_shared->has_idle_cores.
*
* Since SMT siblings share all cache levels, inspecting this limited remote
* state should be fairly cheap.
*/
void __update_idle_core(struct rq *rq)
{
int core = cpu_of(rq);
int cpu;
rcu_read_lock();
if (test_idle_cores(core, true))
goto unlock;
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (cpu == core)
continue;
if (!available_idle_cpu(cpu))
goto unlock;
}
set_idle_cores(core, 1);
unlock:
rcu_read_unlock();
}
/*
* Scan the entire LLC domain for idle cores; this dynamically switches off if
* there are no idle cores left in the system; tracked through
* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
*/
static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
bool idle = true;
int cpu;
if (!static_branch_likely(&sched_smt_present))
return __select_idle_cpu(core, p);
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (!available_idle_cpu(cpu)) {
idle = false;
if (*idle_cpu == -1) {
if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
*idle_cpu = cpu;
break;
}
continue;
}
break;
}
if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
*idle_cpu = cpu;
}
if (idle)
return core;
cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
return -1;
}
/*
* Scan the local SMT mask for idle CPUs.
*/
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
}
return -1;
}
#else /* CONFIG_SCHED_SMT */
static inline void set_idle_cores(int cpu, int val)
{
}
static inline bool test_idle_cores(int cpu, bool def)
{
return def;
}
static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
return __select_idle_cpu(core, p);
}
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
#endif /* CONFIG_SCHED_SMT */
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
struct rq *this_rq = this_rq();
int this = smp_processor_id();
struct sched_domain *this_sd;
u64 time = 0;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
return -1;
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
if (sched_feat(SIS_PROP) && !has_idle_core) {
u64 avg_cost, avg_idle, span_avg;
unsigned long now = jiffies;
/*
* If we're busy, the assumption that the last idle period
* predicts the future is flawed; age away the remaining
* predicted idle time.
*/
if (unlikely(this_rq->wake_stamp < now)) {
while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
this_rq->wake_stamp++;
this_rq->wake_avg_idle >>= 1;
}
}
avg_idle = this_rq->wake_avg_idle;
avg_cost = this_sd->avg_scan_cost + 1;
span_avg = sd->span_weight * avg_idle;
if (span_avg > 4*avg_cost)
nr = div_u64(span_avg, avg_cost);
else
nr = 4;
time = cpu_clock(this);
}
for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
if ((unsigned int)i < nr_cpumask_bits)
return i;
} else {
if (!--nr)
return -1;
idle_cpu = __select_idle_cpu(cpu, p);
if ((unsigned int)idle_cpu < nr_cpumask_bits)
break;
}
}
if (has_idle_core)
set_idle_cores(target, false);
if (sched_feat(SIS_PROP) && !has_idle_core) {
time = cpu_clock(this) - time;
/*
* Account for the scan cost of wakeups against the average
* idle time.
*/
this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
update_avg(&this_sd->avg_scan_cost, time);
}
return idle_cpu;
}
/*
* Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
* the task fits. If no CPU is big enough, but there are idle ones, try to
* maximize capacity.
*/
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
unsigned long task_util, util_min, util_max, best_cap = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
task_util = task_util_est(p);
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
if (util_fits_cpu(task_util, util_min, util_max, cpu))
return cpu;
if (cpu_cap > best_cap) {
best_cap = cpu_cap;
best_cpu = cpu;
}
}
return best_cpu;
}
static inline bool asym_fits_cpu(unsigned long util,
unsigned long util_min,
unsigned long util_max,
int cpu)
{
if (sched_asym_cpucap_active())
return util_fits_cpu(util, util_min, util_max, cpu);
return true;
}
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
bool has_idle_core = false;
struct sched_domain *sd;
unsigned long task_util, util_min, util_max;
int i, recent_used_cpu;
/*
* On asymmetric system, update task utilization because we will check
* that the task fits with cpu's capacity.
*/
if (sched_asym_cpucap_active()) {
sync_entity_load_avg(&p->se);
task_util = task_util_est(p);
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);
}
/*
* per-cpu select_idle_mask usage
*/
lockdep_assert_irqs_disabled();
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
asym_fits_cpu(task_util, util_min, util_max, target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
asym_fits_cpu(task_util, util_min, util_max, prev))
return prev;
/*
* Allow a per-cpu kthread to stack with the wakee if the
* kworker thread and the tasks previous CPUs are the same.
* The assumption is that the wakee queued work for the
* per-cpu kthread that is now complete and the wakeup is
* essentially a sync wakeup. An obvious example of this
* pattern is IO completions.
*/
if (is_per_cpu_kthread(current) &&
in_task() &&
prev == smp_processor_id() &&
this_rq()->nr_running <= 1 &&
asym_fits_cpu(task_util, util_min, util_max, prev)) {
return prev;
}
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
p->recent_used_cpu = prev;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
return recent_used_cpu;
}
/*
* For asymmetric CPU capacity systems, our domain of interest is
* sd_asym_cpucapacity rather than sd_llc.
*/
if (sched_asym_cpucap_active()) {
sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
* cpuset defines a symmetric island (i.e. one unique
* capacity_orig value through the cpuset), the key will be set
* but the CPUs within that cpuset will not have a domain with
* SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
* capacity path.
*/
if (sd) {
i = select_idle_capacity(p, sd, target);
return ((unsigned)i < nr_cpumask_bits) ? i : target;
}
}
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;
if (sched_smt_active()) {
has_idle_core = test_idle_cores(target, false);
if (!has_idle_core && cpus_share_cache(prev, target)) {
i = select_idle_smt(p, sd, prev);
if ((unsigned int)i < nr_cpumask_bits)
return i;
}
}
i = select_idle_cpu(p, sd, has_idle_core, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
return target;
}
/**
* cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks.
* @cpu: the CPU to get the utilization of
*
* The unit of the return value must be the one of capacity so we can compare
* the utilization with the capacity of the CPU that is available for CFS task
* (ie cpu_capacity).
*
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on a CPU. It represents
* the amount of utilization of a CPU in the range [0..capacity_orig] where
* capacity_orig is the cpu_capacity available at the highest frequency
* (arch_scale_freq_capacity()).
* The utilization of a CPU converges towards a sum equal to or less than the
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
* the running time on this CPU scaled by capacity_curr.
*
* The estimated utilization of a CPU is defined to be the maximum between its
* cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
* currently RUNNABLE on that CPU.
* This allows to properly represent the expected utilization of a CPU which
* has just got a big task running since a long sleep period. At the same time
* however it preserves the benefits of the "blocked utilization" in
* describing the potential for other tasks waking up on the same CPU.
*
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
* higher than capacity_orig because of unfortunate rounding in
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
* the average stabilizes with the new running time. We need to check that the
* utilization stays within the range of [0..capacity_orig] and cap it if
* necessary. Without utilization capping, a group could be seen as overloaded
* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
* available capacity. We allow utilization to overshoot capacity_curr (but not
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
*
* Return: the (estimated) utilization for the specified CPU
*/
static inline unsigned long cpu_util(int cpu)
{
struct cfs_rq *cfs_rq;
unsigned int util;
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST))
util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
* @p: the task which utilization should be discounted
*
* The utilization of a CPU is defined by the utilization of tasks currently
* enqueued on that CPU as well as tasks which are currently sleeping after an
* execution on that CPU.
*
* This method returns the utilization of the specified CPU by discounting the
* utilization of the specified task, whenever the task is currently
* contributing to the CPU utilization.
*/
static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
struct cfs_rq *cfs_rq;
unsigned int util;
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_util(cpu);
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
/* Discount task's util from CPU's util */
lsub_positive(&util, task_util(p));
/*
* Covered cases:
*
* a) if *p is the only task sleeping on this CPU, then:
* cpu_util (== task_util) > util_est (== 0)
* and thus we return:
* cpu_util_without = (cpu_util - task_util) = 0
*
* b) if other tasks are SLEEPING on this CPU, which is now exiting
* IDLE, then:
* cpu_util >= task_util
* cpu_util > util_est (== 0)
* and thus we discount *p's blocked utilization to return:
* cpu_util_without = (cpu_util - task_util) >= 0
*
* c) if other tasks are RUNNABLE on that CPU and
* util_est > cpu_util
* then we use util_est since it returns a more restrictive
* estimation of the spare capacity on that CPU, by just
* considering the expected utilization of tasks already
* runnable on that CPU.
*
* Cases a) and b) are covered by the above code, while case c) is
* covered by the following code when estimated utilization is
* enabled.
*/
if (sched_feat(UTIL_EST)) {
unsigned int estimated =
READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
* Despite the following checks we still have a small window
* for a possible race, when an execl's select_task_rq_fair()
* races with LB's detach_task():
*
* detach_task()
* p->on_rq = TASK_ON_RQ_MIGRATING;
* ---------------------------------- A
* deactivate_task() \
* dequeue_task() + RaceTime
* util_est_dequeue() /
* ---------------------------------- B
*
* The additional check on "current == p" it's required to
* properly fix the execl regression and it helps in further
* reducing the chances for the above race.
*/
if (unlikely(task_on_rq_queued(p) || current == p))
lsub_positive(&estimated, _task_util_est(p));
util = max(util, estimated);
}
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
* clamp to the maximum CPU capacity to ensure consistency with
* the cpu_util call.
*/
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
* Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
* to @dst_cpu.
*/
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
/*
* If @p migrates from @cpu to another, remove its contribution. Or,
* if @p migrates from another CPU to @cpu, add its contribution. In
* the other cases, @cpu is not impacted by the migration, so the
* util_avg should already be correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
else if (task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
* During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be
* cpu_util() after the task has been enqueued.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
util = max(util, util_est);
}
return min(util, capacity_orig_of(cpu));
}
/*
* compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
* landscape of @pd's CPUs after the task migration, and uses the Energy Model
* to compute what would be the energy if we decided to actually migrate that
* task.
*/
static long
compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
struct cpumask *pd_mask = perf_domain_span(pd);
unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
unsigned long max_util = 0, sum_util = 0;
unsigned long _cpu_cap = cpu_cap;
unsigned long energy = 0;
int cpu;
_cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
/*
* The capacity state of CPUs of the current rd can be driven by CPUs
* of another rd if they belong to the same pd. So, account for the
* utilization of these CPUs too by masking pd with cpu_online_mask
* instead of the rd span.
*
* If an entire pd is outside of the current rd, it will not appear in
* its pd list and will not be accounted by compute_energy().
*/
for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
unsigned long cpu_util, util_running = util_freq;
struct task_struct *tsk = NULL;
/*
* When @p is placed on @cpu:
*
* util_running = max(cpu_util, cpu_util_est) +
* max(task_util, _task_util_est)
*
* while cpu_util_next is: max(cpu_util + task_util,
* cpu_util_est + _task_util_est)
*/
if (cpu == dst_cpu) {
tsk = p;
util_running =
cpu_util_next(cpu, p, -1) + task_util_est(p);
}
/*
* Busy time computation: utilization clamping is not
* required since the ratio (sum_util / cpu_capacity)
* is already enough to scale the EM reported power
* consumption at the (eventually clamped) cpu_capacity.
*/
cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
ENERGY_UTIL, NULL);
sum_util += min(cpu_util, _cpu_cap);
/*
* Performance domain frequency: utilization clamping
* must be considered since it affects the selection
* of the performance domain frequency.
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
FREQUENCY_UTIL, tsk);
max_util = max(max_util, min(cpu_util, _cpu_cap));
}
trace_android_vh_em_cpu_energy(pd->em_pd, max_util, sum_util, &energy);
if (!energy)
energy = em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
return energy;
}
/*
* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
* waking task. find_energy_efficient_cpu() looks for the CPU with maximum
* spare capacity in each performance domain and uses it as a potential
* candidate to execute the task. Then, it uses the Energy Model to figure
* out which of the CPU candidates is the most energy-efficient.
*
* The rationale for this heuristic is as follows. In a performance domain,
* all the most energy efficient CPU candidates (according to the Energy
* Model) are those for which we'll request a low frequency. When there are
* several CPUs for which the frequency request will be the same, we don't
* have enough data to break the tie between them, because the Energy Model
* only includes active power costs. With this model, if we assume that
* frequency requests follow utilization (e.g. using schedutil), the CPU with
* the maximum spare capacity in a performance domain is guaranteed to be among
* the best candidates of the performance domain.
*
* In practice, it could be preferable from an energy standpoint to pack
* small tasks on a CPU in order to let other CPUs go in deeper idle states,
* but that could also hurt our chances to go cluster idle, and we have no
* ways to tell with the current Energy Model if this is actually a good
* idea or not. So, find_energy_efficient_cpu() basically favors
* cluster-packing, and spreading inside a cluster. That should at least be
* a good thing for latency, and this is consistent with the idea that most
* of the energy savings of EAS come from the asymmetry of the system, and
* not so much from breaking the tie between identical CPUs. That's also the
* reason why EAS is enabled in the topology code only for systems where
* SD_ASYM_CPUCAPACITY is set.
*
* NOTE: Forkees are not accepted in the energy-aware wake-up path because
* they don't have any useful utilization data yet and it's not possible to
* forecast their impact on energy consumption. Consequently, they will be
* placed by find_idlest_cpu() on the least loaded CPU, which might turn out
* to be energy-inefficient in some use-cases. The alternative would be to
* bias new tasks towards specific types of CPUs first, or to try to infer
* their util_avg from the parent task, but those heuristics could hurt
* other use-cases too. So, until someone finds a better way to solve this,
* let's keep things simple by re-using the existing slow path.
*/
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
{
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1;
int cpu, best_energy_cpu = prev_cpu, target = -1;
unsigned long max_spare_cap_ls = 0, target_cap;
unsigned long cpu_cap, util, base_energy = 0;
bool boosted, latency_sensitive = false;
unsigned int min_exit_lat = UINT_MAX;
struct cpuidle_state *idle;
struct sched_domain *sd;
struct perf_domain *pd;
int new_cpu = INT_MAX;
sync_entity_load_avg(&p->se);
trace_android_rvh_find_energy_efficient_cpu(p, prev_cpu, sync, &new_cpu);
if (new_cpu != INT_MAX)
return new_cpu;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized))
goto unlock;
cpu = smp_processor_id();
if (sync && cpu_rq(cpu)->nr_running == 1 &&
cpumask_test_cpu(cpu, p->cpus_ptr) &&
task_fits_cpu(p, cpu)) {
rcu_read_unlock();
return cpu;
}
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
*/
sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
goto unlock;
target = prev_cpu;
if (!task_util_est(p))
goto unlock;
latency_sensitive = uclamp_latency_sensitive(p);
boosted = uclamp_boosted(p);
target_cap = boosted ? 0 : ULONG_MAX;
for (; pd; pd = pd->next) {
unsigned long cur_delta, spare_cap, max_spare_cap = 0;
bool compute_prev_delta = false;
unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
spare_cap = cpu_cap;
lsub_positive(&spare_cap, util);
/*
* Skip CPUs that cannot satisfy the capacity request.
* IOW, placing the task there would make the CPU
* overutilized. Take uclamp into account to see how
* much capacity we can get out of the CPU; this is
* aligned with sched_cpu_util().
*/
util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
if (!fits_capacity(util, cpu_cap))
continue;
if (!latency_sensitive && cpu == prev_cpu) {
/* Always use prev_cpu as a candidate. */
compute_prev_delta = true;
} else if (spare_cap > max_spare_cap) {
/*
* Find the CPU with the maximum spare capacity
* in the performance domain.
*/
max_spare_cap = spare_cap;
max_spare_cap_cpu = cpu;
}
if (!latency_sensitive)
continue;
if (idle_cpu(cpu)) {
cpu_cap = capacity_orig_of(cpu);
if (boosted && cpu_cap < target_cap)
continue;
if (!boosted && cpu_cap > target_cap)
continue;
idle = idle_get_state(cpu_rq(cpu));
if (idle && idle->exit_latency > min_exit_lat &&
cpu_cap == target_cap)
continue;
if (idle)
min_exit_lat = idle->exit_latency;
target_cap = cpu_cap;
best_idle_cpu = cpu;
} else if (spare_cap > max_spare_cap_ls) {
max_spare_cap_ls = spare_cap;
max_spare_cap_cpu_ls = cpu;
}
}
if (!latency_sensitive && max_spare_cap_cpu < 0 && !compute_prev_delta)
continue;
/* Compute the 'base' energy of the pd, without @p */
base_energy_pd = compute_energy(p, -1, pd);
base_energy += base_energy_pd;
/* Evaluate the energy impact of using prev_cpu. */
if (compute_prev_delta) {
prev_delta = compute_energy(p, prev_cpu, pd);
if (prev_delta < base_energy_pd)
goto unlock;
prev_delta -= base_energy_pd;
best_delta = min(best_delta, prev_delta);
}
/* Evaluate the energy impact of using max_spare_cap_cpu. */
if (max_spare_cap_cpu >= 0) {
cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
if (cur_delta < base_energy_pd)
goto unlock;
cur_delta -= base_energy_pd;
if (cur_delta < best_delta) {
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
}
}
}
rcu_read_unlock();
if (latency_sensitive)
return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
/*
* Pick the best CPU if prev_cpu cannot be used, or if it saves at
* least 6% of the energy used by prev_cpu.
*/
if ((prev_delta == ULONG_MAX) ||
(prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
target = best_energy_cpu;
return target;
unlock:
rcu_read_unlock();
return target;
}
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
* Balances load by selecting the idlest CPU in the idlest group, or under
* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
*
* Returns the target CPU number.
*/
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
{
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu;
int want_affine = 0;
int target_cpu = -1;
/* SD_flags and WF_flags share the first nibble */
int sd_flag = wake_flags & 0xF;
if (trace_android_rvh_select_task_rq_fair_enabled() &&
!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(&p->se);
trace_android_rvh_select_task_rq_fair(p, prev_cpu, sd_flag,
wake_flags, &target_cpu);
if (target_cpu >= 0)
return target_cpu;
/*
* required for stable ->cpus_allowed
*/
lockdep_assert_held(&p->pi_lock);
if (wake_flags & WF_TTWU) {
record_wakee(p);
if (sched_energy_enabled()) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync);
if (new_cpu >= 0)
return new_cpu;
new_cpu = prev_cpu;
}
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
}
rcu_read_lock();
for_each_domain(cpu, tmp) {
/*
* If both 'cpu' and 'prev_cpu' are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
if (cpu != prev_cpu)
new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
sd = NULL; /* Prefer wake_affine over balance flags */
break;
}
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
break;
}
if (unlikely(sd)) {
/* Slow path */
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
}
rcu_read_unlock();
return new_cpu;
}
static void detach_entity_cfs_rq(struct sched_entity *se);
/*
* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
* previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
/*
* As blocked tasks retain absolute vruntime the migration needs to
* deal with this by subtracting the old and adding the new
* min_vruntime -- the latter is done by enqueue_entity() when placing
* the task on the new runqueue.
*/
if (READ_ONCE(p->__state) == TASK_WAKING) {
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 min_vruntime;
#ifndef CONFIG_64BIT
u64 min_vruntime_copy;
do {
min_vruntime_copy = cfs_rq->min_vruntime_copy;
smp_rmb();
min_vruntime = cfs_rq->min_vruntime;
} while (min_vruntime != min_vruntime_copy);
#else
min_vruntime = cfs_rq->min_vruntime;
#endif
se->vruntime -= min_vruntime;
}
if (p->on_rq == TASK_ON_RQ_MIGRATING) {
/*
* In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
* rq->lock and can modify state directly.
*/
lockdep_assert_rq_held(task_rq(p));
detach_entity_cfs_rq(&p->se);
} else {
/*
* We are supposed to update the task to "current" time, then
* its up to date and ready to go to new CPU/cfs_rq. But we
* have difficulty in getting what current time is, so simply
* throw away the out-of-date time. This will result in the
* wakee task is less decayed, but giving the wakee more load
* sounds not bad.
*/
remove_entity_load_avg(&p->se);
}
/* Tell new CPU we are migrated */
p->se.avg.last_update_time = 0;
/* We have migrated, no longer consider this task hot */
p->se.exec_start = 0;
update_scan_period(p, new_cpu);
}
static void task_dead_fair(struct task_struct *p)
{
remove_entity_load_avg(&p->se);
}
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
if (rq->nr_running)
return 1;
return newidle_balance(rq, rf) != 0;
}
#endif /* CONFIG_SMP */
static unsigned long wakeup_gran(struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
/*
* Since its curr running now, convert the gran from real-time
* to virtual-time in his units.
*
* By using 'se' instead of 'curr' we penalize light tasks, so
* they get preempted easier. That is, if 'se' < 'curr' then
* the resulting gran will be larger, therefore penalizing the
* lighter, if otoh 'se' > 'curr' then the resulting gran will
* be smaller, again penalizing the lighter task.
*
* This is especially important for buddies when the leftmost
* task is higher priority than the buddy.
*/
return calc_delta_fair(gran, se);
}
/*
* Should 'se' preempt 'curr'.
*
* |s1
* |s2
* |s3
* g
* |<--->|c
*
* w(c, s1) = -1
* w(c, s2) = 0
* w(c, s3) = 1
*
*/
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
s64 gran, vdiff = curr->vruntime - se->vruntime;
if (vdiff <= 0)
return -1;
gran = wakeup_gran(se);
if (vdiff > gran)
return 1;
return 0;
}
static void set_last_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
if (se_is_idle(se))
return;
cfs_rq_of(se)->last = se;
}
}
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
if (se_is_idle(se))
return;
cfs_rq_of(se)->next = se;
}
}
static void set_skip_buddy(struct sched_entity *se)
{
for_each_sched_entity(se)
cfs_rq_of(se)->skip = se;
}
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0;
int cse_is_idle, pse_is_idle;
bool ignore = false;
bool preempt = false;
if (unlikely(se == pse))
return;
trace_android_rvh_check_preempt_wakeup_ignore(curr, &ignore);
if (ignore)
return;
/*
* This is possible from callers such as attach_tasks(), in which we
* unconditionally check_preempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
return;
if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
next_buddy_marked = 1;
}
/*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path.
*
* Note: this also catches the edge-case of curr being in a throttled
* group (e.g. via set_curr_task), since update_curr() (in the
* enqueue of curr) will have resulted in resched being set. This
* prevents us from potentially nominating it as a false LAST_BUDDY
* below.
*/
if (test_tsk_need_resched(curr))
return;
/* Idle tasks are by definition preempted by non-idle tasks. */
if (unlikely(task_has_idle_policy(curr)) &&
likely(!task_has_idle_policy(p)))
goto preempt;
/*
* Batch and idle tasks do not preempt non-idle tasks (their preemption
* is driven by the tick):
*/
if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
return;
find_matching_se(&se, &pse);
BUG_ON(!pse);
cse_is_idle = se_is_idle(se);
pse_is_idle = se_is_idle(pse);
/*
* Preempt an idle group in favor of a non-idle group (and don't preempt
* in the inverse case).
*/
if (cse_is_idle && !pse_is_idle)
goto preempt;
if (cse_is_idle != pse_is_idle)
return;
update_curr(cfs_rq_of(se));
trace_android_rvh_check_preempt_wakeup(rq, p, &preempt, &ignore,
wake_flags, se, pse, next_buddy_marked, sysctl_sched_wakeup_granularity);
if (preempt)
goto preempt;
if (ignore)
return;
if (wakeup_preempt_entity(se, pse) == 1) {
/*
* Bias pick_next to pick the sched entity that is
* triggering this preemption.
*/
if (!next_buddy_marked)
set_next_buddy(pse);
goto preempt;
}
return;
preempt:
resched_curr(rq);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
* with schedule on the ->pre_schedule() or idle_balance()
* point, either of which can * drop the rq lock.
*
* Also, during early boot the idle thread is in the fair class,
* for obvious reasons its a bad idea to schedule back to it.
*/
if (unlikely(!se->on_rq || curr == rq->idle))
return;
if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
set_last_buddy(se);
}
#ifdef CONFIG_SMP
static struct task_struct *pick_task_fair(struct rq *rq)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
again:
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_running)
return NULL;
do {
struct sched_entity *curr = cfs_rq->curr;
/* When we pick for a remote RQ, we'll not have done put_prev_entity() */
if (curr) {
if (curr->on_rq)
update_curr(cfs_rq);
else
curr = NULL;
if (unlikely(check_cfs_rq_runtime(cfs_rq)))
goto again;
}
se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
return task_of(se);
}
#endif
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se = NULL;
struct task_struct *p = NULL;
int new_tasks;
bool repick = false;
again:
if (!sched_fair_runnable(rq))
goto idle;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (!prev || prev->sched_class != &fair_sched_class)
goto simple;
/*
* Because of the set_next_buddy() in dequeue_task_fair() it is rather
* likely that a next task is from the same cgroup as the current.
*
* Therefore attempt to avoid putting and setting the entire cgroup
* hierarchy, only change the part that actually changes.
*/
do {
struct sched_entity *curr = cfs_rq->curr;
/*
* Since we got here without doing put_prev_entity() we also
* have to consider cfs_rq->curr. If it is still a runnable
* entity, update_curr() will update its vruntime, otherwise
* forget we've ever seen it.
*/
if (curr) {
if (curr->on_rq)
update_curr(cfs_rq);
else
curr = NULL;
/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
* Therefore the nr_running test will indeed
* be correct.
*/
if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_running)
goto idle;
goto simple;
}
}
se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, false, prev);
/*
* Since we haven't yet done put_prev_entity and if the selected task
* is a different task than we started out with, try and touch the
* least amount of cfs_rqs.
*/
if (prev != p) {
struct sched_entity *pse = &prev->se;
while (!(cfs_rq = is_same_group(se, pse))) {
int se_depth = se->depth;
int pse_depth = pse->depth;
if (se_depth <= pse_depth) {
put_prev_entity(cfs_rq_of(pse), pse);
pse = parent_entity(pse);
}
if (se_depth >= pse_depth) {
set_next_entity(cfs_rq_of(se), se);
se = parent_entity(se);
}
}
put_prev_entity(cfs_rq, pse);
set_next_entity(cfs_rq, se);
}
goto done;
simple:
#endif
if (prev)
put_prev_task(rq, prev);
trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, true, prev);
if (repick)
goto done;
do {
se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
done: __maybe_unused;
#ifdef CONFIG_SMP
/*
* Move the next running task to the front of
* the list, so our cfs_tasks list becomes MRU
* one.
*/
list_move(&p->se.group_node, &rq->cfs_tasks);
#endif
if (hrtick_enabled_fair(rq))
hrtick_start_fair(rq, p);
update_misfit_status(p, rq);
return p;
idle:
if (!rf)
return NULL;
new_tasks = newidle_balance(rq, rf);
/*
* Because newidle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
if (new_tasks < 0)
return RETRY_TASK;
if (new_tasks > 0)
goto again;
/*
* rq is about to be idle, check if we need to update the
* lost_idle_time of clock_pelt
*/
update_idle_rq_clock_pelt(rq);
return NULL;
}
static struct task_struct *__pick_next_task_fair(struct rq *rq)
{
return pick_next_task_fair(rq, NULL, NULL);
}
/*
* Account for a descheduled task:
*/
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
put_prev_entity(cfs_rq, se);
}
}
/*
* sched_yield() is very simple
*
* The magic of dealing with the ->skip buddy is in pick_next_entity.
*/
static void yield_task_fair(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se;
/*
* Are we the only task in the tree?
*/
if (unlikely(rq->nr_running == 1))
return;
clear_buddies(cfs_rq, se);
if (curr->policy != SCHED_BATCH) {
update_rq_clock(rq);
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
/*
* Tell update_rq_clock() that we've just updated,
* so we don't do microscopic update in schedule()
* and double the fastpath cost.
*/
rq_clock_skip_update(rq);
}
set_skip_buddy(se);
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
/* throttled hierarchies are not runnable */
if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
return false;
/* Tell the scheduler that we'd really like pse to run next. */
set_next_buddy(se);
yield_task_fair(rq);
return true;
}
#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods.
*
* BASICS
*
* The purpose of load-balancing is to achieve the same basic fairness the
* per-CPU scheduler provides, namely provide a proportional amount of compute
* time to each task. This is expressed in the following equation:
*
* W_i,n/P_i == W_j,n/P_j for all i,j (1)
*
* Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
* W_i,0 is defined as:
*
* W_i,0 = \Sum_j w_i,j (2)
*
* Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
* is derived from the nice value as per sched_prio_to_weight[].
*
* The weight average is an exponential decay average of the instantaneous
* weight:
*
* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
*
* C_i is the compute capacity of CPU i, typically it is the
* fraction of 'recent' time available for SCHED_OTHER task execution. But it
* can also include other factors [XXX].
*
* To achieve this balance we define a measure of imbalance which follows
* directly from (1):
*
* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
*
* We them move tasks around to minimize the imbalance. In the continuous
* function space it is obvious this converges, in the discrete case we get
* a few fun cases generally called infeasible weight scenarios.
*
* [XXX expand on:
* - infeasible weights;
* - local vs global optima in the discrete case. ]
*
*
* SCHED DOMAINS
*
* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
* for all i,j solution, we create a tree of CPUs that follows the hardware
* topology where each level pairs two lower groups (or better). This results
* in O(log n) layers. Furthermore we reduce the number of CPUs going up the
* tree to only the first of the previous level and we decrease the frequency
* of load-balance at each level inv. proportional to the number of CPUs in
* the groups.
*
* This yields:
*
* log_2 n 1 n
* \Sum { --- * --- * 2^i } = O(n) (5)
* i = 0 2^i 2^i
* `- size of each group
* | | `- number of CPUs doing load-balance
* | `- freq
* `- sum over all levels
*
* Coupled with a limit on how many tasks we can migrate every balance pass,
* this makes (5) the runtime complexity of the balancer.
*
* An important property here is that each CPU is still (indirectly) connected
* to every other CPU in at most O(log n) steps:
*
* The adjacency matrix of the resulting graph is given by:
*
* log_2 n
* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
* k = 0
*
* And you'll find that:
*
* A^(log_2 n)_i,j != 0 for all i,j (7)
*
* Showing there's indeed a path between every CPU in at most O(log n) steps.
* The task movement gives a factor of O(m), giving a convergence complexity
* of:
*
* O(nm log n), n := nr_cpus, m := nr_tasks (8)
*
*
* WORK CONSERVING
*
* In order to avoid CPUs going idle while there's still work to do, new idle
* balancing is more aggressive and has the newly idle CPU iterate up the domain
* tree itself instead of relying on other CPUs to bring it work.
*
* This adds some complexity to both (5) and (8) but it reduces the total idle
* time.
*
* [XXX more?]
*
*
* CGROUPS
*
* Cgroups make a horror show out of (2), instead of a simple sum we get:
*
* s_k,i
* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
* S_k
*
* Where
*
* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
*
* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
*
* The big problem is S_k, its a global sum needed to compute a local (W_i)
* property.
*
* [XXX write more on how we solve this.. _after_ merging pjt's patches that
* rewrite all of this once again.]
*/
unsigned long __read_mostly max_load_balance_interval = HZ/10;
EXPORT_SYMBOL_GPL(max_load_balance_interval);
enum fbq_type { regular, remote, all };
/*
* 'group_type' describes the group of CPUs at the moment of load balancing.
*
* The enum is ordered by pulling priority, with the group with lowest priority
* first so the group_type can simply be compared when selecting the busiest
* group. See update_sd_pick_busiest().
*/
enum group_type {
/* The group has spare capacity that can be used to run more tasks. */
group_has_spare = 0,
/*
* The group is fully used and the tasks don't compete for more CPU
* cycles. Nevertheless, some tasks might wait before running.
*/
group_fully_busy,
/*
* SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
* and must be migrated to a more powerful CPU.
*/
group_misfit_task,
/*
* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
* and the task should be migrated to it instead of running on the
* current CPU.
*/
group_asym_packing,
/*
* The tasks' affinity constraints previously prevented the scheduler
* from balancing the load across the system.
*/
group_imbalanced,
/*
* The CPU is overloaded and can't provide expected CPU cycles to all
* tasks.
*/
group_overloaded
};
enum migration_type {
migrate_load = 0,
migrate_util,
migrate_task,
migrate_misfit
};
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
#define LBF_ACTIVE_LB 0x10
struct lb_env {
struct sched_domain *sd;
struct rq *src_rq;
int src_cpu;
int dst_cpu;
struct rq *dst_rq;
struct cpumask *dst_grpmask;
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
unsigned int flags;
unsigned int loop;
unsigned int loop_break;
unsigned int loop_max;
enum fbq_type fbq_type;
enum migration_type migration_type;
struct list_head tasks;
struct rq_flags *src_rq_rf;
};
/*
* Is this task likely cache-hot:
*/
static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
lockdep_assert_rq_held(env->src_rq);
if (p->sched_class != &fair_sched_class)
return 0;
if (unlikely(task_has_idle_policy(p)))
return 0;
/* SMT siblings share cache */
if (env->sd->flags & SD_SHARE_CPUCAPACITY)
return 0;
/*
* Buddy candidates are cache hot:
*/
if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
(&p->se == cfs_rq_of(&p->se)->next ||
&p->se == cfs_rq_of(&p->se)->last))
return 1;
if (sysctl_sched_migration_cost == -1)
return 1;
/*
* Don't migrate task if the task's cookie does not match
* with the destination CPU's core cookie.
*/
if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
return 1;
if (sysctl_sched_migration_cost == 0)
return 0;
delta = rq_clock_task(env->src_rq) - p->se.exec_start;
return delta < (s64)sysctl_sched_migration_cost;
}
#ifdef CONFIG_NUMA_BALANCING
/*
* Returns 1, if task migration degrades locality
* Returns 0, if task migration improves locality i.e migration preferred.
* Returns -1, if task migration is not affected by locality.
*/
static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
if (!static_branch_likely(&sched_numa_balancing))
return -1;
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
return -1;
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
if (src_nid == dst_nid)
return -1;
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid) {
if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
return 1;
else
return -1;
}
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
return 0;
/* Leaving a core idle is often worse than degrading locality. */
if (env->idle == CPU_IDLE)
return -1;
dist = node_distance(src_nid, dst_nid);
if (numa_group) {
src_weight = group_weight(p, src_nid, dist);
dst_weight = group_weight(p, dst_nid, dist);
} else {
src_weight = task_weight(p, src_nid, dist);
dst_weight = task_weight(p, dst_nid, dist);
}
return dst_weight < src_weight;
}
#else
static inline int migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
return -1;
}
#endif
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
int can_migrate = 1;
lockdep_assert_rq_held(env->src_rq);
trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate);
if (!can_migrate)
return 0;
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
* 2) cannot be migrated to this CPU due to cpus_ptr, or
* 3) running (obviously), or
* 4) are cache-hot on their current CPU.
*/
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
/* Disregard pcpu kthreads; they are where they need to be. */
if (kthread_is_per_cpu(p))
return 0;
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
env->flags |= LBF_SOME_PINNED;
/*
* Remember if this task can be migrated to any other CPU in
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
* Avoid computing new_dst_cpu
* - for NEWLY_IDLE
* - if we have already computed one in current iteration
* - if it's an active balance
*/
if (env->idle == CPU_NEWLY_IDLE ||
env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
return 0;
/* Prevent to re-select dst_cpu via env's CPUs: */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
}
}
return 0;
}
/* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
schedstat_inc(p->se.statistics.nr_failed_migrations_running);
return 0;
}
/*
* Aggressive migration if:
* 1) active balance
* 2) destination numa is preferred
* 3) task is cache cold, or
* 4) too many balance attempts have failed.
*/
if (env->flags & LBF_ACTIVE_LB)
return 1;
tsk_cache_hot = migrate_degrades_locality(p, env);
if (tsk_cache_hot == -1)
tsk_cache_hot = task_hot(p, env);
if (tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
schedstat_inc(env->sd->lb_hot_gained[env->idle]);
schedstat_inc(p->se.statistics.nr_forced_migrations);
}
return 1;
}
schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
return 0;
}
/*
* detach_task() -- detach the task for the migration specified in env
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
int detached = 0;
lockdep_assert_rq_held(env->src_rq);
/*
* The vendor hook may drop the lock temporarily, so
* pass the rq flags to unpin lock. We expect the
* rq lock to be held after return.
*/
trace_android_rvh_migrate_queued_task(env->src_rq, env->src_rq_rf, p,
env->dst_cpu, &detached);
if (detached)
return;
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
/*
* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
* part of active balancing operations within "domain".
*
* Returns a task if successful and NULL otherwise.
*/
static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p;
lockdep_assert_rq_held(env->src_rq);
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
detach_task(p, env);
/*
* Right now, this is only the second place where
* lb_gained[env->idle] is updated (other is detach_tasks)
* so we can safely collect stats here rather than
* inside detach_tasks().
*/
schedstat_inc(env->sd->lb_gained[env->idle]);
return p;
}
return NULL;
}
static const unsigned int sched_nr_migrate_break = 32;
/*
* detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
*/
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
unsigned long util, load;
struct task_struct *p;
int detached = 0;
lockdep_assert_rq_held(env->src_rq);
/*
* Source run queue has been emptied by another CPU, clear
* LBF_ALL_PINNED flag as we will not test any task.
*/
if (env->src_rq->nr_running <= 1) {
env->flags &= ~LBF_ALL_PINNED;
return 0;
}
if (env->imbalance <= 0)
return 0;
while (!list_empty(tasks)) {
/*
* We don't want to steal all, otherwise we may be treated likewise,
* which could at worst lead to a livelock crash.
*/
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
break;
p = list_last_entry(tasks, struct task_struct, se.group_node);
env->loop++;
/* We've more or less seen every task there is, call it quits */
if (env->loop > env->loop_max)
break;
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
env->loop_break += sched_nr_migrate_break;
env->flags |= LBF_NEED_BREAK;
break;
}
if (!can_migrate_task(p, env))
goto next;
switch (env->migration_type) {
case migrate_load:
/*
* Depending of the number of CPUs and tasks and the
* cgroup hierarchy, task_h_load() can return a null
* value. Make sure that env->imbalance decreases
* otherwise detach_tasks() will stop only after
* detaching up to loop_max tasks.
*/
load = max_t(unsigned long, task_h_load(p), 1);
if (sched_feat(LB_MIN) &&
load < 16 && !env->sd->nr_balance_failed)
goto next;
/*
* Make sure that we don't migrate too much load.
* Nevertheless, let relax the constraint if
* scheduler fails to find a good waiting task to
* migrate.
*/
if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= load;
break;
case migrate_util:
util = task_util_est(p);
if (util > env->imbalance)
goto next;
env->imbalance -= util;
break;
case migrate_task:
env->imbalance--;
break;
case migrate_misfit:
/* This is not a misfit task */
if (task_fits_cpu(p, env->src_cpu))
goto next;
env->imbalance = 0;
break;
}
detach_task(p, env);
list_add(&p->se.group_node, &env->tasks);
detached++;
#ifdef CONFIG_PREEMPTION
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
break;
#endif
/*
* We only want to steal up to the prescribed amount of
* load/util/tasks.
*/
if (env->imbalance <= 0)
break;
continue;
next:
list_move(&p->se.group_node, tasks);
}
/*
* Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather
* than inside detach_one_task().
*/
schedstat_add(env->sd->lb_gained[env->idle], detached);
return detached;
}
/*
* attach_task() -- attach the task detached by detach_task() to its new rq.
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
BUG_ON(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
check_preempt_curr(rq, p, 0);
}
/*
* attach_one_task() -- attaches the task returned from detach_one_task() to
* its new rq.
*/
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
struct rq_flags rf;
rq_lock(rq, &rf);
update_rq_clock(rq);
attach_task(rq, p);
rq_unlock(rq, &rf);
}
/*
* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
* new rq.
*/
static void attach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->tasks;
struct task_struct *p;
struct rq_flags rf;
rq_lock(env->dst_rq, &rf);
update_rq_clock(env->dst_rq);
while (!list_empty(tasks)) {
p = list_first_entry(tasks, struct task_struct, se.group_node);
list_del_init(&p->se.group_node);
attach_task(env->dst_rq, p);
}
rq_unlock(env->dst_rq, &rf);
}
#ifdef CONFIG_NO_HZ_COMMON
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
return true;
if (cfs_rq->avg.util_avg)
return true;
return false;
}
static inline bool others_have_blocked(struct rq *rq)
{
if (READ_ONCE(rq->avg_rt.util_avg))
return true;
if (READ_ONCE(rq->avg_dl.util_avg))
return true;
if (thermal_load_avg(rq))
return true;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if (READ_ONCE(rq->avg_irq.util_avg))
return true;
#endif
return false;
}
static inline void update_blocked_load_tick(struct rq *rq)
{
WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
{
if (!has_blocked)
rq->has_blocked_load = 0;
}
#else
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_tick(struct rq *rq) {}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
static bool __update_blocked_others(struct rq *rq, bool *done)
{
const struct sched_class *curr_class;
u64 now = rq_clock_pelt(rq);
unsigned long thermal_pressure;
bool decayed;
/*
* update_load_avg() can call cpufreq_update_util(). Make sure that RT,
* DL and IRQ signals have been updated before updating CFS.
*/
curr_class = rq->curr->sched_class;
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
*done = false;
return decayed;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq, *pos;
bool decayed = false;
int cpu = cpu_of(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq);
if (cfs_rq == &rq->cfs)
decayed = true;
}
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
* decayed cfs_rqs linger on the list.
*/
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
*done = false;
}
return decayed;
}
/*
* Compute the hierarchical load factor for cfs_rq and all its ascendants.
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
unsigned long now = jiffies;
unsigned long load;
if (cfs_rq->last_h_load_update == now)
return;
WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
}
if (!se) {
cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
cfs_rq->last_h_load_update = now;
}
while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
load = cfs_rq->h_load;
load = div64_ul(load * se->avg.load_avg,
cfs_rq_load_avg(cfs_rq) + 1);
cfs_rq = group_cfs_rq(se);
cfs_rq->h_load = load;
cfs_rq->last_h_load_update = now;
}
}
static unsigned long task_h_load(struct task_struct *p)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);
update_cfs_rq_h_load(cfs_rq);
return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
cfs_rq_load_avg(cfs_rq) + 1);
}
#else
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq = &rq->cfs;
bool decayed;
decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
if (cfs_rq_has_blocked(cfs_rq))
*done = false;
return decayed;
}
static unsigned long task_h_load(struct task_struct *p)
{
return p->se.avg.load_avg;
}
#endif
static void update_blocked_averages(int cpu)
{
bool decayed = false, done = true;
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_blocked_load_tick(rq);
update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
decayed |= __update_blocked_fair(rq, &done);
update_blocked_load_status(rq, !done);
if (decayed)
cpufreq_update_util(rq, 0);
rq_unlock_irqrestore(rq, &rf);
}
/********** Helpers for find_busiest_group ************************/
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long group_capacity;
unsigned long group_util; /* Total utilization over the CPUs of the group */
unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
unsigned int sum_nr_running; /* Nr of tasks running in the group */
unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
#endif
};
/*
* sd_lb_stats - Structure to store the statistics of a sched_domain
* during load balancing.
*/
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
unsigned int prefer_sibling; /* tasks should go to sibling first */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
struct sg_lb_stats local_stat; /* Statistics of the local group */
};
static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
{
/*
* Skimp on the clearing to avoid duplicate work. We can avoid clearing
* local_stat because update_sg_lb_stats() does a full clear/assignment.
* We must however set busiest_stat::group_type and
* busiest_stat::idle_cpus to the worst busiest group because
* update_sd_pick_busiest() reads these before assignment.
*/
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
.idle_cpus = UINT_MAX,
.group_type = group_has_spare,
},
};
}
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
irq = cpu_util_irq(rq);
if (unlikely(irq >= max))
return 1;
/*
* avg_rt.util_avg and avg_dl.util_avg track binary signals
* (running and not running) with weights 0 and 1024 respectively.
* avg_thermal.load_avg tracks thermal pressure and the weighted
* average uses the actual delta max capacity(load).
*/
used = READ_ONCE(rq->avg_rt.util_avg);
used += READ_ONCE(rq->avg_dl.util_avg);
used += thermal_load_avg(rq);
if (unlikely(used >= max))
return 1;
free = max - used;
return scale_irq_capacity(free, irq, max);
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
if (!capacity)
capacity = 1;
trace_android_rvh_update_cpu_capacity(cpu, &capacity);
cpu_rq(cpu)->cpu_capacity = capacity;
trace_sched_cpu_capacity_tp(cpu_rq(cpu));
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
sdg->sgc->max_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long capacity, min_capacity, max_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
interval = clamp(interval, 1UL, max_load_balance_interval);
sdg->sgc->next_update = jiffies + interval;
if (!child) {
update_cpu_capacity(sd, cpu);
return;
}
capacity = 0;
min_capacity = ULONG_MAX;
max_capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
* SD_OVERLAP domains cannot assume that child groups
* span the current group.
*/
for_each_cpu(cpu, sched_group_span(sdg)) {
unsigned long cpu_cap = capacity_of(cpu);
capacity += cpu_cap;
min_capacity = min(cpu_cap, min_capacity);
max_capacity = max(cpu_cap, max_capacity);
}
} else {
/*
* !SD_OVERLAP domains can assume that child groups
* span the current group.
*/
group = child->groups;
do {
struct sched_group_capacity *sgc = group->sgc;
capacity += sgc->capacity;
min_capacity = min(sgc->min_capacity, min_capacity);
max_capacity = max(sgc->max_capacity, max_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = min_capacity;
sdg->sgc->max_capacity = max_capacity;
}
/*
* Check whether the capacity of the rq has been noticeably reduced by side
* activity. The imbalance_pct is used for the threshold.
* Return true is the capacity is reduced
*/
static inline int
check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
return ((rq->cpu_capacity * sd->imbalance_pct) <
(rq->cpu_capacity_orig * 100));
}
/*
* Check whether a rq has a misfit task and if it looks like we can actually
* help that task: we can migrate the task to a CPU of higher capacity, or
* the task's current CPU is heavily pressured.
*/
static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
{
return rq->misfit_task_load &&
(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
check_cpu_capacity(rq, sd));
}
/*
* Group imbalance indicates (and tries to solve) the problem where balancing
* groups is inadequate due to ->cpus_ptr constraints.
*
* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
* Something like:
*
* { 0 1 2 3 } { 4 5 6 7 }
* * * * *
*
* If we were to balance group-wise we'd place two tasks in the first group and
* two tasks in the second group. Clearly this is undesired as it will overload
* cpu 3 and leave one of the CPUs in the second group unused.
*
* The current solution to this issue is detecting the skew in the first group
* by noticing the lower domain failed to reach balance and had difficulty
* moving tasks due to affinity constraints.
*
* When this is so detected; this group becomes a candidate for busiest; see
* update_sd_pick_busiest(). And calculate_imbalance() and
* find_busiest_group() avoid some of the usual balance conditions to allow it
* to create an effective group imbalance.
*
* This is a somewhat tricky proposition since the next run might not find the
* group imbalance and decide the groups need to be balanced again. A most
* subtle and fragile situation.
*/
static inline int sg_imbalanced(struct sched_group *group)
{
return group->sgc->imbalance;
}
/*
* group_has_capacity returns true if the group has spare capacity that could
* be used by some tasks.
* We consider that a group has spare capacity if the * number of task is
* smaller than the number of CPUs or if the utilization is lower than the
* available capacity for CFS tasks.
* For the latter, we use a threshold to stabilize the state, to take into
* account the variance of the tasks' load and to return true if the available
* capacity in meaningful for the load balancer.
* As an example, an available capacity of 1% can appear but it doesn't make
* any benefit for the load balance.
*/
static inline bool
group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running < sgs->group_weight)
return true;
if ((sgs->group_capacity * imbalance_pct) <
(sgs->group_runnable * 100))
return false;
if ((sgs->group_capacity * 100) >
(sgs->group_util * imbalance_pct))
return true;
return false;
}
/*
* group_is_overloaded returns true if the group has more tasks than it can
* handle.
* group_is_overloaded is not equals to !group_has_capacity because a group
* with the exact right number of tasks, has no more spare capacity but is not
* overloaded so both group_has_capacity and group_is_overloaded return
* false.
*/
static inline bool
group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
if ((sgs->group_capacity * 100) <
(sgs->group_util * imbalance_pct))
return true;
if ((sgs->group_capacity * imbalance_pct) <
(sgs->group_runnable * 100))
return true;
return false;
}
static inline enum
group_type group_classify(unsigned int imbalance_pct,
struct sched_group *group,
struct sg_lb_stats *sgs)
{
if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
if (sg_imbalanced(group))
return group_imbalanced;
if (sgs->group_asym_packing)
return group_asym_packing;
if (sgs->group_misfit_task_load)
return group_misfit_task;
if (!group_has_capacity(imbalance_pct, sgs))
return group_fully_busy;
return group_has_spare;
}
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_status: Holds flag indicating the status of the sched_group
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group,
struct sg_lb_stats *sgs,
int *sg_status)
{
int i, nr_running, local_group;
memset(sgs, 0, sizeof(*sgs));
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
if (nr_running > 1)
*sg_status |= SG_OVERLOAD;
if (cpu_overutilized(i))
*sg_status |= SG_OVERUTILIZED;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
/*
* No need to call idle_cpu() if nr_running is not 0
*/
if (!nr_running && idle_cpu(i)) {
sgs->idle_cpus++;
/* Idle cpu can't have misfit task */
continue;
}
if (local_group)
continue;
/* Check for a misfit task on the cpu */
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
*sg_status |= SG_OVERLOAD;
}
}
/* Check if dst CPU is idle and preferred to this group */
if (env->sd->flags & SD_ASYM_PACKING &&
env->idle != CPU_NOT_IDLE &&
sgs->sum_h_nr_running &&
sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
sgs->group_asym_packing = 1;
}
sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
/* Computing avg_load makes sense only when group is overloaded */
if (sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
sgs->group_capacity;
}
/**
* update_sd_pick_busiest - return 1 on busiest group
* @env: The load balancing environment.
* @sds: sched_domain statistics
* @sg: sched_group candidate to be checked for being the busiest
* @sgs: sched_group statistics
*
* Determine if @sg is a busier group than the previously selected
* busiest group.
*
* Return: %true if @sg is a busier group than the previously selected
* busiest group. %false otherwise.
*/
static bool update_sd_pick_busiest(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *sg,
struct sg_lb_stats *sgs)
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
/* Make sure that there is at least one task to pull */
if (!sgs->sum_h_nr_running)
return false;
/*
* Don't try to pull misfit tasks we can't help.
* We can use max_capacity here as reduction in capacity on some
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
*/
if (sgs->group_type == group_misfit_task &&
(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
if (sgs->group_type > busiest->group_type)
return true;
if (sgs->group_type < busiest->group_type)
return false;
/*
* The candidate and the current busiest group are the same type of
* group. Let check which one is the busiest according to the type.
*/
switch (sgs->group_type) {
case group_overloaded:
/* Select the overloaded group with highest avg_load. */
if (sgs->avg_load <= busiest->avg_load)
return false;
break;
case group_imbalanced:
/*
* Select the 1st imbalanced group as we don't have any way to
* choose one more than another.
*/
return false;
case group_asym_packing:
/* Prefer to move from lowest priority CPU's work */
if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
return false;
break;
case group_misfit_task:
/*
* If we have more than one misfit sg go with the biggest
* misfit.
*/
if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
return false;
break;
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
* theory, there is no need to pull task from such kind of
* group because tasks have all compute capacity that they need
* but we can still improve the overall throughput by reducing
* contention when accessing shared HW resources.
*
* XXX for now avg_load is not computed and always 0 so we
* select the 1st one.
*/
if (sgs->avg_load <= busiest->avg_load)
return false;
break;
case group_has_spare:
/*
* Select not overloaded group with lowest number of idle cpus
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
* that the group has less spare capacity but finally more idle
* CPUs which means less opportunity to pull tasks.
*/
if (sgs->idle_cpus > busiest->idle_cpus)
return false;
else if ((sgs->idle_cpus == busiest->idle_cpus) &&
(sgs->sum_nr_running <= busiest->sum_nr_running))
return false;
break;
}
/*
* Candidate sg has no more than one task per CPU and has higher
* per-CPU capacity. Migrating tasks to less capable CPUs may harm
* throughput. Maximize throughput, power/energy consequences are not
* considered.
*/
if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type <= group_fully_busy) &&
(capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
return false;
return true;
}
#ifdef CONFIG_NUMA_BALANCING
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
if (sgs->sum_h_nr_running > sgs->nr_numa_running)
return regular;
if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
return remote;
return all;
}
static inline enum fbq_type fbq_classify_rq(struct rq *rq)
{
if (rq->nr_running > rq->nr_numa_running)
return regular;
if (rq->nr_running > rq->nr_preferred_running)
return remote;
return all;
}
#else
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
return all;
}
static inline enum fbq_type fbq_classify_rq(struct rq *rq)
{
return regular;
}
#endif /* CONFIG_NUMA_BALANCING */
struct sg_lb_stats;
/*
* task_running_on_cpu - return 1 if @p is running on @cpu.
*/
static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
{
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return 0;
if (task_on_rq_queued(p))
return 1;
return 0;
}
/**
* idle_cpu_without - would a given CPU be idle without p ?
* @cpu: the processor on which idleness is tested.
* @p: task which should be ignored.
*
* Return: 1 if the CPU would be idle. 0 otherwise.
*/
static int idle_cpu_without(int cpu, struct task_struct *p)
{
struct rq *rq = cpu_rq(cpu);
if (rq->curr != rq->idle && rq->curr != p)
return 0;
/*
* rq->nr_running can't be used but an updated version without the
* impact of p on cpu must be used instead. The updated nr_running
* be computed and tested before calling idle_cpu_without().
*/
#ifdef CONFIG_SMP
if (rq->ttwu_pending)
return 0;
#endif
return 1;
}
/*
* update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
* @sd: The sched_domain level to look for idlest group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @p: The task for which we look for the idlest group/CPU.
*/
static inline void update_sg_wakeup_stats(struct sched_domain *sd,
struct sched_group *group,
struct sg_lb_stats *sgs,
struct task_struct *p)
{
int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
/* Assume that task can't fit any CPU of the group */
if (sd->flags & SD_ASYM_CPUCAPACITY)
sgs->group_misfit_task_load = 1;
for_each_cpu(i, sched_group_span(group)) {
struct rq *rq = cpu_rq(i);
unsigned int local;
sgs->group_load += cpu_load_without(rq, p);
sgs->group_util += cpu_util_without(i, p);
sgs->group_runnable += cpu_runnable_without(rq, p);
local = task_running_on_cpu(i, p);
sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;
/*
* No need to call idle_cpu_without() if nr_running is not 0
*/
if (!nr_running && idle_cpu_without(i, p))
sgs->idle_cpus++;
/* Check if task fits in the CPU */
if (sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load &&
task_fits_cpu(p, i))
sgs->group_misfit_task_load = 0;
}
sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
/*
* Computing avg_load makes sense only when group is fully busy or
* overloaded
*/
if (sgs->group_type == group_fully_busy ||
sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
sgs->group_capacity;
}
static bool update_pick_idlest(struct sched_group *idlest,
struct sg_lb_stats *idlest_sgs,
struct sched_group *group,
struct sg_lb_stats *sgs)
{
if (sgs->group_type < idlest_sgs->group_type)
return true;
if (sgs->group_type > idlest_sgs->group_type)
return false;
/*
* The candidate and the current idlest group are the same type of
* group. Let check which one is the idlest according to the type.
*/
switch (sgs->group_type) {
case group_overloaded:
case group_fully_busy:
/* Select the group with lowest avg_load. */
if (idlest_sgs->avg_load <= sgs->avg_load)
return false;
break;
case group_imbalanced:
case group_asym_packing:
/* Those types are not used in the slow wakeup path */
return false;
case group_misfit_task:
/* Select group with the highest max capacity */
if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
return false;
break;
case group_has_spare:
/* Select group with most idle CPUs */
if (idlest_sgs->idle_cpus > sgs->idle_cpus)
return false;
/* Select group with lowest group_util */
if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
idlest_sgs->group_util <= sgs->group_util)
return false;
break;
}
return true;
}
/*
* Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
* This is an approximation as the number of running tasks may not be
* related to the number of busy CPUs due to sched_setaffinity.
*/
static inline bool
allow_numa_imbalance(unsigned int running, unsigned int weight)
{
return (running < (weight >> 2));
}
/*
* find_idlest_group() finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
struct sg_lb_stats local_sgs, tmp_sgs;
struct sg_lb_stats *sgs;
unsigned long imbalance;
struct sg_lb_stats idlest_sgs = {
.avg_load = UINT_MAX,
.group_type = group_overloaded,
};
do {
int local_group;
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_span(group),
p->cpus_ptr))
continue;
/* Skip over this group if no cookie matched */
if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
continue;
local_group = cpumask_test_cpu(this_cpu,
sched_group_span(group));
if (local_group) {
sgs = &local_sgs;
local = group;
} else {
sgs = &tmp_sgs;
}
update_sg_wakeup_stats(sd, group, sgs, p);
if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
idlest = group;
idlest_sgs = *sgs;
}
} while (group = group->next, group != sd->groups);
/* There is no idlest group to push tasks to */
if (!idlest)
return NULL;
/* The local group has been skipped because of CPU affinity */
if (!local)
return idlest;
/*
* If the local group is idler than the selected idlest group
* don't try and push the task.
*/
if (local_sgs.group_type < idlest_sgs.group_type)
return NULL;
/*
* If the local group is busier than the selected idlest group
* try and push the task.
*/
if (local_sgs.group_type > idlest_sgs.group_type)
return idlest;
switch (local_sgs.group_type) {
case group_overloaded:
case group_fully_busy:
/* Calculate allowed imbalance based on load */
imbalance = scale_load_down(NICE_0_LOAD) *
(sd->imbalance_pct-100) / 100;
/*
* When comparing groups across NUMA domains, it's possible for
* the local domain to be very lightly loaded relative to the
* remote domains but "imbalance" skews the comparison making
* remote CPUs look much more favourable. When considering
* cross-domain, add imbalance to the load on the remote node
* and consider staying local.
*/
if ((sd->flags & SD_NUMA) &&
((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
return NULL;
/*
* If the local group is less loaded than the selected
* idlest group don't try and push any tasks.
*/
if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
return NULL;
if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
return NULL;
break;
case group_imbalanced:
case group_asym_packing:
/* Those type are not used in the slow wakeup path */
return NULL;
case group_misfit_task:
/* Select group with the highest max capacity */
if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
return NULL;
break;
case group_has_spare:
if (sd->flags & SD_NUMA) {
#ifdef CONFIG_NUMA_BALANCING
int idlest_cpu;
/*
* If there is spare capacity at NUMA, try to select
* the preferred node
*/
if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
return NULL;
idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
#endif
/*
* Otherwise, keep the task close to the wakeup source
* and improve locality if the number of running tasks
* would remain below threshold where an imbalance is
* allowed. If there is a real need of migration,
* periodic load balance will take care of it.
*/
if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, local_sgs.group_weight))
return NULL;
}
/*
* Select group with highest number of idle CPUs. We could also
* compare the utilization which is more stable but it can end
* up that the group has less spare capacity but finally more
* idle CPUs which means more opportunity to run task.
*/
if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
return NULL;
break;
}
return idlest;
}
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
int sg_status = 0;
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
if (local_group) {
sds->local = sg;
sgs = local;
if (env->idle != CPU_NEWLY_IDLE ||
time_after_eq(jiffies, sg->sgc->next_update))
update_group_capacity(env->sd, env->dst_cpu);
}
update_sg_lb_stats(env, sg, sgs, &sg_status);
if (local_group)
goto next_group;
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
}
next_group:
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) {
struct root_domain *rd = env->dst_rq->rd;
/* update overload indicator if we are at root domain */
WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
/* Update over-utilization (tipping point, U >= 0) indicator */
WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
} else if (sg_status & SG_OVERUTILIZED) {
struct root_domain *rd = env->dst_rq->rd;
WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
}
}
#define NUMA_IMBALANCE_MIN 2
static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight)
{
if (!allow_numa_imbalance(dst_running, dst_weight))
return imbalance;
/*
* Allow a small imbalance based on a simple pair of communicating
* tasks that remain local when the destination is lightly loaded.
*/
if (imbalance <= NUMA_IMBALANCE_MIN)
return 0;
return imbalance;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
* @env: load balance environment
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
*/
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
if (busiest->group_type == group_misfit_task) {
/* Set imbalance to allow misfit tasks to be balanced. */
env->migration_type = migrate_misfit;
env->imbalance = 1;
return;
}
if (busiest->group_type == group_asym_packing) {
/*
* In case of asym capacity, we will try to migrate all load to
* the preferred CPU.
*/
env->migration_type = migrate_task;
env->imbalance = busiest->sum_h_nr_running;
return;
}
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure CPU-load equilibrium, try to move any task to fix
* the imbalance. The next load balance will take care of
* balancing back the system.
*/
env->migration_type = migrate_task;
env->imbalance = 1;
return;
}
/*
* Try to use spare capacity of local group without overloading it or
* emptying busiest.
*/
if (local->group_type == group_has_spare) {
if ((busiest->group_type > group_fully_busy) &&
!(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
/*
* If busiest is overloaded, try to fill spare
* capacity. This might end up creating spare capacity
* in busiest or busiest still being overloaded but
* there is no simple way to directly compute the
* amount of load to migrate in order to balance the
* system.
*/
env->migration_type = migrate_util;
env->imbalance = max(local->group_capacity, local->group_util) -
local->group_util;
/*
* In some cases, the group's utilization is max or even
* higher than capacity because of migrations but the
* local CPU is (newly) idle. There is at least one
* waiting task in this overloaded busiest group. Let's
* try to pull it.
*/
if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
env->migration_type = migrate_task;
env->imbalance = 1;
}
return;
}
if (busiest->group_weight == 1 || sds->prefer_sibling) {
unsigned int nr_diff = busiest->sum_nr_running;
/*
* When prefer sibling, evenly spread running tasks on
* groups.
*/
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
env->imbalance = nr_diff >> 1;
} else {
/*
* If there is no overload, we just want to even the number of
* idle cpus.
*/
env->migration_type = migrate_task;
env->imbalance = max_t(long, 0, (local->idle_cpus -
busiest->idle_cpus) >> 1);
}
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
local->sum_nr_running + 1, local->group_weight);
}
return;
}
/*
* Local is fully busy but has to take more load to relieve the
* busiest group
*/
if (local->group_type < group_overloaded) {
/*
* Local will become overloaded so the avg_load metrics are
* finally needed.
*/
local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
local->group_capacity;
sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
sds->total_capacity;
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
*/
if (local->avg_load >= busiest->avg_load) {
env->imbalance = 0;
return;
}
}
/*
* Both group are or will become overloaded and we're trying to get all
* the CPUs to the average_load, so we don't want to push ourselves
* above the average load, nor do we wish to reduce the max loaded CPU
* below the average load. At the same time, we also don't want to
* reduce the group load below the group capacity. Thus we look for
* the minimum possible imbalance.
*/
env->migration_type = migrate_load;
env->imbalance = min(
(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
}
/******* find_busiest_group() helpers end here *********************/
/*
* Decision matrix according to the local and busiest group type:
*
* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
* has_spare nr_idle balanced N/A N/A balanced balanced
* fully_busy nr_idle nr_idle N/A N/A balanced balanced
* misfit_task force N/A N/A N/A force force
* asym_packing force force N/A N/A force force
* imbalanced force force N/A N/A force force
* overloaded force force N/A N/A force avg_load
*
* N/A : Not Applicable because already filtered while updating
* statistics.
* balanced : The system is balanced for these 2 groups.
* force : Calculate the imbalance as load migration is probably needed.
* avg_load : Only if imbalance is significant enough.
* nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
* different in groups.
*/
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
*
* Also calculates the amount of runnable load which should be moved
* to restore balance.
*
* @env: The load balancing environment.
*
* Return: - The busiest group if imbalance exists.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
{
struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
init_sd_lb_stats(&sds);
/*
* Compute the various statistics relevant for load balancing at
* this level.
*/
update_sd_lb_stats(env, &sds);
if (sched_energy_enabled()) {
struct root_domain *rd = env->dst_rq->rd;
int out_balance = 1;
trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq,
&out_balance);
if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)
&& out_balance)
goto out_balanced;
}
local = &sds.local_stat;
busiest = &sds.busiest_stat;
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest)
goto out_balanced;
/* Misfit tasks should be dealt with regardless of the avg load */
if (busiest->group_type == group_misfit_task)
goto force_balance;
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
goto force_balance;
/*
* If the busiest group is imbalanced the below checks don't
* work because they assume all things are equal, which typically
* isn't true due to cpus_ptr constraints and the like.
*/
if (busiest->group_type == group_imbalanced)
goto force_balance;
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
if (local->group_type > busiest->group_type)
goto out_balanced;
/*
* When groups are overloaded, use the avg_load to ensure fairness
* between tasks.
*/
if (local->group_type == group_overloaded) {
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
*/
if (local->avg_load >= busiest->avg_load)
goto out_balanced;
/* XXX broken for overlapping NUMA groups */
sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
sds.total_capacity;
/*
* Don't pull any tasks if this group is already above the
* domain average load.
*/
if (local->avg_load >= sds.avg_load)
goto out_balanced;
/*
* If the busiest group is more loaded, use imbalance_pct to be
* conservative.
*/
if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
/* Try to move all excess tasks to child's sibling domain */
if (sds.prefer_sibling && local->group_type == group_has_spare &&
busiest->sum_nr_running > local->sum_nr_running + 1)
goto force_balance;
if (busiest->group_type != group_overloaded) {
if (env->idle == CPU_NOT_IDLE)
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
* busy, let another idle CPU try to pull task.
*/
goto out_balanced;
if (busiest->group_weight > 1 &&
local->idle_cpus <= (busiest->idle_cpus + 1))
/*
* If the busiest group is not overloaded
* and there is no imbalance between this and busiest
* group wrt idle CPUs, it is balanced. The imbalance
* becomes significant if the diff is greater than 1
* otherwise we might end up to just move the imbalance
* on another group. Of course this applies only if
* there is more than 1 CPU per group.
*/
goto out_balanced;
if (busiest->sum_h_nr_running == 1)
/*
* busiest doesn't have any tasks waiting to run
*/
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds);
return env->imbalance ? sds.busiest : NULL;
out_balanced:
env->imbalance = 0;
return NULL;
}
/*
* find_busiest_queue - find the busiest runqueue among the CPUs in the group.
*/
static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
unsigned int busiest_nr = 0;
int i, done = 0;
trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus,
&busiest, &done);
if (done)
return busiest;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
unsigned long capacity, load, util;
unsigned int nr_running;
enum fbq_type rt;
rq = cpu_rq(i);
rt = fbq_classify_rq(rq);
/*
* We classify groups/runqueues into three groups:
* - regular: there are !numa tasks
* - remote: there are numa tasks that run on the 'wrong' node
* - all: there is no distinction
*
* In order to avoid migrating ideally placed numa tasks,
* ignore those when there's better options.
*
* If we ignore the actual busiest queue to migrate another
* task, the next balance pass can still reduce the busiest
* queue by moving tasks around inside the node.
*
* If we cannot move enough load due to this classification
* the next pass will adjust the group classification and
* allow migration of more tasks.
*
* Both cases only affect the total convergence complexity.
*/
if (rt > env->fbq_type)
continue;
nr_running = rq->cfs.h_nr_running;
if (!nr_running)
continue;
capacity = capacity_of(i);
/*
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
* eventually lead to active_balancing high->low capacity.
* Higher per-CPU capacity is considered better than balancing
* average load.
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
!capacity_greater(capacity_of(env->dst_cpu), capacity) &&
nr_running == 1)
continue;
switch (env->migration_type) {
case migrate_load:
/*
* When comparing with load imbalance, use cpu_load()
* which is not scaled with the CPU capacity.
*/
load = cpu_load(rq);
if (nr_running == 1 && load > env->imbalance &&
!check_cpu_capacity(rq, env->sd))
break;
/*
* For the load comparisons with the other CPUs,
* consider the cpu_load() scaled with the CPU
* capacity, so that the load can be moved away
* from the CPU that is potentially running at a
* lower capacity.
*
* Thus we're looking for max(load_i / capacity_i),
* crosswise multiplication to rid ourselves of the
* division works out to:
* load_i * capacity_j > load_j * capacity_i;
* where j is our previous maximum.
*/
if (load * busiest_capacity > busiest_load * capacity) {
busiest_load = load;
busiest_capacity = capacity;
busiest = rq;
}
break;
case migrate_util:
util = cpu_util(cpu_of(rq));
/*
* Don't try to pull utilization from a CPU with one
* running task. Whatever its utilization, we will fail
* detach the task.
*/
if (nr_running <= 1)
continue;
if (busiest_util < util) {
busiest_util = util;
busiest = rq;
}
break;
case migrate_task:
if (busiest_nr < nr_running) {
busiest_nr = nr_running;
busiest = rq;
}
break;
case migrate_misfit:
/*
* For ASYM_CPUCAPACITY domains with misfit tasks we
* simply seek the "biggest" misfit task.
*/
if (rq->misfit_task_load > busiest_load) {
busiest_load = rq->misfit_task_load;
busiest = rq;
}
break;
}
}
return busiest;
}
/*
* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
* so long as it is large enough.
*/
#define MAX_PINNED_INTERVAL 512
static inline bool
asym_active_balance(struct lb_env *env)
{
/*
* ASYM_PACKING needs to force migrate tasks from busy but
* lower priority CPUs in order to pack all tasks in the
* highest priority CPUs.
*/
return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
sched_asym_prefer(env->dst_cpu, env->src_cpu);
}
static inline bool
imbalanced_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
/*
* The imbalanced case includes the case of pinned tasks preventing a fair
* distribution of the load on the system but also the even distribution of the
* threads on a system with spare capacity
*/
if ((env->migration_type == migrate_task) &&
(sd->nr_balance_failed > sd->cache_nice_tries+2))
return 1;
return 0;
}
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
if (asym_active_balance(env))
return 1;
if (imbalanced_active_balance(env))
return 1;
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
* It's worth migrating the task if the src_cpu's capacity is reduced
* because of other sched_class or IRQs if more capacity stays
* available on dst_cpu.
*/
if ((env->idle != CPU_NOT_IDLE) &&
(env->src_rq->cfs.h_nr_running == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
}
if (env->migration_type == migrate_misfit)
return 1;
return 0;
}
static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
int cpu;
/*
* Ensure the balancing environment is consistent; can happen
* when the softirq triggers 'during' hotplug.
*/
if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
return 0;
/*
* In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
*/
if (env->idle == CPU_NEWLY_IDLE)
return 1;
/* Try to find first idle CPU */
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
if (!idle_cpu(cpu))
continue;
/* Are we the first idle CPU? */
return cpu == env->dst_cpu;
}
/* Are we the first CPU of this group ? */
return group_balance_cpu(sg) == env->dst_cpu;
}
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
int ld_moved, cur_ld_moved, active_balance = 0;
struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
struct rq *busiest;
struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_span(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
};
cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
schedstat_inc(sd->lb_count[idle]);
redo:
if (!should_we_balance(&env)) {
*continue_balancing = 0;
goto out_balanced;
}
group = find_busiest_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == env.dst_rq);
schedstat_add(sd->lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
ld_moved = 0;
/* Clear this flag as soon as we find a pullable task */
env.flags |= LBF_ALL_PINNED;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
rq_lock_irqsave(busiest, &rf);
env.src_rq_rf = &rf;
update_rq_clock(busiest);
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);
/*
* We've detached some tasks from busiest_rq. Every
* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
* unlock busiest->lock, and we are able to be sure
* that nobody can manipulate the tasks in parallel.
* See task_rq_lock() family for the details.
*/
rq_unlock(busiest, &rf);
if (cur_ld_moved) {
attach_tasks(&env);
ld_moved += cur_ld_moved;
}
local_irq_restore(rf.flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
goto more_balance;
}
/*
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
* where they can run. The upper limit on how many times we
* iterate on same src_cpu is dependent on number of CPUs in our
* sched_group.
*
* This changes load balance semantics a bit on who can move
* load to a given_cpu. In addition to the given_cpu itself
* (or a ilb_cpu acting on its behalf where given_cpu is
* nohz-idle), we now have balance_cpu in a position to move
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
* given_cpu) causing excess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
/* Prevent to re-select dst_cpu via env's CPUs */
__cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
*/
goto more_balance;
}
/*
* We failed to reach balance because of affinity.
*/
if (sd_parent) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
*group_imbalance = 1;
}
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
__cpumask_clear_cpu(cpu_of(busiest), cpus);
/*
* Attempting to continue load balancing at the current
* sched_domain level only makes sense if there are
* active CPUs remaining as possible busiest CPUs to
* pull load from which are not contained within the
* destination group that is receiving any migrated
* load.
*/
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
}
goto out_all_pinned;
}
}
if (!ld_moved) {
schedstat_inc(sd->lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
unsigned long flags;
raw_spin_rq_lock_irqsave(busiest, flags);
/*
* Don't kick the active_load_balance_cpu_stop,
* if the curr task on busiest CPU can't be
* moved to this_cpu:
*/
if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
raw_spin_rq_unlock_irqrestore(busiest, flags);
goto out_one_pinned;
}
/* Record that we found at least one task that could run on this_cpu */
env.flags &= ~LBF_ALL_PINNED;
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
* only after active load balance is finished.
*/
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
raw_spin_rq_unlock_irqrestore(busiest, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
}
} else {
sd->nr_balance_failed = 0;
}
if (likely(!active_balance) || need_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
}
goto out;
out_balanced:
/*
* We reach balance although we may have faced some affinity
* constraints. Clear the imbalance flag only if other tasks got
* a chance to move and fix the imbalance.
*/
if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
*group_imbalance = 0;
}
out_all_pinned:
/*
* We reach balance because all tasks are pinned at this level so
* we can't migrate them. Let the imbalance flag set so parent level
* can try to migrate them.
*/
schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
ld_moved = 0;
/*
* newidle_balance() disregards balance intervals, so we could
* repeatedly reach this code, which would lead to balance_interval
* skyrocketing in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
*/
if (env.idle == CPU_NEWLY_IDLE)
goto out;
/* tune up the balancing interval */
if ((env.flags & LBF_ALL_PINNED &&
sd->balance_interval < MAX_PINNED_INTERVAL) ||
sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
out:
return ld_moved;
}
static inline unsigned long
get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
{
unsigned long interval = sd->balance_interval;
if (cpu_busy)
interval *= sd->busy_factor;
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
/*
* Reduce likelihood of busy balancing at higher domains racing with
* balancing at lower domains by preventing their balancing periods
* from being multiples of each other.
*/
if (cpu_busy)
interval -= 1;
interval = clamp(interval, 1UL, max_load_balance_interval);
return interval;
}
static inline void
update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
{
unsigned long interval, next;
/* used by idle balance, so cpu_busy = 0 */
interval = get_sd_balance_interval(sd, 0);
next = sd->last_balance + interval;
if (time_after(*next_balance, next))
*next_balance = next;
}
/*
* active_load_balance_cpu_stop is run by the CPU stopper. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
* avoids physical / logical imbalances.
*/
static int active_load_balance_cpu_stop(void *data)
{
struct rq *busiest_rq = data;
int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
struct task_struct *p = NULL;
struct rq_flags rf;
rq_lock_irq(busiest_rq, &rf);
/*
* Between queueing the stop-work and running it is a hole in which
* CPUs can become inactive. We should not move tasks from or to
* inactive CPUs.
*/
if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
goto out_unlock;
/* Make sure the requested CPU hasn't gone down in the meantime: */
if (unlikely(busiest_cpu != smp_processor_id() ||
!busiest_rq->active_balance))
goto out_unlock;
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
goto out_unlock;
/*
* This condition is "impossible", if it occurs
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-CPU setup.
*/
BUG_ON(busiest_rq == target_rq);
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
break;
}
if (likely(sd)) {
struct lb_env env = {
.sd = sd,
.dst_cpu = target_cpu,
.dst_rq = target_rq,
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
.flags = LBF_ACTIVE_LB,
.src_rq_rf = &rf,
};
schedstat_inc(sd->alb_count);
update_rq_clock(busiest_rq);
p = detach_one_task(&env);
if (p) {
schedstat_inc(sd->alb_pushed);
/* Active balancing done, reset the failure counter. */
sd->nr_balance_failed = 0;
} else {
schedstat_inc(sd->alb_failed);
}
}
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
rq_unlock(busiest_rq, &rf);
if (p)
attach_one_task(target_rq, p);
local_irq_enable();
return 0;
}
static DEFINE_SPINLOCK(balancing);
/*
* Scale the max load_balance interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
void update_max_interval(void)
{
max_load_balance_interval = HZ*num_online_cpus()/10;
}
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in init_sched_domains.
*/
static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize, need_decay = 0;
u64 max_cost = 0;
trace_android_rvh_sched_rebalance_domains(rq, &continue_balancing);
if (!continue_balancing)
return;
rcu_read_lock();
for_each_domain(cpu, sd) {
/*
* Decay the newidle max times here because this is a regular
* visit to all the domains. Decay ~1% per second.
*/
if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
sd->max_newidle_lb_cost =
(sd->max_newidle_lb_cost * 253) / 256;
sd->next_decay_max_lb_cost = jiffies + HZ;
need_decay = 1;
}
max_cost += sd->max_newidle_lb_cost;
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
* actively.
*/
if (!continue_balancing) {
if (need_decay)
continue;
break;
}
interval = get_sd_balance_interval(sd, busy);
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
/*
* The LBF_DST_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
spin_unlock(&balancing);
out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
}
if (need_decay) {
/*
* Ensure the rq-wide value also decays but keep it at a
* reasonable floor to avoid funnies with rq->avg_idle.
*/
rq->max_idle_balance_cost =
max((u64)sysctl_sched_migration_cost, max_cost);
}
rcu_read_unlock();
/*
* next_balance will be updated only when there is a need.
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
}
static inline int on_null_domain(struct rq *rq)
{
return unlikely(!rcu_dereference_sched(rq->sd));
}
#ifdef CONFIG_NO_HZ_COMMON
/*
* idle load balancing details
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
* - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
* anywhere yet.
*/
static inline int find_new_ilb(void)
{
int ilb = -1;
const struct cpumask *hk_mask;
trace_android_rvh_find_new_ilb(nohz.idle_cpus_mask, &ilb);
if (ilb >= 0)
return ilb;
hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
if (ilb == smp_processor_id())
continue;
if (idle_cpu(ilb))
return ilb;
}
return nr_cpu_ids;
}
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick any
* idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
/*
* Increase nohz.next_balance only when if full ilb is triggered but
* not if we only update stats.
*/
if (flags & NOHZ_BALANCE_KICK)
nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
if (ilb_cpu >= nr_cpu_ids)
return;
/*
* Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
* the first flag owns it; cleared by nohz_csd_func().
*/
flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
if (flags & NOHZ_KICK_MASK)
return;
/*
* This way we generate an IPI on the target CPU which
* is idle. And the softirq performing nohz idle load balance
* will be run before returning from the IPI.
*/
smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
}
/*
* Current decision point for kicking the idle load balancer in the presence
* of idle CPUs in the system.
*/
static void nohz_balancer_kick(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain_shared *sds;
struct sched_domain *sd;
int nr_busy, i, cpu = rq->cpu;
unsigned int flags = 0;
int done = 0;
if (unlikely(rq->idle_balance))
return;
/*
* We may be recently in ticked or tickless idle mode. At the first
* busy tick after returning from idle, we will update the busy stats.
*/
nohz_balance_exit_idle(rq);
/*
* None are in tickless mode and hence no need for NOHZ idle load
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
return;
if (READ_ONCE(nohz.has_blocked) &&
time_after(now, READ_ONCE(nohz.next_blocked)))
flags = NOHZ_STATS_KICK;
if (time_before(now, nohz.next_balance))
goto out;
trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done);
if (done)
goto out;
if (rq->nr_running >= 2) {
flags = NOHZ_KICK_MASK;
goto out;
}
rcu_read_lock();
sd = rcu_dereference(rq->sd);
if (sd) {
/*
* If there's a CFS task and the current CPU has reduced
* capacity; kick the ILB to see if there's a better CPU to run
* on.
*/
if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
if (sd) {
/*
* When ASYM_PACKING; see if there's a more preferred CPU
* currently idle; in which case, kick the ILB to move tasks
* around.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
}
sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
if (sd) {
/*
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
if (check_misfit_status(rq, sd)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
/*
* For asymmetric systems, we do not want to nicely balance
* cache use, instead we want to embrace asymmetry and only
* ensure tasks have enough CPU capacity.
*
* Skip the LLC logic because it's not relevant in that case.
*/
goto unlock;
}
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
* increase the overall cache use), we need some less-loaded LLC
* domain to pull some load. Likewise, we may need to spread
* load within the current LLC domain (e.g. packed SMT cores but
* other CPUs are idle). We can't really know from here how busy
* the others are - so just get a nohz balance going if it looks
* like this LLC domain has tasks we could move.
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
unlock:
rcu_read_unlock();
out:
if (flags)
kick_ilb(flags);
}
static void set_cpu_sd_state_busy(int cpu)
{
struct sched_domain *sd;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
void nohz_balance_exit_idle(struct rq *rq)
{
SCHED_WARN_ON(rq != this_rq());
if (likely(!rq->nohz_tick_stopped))
return;
rq->nohz_tick_stopped = 0;
cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
atomic_dec(&nohz.nr_cpus);
set_cpu_sd_state_busy(rq->cpu);
}
static void set_cpu_sd_state_idle(int cpu)
{
struct sched_domain *sd;
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
/*
* This routine will record that the CPU is going idle with tick stopped.
* This info will be used in performing idle load balancing in the future.
*/
void nohz_balance_enter_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
SCHED_WARN_ON(cpu != smp_processor_id());
/* If this CPU is going down, then nothing needs to be done: */
if (!cpu_active(cpu))
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
return;
/*
* Can be set safely without rq->lock held
* If a clear happens, it will have evaluated last additions because
* rq->lock is held during the check and the clear
*/
rq->has_blocked_load = 1;
/*
* The tick is still stopped but load could have been added in the
* meantime. We set the nohz.has_blocked flag to trig a check of the
* *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
* of nohz.has_blocked can only happen after checking the new load
*/
if (rq->nohz_tick_stopped)
goto out;
/* If we're a completely isolated CPU, we don't play: */
if (on_null_domain(rq))
return;
rq->nohz_tick_stopped = 1;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
/*
* Ensures that if nohz_idle_balance() fails to observe our
* @idle_cpus_mask store, it must observe the @has_blocked
* store.
*/
smp_mb__after_atomic();
set_cpu_sd_state_idle(cpu);
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
* enable the periodic update of the load of idle cpus
*/
WRITE_ONCE(nohz.has_blocked, 1);
}
static bool update_nohz_stats(struct rq *rq)
{
unsigned int cpu = rq->cpu;
if (!rq->has_blocked_load)
return false;
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
return true;
update_blocked_averages(cpu);
return rq->has_blocked_load;
}
/*
* Internal function that runs load balance for all idle cpus. The load balance
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
*/
static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
enum cpu_idle_type idle)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
unsigned long next_balance = now + 60*HZ;
bool has_blocked_load = false;
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
struct rq *rq;
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
/*
* We assume there will be no idle load after this update and clear
* the has_blocked flag. If a cpu enters idle in the mean time, it will
* set the has_blocked flag and trig another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
*/
WRITE_ONCE(nohz.has_blocked, 0);
/*
* Ensures that if we miss the CPU, we must see the has_blocked
* store from nohz_balance_enter_idle().
*/
smp_mb();
/*
* Start with the next CPU after this_cpu so we will end with this_cpu and let a
* chance for other idle cpu to pull load.
*/
for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
if (!idle_cpu(balance_cpu))
continue;
/*
* If this CPU gets work to do, stop the load balancing
* work being done for other CPUs. Next load
* balancing owner will pick it up.
*/
if (need_resched()) {
has_blocked_load = true;
goto abort;
}
rq = cpu_rq(balance_cpu);
has_blocked_load |= update_nohz_stats(rq);
/*
* If time for next balance is due,
* do the balance.
*/
if (time_after_eq(jiffies, rq->next_balance)) {
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
rebalance_domains(rq, CPU_IDLE);
}
if (time_after(next_balance, rq->next_balance)) {
next_balance = rq->next_balance;
update_next_balance = 1;
}
}
/*
* next_balance will be updated only when there is a need.
* When the CPU is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
nohz.next_balance = next_balance;
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
}
/*
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
unsigned int flags = this_rq->nohz_idle_balance;
if (!flags)
return false;
this_rq->nohz_idle_balance = 0;
if (idle != CPU_IDLE)
return false;
_nohz_idle_balance(this_rq, flags, idle);
return true;
}
/*
* Check if we need to run the ILB for updating blocked load before entering
* idle state.
*/
void nohz_run_idle_balance(int cpu)
{
unsigned int flags;
flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
/*
* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
* (ie NOHZ_STATS_KICK set) and will do the same.
*/
if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
}
static void nohz_newidle_balance(struct rq *this_rq)
{
int this_cpu = this_rq->cpu;
/*
* This CPU doesn't want to be disturbed by scheduler
* housekeeping
*/
if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
return;
/* Will wake up very soon. No time for doing anything else*/
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
/* Don't need to update blocked load of idle CPUs*/
if (!READ_ONCE(nohz.has_blocked) ||
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
/*
* Set the need to trigger ILB in order to update blocked load
* before entering idle state.
*/
atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
}
#else /* !CONFIG_NO_HZ_COMMON */
static inline void nohz_balancer_kick(struct rq *rq) { }
static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
return false;
}
static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
* newidle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*
* Returns:
* < 0 - we released the lock and there are !fair tasks present
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
int done = 0;
trace_android_rvh_sched_newidle_balance(this_rq, rf, &pulled_task, &done);
if (done)
return pulled_task;
update_misfit_status(NULL, this_rq);
/*
* There is a task waiting to run. No need to search for one.
* Return 0; the task will be enqueued when switching to idle.
*/
if (this_rq->ttwu_pending)
return 0;
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
/*
* Do not pull tasks towards !active CPUs...
*/
if (!cpu_active(this_cpu))
return 0;
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
* further scheduler activity on it and we're being very careful to
* re-start the picking loop.
*/
rq_unpin_lock(this_rq, rf);
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!READ_ONCE(this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
goto out;
}
raw_spin_rq_unlock(this_rq);
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
int continue_balancing = 1;
u64 t0, domain_cost;
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
break;
}
if (sd->flags & SD_BALANCE_NEWIDLE) {
t0 = sched_clock_cpu(this_cpu);
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
domain_cost = sched_clock_cpu(this_cpu) - t0;
if (domain_cost > sd->max_newidle_lb_cost)
sd->max_newidle_lb_cost = domain_cost;
curr_cost += domain_cost;
}
update_next_balance(sd, &next_balance);
/*
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
if (pulled_task || this_rq->nr_running > 0 ||
this_rq->ttwu_pending)
break;
}
rcu_read_unlock();
raw_spin_rq_lock(this_rq);
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
/*
* While browsing the domains, we released the rq lock, a task could
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
out:
/* Move the next balance forward */
if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
if (pulled_task)
this_rq->idle_stamp = 0;
else
nohz_newidle_balance(this_rq);
rq_repin_lock(this_rq, rf);
return pulled_task;
}
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
*/
static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
/*
* If this CPU has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle CPUs whose ticks are
* stopped. Do nohz_idle_balance *before* rebalance_domains to
* give the idle CPUs a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
*/
if (nohz_idle_balance(this_rq, idle))
return;
/* normal load balance */
update_blocked_averages(this_rq->cpu);
rebalance_domains(this_rq, idle);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
void trigger_load_balance(struct rq *rq)
{
/*
* Don't need to rebalance while attached to NULL domain or
* runqueue CPU is not active
*/
if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
return;
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
nohz_balancer_kick(rq);
}
static void rq_online_fair(struct rq *rq)
{
update_sysctl();
update_runtime_enabled(rq);
}
static void rq_offline_fair(struct rq *rq)
{
update_sysctl();
/* Ensure any throttled groups are reachable by pick_next_task */
unthrottle_offline_cfs_rqs(rq);
}
#endif /* CONFIG_SMP */
#ifdef CONFIG_SCHED_CORE
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
{
u64 slice = sched_slice(cfs_rq_of(se), se);
u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
return (rtime * min_nr_tasks > slice);
}
#define MIN_NR_TASKS_DURING_FORCEIDLE 2
static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
{
if (!sched_core_enabled(rq))
return;
/*
* If runqueue has only one task which used up its slice and
* if the sibling is forced idle, then trigger schedule to
* give forced idle task a chance.
*
* sched_slice() considers only this active rq and it gets the
* whole slice. But during force idle, we have siblings acting
* like a single runqueue and hence we need to consider runnable
* tasks on this CPU and the forced idle CPU. Ideally, we should
* go through the forced idle rq, but that would be a perf hit.
* We can assume that the forced idle CPU has at least
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
/*
* se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
*/
static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (forceidle) {
if (cfs_rq->forceidle_seq == fi_seq)
break;
cfs_rq->forceidle_seq = fi_seq;
}
cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
}
}
void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
{
struct sched_entity *se = &p->se;
if (p->sched_class != &fair_sched_class)
return;
se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
}
bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
{
struct rq *rq = task_rq(a);
struct sched_entity *sea = &a->se;
struct sched_entity *seb = &b->se;
struct cfs_rq *cfs_rqa;
struct cfs_rq *cfs_rqb;
s64 delta;
SCHED_WARN_ON(task_rq(b)->core != rq->core);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Find an se in the hierarchy for tasks a and b, such that the se's
* are immediate siblings.
*/
while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
int sea_depth = sea->depth;
int seb_depth = seb->depth;
if (sea_depth >= seb_depth)
sea = parent_entity(sea);
if (sea_depth <= seb_depth)
seb = parent_entity(seb);
}
se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
cfs_rqa = sea->cfs_rq;
cfs_rqb = seb->cfs_rq;
#else
cfs_rqa = &task_rq(a)->cfs;
cfs_rqb = &task_rq(b)->cfs;
#endif
/*
* Find delta after normalizing se's vruntime with its cfs_rq's
* min_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
delta = (s64)(sea->vruntime - seb->vruntime) +
(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
return delta > 0;
}
#else
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
#endif
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
task_tick_core(rq, curr);
}
/*
* called on fork with the child task as argument from the parent's context
* - child not yet on the tasklist
* - preemption disabled
*/
static void task_fork_fair(struct task_struct *p)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
struct rq *rq = this_rq();
struct rq_flags rf;
rq_lock(rq, &rf);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr) {
update_curr(cfs_rq);
se->vruntime = curr->vruntime;
}
place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
/*
* Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
resched_curr(rq);
}
se->vruntime -= cfs_rq->min_vruntime;
rq_unlock(rq, &rf);
}
/*
* Priority of the task has changed. Check to see if we preempt
* the current task.
*/
static void
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
if (!task_on_rq_queued(p))
return;
if (rq->cfs.nr_running == 1)
return;
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
if (task_current(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
} else
check_preempt_curr(rq, p, 0);
}
static inline bool vruntime_normalized(struct task_struct *p)
{
struct sched_entity *se = &p->se;
/*
* In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
* the dequeue_entity(.flags=0) will already have normalized the
* vruntime.
*/
if (p->on_rq)
return true;
/*
* When !on_rq, vruntime of the task has usually NOT been normalized.
* But there are some cases where it has already been normalized:
*
* - A forked child which is waiting for being woken up by
* wake_up_new_task().
* - A task which has been woken up by try_to_wake_up() and
* waiting for actually being woken up by sched_ttwu_pending().
*/
if (!se->sum_exec_runtime ||
(READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
return true;
return false;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Propagate the changes of the sched_entity across the tg tree to make it
* visible to the root
*/
static void propagate_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq;
list_add_leaf_cfs_rq(cfs_rq_of(se));
/* Start to propagate at parent */
se = se->parent;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
if (!cfs_rq_throttled(cfs_rq)){
update_load_avg(cfs_rq, se, UPDATE_TG);
list_add_leaf_cfs_rq(cfs_rq);
continue;
}
if (list_add_leaf_cfs_rq(cfs_rq))
break;
}
}
#else
static void propagate_entity_cfs_rq(struct sched_entity *se) { }
#endif
static void detach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
static void attach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Since the real-depth could have been changed (only FAIR
* class maintain depth value), reset depth properly.
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (!vruntime_normalized(p)) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
*/
place_entity(cfs_rq, se, 0);
se->vruntime -= cfs_rq->min_vruntime;
}
detach_entity_cfs_rq(se);
}
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
attach_entity_cfs_rq(se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
}
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
}
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
attach_task_cfs_rq(p);
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
if (task_current(rq, p))
resched_curr(rq);
else
check_preempt_curr(rq, p, 0);
}
}
/* Account for a task changing its policy or group.
*
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
#ifdef CONFIG_SMP
if (task_on_rq_queued(p)) {
/*
* Move the next running task to the front of the list, so our
* cfs_tasks list becomes MRU one.
*/
list_move(&se->group_node, &rq->cfs_tasks);
}
#endif
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
set_next_entity(cfs_rq, se);
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}
}
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_set_group_fair(struct task_struct *p)
{
struct sched_entity *se = &p->se;
set_task_rq(p, task_cpu(p));
se->depth = se->parent ? se->parent->depth + 1 : 0;
}
static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
set_task_rq(p, task_cpu(p));
#ifdef CONFIG_SMP
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
#endif
attach_task_cfs_rq(p);
}
static void task_change_group_fair(struct task_struct *p, int type)
{
switch (type) {
case TASK_SET_GROUP:
task_set_group_fair(p);
break;
case TASK_MOVE_GROUP:
task_move_group_fair(p);
break;
}
}
void free_fair_sched_group(struct task_group *tg)
{
int i;
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
if (tg->se)
kfree(tg->se[i]);
}
kfree(tg->cfs_rq);
kfree(tg->se);
}
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
int i;
tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
if (!tg->cfs_rq)
goto err;
tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
if (!tg->se)
goto err;
tg->shares = NICE_0_LOAD;
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
goto err;
se = kzalloc_node(sizeof(struct sched_entity),
GFP_KERNEL, cpu_to_node(i));
if (!se)
goto err_free_rq;
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
}
return 1;
err_free_rq:
kfree(cfs_rq);
err:
return 0;
}
void online_fair_sched_group(struct task_group *tg)
{
struct sched_entity *se;
struct rq_flags rf;
struct rq *rq;
int i;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
se = tg->se[i];
rq_lock_irq(rq, &rf);
update_rq_clock(rq);
attach_entity_cfs_rq(se);
sync_throttle(tg, i);
rq_unlock_irq(rq, &rf);
}
}
void unregister_fair_sched_group(struct task_group *tg)
{
unsigned long flags;
struct rq *rq;
int cpu;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(cpu) {
if (tg->se[cpu])
remove_entity_load_avg(tg->se[cpu]);
/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
*/
if (!tg->cfs_rq[cpu]->on_list)
continue;
rq = cpu_rq(cpu);
raw_spin_rq_lock_irqsave(rq, flags);
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
raw_spin_rq_unlock_irqrestore(rq, flags);
}
}
void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
cfs_rq->tg = tg;
cfs_rq->rq = rq;
init_cfs_rq_runtime(cfs_rq);
tg->cfs_rq[cpu] = cfs_rq;
tg->se[cpu] = se;
/* se could be NULL for root_task_group */
if (!se)
return;
if (!parent) {
se->cfs_rq = &rq->cfs;
se->depth = 0;
} else {
se->cfs_rq = parent->my_q;
se->depth = parent->depth + 1;
}
se->my_q = cfs_rq;
/* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
}
static DEFINE_MUTEX(shares_mutex);
static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
lockdep_assert_held(&shares_mutex);
/*
* We can't change the weight of the root cgroup.
*/
if (!tg->se[0])
return -EINVAL;
shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
if (tg->shares == shares)
return 0;
tg->shares = shares;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
struct sched_entity *se = tg->se[i];
struct rq_flags rf;
/* Propagate contribution to hierarchy */
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
for_each_sched_entity(se) {
update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
update_cfs_group(se);
}
rq_unlock_irqrestore(rq, &rf);
}
return 0;
}
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int ret;
mutex_lock(&shares_mutex);
if (tg_is_idle(tg))
ret = -EINVAL;
else
ret = __sched_group_set_shares(tg, shares);
mutex_unlock(&shares_mutex);
return ret;
}
int sched_group_set_idle(struct task_group *tg, long idle)
{
int i;
if (tg == &root_task_group)
return -EINVAL;
if (idle < 0 || idle > 1)
return -EINVAL;
mutex_lock(&shares_mutex);
if (tg->idle == idle) {
mutex_unlock(&shares_mutex);
return 0;
}
tg->idle = idle;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
struct sched_entity *se = tg->se[i];
struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
long idle_task_delta;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
grp_cfs_rq->idle = idle;
if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
goto next_cpu;
idle_task_delta = grp_cfs_rq->h_nr_running -
grp_cfs_rq->idle_h_nr_running;
if (!cfs_rq_is_idle(grp_cfs_rq))
idle_task_delta *= -1;
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
if (!se->on_rq)
break;
cfs_rq->idle_h_nr_running += idle_task_delta;
/* Already accounted at parent level and above. */
if (cfs_rq_is_idle(cfs_rq))
break;
}
next_cpu:
rq_unlock_irqrestore(rq, &rf);
}
/* Idle groups have minimum weight. */
if (tg_is_idle(tg))
__sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
else
__sched_group_set_shares(tg, NICE_0_LOAD);
mutex_unlock(&shares_mutex);
return 0;
}
#else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { }
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
return 1;
}
void online_fair_sched_group(struct task_group *tg) { }
void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
{
struct sched_entity *se = &task->se;
unsigned int rr_interval = 0;
/*
* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
* idle runqueue:
*/
if (rq->cfs.load.weight)
rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
return rr_interval;
}
/*
* All the scheduling class methods:
*/
DEFINE_SCHED_CLASS(fair) = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP
.balance = balance_fair,
.pick_task = pick_task_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
.task_dead = task_dead_fair,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
.prio_changed = prio_changed_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
.get_rr_interval = get_rr_interval_fair,
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_change_group = task_change_group_fair,
#endif
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
};
#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
struct cfs_rq *cfs_rq, *pos;
rcu_read_lock();
for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
#ifdef CONFIG_NUMA_BALANCING
void show_numa_stats(struct task_struct *p, struct seq_file *m)
{
int node;
unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
struct numa_group *ng;
rcu_read_lock();
ng = rcu_dereference(p->numa_group);
for_each_online_node(node) {
if (p->numa_faults) {
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
}
if (ng) {
gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}
print_numa_stats(m, node, tsf, tpf, gsf, gpf);
}
rcu_read_unlock();
}
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */
}
/*
* Helper functions to facilitate extracting info from tracepoints.
*/
const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
{
#ifdef CONFIG_SMP
return cfs_rq ? &cfs_rq->avg : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
{
if (!cfs_rq) {
if (str)
strlcpy(str, "(null)", len);
else
return NULL;
}
cfs_rq_tg_path(cfs_rq, str, len);
return str;
}
EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
{
return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
}
EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
{
#ifdef CONFIG_SMP
return rq ? &rq->avg_rt : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
{
#ifdef CONFIG_SMP
return rq ? &rq->avg_dl : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
{
#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
return rq ? &rq->avg_irq : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
int sched_trace_rq_cpu(struct rq *rq)
{
return rq ? cpu_of(rq) : -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
int sched_trace_rq_cpu_capacity(struct rq *rq)
{
return rq ?
#ifdef CONFIG_SMP
rq->cpu_capacity
#else
SCHED_CAPACITY_SCALE
#endif
: -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
{
#ifdef CONFIG_SMP
return rd ? rd->span : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rd_span);
int sched_trace_rq_nr_running(struct rq *rq)
{
return rq ? rq->nr_running : -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);