* refs/heads/tmp-13855a6: Linux 4.14.157 x86/hyperv: mark hyperv_init as __init function KVM: PPC: Book3S HV: Flush link stack on guest exit to host kernel powerpc/book3s64: Fix link stack flush on context switch powerpc/64s: support nospectre_v2 cmdline option staging: comedi: usbduxfast: usbduxfast_ai_cmdtest rounding error USB: serial: option: add support for Foxconn T77W968 LTE modules USB: serial: option: add support for DW5821e with eSIM support USB: serial: mos7840: fix remote wakeup USB: serial: mos7720: fix remote wakeup USB: serial: mos7840: add USB ID to support Moxa UPort 2210 appledisplay: fix error handling in the scheduled work USB: chaoskey: fix error case of a timeout usb-serial: cp201x: support Mark-10 digital force gauge usbip: Fix uninitialized symbol 'nents' in stub_recv_cmd_submit() usbip: tools: fix fd leakage in the function of read_attr_usbip_status virtio_ring: fix return code on DMA mapping fails media: imon: invalid dereference in imon_touch_event media: cxusb: detect cxusb_ctrl_msg error in query media: b2c2-flexcop-usb: add sanity checking media: uvcvideo: Fix error path in control parsing failure cpufreq: Add NULL checks to show() and store() methods of cpufreq media: usbvision: Fix races among open, close, and disconnect media: vivid: Fix wrong locking that causes race conditions on streaming stop media: vivid: Set vid_cap_streaming and vid_out_streaming to true nfc: port100: handle command failure cleanly nbd: prevent memory leak x86/speculation: Fix redundant MDS mitigation message x86/speculation: Fix incorrect MDS/TAA mitigation status x86/insn: Fix awk regexp warnings ARC: perf: Accommodate big-endian CPU ARM: 8904/1: skip nomap memblocks while finding the lowmem/highmem boundary ocfs2: remove ocfs2_is_o2cb_active() cpufreq: Skip cpufreq resume if it's not suspended arm64: fix for bad_mode() handler to always result in panic net: phy: dp83867: increase SGMII autoneg timer duration net: phy: dp83867: fix speed 10 in sgmii mode mm/memory_hotplug: don't access uninitialized memmaps in shrink_zone_span() md/raid10: prevent access of uninitialized resync_pages offset ath9k_hw: fix uninitialized variable data ath10k: Fix a NULL-ptr-deref bug in ath10k_usb_alloc_urb_from_pipe KVM: MMU: Do not treat ZONE_DEVICE pages as being reserved Bluetooth: Fix invalid-free in bcsp_close() cfg80211: call disconnect_wk when AP stops ipv6: Fix handling of LLA with VRF and sockets bound to VRF mm/memory_hotplug: Do not unlock when fails to take the device_hotplug_lock i2c: uniphier-f: fix timeout error after reading 8 bytes spi: omap2-mcspi: Fix DMA and FIFO event trigger size mismatch PCI: keystone: Use quirk to limit MRRS for K2G pinctrl: zynq: Use define directive for PIN_CONFIG_IO_STANDARD pinctrl: lpc18xx: Use define directive for PIN_CONFIG_GPIO_PIN_INT pinctrl: qcom: spmi-gpio: fix gpio-hog related boot issues cfg80211: Prevent regulatory restore during STA disconnect in concurrent interfaces of: unittest: allow base devicetree to have symbol metadata net: bcmgenet: return correct value 'ret' from bcmgenet_power_down ACPICA: Use %d for signed int print formatting instead of %u vrf: mark skb for multicast or link-local as enslaved to VRF dlm: don't leak kernel pointer to userspace dlm: fix invalid free scsi: lpfc: Correct loss of fc4 type on remote port address change scsi: lpfc: fcoe: Fix link down issue after 1000+ link bounces scsi: megaraid_sas: Fix goto labels in error handling scsi: megaraid_sas: Fix msleep granularity scsi: mpt3sas: Fix driver modifying persistent data in Manufacturing page11 scsi: mpt3sas: Don't modify EEDPTagMode field setting on SAS3.5 HBA devices scsi: mpt3sas: Fix Sync cache command failure during driver unload net: dsa: bcm_sf2: Turn on PHY to allow successful registration rtlwifi: rtl8192de: Fix misleading REG_MCUFWDL information wireless: airo: potential buffer overflow in sprintf() brcmsmac: never log "tid x is not agg'able" by default rtl8xxxu: Fix missing break in switch wlcore: Fix the return value in case of error in 'wlcore_vendor_cmd_smart_config_start()' wil6210: fix locking in wmi_call btrfs: avoid link error with CONFIG_NO_AUTO_INLINE audit: print empty EXECVE args clk: sunxi-ng: enable so-said LDOs for A64 SoC's pll-mipi clock openvswitch: fix linking without CONFIG_NF_CONNTRACK_LABELS sched/fair: Don't increase sd->balance_interval on newidle balance sched/topology: Fix off by one bug net: do not abort bulk send on BQL status ocfs2: fix clusters leak in ocfs2_defrag_extent() ocfs2: don't put and assigning null to bh allocated outside arm64: makefile fix build of .i file in external module case ntb: intel: fix return value for ndev_vec_mask() ntb_netdev: fix sleep time mismatch net: hns3: bugfix for buffer not free problem during resetting igb: shorten maximum PHC timecounter update interval mm/memory_hotplug: make add_memory() take the device_hotplug_lock fs/hfs/extent.c: fix array out of bounds read of array extent hfs: update timestamp on truncate() hfsplus: update timestamps on truncate() hfs: fix return value of hfs_get_block() hfsplus: fix return value of hfsplus_get_block() hfs: prevent btree data loss on ENOSPC hfsplus: prevent btree data loss on ENOSPC hfs: fix BUG on bnode parent update hfsplus: fix BUG on bnode parent update linux/bitmap.h: fix type of nbits in bitmap_shift_right() linux/bitmap.h: handle constant zero-size bitmaps correctly selftests/powerpc/cache_shape: Fix out-of-tree build selftests/powerpc/switch_endian: Fix out-of-tree build selftests/powerpc/signal: Fix out-of-tree build powerpc/xmon: Relax frame size for clang vfs: avoid problematic remapping requests into partial EOF block um: Make line/tty semantics use true write IRQ i2c: uniphier-f: fix race condition when IRQ is cleared i2c: uniphier-f: fix occasional timeout error i2c: uniphier-f: make driver robust against concurrency block: fix the DISCARD request merge macsec: let the administrator set UP state even if lowerdev is down macsec: update operstate when lower device changes mm/page-writeback.c: fix range_cyclic writeback vs writepages deadlock fs/ocfs2/dlm/dlmdebug.c: fix a sleep-in-atomic-context bug in dlm_print_one_mle() arm64: lib: use C string functions with KASAN enabled sparc64: Rework xchg() definition to avoid warnings. powerpc/process: Fix flush_all_to_thread for SPE bpf: devmap: fix wrong interface selection in notifier_call thermal: rcar_thermal: Prevent hardware access during system suspend selftests: watchdog: Fix error message. selftests: watchdog: fix message when /dev/watchdog open fails selftests/ftrace: Fix to test kprobe $comm arg only if available mfd: max8997: Enale irq-wakeup unconditionally mfd: intel_soc_pmic_bxtwc: Chain power button IRQs as well mfd: mc13xxx-core: Fix PMIC shutdown when reading ADC values mfd: arizona: Correct calling of runtime_put_sync net: ethernet: ti: cpsw: unsync mcast entries while switch promisc mode qlcnic: fix a return in qlcnic_dcb_get_capability() mISDN: Fix type of switch control variable in ctrl_teimanager f2fs: fix to spread clear_cold_data() rtc: s35390a: Change buf's type to u8 in s35390a_init ceph: fix dentry leak in ceph_readdir_prepopulate powerpc/pseries: Export raw per-CPU VPA data via debugfs sparc: Fix parport build warnings. spi: omap2-mcspi: Set FIFO DMA trigger level to word length s390/perf: Return error when debug_register fails atm: zatm: Fix empty body Clang warnings sunrpc: safely reallow resvport min/max inversion SUNRPC: Fix a compile warning for cmpxchg64() dm raid: avoid bitmap with raid4/5/6 journal device usbip: tools: fix atoi() on non-null terminated string USB: misc: appledisplay: fix backlight update_status return code PCI: vmd: Detach resources after stopping root bus macintosh/windfarm_smu_sat: Fix debug output ALSA: i2c/cs8427: Fix int to char conversion PM / Domains: Deal with multiple states but no governor in genpd kprobes, x86/ptrace.h: Make regs_get_kernel_stack_nth() not fault on bad stack xfs: fix use-after-free race in xfs_buf_rele net: ena: Fix Kconfig dependency on X86 net: fix warning in af_unix net: dsa: mv88e6xxx: Fix 88E6141/6341 2500mbps SERDES speed scsi: dc395x: fix DMA API usage in sg_update_list scsi: dc395x: fix dma API usage in srb_done ASoC: tegra_sgtl5000: fix device_node refcounting clk: at91: audio-pll: fix audio pmc type clk: mmp2: fix the clock id for sdh2_clk and sdh3_clk nvmet-fcloop: suppress a compiler warning crypto: ccree - avoid implicit enum conversion scsi: iscsi_tcp: Explicitly cast param in iscsi_sw_tcp_host_get_param scsi: isci: Change sci_controller_start_task's return type to sci_status scsi: isci: Use proper enumerated type in atapi_d2h_reg_frame_handler KVM/x86: Fix invvpid and invept register operand size in 64-bit mode KVM: nVMX: reset cache/shadows when switching loaded VMCS scsi: ips: fix missing break in switch qed: Align local and global PTT to propagate through the APIs. amiflop: clean up on errors during setup pwm: lpss: Only set update bit if we are actually changing the settings pinctrl: sunxi: Fix a memory leak in 'sunxi_pinctrl_build_state()' RDMA/bnxt_re: Fix qp async event reporting m68k: fix command-line parsing when passed from u-boot w1: IAD Register is yet readable trough iad sys file. Fix snprintf (%u for unsigned, count for max size). misc: mic: fix a DMA pool free failure gsmi: Fix bug in append_to_eventlog sysfs handler btrfs: handle error of get_old_root mmc: mediatek: fix cannot receive new request when msdc_cmd_is_ready fail spi: sh-msiof: fix deferred probing cdrom: don't attempt to fiddle with cdo->capability skd: fixup usage of legacy IO API ath10k: allocate small size dma memory in ath10k_pci_diag_write_mem brcmsmac: AP mode: update beacon when TIM changes EDAC, thunderx: Fix memory leak in thunderx_l2c_threaded_isr() powerpc/eeh: Fix use of EEH_PE_KEEP on wrong field powerpc/boot: Disable vector instructions powerpc: Fix signedness bug in update_flash_db() synclink_gt(): fix compat_ioctl() pty: fix compat ioctls gfs2: Fix marking bitmaps non-full printk: fix integer overflow in setup_log_buf() ALSA: isight: fix leak of reference to firewire unit in error path of .probe callback mwifiex: Fix NL80211_TX_POWER_LIMITED platform/x86: asus-wmi: Only Tell EC the OS will handle display hotkeys from asus_nb_wmi platform/x86: asus-nb-wmi: Support ALS on the Zenbook UX430UQ drm/i915/userptr: Try to acquire the page lock around set_page_dirty() mm/ksm.c: don't WARN if page is still mapped in remove_stable_node() Revert "fs: ocfs2: fix possible null-pointer dereferences in ocfs2_xa_prepare_entry()" virtio_console: allocate inbufs in add_port() only if it is needed nbd:fix memory leak in nbd_get_socket() tools: gpio: Correctly add make dependencies for gpio_utils gpio: max77620: Fixup debounce delays vhost/vsock: split packets to send using multiple buffers net/sched: act_pedit: fix WARN() in the traffic path net/mlxfw: Verify FSM error code translation doesn't exceed array size net/mlx5e: Fix set vf link state error flow sfc: Only cancel the PPS workqueue if it exists net: rtnetlink: prevent underflows in do_setvfinfo() net/mlx4_en: fix mlx4 ethtool -N insertion ANDROID: removed CONFIG_PM_WAKELOCKS Conflicts: block/blk-merge.c drivers/pinctrl/qcom/pinctrl-spmi-gpio.c Discarding the commit "block: fix the DISCARD request merge" as it is causing stability issues. Change-Id: I05fea476d3bce65663beac6552d7d5c6cd7445d5 Signed-off-by: Srinivasarao P <spathi@codeaurora.org>
966 lines
23 KiB
C
966 lines
23 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Memory subsystem support
|
|
*
|
|
* Written by Matt Tolentino <matthew.e.tolentino@intel.com>
|
|
* Dave Hansen <haveblue@us.ibm.com>
|
|
*
|
|
* This file provides the necessary infrastructure to represent
|
|
* a SPARSEMEM-memory-model system's physical memory in /sysfs.
|
|
* All arch-independent code that assumes MEMORY_HOTPLUG requires
|
|
* SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/init.h>
|
|
#include <linux/topology.h>
|
|
#include <linux/capability.h>
|
|
#include <linux/device.h>
|
|
#include <linux/memory.h>
|
|
#include <linux/memory_hotplug.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/stat.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/memblock.h>
|
|
|
|
#include <linux/atomic.h>
|
|
#include <linux/uaccess.h>
|
|
|
|
static DEFINE_MUTEX(mem_sysfs_mutex);
|
|
|
|
#define MEMORY_CLASS_NAME "memory"
|
|
|
|
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
|
|
|
|
static int sections_per_block;
|
|
|
|
static inline int base_memory_block_id(int section_nr)
|
|
{
|
|
return section_nr / sections_per_block;
|
|
}
|
|
|
|
static int memory_subsys_online(struct device *dev);
|
|
static int memory_subsys_offline(struct device *dev);
|
|
|
|
static struct bus_type memory_subsys = {
|
|
.name = MEMORY_CLASS_NAME,
|
|
.dev_name = MEMORY_CLASS_NAME,
|
|
.online = memory_subsys_online,
|
|
.offline = memory_subsys_offline,
|
|
};
|
|
|
|
static BLOCKING_NOTIFIER_HEAD(memory_chain);
|
|
|
|
int register_memory_notifier(struct notifier_block *nb)
|
|
{
|
|
return blocking_notifier_chain_register(&memory_chain, nb);
|
|
}
|
|
EXPORT_SYMBOL(register_memory_notifier);
|
|
|
|
void unregister_memory_notifier(struct notifier_block *nb)
|
|
{
|
|
blocking_notifier_chain_unregister(&memory_chain, nb);
|
|
}
|
|
EXPORT_SYMBOL(unregister_memory_notifier);
|
|
|
|
static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);
|
|
|
|
int register_memory_isolate_notifier(struct notifier_block *nb)
|
|
{
|
|
return atomic_notifier_chain_register(&memory_isolate_chain, nb);
|
|
}
|
|
EXPORT_SYMBOL(register_memory_isolate_notifier);
|
|
|
|
void unregister_memory_isolate_notifier(struct notifier_block *nb)
|
|
{
|
|
atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
|
|
}
|
|
EXPORT_SYMBOL(unregister_memory_isolate_notifier);
|
|
|
|
static void memory_block_release(struct device *dev)
|
|
{
|
|
struct memory_block *mem = to_memory_block(dev);
|
|
|
|
kfree(mem);
|
|
}
|
|
|
|
unsigned long __weak memory_block_size_bytes(void)
|
|
{
|
|
return MIN_MEMORY_BLOCK_SIZE;
|
|
}
|
|
|
|
static unsigned long get_memory_block_size(void)
|
|
{
|
|
unsigned long block_sz;
|
|
|
|
block_sz = memory_block_size_bytes();
|
|
|
|
/* Validate blk_sz is a power of 2 and not less than section size */
|
|
if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
|
|
WARN_ON(1);
|
|
block_sz = MIN_MEMORY_BLOCK_SIZE;
|
|
}
|
|
|
|
return block_sz;
|
|
}
|
|
|
|
/*
|
|
* use this as the physical section index that this memsection
|
|
* uses.
|
|
*/
|
|
|
|
static ssize_t show_mem_start_phys_index(struct device *dev,
|
|
struct device_attribute *attr, char *buf)
|
|
{
|
|
struct memory_block *mem = to_memory_block(dev);
|
|
unsigned long phys_index;
|
|
|
|
phys_index = mem->start_section_nr / sections_per_block;
|
|
return sprintf(buf, "%08lx\n", phys_index);
|
|
}
|
|
|
|
/*
|
|
* Show whether the section of memory is likely to be hot-removable
|
|
*/
|
|
static ssize_t show_mem_removable(struct device *dev,
|
|
struct device_attribute *attr, char *buf)
|
|
{
|
|
unsigned long i, pfn;
|
|
int ret = 1;
|
|
struct memory_block *mem = to_memory_block(dev);
|
|
|
|
if (mem->state != MEM_ONLINE)
|
|
goto out;
|
|
|
|
for (i = 0; i < sections_per_block; i++) {
|
|
if (!present_section_nr(mem->start_section_nr + i))
|
|
continue;
|
|
pfn = section_nr_to_pfn(mem->start_section_nr + i);
|
|
ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
|
|
}
|
|
|
|
out:
|
|
return sprintf(buf, "%d\n", ret);
|
|
}
|
|
|
|
/*
|
|
* online, offline, going offline, etc.
|
|
*/
|
|
static ssize_t show_mem_state(struct device *dev,
|
|
struct device_attribute *attr, char *buf)
|
|
{
|
|
struct memory_block *mem = to_memory_block(dev);
|
|
ssize_t len = 0;
|
|
|
|
/*
|
|
* We can probably put these states in a nice little array
|
|
* so that they're not open-coded
|
|
*/
|
|
switch (mem->state) {
|
|
case MEM_ONLINE:
|
|
len = sprintf(buf, "online\n");
|
|
break;
|
|
case MEM_OFFLINE:
|
|
len = sprintf(buf, "offline\n");
|
|
break;
|
|
case MEM_GOING_OFFLINE:
|
|
len = sprintf(buf, "going-offline\n");
|
|
break;
|
|
default:
|
|
len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
|
|
mem->state);
|
|
WARN_ON(1);
|
|
break;
|
|
}
|
|
|
|
return len;
|
|
}
|
|
|
|
int memory_notify(unsigned long val, void *v)
|
|
{
|
|
return blocking_notifier_call_chain(&memory_chain, val, v);
|
|
}
|
|
|
|
int memory_isolate_notify(unsigned long val, void *v)
|
|
{
|
|
return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
|
|
}
|
|
|
|
/*
|
|
* The probe routines leave the pages reserved, just as the bootmem code does.
|
|
* Make sure they're still that way.
|
|
*/
|
|
static bool pages_correctly_reserved(unsigned long start_pfn)
|
|
{
|
|
int i, j;
|
|
struct page *page;
|
|
unsigned long pfn = start_pfn;
|
|
|
|
/*
|
|
* memmap between sections is not contiguous except with
|
|
* SPARSEMEM_VMEMMAP. We lookup the page once per section
|
|
* and assume memmap is contiguous within each section
|
|
*/
|
|
for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) {
|
|
if (WARN_ON_ONCE(!pfn_valid(pfn)))
|
|
return false;
|
|
page = pfn_to_page(pfn);
|
|
|
|
for (j = 0; j < PAGES_PER_SECTION; j++) {
|
|
if (PageReserved(page + j))
|
|
continue;
|
|
|
|
printk(KERN_WARNING "section number %ld page number %d "
|
|
"not reserved, was it already online?\n",
|
|
pfn_to_section_nr(pfn), j);
|
|
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
|
|
* OK to have direct references to sparsemem variables in here.
|
|
* Must already be protected by mem_hotplug_begin().
|
|
*/
|
|
static int
|
|
memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
|
|
{
|
|
unsigned long start_pfn;
|
|
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
|
|
int ret;
|
|
|
|
start_pfn = section_nr_to_pfn(phys_index);
|
|
|
|
switch (action) {
|
|
case MEM_ONLINE:
|
|
if (!pages_correctly_reserved(start_pfn))
|
|
return -EBUSY;
|
|
|
|
ret = online_pages(start_pfn, nr_pages, online_type);
|
|
break;
|
|
case MEM_OFFLINE:
|
|
ret = offline_pages(start_pfn, nr_pages);
|
|
break;
|
|
default:
|
|
WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
|
|
"%ld\n", __func__, phys_index, action, action);
|
|
ret = -EINVAL;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int memory_block_change_state(struct memory_block *mem,
|
|
unsigned long to_state, unsigned long from_state_req)
|
|
{
|
|
int ret = 0;
|
|
|
|
if (mem->state != from_state_req)
|
|
return -EINVAL;
|
|
|
|
if (to_state == MEM_OFFLINE)
|
|
mem->state = MEM_GOING_OFFLINE;
|
|
|
|
ret = memory_block_action(mem->start_section_nr, to_state,
|
|
mem->online_type);
|
|
|
|
mem->state = ret ? from_state_req : to_state;
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* The device lock serializes operations on memory_subsys_[online|offline] */
|
|
static int memory_subsys_online(struct device *dev)
|
|
{
|
|
struct memory_block *mem = to_memory_block(dev);
|
|
int ret;
|
|
|
|
if (mem->state == MEM_ONLINE)
|
|
return 0;
|
|
|
|
/*
|
|
* If we are called from store_mem_state(), online_type will be
|
|
* set >= 0 Otherwise we were called from the device online
|
|
* attribute and need to set the online_type.
|
|
*/
|
|
if (mem->online_type < 0)
|
|
mem->online_type = MMOP_ONLINE_KEEP;
|
|
|
|
/* Already under protection of mem_hotplug_begin() */
|
|
ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
|
|
|
|
/* clear online_type */
|
|
mem->online_type = -1;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int memory_subsys_offline(struct device *dev)
|
|
{
|
|
struct memory_block *mem = to_memory_block(dev);
|
|
|
|
if (mem->state == MEM_OFFLINE)
|
|
return 0;
|
|
|
|
/* Can't offline block with non-present sections */
|
|
if (mem->section_count != sections_per_block)
|
|
return -EINVAL;
|
|
|
|
return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
|
|
}
|
|
|
|
static ssize_t
|
|
store_mem_state(struct device *dev,
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
{
|
|
struct memory_block *mem = to_memory_block(dev);
|
|
int ret, online_type;
|
|
|
|
ret = lock_device_hotplug_sysfs();
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (sysfs_streq(buf, "online_kernel"))
|
|
online_type = MMOP_ONLINE_KERNEL;
|
|
else if (sysfs_streq(buf, "online_movable"))
|
|
online_type = MMOP_ONLINE_MOVABLE;
|
|
else if (sysfs_streq(buf, "online"))
|
|
online_type = MMOP_ONLINE_KEEP;
|
|
else if (sysfs_streq(buf, "offline"))
|
|
online_type = MMOP_OFFLINE;
|
|
else {
|
|
ret = -EINVAL;
|
|
goto err;
|
|
}
|
|
|
|
/*
|
|
* Memory hotplug needs to hold mem_hotplug_begin() for probe to find
|
|
* the correct memory block to online before doing device_online(dev),
|
|
* which will take dev->mutex. Take the lock early to prevent an
|
|
* inversion, memory_subsys_online() callbacks will be implemented by
|
|
* assuming it's already protected.
|
|
*/
|
|
mem_hotplug_begin();
|
|
|
|
switch (online_type) {
|
|
case MMOP_ONLINE_KERNEL:
|
|
case MMOP_ONLINE_MOVABLE:
|
|
case MMOP_ONLINE_KEEP:
|
|
mem->online_type = online_type;
|
|
ret = device_online(&mem->dev);
|
|
break;
|
|
case MMOP_OFFLINE:
|
|
ret = device_offline(&mem->dev);
|
|
break;
|
|
default:
|
|
ret = -EINVAL; /* should never happen */
|
|
}
|
|
|
|
mem_hotplug_done();
|
|
err:
|
|
unlock_device_hotplug();
|
|
|
|
if (ret < 0)
|
|
return ret;
|
|
if (ret)
|
|
return -EINVAL;
|
|
|
|
return count;
|
|
}
|
|
|
|
/*
|
|
* phys_device is a bad name for this. What I really want
|
|
* is a way to differentiate between memory ranges that
|
|
* are part of physical devices that constitute
|
|
* a complete removable unit or fru.
|
|
* i.e. do these ranges belong to the same physical device,
|
|
* s.t. if I offline all of these sections I can then
|
|
* remove the physical device?
|
|
*/
|
|
static ssize_t show_phys_device(struct device *dev,
|
|
struct device_attribute *attr, char *buf)
|
|
{
|
|
struct memory_block *mem = to_memory_block(dev);
|
|
return sprintf(buf, "%d\n", mem->phys_device);
|
|
}
|
|
|
|
#ifdef CONFIG_MEMORY_HOTREMOVE
|
|
static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
|
|
unsigned long nr_pages, int online_type,
|
|
struct zone *default_zone)
|
|
{
|
|
struct zone *zone;
|
|
|
|
zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
|
|
if (zone != default_zone) {
|
|
strcat(buf, " ");
|
|
strcat(buf, zone->name);
|
|
}
|
|
}
|
|
|
|
static ssize_t show_valid_zones(struct device *dev,
|
|
struct device_attribute *attr, char *buf)
|
|
{
|
|
struct memory_block *mem = to_memory_block(dev);
|
|
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
|
|
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
|
|
unsigned long valid_start_pfn, valid_end_pfn;
|
|
struct zone *default_zone;
|
|
int nid;
|
|
|
|
/*
|
|
* The block contains more than one zone can not be offlined.
|
|
* This can happen e.g. for ZONE_DMA and ZONE_DMA32
|
|
*/
|
|
if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn))
|
|
return sprintf(buf, "none\n");
|
|
|
|
start_pfn = valid_start_pfn;
|
|
nr_pages = valid_end_pfn - start_pfn;
|
|
|
|
/*
|
|
* Check the existing zone. Make sure that we do that only on the
|
|
* online nodes otherwise the page_zone is not reliable
|
|
*/
|
|
if (mem->state == MEM_ONLINE) {
|
|
strcat(buf, page_zone(pfn_to_page(start_pfn))->name);
|
|
goto out;
|
|
}
|
|
|
|
nid = pfn_to_nid(start_pfn);
|
|
default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
|
|
strcat(buf, default_zone->name);
|
|
|
|
print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
|
|
default_zone);
|
|
print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
|
|
default_zone);
|
|
out:
|
|
strcat(buf, "\n");
|
|
|
|
return strlen(buf);
|
|
}
|
|
static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL);
|
|
#endif
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
static int count_num_free_block_pages(struct zone *zone, int bid)
|
|
{
|
|
int order, type;
|
|
unsigned long freecount = 0;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&zone->lock, flags);
|
|
for (type = 0; type < MIGRATE_TYPES; type++) {
|
|
for (order = 0; order < MAX_ORDER; ++order) {
|
|
struct free_area *area;
|
|
struct page *page;
|
|
|
|
area = &(zone->free_area[order]);
|
|
list_for_each_entry(page, &area->free_list[type], lru) {
|
|
unsigned long pfn = page_to_pfn(page);
|
|
int section_nr = pfn_to_section_nr(pfn);
|
|
|
|
if (bid == base_memory_block_id(section_nr))
|
|
freecount += (1 << order);
|
|
}
|
|
|
|
}
|
|
}
|
|
spin_unlock_irqrestore(&zone->lock, flags);
|
|
|
|
return freecount;
|
|
}
|
|
|
|
static ssize_t show_allocated_bytes(struct device *dev,
|
|
struct device_attribute *attr, char *buf)
|
|
{
|
|
struct memory_block *mem = to_memory_block(dev);
|
|
int block_id, free_pages;
|
|
struct zone *movable_zone =
|
|
&NODE_DATA(numa_node_id())->node_zones[ZONE_MOVABLE];
|
|
unsigned long used, block_sz = get_memory_block_size();
|
|
|
|
if (!populated_zone(movable_zone) || mem->state != MEM_ONLINE)
|
|
return snprintf(buf, 100, "0\n");
|
|
|
|
block_id = base_memory_block_id(mem->start_section_nr);
|
|
free_pages = count_num_free_block_pages(movable_zone, block_id);
|
|
used = block_sz - (free_pages * PAGE_SIZE);
|
|
|
|
return snprintf(buf, 100, "%lu\n", used);
|
|
}
|
|
|
|
static ssize_t show_aligned_blocks_addr(struct device *dev,
|
|
struct device_attribute *attr, char *buf)
|
|
{
|
|
return memblock_dump_aligned_blocks_addr(buf);
|
|
}
|
|
|
|
static ssize_t show_aligned_blocks_num(struct device *dev,
|
|
struct device_attribute *attr, char *buf)
|
|
{
|
|
return memblock_dump_aligned_blocks_num(buf);
|
|
}
|
|
#endif
|
|
|
|
static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
|
|
static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
|
|
static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
|
|
static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
static DEVICE_ATTR(allocated_bytes, 0444, show_allocated_bytes, NULL);
|
|
static DEVICE_ATTR(aligned_blocks_addr, 0444, show_aligned_blocks_addr, NULL);
|
|
static DEVICE_ATTR(aligned_blocks_num, 0444, show_aligned_blocks_num, NULL);
|
|
#endif
|
|
|
|
/*
|
|
* Block size attribute stuff
|
|
*/
|
|
static ssize_t
|
|
print_block_size(struct device *dev, struct device_attribute *attr,
|
|
char *buf)
|
|
{
|
|
return sprintf(buf, "%lx\n", get_memory_block_size());
|
|
}
|
|
|
|
static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
|
|
|
|
/*
|
|
* Memory auto online policy.
|
|
*/
|
|
|
|
static ssize_t
|
|
show_auto_online_blocks(struct device *dev, struct device_attribute *attr,
|
|
char *buf)
|
|
{
|
|
if (memhp_auto_online)
|
|
return sprintf(buf, "online\n");
|
|
else
|
|
return sprintf(buf, "offline\n");
|
|
}
|
|
|
|
static ssize_t
|
|
store_auto_online_blocks(struct device *dev, struct device_attribute *attr,
|
|
const char *buf, size_t count)
|
|
{
|
|
if (sysfs_streq(buf, "online"))
|
|
memhp_auto_online = true;
|
|
else if (sysfs_streq(buf, "offline"))
|
|
memhp_auto_online = false;
|
|
else
|
|
return -EINVAL;
|
|
|
|
return count;
|
|
}
|
|
|
|
static DEVICE_ATTR(auto_online_blocks, 0644, show_auto_online_blocks,
|
|
store_auto_online_blocks);
|
|
|
|
/*
|
|
* Some architectures will have custom drivers to do this, and
|
|
* will not need to do it from userspace. The fake hot-add code
|
|
* as well as ppc64 will do all of their discovery in userspace
|
|
* and will require this interface.
|
|
*/
|
|
#ifdef CONFIG_ARCH_MEMORY_PROBE
|
|
static ssize_t
|
|
memory_probe_store(struct device *dev, struct device_attribute *attr,
|
|
const char *buf, size_t count)
|
|
{
|
|
u64 phys_addr;
|
|
int nid, ret;
|
|
unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
|
|
|
|
ret = kstrtoull(buf, 0, &phys_addr);
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
|
|
return -EINVAL;
|
|
|
|
ret = lock_device_hotplug_sysfs();
|
|
if (ret)
|
|
return ret;
|
|
|
|
nid = memory_add_physaddr_to_nid(phys_addr);
|
|
ret = __add_memory(nid, phys_addr,
|
|
MIN_MEMORY_BLOCK_SIZE * sections_per_block);
|
|
|
|
if (ret)
|
|
goto out;
|
|
|
|
ret = count;
|
|
out:
|
|
unlock_device_hotplug();
|
|
return ret;
|
|
}
|
|
|
|
static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
|
|
|
|
#ifdef CONFIG_MEMORY_HOTREMOVE
|
|
static ssize_t
|
|
memory_remove_store(struct device *dev,
|
|
struct device_attribute *attr, const char *buf, size_t count)
|
|
{
|
|
u64 phys_addr;
|
|
int nid, ret;
|
|
unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
|
|
|
|
ret = kstrtoull(buf, 0, &phys_addr);
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
|
|
return -EINVAL;
|
|
|
|
nid = memory_add_physaddr_to_nid(phys_addr);
|
|
ret = lock_device_hotplug_sysfs();
|
|
if (ret)
|
|
return ret;
|
|
|
|
remove_memory(nid, phys_addr,
|
|
MIN_MEMORY_BLOCK_SIZE * sections_per_block);
|
|
unlock_device_hotplug();
|
|
return count;
|
|
}
|
|
static DEVICE_ATTR(remove, S_IWUSR, NULL, memory_remove_store);
|
|
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
|
#endif /* CONFIG_ARCH_MEMORY_PROBE */
|
|
|
|
#ifdef CONFIG_MEMORY_FAILURE
|
|
/*
|
|
* Support for offlining pages of memory
|
|
*/
|
|
|
|
/* Soft offline a page */
|
|
static ssize_t
|
|
store_soft_offline_page(struct device *dev,
|
|
struct device_attribute *attr,
|
|
const char *buf, size_t count)
|
|
{
|
|
int ret;
|
|
u64 pfn;
|
|
if (!capable(CAP_SYS_ADMIN))
|
|
return -EPERM;
|
|
if (kstrtoull(buf, 0, &pfn) < 0)
|
|
return -EINVAL;
|
|
pfn >>= PAGE_SHIFT;
|
|
if (!pfn_valid(pfn))
|
|
return -ENXIO;
|
|
/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
|
|
if (!pfn_to_online_page(pfn))
|
|
return -EIO;
|
|
ret = soft_offline_page(pfn_to_page(pfn), 0);
|
|
return ret == 0 ? count : ret;
|
|
}
|
|
|
|
/* Forcibly offline a page, including killing processes. */
|
|
static ssize_t
|
|
store_hard_offline_page(struct device *dev,
|
|
struct device_attribute *attr,
|
|
const char *buf, size_t count)
|
|
{
|
|
int ret;
|
|
u64 pfn;
|
|
if (!capable(CAP_SYS_ADMIN))
|
|
return -EPERM;
|
|
if (kstrtoull(buf, 0, &pfn) < 0)
|
|
return -EINVAL;
|
|
pfn >>= PAGE_SHIFT;
|
|
ret = memory_failure(pfn, 0, 0);
|
|
return ret ? ret : count;
|
|
}
|
|
|
|
static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page);
|
|
static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page);
|
|
#endif
|
|
|
|
/*
|
|
* Note that phys_device is optional. It is here to allow for
|
|
* differentiation between which *physical* devices each
|
|
* section belongs to...
|
|
*/
|
|
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* A reference for the returned object is held and the reference for the
|
|
* hinted object is released.
|
|
*/
|
|
struct memory_block *find_memory_block_hinted(struct mem_section *section,
|
|
struct memory_block *hint)
|
|
{
|
|
int block_id = base_memory_block_id(__section_nr(section));
|
|
struct device *hintdev = hint ? &hint->dev : NULL;
|
|
struct device *dev;
|
|
|
|
dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
|
|
if (hint)
|
|
put_device(&hint->dev);
|
|
if (!dev)
|
|
return NULL;
|
|
return to_memory_block(dev);
|
|
}
|
|
|
|
/*
|
|
* For now, we have a linear search to go find the appropriate
|
|
* memory_block corresponding to a particular phys_index. If
|
|
* this gets to be a real problem, we can always use a radix
|
|
* tree or something here.
|
|
*
|
|
* This could be made generic for all device subsystems.
|
|
*/
|
|
struct memory_block *find_memory_block(struct mem_section *section)
|
|
{
|
|
return find_memory_block_hinted(section, NULL);
|
|
}
|
|
|
|
static struct attribute *memory_memblk_attrs[] = {
|
|
&dev_attr_phys_index.attr,
|
|
&dev_attr_state.attr,
|
|
&dev_attr_phys_device.attr,
|
|
&dev_attr_removable.attr,
|
|
#ifdef CONFIG_MEMORY_HOTREMOVE
|
|
&dev_attr_valid_zones.attr,
|
|
#endif
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
&dev_attr_allocated_bytes.attr,
|
|
#endif
|
|
NULL
|
|
};
|
|
|
|
static struct attribute_group memory_memblk_attr_group = {
|
|
.attrs = memory_memblk_attrs,
|
|
};
|
|
|
|
static const struct attribute_group *memory_memblk_attr_groups[] = {
|
|
&memory_memblk_attr_group,
|
|
NULL,
|
|
};
|
|
|
|
/*
|
|
* register_memory - Setup a sysfs device for a memory block
|
|
*/
|
|
static
|
|
int register_memory(struct memory_block *memory)
|
|
{
|
|
memory->dev.bus = &memory_subsys;
|
|
memory->dev.id = memory->start_section_nr / sections_per_block;
|
|
memory->dev.release = memory_block_release;
|
|
memory->dev.groups = memory_memblk_attr_groups;
|
|
memory->dev.offline = memory->state == MEM_OFFLINE;
|
|
|
|
return device_register(&memory->dev);
|
|
}
|
|
|
|
static int init_memory_block(struct memory_block **memory,
|
|
struct mem_section *section, unsigned long state)
|
|
{
|
|
struct memory_block *mem;
|
|
unsigned long start_pfn;
|
|
int scn_nr;
|
|
int ret = 0;
|
|
|
|
mem = kzalloc(sizeof(*mem), GFP_KERNEL);
|
|
if (!mem)
|
|
return -ENOMEM;
|
|
|
|
scn_nr = __section_nr(section);
|
|
mem->start_section_nr =
|
|
base_memory_block_id(scn_nr) * sections_per_block;
|
|
mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
|
|
mem->state = state;
|
|
start_pfn = section_nr_to_pfn(mem->start_section_nr);
|
|
mem->phys_device = arch_get_memory_phys_device(start_pfn);
|
|
|
|
ret = register_memory(mem);
|
|
|
|
*memory = mem;
|
|
return ret;
|
|
}
|
|
|
|
static int add_memory_block(int base_section_nr)
|
|
{
|
|
struct memory_block *mem;
|
|
int i, ret, section_count = 0, section_nr;
|
|
|
|
for (i = base_section_nr;
|
|
(i < base_section_nr + sections_per_block) && i < NR_MEM_SECTIONS;
|
|
i++) {
|
|
if (!present_section_nr(i))
|
|
continue;
|
|
if (section_count == 0)
|
|
section_nr = i;
|
|
section_count++;
|
|
}
|
|
|
|
if (section_count == 0)
|
|
return 0;
|
|
ret = init_memory_block(&mem, __nr_to_section(section_nr), MEM_ONLINE);
|
|
if (ret)
|
|
return ret;
|
|
mem->section_count = section_count;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* need an interface for the VM to add new memory regions,
|
|
* but without onlining it.
|
|
*/
|
|
int register_new_memory(int nid, struct mem_section *section)
|
|
{
|
|
int ret = 0;
|
|
struct memory_block *mem;
|
|
|
|
mutex_lock(&mem_sysfs_mutex);
|
|
|
|
mem = find_memory_block(section);
|
|
if (mem) {
|
|
mem->section_count++;
|
|
put_device(&mem->dev);
|
|
} else {
|
|
ret = init_memory_block(&mem, section, MEM_OFFLINE);
|
|
if (ret)
|
|
goto out;
|
|
mem->section_count++;
|
|
}
|
|
|
|
if (mem->section_count == sections_per_block)
|
|
ret = register_mem_sect_under_node(mem, nid);
|
|
out:
|
|
mutex_unlock(&mem_sysfs_mutex);
|
|
return ret;
|
|
}
|
|
|
|
#ifdef CONFIG_MEMORY_HOTREMOVE
|
|
static void
|
|
unregister_memory(struct memory_block *memory)
|
|
{
|
|
BUG_ON(memory->dev.bus != &memory_subsys);
|
|
|
|
/* drop the ref. we got in remove_memory_block() */
|
|
put_device(&memory->dev);
|
|
device_unregister(&memory->dev);
|
|
}
|
|
|
|
static int remove_memory_section(unsigned long node_id,
|
|
struct mem_section *section, int phys_device)
|
|
{
|
|
struct memory_block *mem;
|
|
|
|
mutex_lock(&mem_sysfs_mutex);
|
|
|
|
/*
|
|
* Some users of the memory hotplug do not want/need memblock to
|
|
* track all sections. Skip over those.
|
|
*/
|
|
mem = find_memory_block(section);
|
|
if (!mem)
|
|
goto out_unlock;
|
|
|
|
unregister_mem_sect_under_nodes(mem, __section_nr(section));
|
|
|
|
mem->section_count--;
|
|
if (mem->section_count == 0)
|
|
unregister_memory(mem);
|
|
else
|
|
put_device(&mem->dev);
|
|
|
|
out_unlock:
|
|
mutex_unlock(&mem_sysfs_mutex);
|
|
return 0;
|
|
}
|
|
|
|
int unregister_memory_section(struct mem_section *section)
|
|
{
|
|
if (!present_section(section))
|
|
return -EINVAL;
|
|
|
|
return remove_memory_section(0, section, 0);
|
|
}
|
|
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
|
|
|
/* return true if the memory block is offlined, otherwise, return false */
|
|
bool is_memblock_offlined(struct memory_block *mem)
|
|
{
|
|
return mem->state == MEM_OFFLINE;
|
|
}
|
|
|
|
static struct attribute *memory_root_attrs[] = {
|
|
#ifdef CONFIG_ARCH_MEMORY_PROBE
|
|
&dev_attr_probe.attr,
|
|
#ifdef CONFIG_MEMORY_HOTREMOVE
|
|
&dev_attr_remove.attr,
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef CONFIG_MEMORY_FAILURE
|
|
&dev_attr_soft_offline_page.attr,
|
|
&dev_attr_hard_offline_page.attr,
|
|
#endif
|
|
|
|
&dev_attr_block_size_bytes.attr,
|
|
&dev_attr_auto_online_blocks.attr,
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
&dev_attr_aligned_blocks_addr.attr,
|
|
&dev_attr_aligned_blocks_num.attr,
|
|
#endif
|
|
NULL
|
|
};
|
|
|
|
static struct attribute_group memory_root_attr_group = {
|
|
.attrs = memory_root_attrs,
|
|
};
|
|
|
|
static const struct attribute_group *memory_root_attr_groups[] = {
|
|
&memory_root_attr_group,
|
|
NULL,
|
|
};
|
|
|
|
/*
|
|
* Initialize the sysfs support for memory devices...
|
|
*/
|
|
int __init memory_dev_init(void)
|
|
{
|
|
unsigned int i;
|
|
int ret;
|
|
int err;
|
|
unsigned long block_sz;
|
|
|
|
ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
|
|
if (ret)
|
|
goto out;
|
|
|
|
block_sz = get_memory_block_size();
|
|
sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
|
|
|
|
/*
|
|
* Create entries for memory sections that were found
|
|
* during boot and have been initialized
|
|
*/
|
|
mutex_lock(&mem_sysfs_mutex);
|
|
for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) {
|
|
/* Don't iterate over sections we know are !present: */
|
|
if (i > __highest_present_section_nr)
|
|
break;
|
|
|
|
err = add_memory_block(i);
|
|
if (!ret)
|
|
ret = err;
|
|
}
|
|
mutex_unlock(&mem_sysfs_mutex);
|
|
|
|
out:
|
|
if (ret)
|
|
printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
|
|
return ret;
|
|
}
|