Files
Srinivasarao P 5cae863d50 Merge android-4.14.157 (13855a6) into msm-4.14
* refs/heads/tmp-13855a6:
  Linux 4.14.157
  x86/hyperv: mark hyperv_init as __init function
  KVM: PPC: Book3S HV: Flush link stack on guest exit to host kernel
  powerpc/book3s64: Fix link stack flush on context switch
  powerpc/64s: support nospectre_v2 cmdline option
  staging: comedi: usbduxfast: usbduxfast_ai_cmdtest rounding error
  USB: serial: option: add support for Foxconn T77W968 LTE modules
  USB: serial: option: add support for DW5821e with eSIM support
  USB: serial: mos7840: fix remote wakeup
  USB: serial: mos7720: fix remote wakeup
  USB: serial: mos7840: add USB ID to support Moxa UPort 2210
  appledisplay: fix error handling in the scheduled work
  USB: chaoskey: fix error case of a timeout
  usb-serial: cp201x: support Mark-10 digital force gauge
  usbip: Fix uninitialized symbol 'nents' in stub_recv_cmd_submit()
  usbip: tools: fix fd leakage in the function of read_attr_usbip_status
  virtio_ring: fix return code on DMA mapping fails
  media: imon: invalid dereference in imon_touch_event
  media: cxusb: detect cxusb_ctrl_msg error in query
  media: b2c2-flexcop-usb: add sanity checking
  media: uvcvideo: Fix error path in control parsing failure
  cpufreq: Add NULL checks to show() and store() methods of cpufreq
  media: usbvision: Fix races among open, close, and disconnect
  media: vivid: Fix wrong locking that causes race conditions on streaming stop
  media: vivid: Set vid_cap_streaming and vid_out_streaming to true
  nfc: port100: handle command failure cleanly
  nbd: prevent memory leak
  x86/speculation: Fix redundant MDS mitigation message
  x86/speculation: Fix incorrect MDS/TAA mitigation status
  x86/insn: Fix awk regexp warnings
  ARC: perf: Accommodate big-endian CPU
  ARM: 8904/1: skip nomap memblocks while finding the lowmem/highmem boundary
  ocfs2: remove ocfs2_is_o2cb_active()
  cpufreq: Skip cpufreq resume if it's not suspended
  arm64: fix for bad_mode() handler to always result in panic
  net: phy: dp83867: increase SGMII autoneg timer duration
  net: phy: dp83867: fix speed 10 in sgmii mode
  mm/memory_hotplug: don't access uninitialized memmaps in shrink_zone_span()
  md/raid10: prevent access of uninitialized resync_pages offset
  ath9k_hw: fix uninitialized variable data
  ath10k: Fix a NULL-ptr-deref bug in ath10k_usb_alloc_urb_from_pipe
  KVM: MMU: Do not treat ZONE_DEVICE pages as being reserved
  Bluetooth: Fix invalid-free in bcsp_close()
  cfg80211: call disconnect_wk when AP stops
  ipv6: Fix handling of LLA with VRF and sockets bound to VRF
  mm/memory_hotplug: Do not unlock when fails to take the device_hotplug_lock
  i2c: uniphier-f: fix timeout error after reading 8 bytes
  spi: omap2-mcspi: Fix DMA and FIFO event trigger size mismatch
  PCI: keystone: Use quirk to limit MRRS for K2G
  pinctrl: zynq: Use define directive for PIN_CONFIG_IO_STANDARD
  pinctrl: lpc18xx: Use define directive for PIN_CONFIG_GPIO_PIN_INT
  pinctrl: qcom: spmi-gpio: fix gpio-hog related boot issues
  cfg80211: Prevent regulatory restore during STA disconnect in concurrent interfaces
  of: unittest: allow base devicetree to have symbol metadata
  net: bcmgenet: return correct value 'ret' from bcmgenet_power_down
  ACPICA: Use %d for signed int print formatting instead of %u
  vrf: mark skb for multicast or link-local as enslaved to VRF
  dlm: don't leak kernel pointer to userspace
  dlm: fix invalid free
  scsi: lpfc: Correct loss of fc4 type on remote port address change
  scsi: lpfc: fcoe: Fix link down issue after 1000+ link bounces
  scsi: megaraid_sas: Fix goto labels in error handling
  scsi: megaraid_sas: Fix msleep granularity
  scsi: mpt3sas: Fix driver modifying persistent data in Manufacturing page11
  scsi: mpt3sas: Don't modify EEDPTagMode field setting on SAS3.5 HBA devices
  scsi: mpt3sas: Fix Sync cache command failure during driver unload
  net: dsa: bcm_sf2: Turn on PHY to allow successful registration
  rtlwifi: rtl8192de: Fix misleading REG_MCUFWDL information
  wireless: airo: potential buffer overflow in sprintf()
  brcmsmac: never log "tid x is not agg'able" by default
  rtl8xxxu: Fix missing break in switch
  wlcore: Fix the return value in case of error in 'wlcore_vendor_cmd_smart_config_start()'
  wil6210: fix locking in wmi_call
  btrfs: avoid link error with CONFIG_NO_AUTO_INLINE
  audit: print empty EXECVE args
  clk: sunxi-ng: enable so-said LDOs for A64 SoC's pll-mipi clock
  openvswitch: fix linking without CONFIG_NF_CONNTRACK_LABELS
  sched/fair: Don't increase sd->balance_interval on newidle balance
  sched/topology: Fix off by one bug
  net: do not abort bulk send on BQL status
  ocfs2: fix clusters leak in ocfs2_defrag_extent()
  ocfs2: don't put and assigning null to bh allocated outside
  arm64: makefile fix build of .i file in external module case
  ntb: intel: fix return value for ndev_vec_mask()
  ntb_netdev: fix sleep time mismatch
  net: hns3: bugfix for buffer not free problem during resetting
  igb: shorten maximum PHC timecounter update interval
  mm/memory_hotplug: make add_memory() take the device_hotplug_lock
  fs/hfs/extent.c: fix array out of bounds read of array extent
  hfs: update timestamp on truncate()
  hfsplus: update timestamps on truncate()
  hfs: fix return value of hfs_get_block()
  hfsplus: fix return value of hfsplus_get_block()
  hfs: prevent btree data loss on ENOSPC
  hfsplus: prevent btree data loss on ENOSPC
  hfs: fix BUG on bnode parent update
  hfsplus: fix BUG on bnode parent update
  linux/bitmap.h: fix type of nbits in bitmap_shift_right()
  linux/bitmap.h: handle constant zero-size bitmaps correctly
  selftests/powerpc/cache_shape: Fix out-of-tree build
  selftests/powerpc/switch_endian: Fix out-of-tree build
  selftests/powerpc/signal: Fix out-of-tree build
  powerpc/xmon: Relax frame size for clang
  vfs: avoid problematic remapping requests into partial EOF block
  um: Make line/tty semantics use true write IRQ
  i2c: uniphier-f: fix race condition when IRQ is cleared
  i2c: uniphier-f: fix occasional timeout error
  i2c: uniphier-f: make driver robust against concurrency
  block: fix the DISCARD request merge
  macsec: let the administrator set UP state even if lowerdev is down
  macsec: update operstate when lower device changes
  mm/page-writeback.c: fix range_cyclic writeback vs writepages deadlock
  fs/ocfs2/dlm/dlmdebug.c: fix a sleep-in-atomic-context bug in dlm_print_one_mle()
  arm64: lib: use C string functions with KASAN enabled
  sparc64: Rework xchg() definition to avoid warnings.
  powerpc/process: Fix flush_all_to_thread for SPE
  bpf: devmap: fix wrong interface selection in notifier_call
  thermal: rcar_thermal: Prevent hardware access during system suspend
  selftests: watchdog: Fix error message.
  selftests: watchdog: fix message when /dev/watchdog open fails
  selftests/ftrace: Fix to test kprobe $comm arg only if available
  mfd: max8997: Enale irq-wakeup unconditionally
  mfd: intel_soc_pmic_bxtwc: Chain power button IRQs as well
  mfd: mc13xxx-core: Fix PMIC shutdown when reading ADC values
  mfd: arizona: Correct calling of runtime_put_sync
  net: ethernet: ti: cpsw: unsync mcast entries while switch promisc mode
  qlcnic: fix a return in qlcnic_dcb_get_capability()
  mISDN: Fix type of switch control variable in ctrl_teimanager
  f2fs: fix to spread clear_cold_data()
  rtc: s35390a: Change buf's type to u8 in s35390a_init
  ceph: fix dentry leak in ceph_readdir_prepopulate
  powerpc/pseries: Export raw per-CPU VPA data via debugfs
  sparc: Fix parport build warnings.
  spi: omap2-mcspi: Set FIFO DMA trigger level to word length
  s390/perf: Return error when debug_register fails
  atm: zatm: Fix empty body Clang warnings
  sunrpc: safely reallow resvport min/max inversion
  SUNRPC: Fix a compile warning for cmpxchg64()
  dm raid: avoid bitmap with raid4/5/6 journal device
  usbip: tools: fix atoi() on non-null terminated string
  USB: misc: appledisplay: fix backlight update_status return code
  PCI: vmd: Detach resources after stopping root bus
  macintosh/windfarm_smu_sat: Fix debug output
  ALSA: i2c/cs8427: Fix int to char conversion
  PM / Domains: Deal with multiple states but no governor in genpd
  kprobes, x86/ptrace.h: Make regs_get_kernel_stack_nth() not fault on bad stack
  xfs: fix use-after-free race in xfs_buf_rele
  net: ena: Fix Kconfig dependency on X86
  net: fix warning in af_unix
  net: dsa: mv88e6xxx: Fix 88E6141/6341 2500mbps SERDES speed
  scsi: dc395x: fix DMA API usage in sg_update_list
  scsi: dc395x: fix dma API usage in srb_done
  ASoC: tegra_sgtl5000: fix device_node refcounting
  clk: at91: audio-pll: fix audio pmc type
  clk: mmp2: fix the clock id for sdh2_clk and sdh3_clk
  nvmet-fcloop: suppress a compiler warning
  crypto: ccree - avoid implicit enum conversion
  scsi: iscsi_tcp: Explicitly cast param in iscsi_sw_tcp_host_get_param
  scsi: isci: Change sci_controller_start_task's return type to sci_status
  scsi: isci: Use proper enumerated type in atapi_d2h_reg_frame_handler
  KVM/x86: Fix invvpid and invept register operand size in 64-bit mode
  KVM: nVMX: reset cache/shadows when switching loaded VMCS
  scsi: ips: fix missing break in switch
  qed: Align local and global PTT to propagate through the APIs.
  amiflop: clean up on errors during setup
  pwm: lpss: Only set update bit if we are actually changing the settings
  pinctrl: sunxi: Fix a memory leak in 'sunxi_pinctrl_build_state()'
  RDMA/bnxt_re: Fix qp async event reporting
  m68k: fix command-line parsing when passed from u-boot
  w1: IAD Register is yet readable trough iad sys file. Fix snprintf (%u for unsigned, count for max size).
  misc: mic: fix a DMA pool free failure
  gsmi: Fix bug in append_to_eventlog sysfs handler
  btrfs: handle error of get_old_root
  mmc: mediatek: fix cannot receive new request when msdc_cmd_is_ready fail
  spi: sh-msiof: fix deferred probing
  cdrom: don't attempt to fiddle with cdo->capability
  skd: fixup usage of legacy IO API
  ath10k: allocate small size dma memory in ath10k_pci_diag_write_mem
  brcmsmac: AP mode: update beacon when TIM changes
  EDAC, thunderx: Fix memory leak in thunderx_l2c_threaded_isr()
  powerpc/eeh: Fix use of EEH_PE_KEEP on wrong field
  powerpc/boot: Disable vector instructions
  powerpc: Fix signedness bug in update_flash_db()
  synclink_gt(): fix compat_ioctl()
  pty: fix compat ioctls
  gfs2: Fix marking bitmaps non-full
  printk: fix integer overflow in setup_log_buf()
  ALSA: isight: fix leak of reference to firewire unit in error path of .probe callback
  mwifiex: Fix NL80211_TX_POWER_LIMITED
  platform/x86: asus-wmi: Only Tell EC the OS will handle display hotkeys from asus_nb_wmi
  platform/x86: asus-nb-wmi: Support ALS on the Zenbook UX430UQ
  drm/i915/userptr: Try to acquire the page lock around set_page_dirty()
  mm/ksm.c: don't WARN if page is still mapped in remove_stable_node()
  Revert "fs: ocfs2: fix possible null-pointer dereferences in ocfs2_xa_prepare_entry()"
  virtio_console: allocate inbufs in add_port() only if it is needed
  nbd:fix memory leak in nbd_get_socket()
  tools: gpio: Correctly add make dependencies for gpio_utils
  gpio: max77620: Fixup debounce delays
  vhost/vsock: split packets to send using multiple buffers
  net/sched: act_pedit: fix WARN() in the traffic path
  net/mlxfw: Verify FSM error code translation doesn't exceed array size
  net/mlx5e: Fix set vf link state error flow
  sfc: Only cancel the PPS workqueue if it exists
  net: rtnetlink: prevent underflows in do_setvfinfo()
  net/mlx4_en: fix mlx4 ethtool -N insertion
  ANDROID: removed CONFIG_PM_WAKELOCKS

Conflicts:
	block/blk-merge.c
	drivers/pinctrl/qcom/pinctrl-spmi-gpio.c

Discarding the commit "block: fix the DISCARD request merge"
as it is causing stability issues.

Change-Id: I05fea476d3bce65663beac6552d7d5c6cd7445d5
Signed-off-by: Srinivasarao P <spathi@codeaurora.org>
2020-04-16 16:45:40 +05:30

966 lines
23 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Memory subsystem support
*
* Written by Matt Tolentino <matthew.e.tolentino@intel.com>
* Dave Hansen <haveblue@us.ibm.com>
*
* This file provides the necessary infrastructure to represent
* a SPARSEMEM-memory-model system's physical memory in /sysfs.
* All arch-independent code that assumes MEMORY_HOTPLUG requires
* SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
#include <linux/capability.h>
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/memblock.h>
#include <linux/atomic.h>
#include <linux/uaccess.h>
static DEFINE_MUTEX(mem_sysfs_mutex);
#define MEMORY_CLASS_NAME "memory"
#define to_memory_block(dev) container_of(dev, struct memory_block, dev)
static int sections_per_block;
static inline int base_memory_block_id(int section_nr)
{
return section_nr / sections_per_block;
}
static int memory_subsys_online(struct device *dev);
static int memory_subsys_offline(struct device *dev);
static struct bus_type memory_subsys = {
.name = MEMORY_CLASS_NAME,
.dev_name = MEMORY_CLASS_NAME,
.online = memory_subsys_online,
.offline = memory_subsys_offline,
};
static BLOCKING_NOTIFIER_HEAD(memory_chain);
int register_memory_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&memory_chain, nb);
}
EXPORT_SYMBOL(register_memory_notifier);
void unregister_memory_notifier(struct notifier_block *nb)
{
blocking_notifier_chain_unregister(&memory_chain, nb);
}
EXPORT_SYMBOL(unregister_memory_notifier);
static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);
int register_memory_isolate_notifier(struct notifier_block *nb)
{
return atomic_notifier_chain_register(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(register_memory_isolate_notifier);
void unregister_memory_isolate_notifier(struct notifier_block *nb)
{
atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(unregister_memory_isolate_notifier);
static void memory_block_release(struct device *dev)
{
struct memory_block *mem = to_memory_block(dev);
kfree(mem);
}
unsigned long __weak memory_block_size_bytes(void)
{
return MIN_MEMORY_BLOCK_SIZE;
}
static unsigned long get_memory_block_size(void)
{
unsigned long block_sz;
block_sz = memory_block_size_bytes();
/* Validate blk_sz is a power of 2 and not less than section size */
if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
WARN_ON(1);
block_sz = MIN_MEMORY_BLOCK_SIZE;
}
return block_sz;
}
/*
* use this as the physical section index that this memsection
* uses.
*/
static ssize_t show_mem_start_phys_index(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct memory_block *mem = to_memory_block(dev);
unsigned long phys_index;
phys_index = mem->start_section_nr / sections_per_block;
return sprintf(buf, "%08lx\n", phys_index);
}
/*
* Show whether the section of memory is likely to be hot-removable
*/
static ssize_t show_mem_removable(struct device *dev,
struct device_attribute *attr, char *buf)
{
unsigned long i, pfn;
int ret = 1;
struct memory_block *mem = to_memory_block(dev);
if (mem->state != MEM_ONLINE)
goto out;
for (i = 0; i < sections_per_block; i++) {
if (!present_section_nr(mem->start_section_nr + i))
continue;
pfn = section_nr_to_pfn(mem->start_section_nr + i);
ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
}
out:
return sprintf(buf, "%d\n", ret);
}
/*
* online, offline, going offline, etc.
*/
static ssize_t show_mem_state(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct memory_block *mem = to_memory_block(dev);
ssize_t len = 0;
/*
* We can probably put these states in a nice little array
* so that they're not open-coded
*/
switch (mem->state) {
case MEM_ONLINE:
len = sprintf(buf, "online\n");
break;
case MEM_OFFLINE:
len = sprintf(buf, "offline\n");
break;
case MEM_GOING_OFFLINE:
len = sprintf(buf, "going-offline\n");
break;
default:
len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
mem->state);
WARN_ON(1);
break;
}
return len;
}
int memory_notify(unsigned long val, void *v)
{
return blocking_notifier_call_chain(&memory_chain, val, v);
}
int memory_isolate_notify(unsigned long val, void *v)
{
return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
}
/*
* The probe routines leave the pages reserved, just as the bootmem code does.
* Make sure they're still that way.
*/
static bool pages_correctly_reserved(unsigned long start_pfn)
{
int i, j;
struct page *page;
unsigned long pfn = start_pfn;
/*
* memmap between sections is not contiguous except with
* SPARSEMEM_VMEMMAP. We lookup the page once per section
* and assume memmap is contiguous within each section
*/
for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) {
if (WARN_ON_ONCE(!pfn_valid(pfn)))
return false;
page = pfn_to_page(pfn);
for (j = 0; j < PAGES_PER_SECTION; j++) {
if (PageReserved(page + j))
continue;
printk(KERN_WARNING "section number %ld page number %d "
"not reserved, was it already online?\n",
pfn_to_section_nr(pfn), j);
return false;
}
}
return true;
}
/*
* MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
* OK to have direct references to sparsemem variables in here.
* Must already be protected by mem_hotplug_begin().
*/
static int
memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
{
unsigned long start_pfn;
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
int ret;
start_pfn = section_nr_to_pfn(phys_index);
switch (action) {
case MEM_ONLINE:
if (!pages_correctly_reserved(start_pfn))
return -EBUSY;
ret = online_pages(start_pfn, nr_pages, online_type);
break;
case MEM_OFFLINE:
ret = offline_pages(start_pfn, nr_pages);
break;
default:
WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
"%ld\n", __func__, phys_index, action, action);
ret = -EINVAL;
}
return ret;
}
static int memory_block_change_state(struct memory_block *mem,
unsigned long to_state, unsigned long from_state_req)
{
int ret = 0;
if (mem->state != from_state_req)
return -EINVAL;
if (to_state == MEM_OFFLINE)
mem->state = MEM_GOING_OFFLINE;
ret = memory_block_action(mem->start_section_nr, to_state,
mem->online_type);
mem->state = ret ? from_state_req : to_state;
return ret;
}
/* The device lock serializes operations on memory_subsys_[online|offline] */
static int memory_subsys_online(struct device *dev)
{
struct memory_block *mem = to_memory_block(dev);
int ret;
if (mem->state == MEM_ONLINE)
return 0;
/*
* If we are called from store_mem_state(), online_type will be
* set >= 0 Otherwise we were called from the device online
* attribute and need to set the online_type.
*/
if (mem->online_type < 0)
mem->online_type = MMOP_ONLINE_KEEP;
/* Already under protection of mem_hotplug_begin() */
ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
/* clear online_type */
mem->online_type = -1;
return ret;
}
static int memory_subsys_offline(struct device *dev)
{
struct memory_block *mem = to_memory_block(dev);
if (mem->state == MEM_OFFLINE)
return 0;
/* Can't offline block with non-present sections */
if (mem->section_count != sections_per_block)
return -EINVAL;
return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
}
static ssize_t
store_mem_state(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct memory_block *mem = to_memory_block(dev);
int ret, online_type;
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
if (sysfs_streq(buf, "online_kernel"))
online_type = MMOP_ONLINE_KERNEL;
else if (sysfs_streq(buf, "online_movable"))
online_type = MMOP_ONLINE_MOVABLE;
else if (sysfs_streq(buf, "online"))
online_type = MMOP_ONLINE_KEEP;
else if (sysfs_streq(buf, "offline"))
online_type = MMOP_OFFLINE;
else {
ret = -EINVAL;
goto err;
}
/*
* Memory hotplug needs to hold mem_hotplug_begin() for probe to find
* the correct memory block to online before doing device_online(dev),
* which will take dev->mutex. Take the lock early to prevent an
* inversion, memory_subsys_online() callbacks will be implemented by
* assuming it's already protected.
*/
mem_hotplug_begin();
switch (online_type) {
case MMOP_ONLINE_KERNEL:
case MMOP_ONLINE_MOVABLE:
case MMOP_ONLINE_KEEP:
mem->online_type = online_type;
ret = device_online(&mem->dev);
break;
case MMOP_OFFLINE:
ret = device_offline(&mem->dev);
break;
default:
ret = -EINVAL; /* should never happen */
}
mem_hotplug_done();
err:
unlock_device_hotplug();
if (ret < 0)
return ret;
if (ret)
return -EINVAL;
return count;
}
/*
* phys_device is a bad name for this. What I really want
* is a way to differentiate between memory ranges that
* are part of physical devices that constitute
* a complete removable unit or fru.
* i.e. do these ranges belong to the same physical device,
* s.t. if I offline all of these sections I can then
* remove the physical device?
*/
static ssize_t show_phys_device(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct memory_block *mem = to_memory_block(dev);
return sprintf(buf, "%d\n", mem->phys_device);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
unsigned long nr_pages, int online_type,
struct zone *default_zone)
{
struct zone *zone;
zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
if (zone != default_zone) {
strcat(buf, " ");
strcat(buf, zone->name);
}
}
static ssize_t show_valid_zones(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct memory_block *mem = to_memory_block(dev);
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
unsigned long valid_start_pfn, valid_end_pfn;
struct zone *default_zone;
int nid;
/*
* The block contains more than one zone can not be offlined.
* This can happen e.g. for ZONE_DMA and ZONE_DMA32
*/
if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn))
return sprintf(buf, "none\n");
start_pfn = valid_start_pfn;
nr_pages = valid_end_pfn - start_pfn;
/*
* Check the existing zone. Make sure that we do that only on the
* online nodes otherwise the page_zone is not reliable
*/
if (mem->state == MEM_ONLINE) {
strcat(buf, page_zone(pfn_to_page(start_pfn))->name);
goto out;
}
nid = pfn_to_nid(start_pfn);
default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
strcat(buf, default_zone->name);
print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
default_zone);
print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
default_zone);
out:
strcat(buf, "\n");
return strlen(buf);
}
static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL);
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
static int count_num_free_block_pages(struct zone *zone, int bid)
{
int order, type;
unsigned long freecount = 0;
unsigned long flags;
spin_lock_irqsave(&zone->lock, flags);
for (type = 0; type < MIGRATE_TYPES; type++) {
for (order = 0; order < MAX_ORDER; ++order) {
struct free_area *area;
struct page *page;
area = &(zone->free_area[order]);
list_for_each_entry(page, &area->free_list[type], lru) {
unsigned long pfn = page_to_pfn(page);
int section_nr = pfn_to_section_nr(pfn);
if (bid == base_memory_block_id(section_nr))
freecount += (1 << order);
}
}
}
spin_unlock_irqrestore(&zone->lock, flags);
return freecount;
}
static ssize_t show_allocated_bytes(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct memory_block *mem = to_memory_block(dev);
int block_id, free_pages;
struct zone *movable_zone =
&NODE_DATA(numa_node_id())->node_zones[ZONE_MOVABLE];
unsigned long used, block_sz = get_memory_block_size();
if (!populated_zone(movable_zone) || mem->state != MEM_ONLINE)
return snprintf(buf, 100, "0\n");
block_id = base_memory_block_id(mem->start_section_nr);
free_pages = count_num_free_block_pages(movable_zone, block_id);
used = block_sz - (free_pages * PAGE_SIZE);
return snprintf(buf, 100, "%lu\n", used);
}
static ssize_t show_aligned_blocks_addr(struct device *dev,
struct device_attribute *attr, char *buf)
{
return memblock_dump_aligned_blocks_addr(buf);
}
static ssize_t show_aligned_blocks_num(struct device *dev,
struct device_attribute *attr, char *buf)
{
return memblock_dump_aligned_blocks_num(buf);
}
#endif
static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state);
static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL);
static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL);
#ifdef CONFIG_MEMORY_HOTPLUG
static DEVICE_ATTR(allocated_bytes, 0444, show_allocated_bytes, NULL);
static DEVICE_ATTR(aligned_blocks_addr, 0444, show_aligned_blocks_addr, NULL);
static DEVICE_ATTR(aligned_blocks_num, 0444, show_aligned_blocks_num, NULL);
#endif
/*
* Block size attribute stuff
*/
static ssize_t
print_block_size(struct device *dev, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%lx\n", get_memory_block_size());
}
static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL);
/*
* Memory auto online policy.
*/
static ssize_t
show_auto_online_blocks(struct device *dev, struct device_attribute *attr,
char *buf)
{
if (memhp_auto_online)
return sprintf(buf, "online\n");
else
return sprintf(buf, "offline\n");
}
static ssize_t
store_auto_online_blocks(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
if (sysfs_streq(buf, "online"))
memhp_auto_online = true;
else if (sysfs_streq(buf, "offline"))
memhp_auto_online = false;
else
return -EINVAL;
return count;
}
static DEVICE_ATTR(auto_online_blocks, 0644, show_auto_online_blocks,
store_auto_online_blocks);
/*
* Some architectures will have custom drivers to do this, and
* will not need to do it from userspace. The fake hot-add code
* as well as ppc64 will do all of their discovery in userspace
* and will require this interface.
*/
#ifdef CONFIG_ARCH_MEMORY_PROBE
static ssize_t
memory_probe_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
u64 phys_addr;
int nid, ret;
unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
ret = kstrtoull(buf, 0, &phys_addr);
if (ret)
return ret;
if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
return -EINVAL;
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
nid = memory_add_physaddr_to_nid(phys_addr);
ret = __add_memory(nid, phys_addr,
MIN_MEMORY_BLOCK_SIZE * sections_per_block);
if (ret)
goto out;
ret = count;
out:
unlock_device_hotplug();
return ret;
}
static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
#ifdef CONFIG_MEMORY_HOTREMOVE
static ssize_t
memory_remove_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
u64 phys_addr;
int nid, ret;
unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
ret = kstrtoull(buf, 0, &phys_addr);
if (ret)
return ret;
if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
return -EINVAL;
nid = memory_add_physaddr_to_nid(phys_addr);
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
remove_memory(nid, phys_addr,
MIN_MEMORY_BLOCK_SIZE * sections_per_block);
unlock_device_hotplug();
return count;
}
static DEVICE_ATTR(remove, S_IWUSR, NULL, memory_remove_store);
#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_ARCH_MEMORY_PROBE */
#ifdef CONFIG_MEMORY_FAILURE
/*
* Support for offlining pages of memory
*/
/* Soft offline a page */
static ssize_t
store_soft_offline_page(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
int ret;
u64 pfn;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (kstrtoull(buf, 0, &pfn) < 0)
return -EINVAL;
pfn >>= PAGE_SHIFT;
if (!pfn_valid(pfn))
return -ENXIO;
/* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
if (!pfn_to_online_page(pfn))
return -EIO;
ret = soft_offline_page(pfn_to_page(pfn), 0);
return ret == 0 ? count : ret;
}
/* Forcibly offline a page, including killing processes. */
static ssize_t
store_hard_offline_page(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
int ret;
u64 pfn;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (kstrtoull(buf, 0, &pfn) < 0)
return -EINVAL;
pfn >>= PAGE_SHIFT;
ret = memory_failure(pfn, 0, 0);
return ret ? ret : count;
}
static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page);
static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page);
#endif
/*
* Note that phys_device is optional. It is here to allow for
* differentiation between which *physical* devices each
* section belongs to...
*/
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
return 0;
}
/*
* A reference for the returned object is held and the reference for the
* hinted object is released.
*/
struct memory_block *find_memory_block_hinted(struct mem_section *section,
struct memory_block *hint)
{
int block_id = base_memory_block_id(__section_nr(section));
struct device *hintdev = hint ? &hint->dev : NULL;
struct device *dev;
dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
if (hint)
put_device(&hint->dev);
if (!dev)
return NULL;
return to_memory_block(dev);
}
/*
* For now, we have a linear search to go find the appropriate
* memory_block corresponding to a particular phys_index. If
* this gets to be a real problem, we can always use a radix
* tree or something here.
*
* This could be made generic for all device subsystems.
*/
struct memory_block *find_memory_block(struct mem_section *section)
{
return find_memory_block_hinted(section, NULL);
}
static struct attribute *memory_memblk_attrs[] = {
&dev_attr_phys_index.attr,
&dev_attr_state.attr,
&dev_attr_phys_device.attr,
&dev_attr_removable.attr,
#ifdef CONFIG_MEMORY_HOTREMOVE
&dev_attr_valid_zones.attr,
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
&dev_attr_allocated_bytes.attr,
#endif
NULL
};
static struct attribute_group memory_memblk_attr_group = {
.attrs = memory_memblk_attrs,
};
static const struct attribute_group *memory_memblk_attr_groups[] = {
&memory_memblk_attr_group,
NULL,
};
/*
* register_memory - Setup a sysfs device for a memory block
*/
static
int register_memory(struct memory_block *memory)
{
memory->dev.bus = &memory_subsys;
memory->dev.id = memory->start_section_nr / sections_per_block;
memory->dev.release = memory_block_release;
memory->dev.groups = memory_memblk_attr_groups;
memory->dev.offline = memory->state == MEM_OFFLINE;
return device_register(&memory->dev);
}
static int init_memory_block(struct memory_block **memory,
struct mem_section *section, unsigned long state)
{
struct memory_block *mem;
unsigned long start_pfn;
int scn_nr;
int ret = 0;
mem = kzalloc(sizeof(*mem), GFP_KERNEL);
if (!mem)
return -ENOMEM;
scn_nr = __section_nr(section);
mem->start_section_nr =
base_memory_block_id(scn_nr) * sections_per_block;
mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
mem->state = state;
start_pfn = section_nr_to_pfn(mem->start_section_nr);
mem->phys_device = arch_get_memory_phys_device(start_pfn);
ret = register_memory(mem);
*memory = mem;
return ret;
}
static int add_memory_block(int base_section_nr)
{
struct memory_block *mem;
int i, ret, section_count = 0, section_nr;
for (i = base_section_nr;
(i < base_section_nr + sections_per_block) && i < NR_MEM_SECTIONS;
i++) {
if (!present_section_nr(i))
continue;
if (section_count == 0)
section_nr = i;
section_count++;
}
if (section_count == 0)
return 0;
ret = init_memory_block(&mem, __nr_to_section(section_nr), MEM_ONLINE);
if (ret)
return ret;
mem->section_count = section_count;
return 0;
}
/*
* need an interface for the VM to add new memory regions,
* but without onlining it.
*/
int register_new_memory(int nid, struct mem_section *section)
{
int ret = 0;
struct memory_block *mem;
mutex_lock(&mem_sysfs_mutex);
mem = find_memory_block(section);
if (mem) {
mem->section_count++;
put_device(&mem->dev);
} else {
ret = init_memory_block(&mem, section, MEM_OFFLINE);
if (ret)
goto out;
mem->section_count++;
}
if (mem->section_count == sections_per_block)
ret = register_mem_sect_under_node(mem, nid);
out:
mutex_unlock(&mem_sysfs_mutex);
return ret;
}
#ifdef CONFIG_MEMORY_HOTREMOVE
static void
unregister_memory(struct memory_block *memory)
{
BUG_ON(memory->dev.bus != &memory_subsys);
/* drop the ref. we got in remove_memory_block() */
put_device(&memory->dev);
device_unregister(&memory->dev);
}
static int remove_memory_section(unsigned long node_id,
struct mem_section *section, int phys_device)
{
struct memory_block *mem;
mutex_lock(&mem_sysfs_mutex);
/*
* Some users of the memory hotplug do not want/need memblock to
* track all sections. Skip over those.
*/
mem = find_memory_block(section);
if (!mem)
goto out_unlock;
unregister_mem_sect_under_nodes(mem, __section_nr(section));
mem->section_count--;
if (mem->section_count == 0)
unregister_memory(mem);
else
put_device(&mem->dev);
out_unlock:
mutex_unlock(&mem_sysfs_mutex);
return 0;
}
int unregister_memory_section(struct mem_section *section)
{
if (!present_section(section))
return -EINVAL;
return remove_memory_section(0, section, 0);
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
/* return true if the memory block is offlined, otherwise, return false */
bool is_memblock_offlined(struct memory_block *mem)
{
return mem->state == MEM_OFFLINE;
}
static struct attribute *memory_root_attrs[] = {
#ifdef CONFIG_ARCH_MEMORY_PROBE
&dev_attr_probe.attr,
#ifdef CONFIG_MEMORY_HOTREMOVE
&dev_attr_remove.attr,
#endif
#endif
#ifdef CONFIG_MEMORY_FAILURE
&dev_attr_soft_offline_page.attr,
&dev_attr_hard_offline_page.attr,
#endif
&dev_attr_block_size_bytes.attr,
&dev_attr_auto_online_blocks.attr,
#ifdef CONFIG_MEMORY_HOTPLUG
&dev_attr_aligned_blocks_addr.attr,
&dev_attr_aligned_blocks_num.attr,
#endif
NULL
};
static struct attribute_group memory_root_attr_group = {
.attrs = memory_root_attrs,
};
static const struct attribute_group *memory_root_attr_groups[] = {
&memory_root_attr_group,
NULL,
};
/*
* Initialize the sysfs support for memory devices...
*/
int __init memory_dev_init(void)
{
unsigned int i;
int ret;
int err;
unsigned long block_sz;
ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
if (ret)
goto out;
block_sz = get_memory_block_size();
sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
/*
* Create entries for memory sections that were found
* during boot and have been initialized
*/
mutex_lock(&mem_sysfs_mutex);
for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) {
/* Don't iterate over sections we know are !present: */
if (i > __highest_present_section_nr)
break;
err = add_memory_block(i);
if (!ret)
ret = err;
}
mutex_unlock(&mem_sysfs_mutex);
out:
if (ret)
printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
return ret;
}