Merge pull request #1006 from Pennyzct/kernel_fragment_on_aarch64

AArch64: Enable kernel fragment on aarch64
This commit is contained in:
James O. D. Hunt
2020-04-23 08:42:17 +01:00
committed by GitHub
26 changed files with 1100 additions and 324 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
# ACPI on arm64 is dependent on uEFI.
CONFIG_EFI=y
CONFIG_EFI_STUB=y
# ARM64 can run properly in ACPI hardware reduced mode.
CONFIG_ACPI_REDUCED_HARDWARE_ONLY=y

View File

@@ -0,0 +1,42 @@
CONFIG_ARM64=y
CONFIG_ARM64_4K_PAGES=y
# ARM servers are often multi-cores, following configs improve
# the CPU scheduler's decision making.
CONFIG_SCHED_MC=y
CONFIG_SCHED_SMT=y
# Virtual address space size (48-bit)
CONFIG_ARM64_VA_BITS_48=y
CONFIG_ARM64_VA_BITS=48
# Physical address space size (48-bit)
CONFIG_ARM64_PA_BITS_48=y
CONFIG_ARM64_PA_BITS=48
# Use the maximum number of CPUs supported by KVM (255)
CONFIG_NR_CPUS=255
CONFIG_PERF_EVENTS=y
# No architected NMI
CONFIG_ARM64_PSEUDO_NMI=y
CONFIG_ARM64_SVE=y
# Arm64 prefers to use REFCOUNT_FULL by default.
CONFIG_REFCOUNT_FULL=y
#
# ARMv8.1 architectural features
#
CONFIG_ARM64_HW_AFDBM=y
CONFIG_ARM64_PAN=y
# end of ARMv8.1 architectural features
#
# ARMv8.2 architectural features
#
CONFIG_ARM64_CNP=y
CONFIG_ARM64_PMEM=y
CONFIG_ARM64_RAS_EXTN=y
CONFIG_ARM64_UAO=y
# end of ARMv8.2 architectural feature

View File

@@ -0,0 +1,6 @@
# ARMv8 adds cryptographic instructions that could significantly improve
# performance on tasks such as AES encryption and SHA1 and SHA256 hashing.
CONFIG_ARM64_CRYPTO=y
CONFIG_CRYPTO_AES_ARM64=y
CONFIG_CRYPTO_AES_ARM64_CE=y
CONFIG_CRYPTO_SHA256_ARM64=y

View File

@@ -0,0 +1,4 @@
# Device Tree and Open Firmware support
CONFIG_DTC=y
CONFIG_OF=y
CONFIG_OF_PMEM=y

View File

@@ -0,0 +1,15 @@
# ARM errata workarounds via the alternatives framework.
# Vendor-specific option will be left to users to decide.
CONFIG_ARM64_ERRATUM_1024718=y
CONFIG_ARM64_ERRATUM_1165522=y
CONFIG_ARM64_ERRATUM_1286807=y
CONFIG_ARM64_ERRATUM_1463225=y
CONFIG_ARM64_ERRATUM_819472=y
CONFIG_ARM64_ERRATUM_824069=y
CONFIG_ARM64_ERRATUM_826319=y
CONFIG_ARM64_ERRATUM_827319=y
CONFIG_ARM64_ERRATUM_832075=y
CONFIG_ARM64_ERRATUM_843419=y
CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y
CONFIG_ARM64_WORKAROUND_REPEAT_TLBI=y

View File

@@ -0,0 +1,3 @@
# It brings PCI support to mach-virt based upon an idealised host controller.
CONFIG_PCI_HOST_COMMON=y
CONFIG_PCI_HOST_GENERIC=y

View File

@@ -0,0 +1,7 @@
# PTP clock support
#
# The implementation of ptp_kvm on arm is one experimental feature,
# you need to apply private patches to enable it on your host machine.
# See https://github.com/kata-containers/packaging/pull/998 for detailed info.
CONFIG_PTP_1588_CLOCK=y
CONFIG_PTP_1588_CLOCK_KVM=y

View File

@@ -0,0 +1,10 @@
CONFIG_RTC_LIB=y
CONFIG_RTC_CLASS=y
CONFIG_RTC_HCTOSYS=y
CONFIG_RTC_SYSTOHC=y
# RTC interfaces
CONFIG_RTC_INTF_SYSFS=y
CONFIG_RTC_INTF_PROC=y
CONFIG_RTC_INTF_DEV=y
# QEMU provides an emulated ARM AMBA PrimeCell PL031 RTC.
CONFIG_RTC_DRV_PL031=y

View File

@@ -0,0 +1,3 @@
# This option is used for all 8250 compatible serial ports
# that are probed through device tree.
CONFIG_SERIAL_OF_PLATFORM=y

View File

@@ -3,13 +3,7 @@
# https://github.com/kata-containers/packaging/issues/483
CONFIG_ARCH_SUPPORTS_ACPI=y
CONFIG_ACPI=y
CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y
CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y
CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y
CONFIG_ACPI_LPIT=y
CONFIG_ACPI_BUTTON=y
CONFIG_ACPI_CPU_FREQ_PSS=y
CONFIG_ACPI_PROCESSOR_CSTATE=y
CONFIG_ACPI_PROCESSOR_IDLE=y
# Having trouble enabling this - disable for now.
# Would add support for ACPI CPPC power control via firmware - do we need
@@ -22,7 +16,5 @@ CONFIG_ACPI_TABLE_UPGRADE=y
CONFIG_ACPI_PCI_SLOT=y
CONFIG_ACPI_CONTAINER=y
CONFIG_ACPI_HOTPLUG_MEMORY=y
CONFIG_ACPI_HOTPLUG_IOAPIC=y
CONFIG_ACPI_NFIT=y
CONFIG_HAVE_ACPI_APEI=y
CONFIG_HAVE_ACPI_APEI_NMI=y

View File

@@ -1,9 +1,7 @@
# Basic necessary items!
CONFIG_SMP=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
CONFIG_KVM_GUEST=y
# Note, no nested VM support enabled here
# Turn off embedded mode, as it disabled 'too much', and we
@@ -23,7 +21,6 @@ CONFIG_FUTEX=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_GENERIC_MSI_IRQ_DOMAIN=y
CONFIG_GENERIC_MSI_IRQ=y
CONFIG_LEGACY_VSYSCALL_NONE=y
CONFIG_NO_HZ=y
CONFIG_NO_HZ_FULL=y
CONFIG_POSIX_MQUEUE=y
@@ -34,14 +31,11 @@ CONFIG_SHMEM=y
# For security...
CONFIG_RELOCATABLE=y
# FIXME - check if we should be setting this
# https://github.com/kata-containers/packaging/issues/483
#CONFIG_RANDOMIZE_BASE=y
CONFIG_RANDOMIZE_BASE=y
# FIXME - check if we should be setting this
# https://github.com/kata-containers/packaging/issues/483
# I have a feeling it effects our memory hotplug maybe?
# PHYSICAL_ALIGN=0x1000000
CONFIG_RETPOLINE=y
# This would only affect two drivers, neither of which we have enabled.
# The recommendation is to have it on, and you will see if in a diff if you
@@ -54,4 +48,4 @@ CONFIG_RETPOLINE=y
# This can still be dynamically disabled on the kernel command line/kata config if needed.
# Disable for now, as it upsets the entropy test, and we need to improve those: FIXME: see:
# https://github.com/kata-containers/tests/issues/1543
# CONFIG_RANDOM_TRUST_CPU is not set
# RANDOM_TRUST_CPU=y

View File

@@ -4,7 +4,6 @@ CONFIG_MEMORY_HOTPLUG=y
CONFIG_HOTPLUG_CPU=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_PCIE=y
CONFIG_HOTPLUG_PCI_SHPC=y
CONFIG_PCIEPORTBUS=y
CONFIG_HOTPLUG_PCI_ACPI=y
CONFIG_PNPACPI=y

View File

@@ -1,5 +1,10 @@
# Items to enable large/huge mmu pages and tlbs etc.
# Compaction is the only memory management component to form high order
# (larger physically contiguous) memory blocks reliably. The lack of the
# feature can lead to unexpected OOM killer invocations for high order memory requests.
CONFIG_COMPACTION=y
CONFIG_HUGETLBFS=y
# Enable memory page physical migration here, as it can come

View File

@@ -0,0 +1,3 @@
# mmio devices are required for firecracker
CONFIG_VIRTIO_MMIO=y
CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y

View File

@@ -3,4 +3,3 @@
# vmap the kernel stacks - detects stack over-runs better and reduces
# the stack attack window.
CONFIG_VMAP_STACK=y

View File

@@ -3,4 +3,4 @@
# Estimated cost (detailed in the kernel config files)
# is maybe 2.3% for both
CONFIG_STACKPROTECTOR=y
CONFIG_STACKPROTECTOR_STRONG
CONFIG_STACKPROTECTOR_STRONG=y

View File

@@ -3,3 +3,12 @@ CONFIG_X86_INTEL_PSTATE=y
# For old smp systems that do not have proper acpi support.
# Firecracker needs this to support `vcpu_count`
CONFIG_X86_MPPARSE=y
CONFIG_ACPI_CPU_FREQ_PSS=y
CONFIG_ACPI_HOTPLUG_IOAPIC=y
CONFIG_ACPI_LEGACY_TABLES_LOOKUP
CONFIG_ACPI_LPIT=y
CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y
CONFIG_ACPI_PROCESSOR_CSTATE=y
CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y
CONFIG_HAVE_ACPI_APEI_NMI=y

View File

@@ -4,5 +4,13 @@ CONFIG_X86_MSR=y
CONFIG_X86_X2APIC=y
CONFIG_X86_VERBOSE_BOOTUP=y
# Configs around linux guest support and optimizations.
CONFIG_HYPERVISOR_GUEST=y
CONFIG_KVM_GUEST=y
# Use the maximum number of CPUs supported by KVM (240)
CONFIG_NR_CPUS=240
# For security
CONFIG_LEGACY_VSYSCALL_NONE=y
CONFIG_RETPOLINE=y

View File

@@ -0,0 +1,5 @@
# Since we disable pci shpc hotplug for arm64,
# See https://github.com/kata-containers/packaging/pull/498
# for detailed reasons.
# we move this config into x86_64-specific.
CONFIG_HOTPLUG_PCI_SHPC=y

View File

@@ -1,5 +0,0 @@
# x86 specific mmio related items
# Next config are required for firecracker
CONFIG_VIRTIO_MMIO=y
CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y

View File

@@ -1 +1 @@
74
75

View File

@@ -0,0 +1,498 @@
From ba91422b18892bceacf3b4aa60354cf36fcabf9b Mon Sep 17 00:00:00 2001
From: Penny Zheng <penny.zheng@arm.com>
Date: Wed, 8 Apr 2020 10:26:52 +0800
Subject: [PATCH] arm64/mm: Enable memory hot remove
Backport Anshuman Khandual's patch series of Enabling memory hot
remove on aarch64(https://patchwork.kernel.org/cover/11419305/)
to v5.4.x.
This patch series has already been merged, and queued for 5.7.
Signed-off-by: Penny Zheng <penny.zheng@arm.com>
---
arch/arm64/Kconfig | 3 +
arch/arm64/include/asm/memory.h | 1 +
arch/arm64/mm/mmu.c | 379 +++++++++++++++++++++++++++++++-
arch/arm64/mm/ptdump_debugfs.c | 4 +
4 files changed, 378 insertions(+), 9 deletions(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6ccd2ed30963..d18b716fa569 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -274,6 +274,9 @@ config ZONE_DMA32
config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+ def_bool y
+
config SMP
def_bool y
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index c23c47360664..dbba06e258f5 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -54,6 +54,7 @@
#define MODULES_VADDR (BPF_JIT_REGION_END)
#define MODULES_VSIZE (SZ_128M)
#define VMEMMAP_START (-VMEMMAP_SIZE - SZ_2M)
+#define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE)
#define PCI_IO_END (VMEMMAP_START - SZ_2M)
#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE)
#define FIXADDR_TOP (PCI_IO_START - SZ_2M)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d10247fab0fd..99fec235144e 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -17,6 +17,7 @@
#include <linux/mman.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
+#include <linux/memory.h>
#include <linux/fs.h>
#include <linux/io.h>
#include <linux/mm.h>
@@ -725,6 +726,312 @@ int kern_addr_valid(unsigned long addr)
return pfn_valid(pte_pfn(pte));
}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_hotplug_page_range(struct page *page, size_t size)
+{
+ WARN_ON(PageReserved(page));
+ free_pages((unsigned long)page_address(page), get_order(size));
+}
+
+static void free_hotplug_pgtable_page(struct page *page)
+{
+ free_hotplug_page_range(page, PAGE_SIZE);
+}
+
+static bool pgtable_range_aligned(unsigned long start, unsigned long end,
+ unsigned long floor, unsigned long ceiling,
+ unsigned long mask)
+{
+ start &= mask;
+ if (start < floor)
+ return false;
+
+ if (ceiling) {
+ ceiling &= mask;
+ if (!ceiling)
+ return false;
+ }
+
+ if (end - 1 > ceiling - 1)
+ return false;
+ return true;
+}
+
+static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, bool free_mapped)
+{
+ pte_t *ptep, pte;
+
+ do {
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = READ_ONCE(*ptep);
+ if (pte_none(pte))
+ continue;
+
+ WARN_ON(!pte_present(pte));
+ pte_clear(&init_mm, addr, ptep);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ if (free_mapped)
+ free_hotplug_page_range(pte_page(pte), PAGE_SIZE);
+ } while (addr += PAGE_SIZE, addr < end);
+}
+
+static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
+ unsigned long end, bool free_mapped)
+{
+ unsigned long next;
+ pmd_t *pmdp, pmd;
+
+ do {
+ next = pmd_addr_end(addr, end);
+ pmdp = pmd_offset(pudp, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
+ continue;
+
+ WARN_ON(!pmd_present(pmd));
+ if (pmd_sect(pmd)) {
+ pmd_clear(pmdp);
+
+ /*
+ * One TLBI should be sufficient here as the PMD_SIZE
+ * range is mapped with a single block entry.
+ */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ if (free_mapped)
+ free_hotplug_page_range(pmd_page(pmd),
+ PMD_SIZE);
+ continue;
+ }
+ WARN_ON(!pmd_table(pmd));
+ unmap_hotplug_pte_range(pmdp, addr, next, free_mapped);
+ } while (addr = next, addr < end);
+}
+
+static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
+ unsigned long end, bool free_mapped)
+{
+ unsigned long next;
+ pud_t *pudp, pud;
+
+ do {
+ next = pud_addr_end(addr, end);
+ pudp = pud_offset(p4dp, addr);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ continue;
+
+ WARN_ON(!pud_present(pud));
+ if (pud_sect(pud)) {
+ pud_clear(pudp);
+
+ /*
+ * One TLBI should be sufficient here as the PUD_SIZE
+ * range is mapped with a single block entry.
+ */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ if (free_mapped)
+ free_hotplug_page_range(pud_page(pud),
+ PUD_SIZE);
+ continue;
+ }
+ WARN_ON(!pud_table(pud));
+ unmap_hotplug_pmd_range(pudp, addr, next, free_mapped);
+ } while (addr = next, addr < end);
+}
+
+static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
+ unsigned long end, bool free_mapped)
+{
+ unsigned long next;
+ p4d_t *p4dp, p4d;
+
+ do {
+ next = p4d_addr_end(addr, end);
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (p4d_none(p4d))
+ continue;
+
+ WARN_ON(!p4d_present(p4d));
+ unmap_hotplug_pud_range(p4dp, addr, next, free_mapped);
+ } while (addr = next, addr < end);
+}
+
+static void unmap_hotplug_range(unsigned long addr, unsigned long end,
+ bool free_mapped)
+{
+ unsigned long next;
+ pgd_t *pgdp, pgd;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ pgdp = pgd_offset_k(addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ continue;
+
+ WARN_ON(!pgd_present(pgd));
+ unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped);
+ } while (addr = next, addr < end);
+}
+
+static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ pte_t *ptep, pte;
+ unsigned long i, start = addr;
+
+ do {
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = READ_ONCE(*ptep);
+
+ /*
+ * This is just a sanity check here which verifies that
+ * pte clearing has been done by earlier unmap loops.
+ */
+ WARN_ON(!pte_none(pte));
+ } while (addr += PAGE_SIZE, addr < end);
+
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
+ return;
+
+ /*
+ * Check whether we can free the pte page if the rest of the
+ * entries are empty. Overlap with other regions have been
+ * handled by the floor/ceiling check.
+ */
+ ptep = pte_offset_kernel(pmdp, 0UL);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ if (!pte_none(READ_ONCE(ptep[i])))
+ return;
+ }
+
+ pmd_clear(pmdp);
+ __flush_tlb_kernel_pgtable(start);
+ free_hotplug_pgtable_page(virt_to_page(ptep));
+}
+
+static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ pmd_t *pmdp, pmd;
+ unsigned long i, next, start = addr;
+
+ do {
+ next = pmd_addr_end(addr, end);
+ pmdp = pmd_offset(pudp, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
+ continue;
+
+ WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
+ free_empty_pte_table(pmdp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+
+ if (CONFIG_PGTABLE_LEVELS <= 2)
+ return;
+
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
+ return;
+
+ /*
+ * Check whether we can free the pmd page if the rest of the
+ * entries are empty. Overlap with other regions have been
+ * handled by the floor/ceiling check.
+ */
+ pmdp = pmd_offset(pudp, 0UL);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(READ_ONCE(pmdp[i])))
+ return;
+ }
+
+ pud_clear(pudp);
+ __flush_tlb_kernel_pgtable(start);
+ free_hotplug_pgtable_page(virt_to_page(pmdp));
+}
+
+static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ pud_t *pudp, pud;
+ unsigned long i, next, start = addr;
+
+ do {
+ next = pud_addr_end(addr, end);
+ pudp = pud_offset(p4dp, addr);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ continue;
+
+ WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
+ free_empty_pmd_table(pudp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+
+ if (CONFIG_PGTABLE_LEVELS <= 3)
+ return;
+
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
+ return;
+
+ /*
+ * Check whether we can free the pud page if the rest of the
+ * entries are empty. Overlap with other regions have been
+ * handled by the floor/ceiling check.
+ */
+ pudp = pud_offset(p4dp, 0UL);
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ if (!pud_none(READ_ONCE(pudp[i])))
+ return;
+ }
+
+ p4d_clear(p4dp);
+ __flush_tlb_kernel_pgtable(start);
+ free_hotplug_pgtable_page(virt_to_page(pudp));
+}
+
+static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ unsigned long next;
+ p4d_t *p4dp, p4d;
+
+ do {
+ next = p4d_addr_end(addr, end);
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (p4d_none(p4d))
+ continue;
+
+ WARN_ON(!p4d_present(p4d));
+ free_empty_pud_table(p4dp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+}
+
+static void free_empty_tables(unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ unsigned long next;
+ pgd_t *pgdp, pgd;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ pgdp = pgd_offset_k(addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ continue;
+
+ WARN_ON(!pgd_present(pgd));
+ free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+}
+#endif
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
#if !ARM64_SWAPPER_USES_SECTION_MAPS
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -772,6 +1079,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap)
{
+#ifdef CONFIG_MEMORY_HOTPLUG
+ WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
+
+ unmap_hotplug_range(start, end, true);
+ free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
+#endif
}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
@@ -1050,10 +1363,21 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
}
#ifdef CONFIG_MEMORY_HOTPLUG
+static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
+{
+ unsigned long end = start + size;
+
+ WARN_ON(pgdir != init_mm.pgd);
+ WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
+
+ unmap_hotplug_range(start, end, false);
+ free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
+}
+
int arch_add_memory(int nid, u64 start, u64 size,
struct mhp_restrictions *restrictions)
{
- int flags = 0;
+ int ret, flags = 0;
if (rodata_full || debug_pagealloc_enabled())
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
@@ -1061,22 +1385,59 @@ int arch_add_memory(int nid, u64 start, u64 size,
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
- return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
+ ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
restrictions);
+ if (ret)
+ __remove_pgd_mapping(swapper_pg_dir,
+ __phys_to_virt(start), size);
+ return ret;
}
+
void arch_remove_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- /*
- * FIXME: Cleanup page tables (also in arch_add_memory() in case
- * adding fails). Until then, this function should only be used
- * during memory hotplug (adding memory), not for memory
- * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be
- * unlocked yet.
- */
__remove_pages(start_pfn, nr_pages, altmap);
+ __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
+}
+
+/*
+ * This memory hotplug notifier helps prevent boot memory from being
+ * inadvertently removed as it blocks pfn range offlining process in
+ * __offline_pages(). Hence this prevents both offlining as well as
+ * removal process for boot memory which is initially always online.
+ * In future if and when boot memory could be removed, this notifier
+ * should be dropped and free_hotplug_page_range() should handle any
+ * reserved pages allocated during boot.
+ */
+static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct mem_section *ms;
+ struct memory_notify *arg = data;
+ unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
+ unsigned long pfn = arg->start_pfn;
+
+ if (action != MEM_GOING_OFFLINE)
+ return NOTIFY_OK;
+
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ ms = __pfn_to_section(pfn);
+ if (early_section(ms))
+ return NOTIFY_BAD;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block prevent_bootmem_remove_nb = {
+ .notifier_call = prevent_bootmem_remove_notifier,
+};
+
+static int __init prevent_bootmem_remove_init(void)
+{
+ return register_memory_notifier(&prevent_bootmem_remove_nb);
}
+device_initcall(prevent_bootmem_remove_init);
#endif
diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c
index 064163f25592..b5eebc8c4924 100644
--- a/arch/arm64/mm/ptdump_debugfs.c
+++ b/arch/arm64/mm/ptdump_debugfs.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/debugfs.h>
+#include <linux/memory_hotplug.h>
#include <linux/seq_file.h>
#include <asm/ptdump.h>
@@ -7,7 +8,10 @@
static int ptdump_show(struct seq_file *m, void *v)
{
struct ptdump_info *info = m->private;
+
+ get_online_mems();
ptdump_walk_pgd(m, info);
+ put_online_mems();
return 0;
}
DEFINE_SHOW_ATTRIBUTE(ptdump);
--
2.17.1