mirror of
https://github.com/aljazceru/kata-containers.git
synced 2026-02-22 23:14:21 +01:00
Merge pull request #1006 from Pennyzct/kernel_fragment_on_aarch64
AArch64: Enable kernel fragment on aarch64
This commit is contained in:
File diff suppressed because it is too large
Load Diff
5
kernel/configs/fragments/arm64/acpi.conf
Normal file
5
kernel/configs/fragments/arm64/acpi.conf
Normal file
@@ -0,0 +1,5 @@
|
||||
# ACPI on arm64 is dependent on uEFI.
|
||||
CONFIG_EFI=y
|
||||
CONFIG_EFI_STUB=y
|
||||
# ARM64 can run properly in ACPI hardware reduced mode.
|
||||
CONFIG_ACPI_REDUCED_HARDWARE_ONLY=y
|
||||
42
kernel/configs/fragments/arm64/base.conf
Normal file
42
kernel/configs/fragments/arm64/base.conf
Normal file
@@ -0,0 +1,42 @@
|
||||
CONFIG_ARM64=y
|
||||
CONFIG_ARM64_4K_PAGES=y
|
||||
|
||||
# ARM servers are often multi-cores, following configs improve
|
||||
# the CPU scheduler's decision making.
|
||||
CONFIG_SCHED_MC=y
|
||||
CONFIG_SCHED_SMT=y
|
||||
|
||||
# Virtual address space size (48-bit)
|
||||
CONFIG_ARM64_VA_BITS_48=y
|
||||
CONFIG_ARM64_VA_BITS=48
|
||||
# Physical address space size (48-bit)
|
||||
CONFIG_ARM64_PA_BITS_48=y
|
||||
CONFIG_ARM64_PA_BITS=48
|
||||
|
||||
# Use the maximum number of CPUs supported by KVM (255)
|
||||
CONFIG_NR_CPUS=255
|
||||
|
||||
CONFIG_PERF_EVENTS=y
|
||||
|
||||
# No architected NMI
|
||||
CONFIG_ARM64_PSEUDO_NMI=y
|
||||
CONFIG_ARM64_SVE=y
|
||||
|
||||
# Arm64 prefers to use REFCOUNT_FULL by default.
|
||||
CONFIG_REFCOUNT_FULL=y
|
||||
|
||||
#
|
||||
# ARMv8.1 architectural features
|
||||
#
|
||||
CONFIG_ARM64_HW_AFDBM=y
|
||||
CONFIG_ARM64_PAN=y
|
||||
# end of ARMv8.1 architectural features
|
||||
|
||||
#
|
||||
# ARMv8.2 architectural features
|
||||
#
|
||||
CONFIG_ARM64_CNP=y
|
||||
CONFIG_ARM64_PMEM=y
|
||||
CONFIG_ARM64_RAS_EXTN=y
|
||||
CONFIG_ARM64_UAO=y
|
||||
# end of ARMv8.2 architectural feature
|
||||
6
kernel/configs/fragments/arm64/crypto.conf
Normal file
6
kernel/configs/fragments/arm64/crypto.conf
Normal file
@@ -0,0 +1,6 @@
|
||||
# ARMv8 adds cryptographic instructions that could significantly improve
|
||||
# performance on tasks such as AES encryption and SHA1 and SHA256 hashing.
|
||||
CONFIG_ARM64_CRYPTO=y
|
||||
CONFIG_CRYPTO_AES_ARM64=y
|
||||
CONFIG_CRYPTO_AES_ARM64_CE=y
|
||||
CONFIG_CRYPTO_SHA256_ARM64=y
|
||||
4
kernel/configs/fragments/arm64/dt.conf
Normal file
4
kernel/configs/fragments/arm64/dt.conf
Normal file
@@ -0,0 +1,4 @@
|
||||
# Device Tree and Open Firmware support
|
||||
CONFIG_DTC=y
|
||||
CONFIG_OF=y
|
||||
CONFIG_OF_PMEM=y
|
||||
15
kernel/configs/fragments/arm64/erratum.conf
Normal file
15
kernel/configs/fragments/arm64/erratum.conf
Normal file
@@ -0,0 +1,15 @@
|
||||
# ARM errata workarounds via the alternatives framework.
|
||||
# Vendor-specific option will be left to users to decide.
|
||||
CONFIG_ARM64_ERRATUM_1024718=y
|
||||
CONFIG_ARM64_ERRATUM_1165522=y
|
||||
CONFIG_ARM64_ERRATUM_1286807=y
|
||||
CONFIG_ARM64_ERRATUM_1463225=y
|
||||
CONFIG_ARM64_ERRATUM_819472=y
|
||||
CONFIG_ARM64_ERRATUM_824069=y
|
||||
CONFIG_ARM64_ERRATUM_826319=y
|
||||
CONFIG_ARM64_ERRATUM_827319=y
|
||||
CONFIG_ARM64_ERRATUM_832075=y
|
||||
CONFIG_ARM64_ERRATUM_843419=y
|
||||
CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y
|
||||
CONFIG_ARM64_WORKAROUND_REPEAT_TLBI=y
|
||||
|
||||
3
kernel/configs/fragments/arm64/pci.conf
Normal file
3
kernel/configs/fragments/arm64/pci.conf
Normal file
@@ -0,0 +1,3 @@
|
||||
# It brings PCI support to mach-virt based upon an idealised host controller.
|
||||
CONFIG_PCI_HOST_COMMON=y
|
||||
CONFIG_PCI_HOST_GENERIC=y
|
||||
7
kernel/configs/fragments/arm64/ptp.conf
Normal file
7
kernel/configs/fragments/arm64/ptp.conf
Normal file
@@ -0,0 +1,7 @@
|
||||
# PTP clock support
|
||||
#
|
||||
# The implementation of ptp_kvm on arm is one experimental feature,
|
||||
# you need to apply private patches to enable it on your host machine.
|
||||
# See https://github.com/kata-containers/packaging/pull/998 for detailed info.
|
||||
CONFIG_PTP_1588_CLOCK=y
|
||||
CONFIG_PTP_1588_CLOCK_KVM=y
|
||||
10
kernel/configs/fragments/arm64/rtc.conf
Normal file
10
kernel/configs/fragments/arm64/rtc.conf
Normal file
@@ -0,0 +1,10 @@
|
||||
CONFIG_RTC_LIB=y
|
||||
CONFIG_RTC_CLASS=y
|
||||
CONFIG_RTC_HCTOSYS=y
|
||||
CONFIG_RTC_SYSTOHC=y
|
||||
# RTC interfaces
|
||||
CONFIG_RTC_INTF_SYSFS=y
|
||||
CONFIG_RTC_INTF_PROC=y
|
||||
CONFIG_RTC_INTF_DEV=y
|
||||
# QEMU provides an emulated ARM AMBA PrimeCell PL031 RTC.
|
||||
CONFIG_RTC_DRV_PL031=y
|
||||
3
kernel/configs/fragments/arm64/serial.conf
Normal file
3
kernel/configs/fragments/arm64/serial.conf
Normal file
@@ -0,0 +1,3 @@
|
||||
# This option is used for all 8250 compatible serial ports
|
||||
# that are probed through device tree.
|
||||
CONFIG_SERIAL_OF_PLATFORM=y
|
||||
@@ -3,13 +3,7 @@
|
||||
# https://github.com/kata-containers/packaging/issues/483
|
||||
CONFIG_ARCH_SUPPORTS_ACPI=y
|
||||
CONFIG_ACPI=y
|
||||
CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y
|
||||
CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y
|
||||
CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y
|
||||
CONFIG_ACPI_LPIT=y
|
||||
CONFIG_ACPI_BUTTON=y
|
||||
CONFIG_ACPI_CPU_FREQ_PSS=y
|
||||
CONFIG_ACPI_PROCESSOR_CSTATE=y
|
||||
CONFIG_ACPI_PROCESSOR_IDLE=y
|
||||
# Having trouble enabling this - disable for now.
|
||||
# Would add support for ACPI CPPC power control via firmware - do we need
|
||||
@@ -22,7 +16,5 @@ CONFIG_ACPI_TABLE_UPGRADE=y
|
||||
CONFIG_ACPI_PCI_SLOT=y
|
||||
CONFIG_ACPI_CONTAINER=y
|
||||
CONFIG_ACPI_HOTPLUG_MEMORY=y
|
||||
CONFIG_ACPI_HOTPLUG_IOAPIC=y
|
||||
CONFIG_ACPI_NFIT=y
|
||||
CONFIG_HAVE_ACPI_APEI=y
|
||||
CONFIG_HAVE_ACPI_APEI_NMI=y
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
# Basic necessary items!
|
||||
|
||||
CONFIG_SMP=y
|
||||
CONFIG_HYPERVISOR_GUEST=y
|
||||
CONFIG_PARAVIRT=y
|
||||
CONFIG_KVM_GUEST=y
|
||||
# Note, no nested VM support enabled here
|
||||
|
||||
# Turn off embedded mode, as it disabled 'too much', and we
|
||||
@@ -23,7 +21,6 @@ CONFIG_FUTEX=y
|
||||
CONFIG_HIGH_RES_TIMERS=y
|
||||
CONFIG_GENERIC_MSI_IRQ_DOMAIN=y
|
||||
CONFIG_GENERIC_MSI_IRQ=y
|
||||
CONFIG_LEGACY_VSYSCALL_NONE=y
|
||||
CONFIG_NO_HZ=y
|
||||
CONFIG_NO_HZ_FULL=y
|
||||
CONFIG_POSIX_MQUEUE=y
|
||||
@@ -34,14 +31,11 @@ CONFIG_SHMEM=y
|
||||
|
||||
# For security...
|
||||
CONFIG_RELOCATABLE=y
|
||||
# FIXME - check if we should be setting this
|
||||
# https://github.com/kata-containers/packaging/issues/483
|
||||
#CONFIG_RANDOMIZE_BASE=y
|
||||
CONFIG_RANDOMIZE_BASE=y
|
||||
# FIXME - check if we should be setting this
|
||||
# https://github.com/kata-containers/packaging/issues/483
|
||||
# I have a feeling it effects our memory hotplug maybe?
|
||||
# PHYSICAL_ALIGN=0x1000000
|
||||
CONFIG_RETPOLINE=y
|
||||
|
||||
# This would only affect two drivers, neither of which we have enabled.
|
||||
# The recommendation is to have it on, and you will see if in a diff if you
|
||||
@@ -54,4 +48,4 @@ CONFIG_RETPOLINE=y
|
||||
# This can still be dynamically disabled on the kernel command line/kata config if needed.
|
||||
# Disable for now, as it upsets the entropy test, and we need to improve those: FIXME: see:
|
||||
# https://github.com/kata-containers/tests/issues/1543
|
||||
# CONFIG_RANDOM_TRUST_CPU is not set
|
||||
# RANDOM_TRUST_CPU=y
|
||||
|
||||
@@ -4,7 +4,6 @@ CONFIG_MEMORY_HOTPLUG=y
|
||||
CONFIG_HOTPLUG_CPU=y
|
||||
CONFIG_HOTPLUG_PCI=y
|
||||
CONFIG_HOTPLUG_PCI_PCIE=y
|
||||
CONFIG_HOTPLUG_PCI_SHPC=y
|
||||
CONFIG_PCIEPORTBUS=y
|
||||
CONFIG_HOTPLUG_PCI_ACPI=y
|
||||
CONFIG_PNPACPI=y
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
# Items to enable large/huge mmu pages and tlbs etc.
|
||||
|
||||
# Compaction is the only memory management component to form high order
|
||||
# (larger physically contiguous) memory blocks reliably. The lack of the
|
||||
# feature can lead to unexpected OOM killer invocations for high order memory requests.
|
||||
CONFIG_COMPACTION=y
|
||||
|
||||
CONFIG_HUGETLBFS=y
|
||||
|
||||
# Enable memory page physical migration here, as it can come
|
||||
|
||||
3
kernel/configs/fragments/common/mmio.conf
Normal file
3
kernel/configs/fragments/common/mmio.conf
Normal file
@@ -0,0 +1,3 @@
|
||||
# mmio devices are required for firecracker
|
||||
CONFIG_VIRTIO_MMIO=y
|
||||
CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
|
||||
@@ -3,4 +3,3 @@
|
||||
# vmap the kernel stacks - detects stack over-runs better and reduces
|
||||
# the stack attack window.
|
||||
CONFIG_VMAP_STACK=y
|
||||
|
||||
|
||||
@@ -3,4 +3,4 @@
|
||||
# Estimated cost (detailed in the kernel config files)
|
||||
# is maybe 2.3% for both
|
||||
CONFIG_STACKPROTECTOR=y
|
||||
CONFIG_STACKPROTECTOR_STRONG
|
||||
CONFIG_STACKPROTECTOR_STRONG=y
|
||||
|
||||
@@ -3,3 +3,12 @@ CONFIG_X86_INTEL_PSTATE=y
|
||||
# For old smp systems that do not have proper acpi support.
|
||||
# Firecracker needs this to support `vcpu_count`
|
||||
CONFIG_X86_MPPARSE=y
|
||||
|
||||
CONFIG_ACPI_CPU_FREQ_PSS=y
|
||||
CONFIG_ACPI_HOTPLUG_IOAPIC=y
|
||||
CONFIG_ACPI_LEGACY_TABLES_LOOKUP
|
||||
CONFIG_ACPI_LPIT=y
|
||||
CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y
|
||||
CONFIG_ACPI_PROCESSOR_CSTATE=y
|
||||
CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y
|
||||
CONFIG_HAVE_ACPI_APEI_NMI=y
|
||||
|
||||
@@ -4,5 +4,13 @@ CONFIG_X86_MSR=y
|
||||
CONFIG_X86_X2APIC=y
|
||||
CONFIG_X86_VERBOSE_BOOTUP=y
|
||||
|
||||
# Configs around linux guest support and optimizations.
|
||||
CONFIG_HYPERVISOR_GUEST=y
|
||||
CONFIG_KVM_GUEST=y
|
||||
|
||||
# Use the maximum number of CPUs supported by KVM (240)
|
||||
CONFIG_NR_CPUS=240
|
||||
|
||||
# For security
|
||||
CONFIG_LEGACY_VSYSCALL_NONE=y
|
||||
CONFIG_RETPOLINE=y
|
||||
|
||||
5
kernel/configs/fragments/x86_64/hotplug.conf
Normal file
5
kernel/configs/fragments/x86_64/hotplug.conf
Normal file
@@ -0,0 +1,5 @@
|
||||
# Since we disable pci shpc hotplug for arm64,
|
||||
# See https://github.com/kata-containers/packaging/pull/498
|
||||
# for detailed reasons.
|
||||
# we move this config into x86_64-specific.
|
||||
CONFIG_HOTPLUG_PCI_SHPC=y
|
||||
@@ -1,5 +0,0 @@
|
||||
# x86 specific mmio related items
|
||||
|
||||
# Next config are required for firecracker
|
||||
CONFIG_VIRTIO_MMIO=y
|
||||
CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y
|
||||
@@ -1 +1 @@
|
||||
74
|
||||
75
|
||||
|
||||
@@ -0,0 +1,498 @@
|
||||
From ba91422b18892bceacf3b4aa60354cf36fcabf9b Mon Sep 17 00:00:00 2001
|
||||
From: Penny Zheng <penny.zheng@arm.com>
|
||||
Date: Wed, 8 Apr 2020 10:26:52 +0800
|
||||
Subject: [PATCH] arm64/mm: Enable memory hot remove
|
||||
|
||||
Backport Anshuman Khandual's patch series of Enabling memory hot
|
||||
remove on aarch64(https://patchwork.kernel.org/cover/11419305/)
|
||||
to v5.4.x.
|
||||
This patch series has already been merged, and queued for 5.7.
|
||||
|
||||
Signed-off-by: Penny Zheng <penny.zheng@arm.com>
|
||||
---
|
||||
arch/arm64/Kconfig | 3 +
|
||||
arch/arm64/include/asm/memory.h | 1 +
|
||||
arch/arm64/mm/mmu.c | 379 +++++++++++++++++++++++++++++++-
|
||||
arch/arm64/mm/ptdump_debugfs.c | 4 +
|
||||
4 files changed, 378 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
|
||||
index 6ccd2ed30963..d18b716fa569 100644
|
||||
--- a/arch/arm64/Kconfig
|
||||
+++ b/arch/arm64/Kconfig
|
||||
@@ -274,6 +274,9 @@ config ZONE_DMA32
|
||||
config ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
def_bool y
|
||||
|
||||
+config ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
+ def_bool y
|
||||
+
|
||||
config SMP
|
||||
def_bool y
|
||||
|
||||
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
|
||||
index c23c47360664..dbba06e258f5 100644
|
||||
--- a/arch/arm64/include/asm/memory.h
|
||||
+++ b/arch/arm64/include/asm/memory.h
|
||||
@@ -54,6 +54,7 @@
|
||||
#define MODULES_VADDR (BPF_JIT_REGION_END)
|
||||
#define MODULES_VSIZE (SZ_128M)
|
||||
#define VMEMMAP_START (-VMEMMAP_SIZE - SZ_2M)
|
||||
+#define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE)
|
||||
#define PCI_IO_END (VMEMMAP_START - SZ_2M)
|
||||
#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE)
|
||||
#define FIXADDR_TOP (PCI_IO_START - SZ_2M)
|
||||
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
|
||||
index d10247fab0fd..99fec235144e 100644
|
||||
--- a/arch/arm64/mm/mmu.c
|
||||
+++ b/arch/arm64/mm/mmu.c
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <linux/mman.h>
|
||||
#include <linux/nodemask.h>
|
||||
#include <linux/memblock.h>
|
||||
+#include <linux/memory.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/mm.h>
|
||||
@@ -725,6 +726,312 @@ int kern_addr_valid(unsigned long addr)
|
||||
|
||||
return pfn_valid(pte_pfn(pte));
|
||||
}
|
||||
+
|
||||
+#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
+static void free_hotplug_page_range(struct page *page, size_t size)
|
||||
+{
|
||||
+ WARN_ON(PageReserved(page));
|
||||
+ free_pages((unsigned long)page_address(page), get_order(size));
|
||||
+}
|
||||
+
|
||||
+static void free_hotplug_pgtable_page(struct page *page)
|
||||
+{
|
||||
+ free_hotplug_page_range(page, PAGE_SIZE);
|
||||
+}
|
||||
+
|
||||
+static bool pgtable_range_aligned(unsigned long start, unsigned long end,
|
||||
+ unsigned long floor, unsigned long ceiling,
|
||||
+ unsigned long mask)
|
||||
+{
|
||||
+ start &= mask;
|
||||
+ if (start < floor)
|
||||
+ return false;
|
||||
+
|
||||
+ if (ceiling) {
|
||||
+ ceiling &= mask;
|
||||
+ if (!ceiling)
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ if (end - 1 > ceiling - 1)
|
||||
+ return false;
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
|
||||
+ unsigned long end, bool free_mapped)
|
||||
+{
|
||||
+ pte_t *ptep, pte;
|
||||
+
|
||||
+ do {
|
||||
+ ptep = pte_offset_kernel(pmdp, addr);
|
||||
+ pte = READ_ONCE(*ptep);
|
||||
+ if (pte_none(pte))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pte_present(pte));
|
||||
+ pte_clear(&init_mm, addr, ptep);
|
||||
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
||||
+ if (free_mapped)
|
||||
+ free_hotplug_page_range(pte_page(pte), PAGE_SIZE);
|
||||
+ } while (addr += PAGE_SIZE, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
|
||||
+ unsigned long end, bool free_mapped)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ pmd_t *pmdp, pmd;
|
||||
+
|
||||
+ do {
|
||||
+ next = pmd_addr_end(addr, end);
|
||||
+ pmdp = pmd_offset(pudp, addr);
|
||||
+ pmd = READ_ONCE(*pmdp);
|
||||
+ if (pmd_none(pmd))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pmd_present(pmd));
|
||||
+ if (pmd_sect(pmd)) {
|
||||
+ pmd_clear(pmdp);
|
||||
+
|
||||
+ /*
|
||||
+ * One TLBI should be sufficient here as the PMD_SIZE
|
||||
+ * range is mapped with a single block entry.
|
||||
+ */
|
||||
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
||||
+ if (free_mapped)
|
||||
+ free_hotplug_page_range(pmd_page(pmd),
|
||||
+ PMD_SIZE);
|
||||
+ continue;
|
||||
+ }
|
||||
+ WARN_ON(!pmd_table(pmd));
|
||||
+ unmap_hotplug_pte_range(pmdp, addr, next, free_mapped);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
|
||||
+ unsigned long end, bool free_mapped)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ pud_t *pudp, pud;
|
||||
+
|
||||
+ do {
|
||||
+ next = pud_addr_end(addr, end);
|
||||
+ pudp = pud_offset(p4dp, addr);
|
||||
+ pud = READ_ONCE(*pudp);
|
||||
+ if (pud_none(pud))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pud_present(pud));
|
||||
+ if (pud_sect(pud)) {
|
||||
+ pud_clear(pudp);
|
||||
+
|
||||
+ /*
|
||||
+ * One TLBI should be sufficient here as the PUD_SIZE
|
||||
+ * range is mapped with a single block entry.
|
||||
+ */
|
||||
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
||||
+ if (free_mapped)
|
||||
+ free_hotplug_page_range(pud_page(pud),
|
||||
+ PUD_SIZE);
|
||||
+ continue;
|
||||
+ }
|
||||
+ WARN_ON(!pud_table(pud));
|
||||
+ unmap_hotplug_pmd_range(pudp, addr, next, free_mapped);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
|
||||
+ unsigned long end, bool free_mapped)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ p4d_t *p4dp, p4d;
|
||||
+
|
||||
+ do {
|
||||
+ next = p4d_addr_end(addr, end);
|
||||
+ p4dp = p4d_offset(pgdp, addr);
|
||||
+ p4d = READ_ONCE(*p4dp);
|
||||
+ if (p4d_none(p4d))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!p4d_present(p4d));
|
||||
+ unmap_hotplug_pud_range(p4dp, addr, next, free_mapped);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void unmap_hotplug_range(unsigned long addr, unsigned long end,
|
||||
+ bool free_mapped)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ pgd_t *pgdp, pgd;
|
||||
+
|
||||
+ do {
|
||||
+ next = pgd_addr_end(addr, end);
|
||||
+ pgdp = pgd_offset_k(addr);
|
||||
+ pgd = READ_ONCE(*pgdp);
|
||||
+ if (pgd_none(pgd))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pgd_present(pgd));
|
||||
+ unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
|
||||
+ unsigned long end, unsigned long floor,
|
||||
+ unsigned long ceiling)
|
||||
+{
|
||||
+ pte_t *ptep, pte;
|
||||
+ unsigned long i, start = addr;
|
||||
+
|
||||
+ do {
|
||||
+ ptep = pte_offset_kernel(pmdp, addr);
|
||||
+ pte = READ_ONCE(*ptep);
|
||||
+
|
||||
+ /*
|
||||
+ * This is just a sanity check here which verifies that
|
||||
+ * pte clearing has been done by earlier unmap loops.
|
||||
+ */
|
||||
+ WARN_ON(!pte_none(pte));
|
||||
+ } while (addr += PAGE_SIZE, addr < end);
|
||||
+
|
||||
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * Check whether we can free the pte page if the rest of the
|
||||
+ * entries are empty. Overlap with other regions have been
|
||||
+ * handled by the floor/ceiling check.
|
||||
+ */
|
||||
+ ptep = pte_offset_kernel(pmdp, 0UL);
|
||||
+ for (i = 0; i < PTRS_PER_PTE; i++) {
|
||||
+ if (!pte_none(READ_ONCE(ptep[i])))
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ pmd_clear(pmdp);
|
||||
+ __flush_tlb_kernel_pgtable(start);
|
||||
+ free_hotplug_pgtable_page(virt_to_page(ptep));
|
||||
+}
|
||||
+
|
||||
+static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
|
||||
+ unsigned long end, unsigned long floor,
|
||||
+ unsigned long ceiling)
|
||||
+{
|
||||
+ pmd_t *pmdp, pmd;
|
||||
+ unsigned long i, next, start = addr;
|
||||
+
|
||||
+ do {
|
||||
+ next = pmd_addr_end(addr, end);
|
||||
+ pmdp = pmd_offset(pudp, addr);
|
||||
+ pmd = READ_ONCE(*pmdp);
|
||||
+ if (pmd_none(pmd))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
|
||||
+ free_empty_pte_table(pmdp, addr, next, floor, ceiling);
|
||||
+ } while (addr = next, addr < end);
|
||||
+
|
||||
+ if (CONFIG_PGTABLE_LEVELS <= 2)
|
||||
+ return;
|
||||
+
|
||||
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * Check whether we can free the pmd page if the rest of the
|
||||
+ * entries are empty. Overlap with other regions have been
|
||||
+ * handled by the floor/ceiling check.
|
||||
+ */
|
||||
+ pmdp = pmd_offset(pudp, 0UL);
|
||||
+ for (i = 0; i < PTRS_PER_PMD; i++) {
|
||||
+ if (!pmd_none(READ_ONCE(pmdp[i])))
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ pud_clear(pudp);
|
||||
+ __flush_tlb_kernel_pgtable(start);
|
||||
+ free_hotplug_pgtable_page(virt_to_page(pmdp));
|
||||
+}
|
||||
+
|
||||
+static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
|
||||
+ unsigned long end, unsigned long floor,
|
||||
+ unsigned long ceiling)
|
||||
+{
|
||||
+ pud_t *pudp, pud;
|
||||
+ unsigned long i, next, start = addr;
|
||||
+
|
||||
+ do {
|
||||
+ next = pud_addr_end(addr, end);
|
||||
+ pudp = pud_offset(p4dp, addr);
|
||||
+ pud = READ_ONCE(*pudp);
|
||||
+ if (pud_none(pud))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
|
||||
+ free_empty_pmd_table(pudp, addr, next, floor, ceiling);
|
||||
+ } while (addr = next, addr < end);
|
||||
+
|
||||
+ if (CONFIG_PGTABLE_LEVELS <= 3)
|
||||
+ return;
|
||||
+
|
||||
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * Check whether we can free the pud page if the rest of the
|
||||
+ * entries are empty. Overlap with other regions have been
|
||||
+ * handled by the floor/ceiling check.
|
||||
+ */
|
||||
+ pudp = pud_offset(p4dp, 0UL);
|
||||
+ for (i = 0; i < PTRS_PER_PUD; i++) {
|
||||
+ if (!pud_none(READ_ONCE(pudp[i])))
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ p4d_clear(p4dp);
|
||||
+ __flush_tlb_kernel_pgtable(start);
|
||||
+ free_hotplug_pgtable_page(virt_to_page(pudp));
|
||||
+}
|
||||
+
|
||||
+static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
|
||||
+ unsigned long end, unsigned long floor,
|
||||
+ unsigned long ceiling)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ p4d_t *p4dp, p4d;
|
||||
+
|
||||
+ do {
|
||||
+ next = p4d_addr_end(addr, end);
|
||||
+ p4dp = p4d_offset(pgdp, addr);
|
||||
+ p4d = READ_ONCE(*p4dp);
|
||||
+ if (p4d_none(p4d))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!p4d_present(p4d));
|
||||
+ free_empty_pud_table(p4dp, addr, next, floor, ceiling);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+
|
||||
+static void free_empty_tables(unsigned long addr, unsigned long end,
|
||||
+ unsigned long floor, unsigned long ceiling)
|
||||
+{
|
||||
+ unsigned long next;
|
||||
+ pgd_t *pgdp, pgd;
|
||||
+
|
||||
+ do {
|
||||
+ next = pgd_addr_end(addr, end);
|
||||
+ pgdp = pgd_offset_k(addr);
|
||||
+ pgd = READ_ONCE(*pgdp);
|
||||
+ if (pgd_none(pgd))
|
||||
+ continue;
|
||||
+
|
||||
+ WARN_ON(!pgd_present(pgd));
|
||||
+ free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
|
||||
+ } while (addr = next, addr < end);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
||||
#if !ARM64_SWAPPER_USES_SECTION_MAPS
|
||||
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
||||
@@ -772,6 +1079,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
||||
void vmemmap_free(unsigned long start, unsigned long end,
|
||||
struct vmem_altmap *altmap)
|
||||
{
|
||||
+#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
+ WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
|
||||
+
|
||||
+ unmap_hotplug_range(start, end, true);
|
||||
+ free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
|
||||
+#endif
|
||||
}
|
||||
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
|
||||
|
||||
@@ -1050,10 +1363,21 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
+static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
|
||||
+{
|
||||
+ unsigned long end = start + size;
|
||||
+
|
||||
+ WARN_ON(pgdir != init_mm.pgd);
|
||||
+ WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
|
||||
+
|
||||
+ unmap_hotplug_range(start, end, false);
|
||||
+ free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
|
||||
+}
|
||||
+
|
||||
int arch_add_memory(int nid, u64 start, u64 size,
|
||||
struct mhp_restrictions *restrictions)
|
||||
{
|
||||
- int flags = 0;
|
||||
+ int ret, flags = 0;
|
||||
|
||||
if (rodata_full || debug_pagealloc_enabled())
|
||||
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
||||
@@ -1061,22 +1385,59 @@ int arch_add_memory(int nid, u64 start, u64 size,
|
||||
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
|
||||
size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
|
||||
|
||||
- return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
|
||||
+ ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
|
||||
restrictions);
|
||||
+ if (ret)
|
||||
+ __remove_pgd_mapping(swapper_pg_dir,
|
||||
+ __phys_to_virt(start), size);
|
||||
+ return ret;
|
||||
}
|
||||
+
|
||||
void arch_remove_memory(int nid, u64 start, u64 size,
|
||||
struct vmem_altmap *altmap)
|
||||
{
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
|
||||
- /*
|
||||
- * FIXME: Cleanup page tables (also in arch_add_memory() in case
|
||||
- * adding fails). Until then, this function should only be used
|
||||
- * during memory hotplug (adding memory), not for memory
|
||||
- * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be
|
||||
- * unlocked yet.
|
||||
- */
|
||||
__remove_pages(start_pfn, nr_pages, altmap);
|
||||
+ __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * This memory hotplug notifier helps prevent boot memory from being
|
||||
+ * inadvertently removed as it blocks pfn range offlining process in
|
||||
+ * __offline_pages(). Hence this prevents both offlining as well as
|
||||
+ * removal process for boot memory which is initially always online.
|
||||
+ * In future if and when boot memory could be removed, this notifier
|
||||
+ * should be dropped and free_hotplug_page_range() should handle any
|
||||
+ * reserved pages allocated during boot.
|
||||
+ */
|
||||
+static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
|
||||
+ unsigned long action, void *data)
|
||||
+{
|
||||
+ struct mem_section *ms;
|
||||
+ struct memory_notify *arg = data;
|
||||
+ unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
|
||||
+ unsigned long pfn = arg->start_pfn;
|
||||
+
|
||||
+ if (action != MEM_GOING_OFFLINE)
|
||||
+ return NOTIFY_OK;
|
||||
+
|
||||
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
|
||||
+ ms = __pfn_to_section(pfn);
|
||||
+ if (early_section(ms))
|
||||
+ return NOTIFY_BAD;
|
||||
+ }
|
||||
+ return NOTIFY_OK;
|
||||
+}
|
||||
+
|
||||
+static struct notifier_block prevent_bootmem_remove_nb = {
|
||||
+ .notifier_call = prevent_bootmem_remove_notifier,
|
||||
+};
|
||||
+
|
||||
+static int __init prevent_bootmem_remove_init(void)
|
||||
+{
|
||||
+ return register_memory_notifier(&prevent_bootmem_remove_nb);
|
||||
}
|
||||
+device_initcall(prevent_bootmem_remove_init);
|
||||
#endif
|
||||
diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c
|
||||
index 064163f25592..b5eebc8c4924 100644
|
||||
--- a/arch/arm64/mm/ptdump_debugfs.c
|
||||
+++ b/arch/arm64/mm/ptdump_debugfs.c
|
||||
@@ -1,5 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/debugfs.h>
|
||||
+#include <linux/memory_hotplug.h>
|
||||
#include <linux/seq_file.h>
|
||||
|
||||
#include <asm/ptdump.h>
|
||||
@@ -7,7 +8,10 @@
|
||||
static int ptdump_show(struct seq_file *m, void *v)
|
||||
{
|
||||
struct ptdump_info *info = m->private;
|
||||
+
|
||||
+ get_online_mems();
|
||||
ptdump_walk_pgd(m, info);
|
||||
+ put_online_mems();
|
||||
return 0;
|
||||
}
|
||||
DEFINE_SHOW_ATTRIBUTE(ptdump);
|
||||
--
|
||||
2.17.1
|
||||
|
||||
Reference in New Issue
Block a user