mm: ZONE_DEVICE for "device memory"
While pmem is usable as a block device or via DAX mappings to userspace
there are several usage scenarios that can not target pmem due to its
lack of struct page coverage. In preparation for "hot plugging" pmem
into the vmemmap add ZONE_DEVICE as a new zone to tag these pages
separately from the ones that are subject to standard page allocations.
Importantly "device memory" can be removed at will by userspace
unbinding the driver of the device.
Having a separate zone prevents allocation and otherwise marks these
pages that are distinct from typical uniform memory. Device memory has
different lifetime and performance characteristics than RAM. However,
since we have run out of ZONES_SHIFT bits this functionality currently
depends on sacrificing ZONE_DMA.
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Jerome Glisse <j.glisse@gmail.com>
[hch: various simplifications in the arch interface]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 97e48b0..1841ef6 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -645,7 +645,7 @@
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{
pg_data_t *pgdat;
struct zone *zone;
@@ -656,7 +656,7 @@
pgdat = NODE_DATA(nid);
zone = pgdat->node_zones +
- zone_for_memory(nid, start, size, ZONE_NORMAL);
+ zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
ret = __add_pages(nid, zone, start_pfn, nr_pages);
if (ret)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0f11819..6571cfb 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -113,7 +113,7 @@
}
#endif
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{
struct pglist_data *pgdata;
struct zone *zone;
@@ -128,7 +128,7 @@
/* this should work for most non-highmem platforms */
zone = pgdata->node_zones +
- zone_for_memory(nid, start, size, 0);
+ zone_for_memory(nid, start, size, 0, for_device);
return __add_pages(nid, zone, start_pfn, nr_pages);
}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 76e8737..48ee78b 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -168,7 +168,7 @@
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{
unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
unsigned long start_pfn = PFN_DOWN(start);
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 2790b6a..c149009 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -485,7 +485,7 @@
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{
pg_data_t *pgdat;
unsigned long start_pfn = start >> PAGE_SHIFT;
@@ -496,7 +496,8 @@
/* We only have ZONE_NORMAL, so this is easy.. */
ret = __add_pages(nid, pgdat->node_zones +
- zone_for_memory(nid, start, size, ZONE_NORMAL),
+ zone_for_memory(nid, start, size, ZONE_NORMAL,
+ for_device),
start_pfn, nr_pages);
if (unlikely(ret))
printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 5bd252e..d4e1fc4 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -863,7 +863,7 @@
* memory to the highmem for now.
*/
#ifndef CONFIG_NEED_MULTIPLE_NODES
-int arch_add_memory(u64 start, u64 size)
+int arch_add_memory(u64 start, u64 size, bool for_device)
{
struct pglist_data *pgdata = &contig_page_data;
struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8340e45..2a9237d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -822,11 +822,11 @@
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{
struct pglist_data *pgdata = NODE_DATA(nid);
struct zone *zone = pgdata->node_zones +
- zone_for_memory(nid, start, size, ZONE_HIGHMEM);
+ zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3fba623..30564e2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -687,11 +687,11 @@
* Memory is added always to NORMAL zone. This means you will never get
* additional DMA/DMA32 memory.
*/
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{
struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones +
- zone_for_memory(nid, start, size, ZONE_NORMAL);
+ zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 6ffa0ac..8f60e89 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -266,8 +266,9 @@
extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
void *arg, int (*func)(struct memory_block *, void *));
extern int add_memory(int nid, u64 start, u64 size);
-extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default);
-extern int arch_add_memory(int nid, u64 start, u64 size);
+extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
+ bool for_device);
+extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
extern bool is_memblock_offlined(struct memory_block *mem);
extern void remove_memory(int nid, u64 start, u64 size);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 754c259..9217fd9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -319,7 +319,11 @@
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
+#ifdef CONFIG_ZONE_DEVICE
+ ZONE_DEVICE,
+#endif
__MAX_NR_ZONES
+
};
#ifndef __GENERATING_BOUNDS_H
@@ -794,6 +798,25 @@
return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
}
+static inline int zone_id(const struct zone *zone)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+
+ return zone - pgdat->node_zones;
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+static inline bool is_dev_zone(const struct zone *zone)
+{
+ return zone_id(zone) == ZONE_DEVICE;
+}
+#else
+static inline bool is_dev_zone(const struct zone *zone)
+{
+ return false;
+}
+#endif
+
#include <linux/memory_hotplug.h>
extern struct mutex zonelists_mutex;
diff --git a/mm/Kconfig b/mm/Kconfig
index e79de2b..a0cd086 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -654,3 +654,20 @@
when kswapd starts. This has a potential performance impact on
processes running early in the lifetime of the systemm until kswapd
finishes the initialisation.
+
+config ZONE_DEVICE
+ bool "Device memory (pmem, etc...) hotplug support" if EXPERT
+ default !ZONE_DMA
+ depends on !ZONE_DMA
+ depends on MEMORY_HOTPLUG
+ depends on MEMORY_HOTREMOVE
+ depends on X86_64 #arch_add_memory() comprehends device memory
+
+ help
+ Device memory hotplug support allows for establishing pmem,
+ or other device driver discovered memory regions, in the
+ memmap. This allows pfn_to_page() lookups of otherwise
+ "device-physical" addresses which is needed for using a DAX
+ mapping in an O_DIRECT operation, among other things.
+
+ If FS_DAX is enabled, then say Y.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 26fbba7..24e4c76 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -770,7 +770,10 @@
start = phys_start_pfn << PAGE_SHIFT;
size = nr_pages * PAGE_SIZE;
- ret = release_mem_region_adjustable(&iomem_resource, start, size);
+
+ /* in the ZONE_DEVICE case device driver owns the memory region */
+ if (!is_dev_zone(zone))
+ ret = release_mem_region_adjustable(&iomem_resource, start, size);
if (ret) {
resource_size_t endres = start + size - 1;
@@ -1207,8 +1210,13 @@
return 0;
}
-int zone_for_memory(int nid, u64 start, u64 size, int zone_default)
+int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
+ bool for_device)
{
+#ifdef CONFIG_ZONE_DEVICE
+ if (for_device)
+ return ZONE_DEVICE;
+#endif
if (should_add_memory_movable(nid, start, size))
return ZONE_MOVABLE;
@@ -1249,7 +1257,7 @@
}
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size);
+ ret = arch_add_memory(nid, start, size, false);
if (ret < 0)
goto error;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef19f22..0f19b4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -207,6 +207,9 @@
"HighMem",
#endif
"Movable",
+#ifdef CONFIG_ZONE_DEVICE
+ "Device",
+#endif
};
int min_free_kbytes = 1024;