arch/x86/mm/kaiser.c - LeafOS-Devices/android_kernel_samsung_universal7904 - Gitiles

 #include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>

 #undef pr_fmt
 #define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt

 #include <asm/kaiser.h>
 #include <asm/tlbflush.h>	/* to verify its kaiser declarations */
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
 #include <asm/cmdline.h>
 #include <asm/vsyscall.h>

 int kaiser_enabled __read_mostly = 1;
 EXPORT_SYMBOL(kaiser_enabled);	/* for inlined TLB flush functions */

 __visible
 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

 /*
  * These can have bit 63 set, so we can not just use a plain "or"
  * instruction to get their value or'd into CR3.  It would take
  * another register.  So, we use a memory reference to these instead.
  *
  * This is also handy because systems that do not support PCIDs
  * just end up or'ing a 0 into their CR3, which does no harm.
  */
 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);

 /*
  * At runtime, the only things we map are some things for CPU
  * hotplug, and stacks for new processes.  No two CPUs will ever
  * be populating the same addresses, so we only need to ensure
  * that we protect between two CPUs trying to allocate and
  * populate the same page table page.
  *
  * Only take this lock when doing a set_p[4um]d(), but it is not
  * needed for doing a set_pte().  We assume that only the *owner*
  * of a given allocation will be doing this for _their_
  * allocation.
  *
  * This ensures that once a system has been running for a while
  * and there have been stacks all over and these page tables
  * are fully populated, there will be no further acquisitions of
  * this lock.
  */
 static DEFINE_SPINLOCK(shadow_table_allocation_lock);

 /*
  * Returns -1 on error.
  */
 static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;

 	pgd = pgd_offset_k(vaddr);
 	/*
 	 * We made all the kernel PGDs present in kaiser_init().
 	 * We expect them to stay that way.
 	 */
 	BUG_ON(pgd_none(*pgd));
 	/*
 	 * PGDs are either 512GB or 128TB on all x86_64
 	 * configurations.  We don't handle these.
 	 */
 	BUG_ON(pgd_large(*pgd));

 	pud = pud_offset(pgd, vaddr);
 	if (pud_none(*pud)) {
 		WARN_ON_ONCE(1);
 		return -1;
 	}

 	if (pud_large(*pud))
 		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);

 	pmd = pmd_offset(pud, vaddr);
 	if (pmd_none(*pmd)) {
 		WARN_ON_ONCE(1);
 		return -1;
 	}

 	if (pmd_large(*pmd))
 		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);

 	pte = pte_offset_kernel(pmd, vaddr);
 	if (pte_none(*pte)) {
 		WARN_ON_ONCE(1);
 		return -1;
 	}

 	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
 }

 /*
  * This is a relatively normal page table walk, except that it
  * also tries to allocate page tables pages along the way.
  *
  * Returns a pointer to a PTE on success, or NULL on failure.
  */
 static pte_t *kaiser_pagetable_walk(unsigned long address, bool user)
 {
 	pmd_t *pmd;
 	pud_t *pud;
 	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 	unsigned long prot = _KERNPG_TABLE;

 	if (pgd_none(*pgd)) {
 		WARN_ONCE(1, "All shadow pgds should have been populated");
 		return NULL;
 	}
 	BUILD_BUG_ON(pgd_large(*pgd) != 0);

 	if (user) {
 		/*
 		 * The vsyscall page is the only page that will have
 		 *  _PAGE_USER set. Catch everything else.
 		 */
 		BUG_ON(address != VSYSCALL_ADDR);

 		set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
 		prot = _PAGE_TABLE;
 	}

 	pud = pud_offset(pgd, address);
 	/* The shadow page tables do not use large mappings: */
 	if (pud_large(*pud)) {
 		WARN_ON(1);
 		return NULL;
 	}
 	if (pud_none(*pud)) {
 		unsigned long new_pmd_page = __get_free_page(gfp);
 		if (!new_pmd_page)
 			return NULL;
 		spin_lock(&shadow_table_allocation_lock);
 		if (pud_none(*pud)) {
 			set_pud(pud, __pud(prot | __pa(new_pmd_page)));
 			__inc_zone_page_state(virt_to_page((void *)
 						new_pmd_page), NR_KAISERTABLE);
 		} else
 			free_page(new_pmd_page);
 		spin_unlock(&shadow_table_allocation_lock);
 	}

 	pmd = pmd_offset(pud, address);
 	/* The shadow page tables do not use large mappings: */
 	if (pmd_large(*pmd)) {
 		WARN_ON(1);
 		return NULL;
 	}
 	if (pmd_none(*pmd)) {
 		unsigned long new_pte_page = __get_free_page(gfp);
 		if (!new_pte_page)
 			return NULL;
 		spin_lock(&shadow_table_allocation_lock);
 		if (pmd_none(*pmd)) {
 			set_pmd(pmd, __pmd(prot | __pa(new_pte_page)));
 			__inc_zone_page_state(virt_to_page((void *)
 						new_pte_page), NR_KAISERTABLE);
 		} else
 			free_page(new_pte_page);
 		spin_unlock(&shadow_table_allocation_lock);
 	}

 	return pte_offset_kernel(pmd, address);
 }

 static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 			       unsigned long flags)
 {
 	int ret = 0;
 	pte_t *pte;
 	unsigned long start_addr = (unsigned long )__start_addr;
 	unsigned long address = start_addr & PAGE_MASK;
 	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 	unsigned long target_address;

 	/*
 	 * It is convenient for callers to pass in __PAGE_KERNEL etc,
 	 * and there is no actual harm from setting _PAGE_GLOBAL, so
 	 * long as CR4.PGE is not set.  But it is nonetheless troubling
 	 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
 	 * requires that not to be #defined to 0): so mask it off here.
 	 */
 	flags &= ~_PAGE_GLOBAL;
 	if (!(__supported_pte_mask & _PAGE_NX))
 		flags &= ~_PAGE_NX;

 	for (; address < end_addr; address += PAGE_SIZE) {
 		target_address = get_pa_from_mapping(address);
 		if (target_address == -1) {
 			ret = -EIO;
 			break;
 		}
 		pte = kaiser_pagetable_walk(address, flags & _PAGE_USER);
 		if (!pte) {
 			ret = -ENOMEM;
 			break;
 		}
 		if (pte_none(*pte)) {
 			set_pte(pte, __pte(flags | target_address));
 		} else {
 			pte_t tmp;
 			set_pte(&tmp, __pte(flags | target_address));
 			WARN_ON_ONCE(!pte_same(*pte, tmp));
 		}
 	}
 	return ret;
 }

 static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
 {
 	unsigned long size = end - start;

 	return kaiser_add_user_map(start, size, flags);
 }

 /*
  * Ensure that the top level of the (shadow) page tables are
  * entirely populated.  This ensures that all processes that get
  * forked have the same entries.  This way, we do not have to
  * ever go set up new entries in older processes.
  *
  * Note: we never free these, so there are no updates to them
  * after this.
  */
 static void __init kaiser_init_all_pgds(void)
 {
 	pgd_t *pgd;
 	int i = 0;

 	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
 	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
 		pgd_t new_pgd;
 		pud_t *pud = pud_alloc_one(&init_mm,
 					   PAGE_OFFSET + i * PGDIR_SIZE);
 		if (!pud) {
 			WARN_ON(1);
 			break;
 		}
 		inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
 		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
 		/*
 		 * Make sure not to stomp on some other pgd entry.
 		 */
 		if (!pgd_none(pgd[i])) {
 			WARN_ON(1);
 			continue;
 		}
 		set_pgd(pgd + i, new_pgd);
 	}
 }

 #define kaiser_add_user_map_early(start, size, flags) do {	\
 	int __ret = kaiser_add_user_map(start, size, flags);	\
 	WARN_ON(__ret);						\
 } while (0)

 #define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
 	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
 	WARN_ON(__ret);							\
 } while (0)

 void __init kaiser_check_boottime_disable(void)
 {
 	bool enable = true;
 	char arg[5];
 	int ret;

 	if (boot_cpu_has(X86_FEATURE_XENPV))
 		goto silent_disable;

 	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
 	if (ret > 0) {
 		if (!strncmp(arg, "on", 2))
 			goto enable;

 		if (!strncmp(arg, "off", 3))
 			goto disable;

 		if (!strncmp(arg, "auto", 4))
 			goto skip;
 	}

 	if (cmdline_find_option_bool(boot_command_line, "nopti"))
 		goto disable;

 skip:
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 		goto disable;

 enable:
 	if (enable)
 		setup_force_cpu_cap(X86_FEATURE_KAISER);

 	return;

 disable:
 	pr_info("disabled\n");

 silent_disable:
 	kaiser_enabled = 0;
 	setup_clear_cpu_cap(X86_FEATURE_KAISER);
 }

 /*
  * If anything in here fails, we will likely die on one of the
  * first kernel->user transitions and init will die.  But, we
  * will have most of the kernel up by then and should be able to
  * get a clean warning out of it.  If we BUG_ON() here, we run
  * the risk of being before we have good console output.
  */
 void __init kaiser_init(void)
 {
 	int cpu;

 	if (!kaiser_enabled)
 		return;

 	kaiser_init_all_pgds();

 	/*
 	 * Note that this sets _PAGE_USER and it needs to happen when the
 	 * pagetable hierarchy gets created, i.e., early. Otherwise
 	 * kaiser_pagetable_walk() will encounter initialized PTEs in the
 	 * hierarchy and not set the proper permissions, leading to the
 	 * pagefaults with page-protection violations when trying to read the
 	 * vsyscall page. For example.
 	 */
 	if (vsyscall_enabled())
 		kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
 					  PAGE_SIZE,
 					  vsyscall_pgprot);

 	for_each_possible_cpu(cpu) {
 		void *percpu_vaddr = __per_cpu_user_mapped_start +
 				     per_cpu_offset(cpu);
 		unsigned long percpu_sz = __per_cpu_user_mapped_end -
 					  __per_cpu_user_mapped_start;
 		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
 					  __PAGE_KERNEL);
 	}

 	/*
 	 * Map the entry/exit text section, which is needed at
 	 * switches from user to and from kernel.
 	 */
 	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
 				       __PAGE_KERNEL_RX);

 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
 				       __irqentry_text_end,
 				       __PAGE_KERNEL_RX);
 #endif
 	kaiser_add_user_map_early((void *)idt_descr.address,
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL_RO);
 #ifdef CONFIG_TRACING
 	kaiser_add_user_map_early(&trace_idt_descr,
 				  sizeof(trace_idt_descr),
 				  __PAGE_KERNEL);
 	kaiser_add_user_map_early(&trace_idt_table,
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL);
 #endif
 	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
 				  __PAGE_KERNEL);
 	kaiser_add_user_map_early(&debug_idt_table,
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL);

 	pr_info("enabled\n");
 }

 /* Add a mapping to the shadow mapping, and synchronize the mappings */
 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 {
 	if (!kaiser_enabled)
 		return 0;
 	return kaiser_add_user_map((const void *)addr, size, flags);
 }

 void kaiser_remove_mapping(unsigned long start, unsigned long size)
 {
 	extern void unmap_pud_range_nofree(pgd_t *pgd,
 				unsigned long start, unsigned long end);
 	unsigned long end = start + size;
 	unsigned long addr, next;
 	pgd_t *pgd;

 	if (!kaiser_enabled)
 		return;
 	pgd = native_get_shadow_pgd(pgd_offset_k(start));
 	for (addr = start; addr < end; pgd++, addr = next) {
 		next = pgd_addr_end(addr, end);
 		unmap_pud_range_nofree(pgd, addr, next);
 	}
 }

 /*
  * Page table pages are page-aligned.  The lower half of the top
  * level is used for userspace and the top half for the kernel.
  * This returns true for user pages that need to get copied into
  * both the user and kernel copies of the page tables, and false
  * for kernel pages that should only be in the kernel copy.
  */
 static inline bool is_userspace_pgd(pgd_t *pgdp)
 {
 	return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
 }

 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 	if (!kaiser_enabled)
 		return pgd;
 	/*
 	 * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
 	 * skip cases like kexec and EFI which make temporary low mappings.
 	 */
 	if (pgd.pgd & _PAGE_USER) {
 		if (is_userspace_pgd(pgdp)) {
 			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
 			/*
 			 * Even if the entry is *mapping* userspace, ensure
 			 * that userspace can not use it.  This way, if we
 			 * get out to userspace running on the kernel CR3,
 			 * userspace will crash instead of running.
 			 */
 			if (__supported_pte_mask & _PAGE_NX)
 				pgd.pgd |= _PAGE_NX;
 		}
 	} else if (!pgd.pgd) {
 		/*
 		 * pgd_clear() cannot check _PAGE_USER, and is even used to
 		 * clear corrupted pgd entries: so just rely on cases like
 		 * kexec and EFI never to be using pgd_clear().
 		 */
 		if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
 		    is_userspace_pgd(pgdp))
 			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
 	}
 	return pgd;
 }

 void kaiser_setup_pcid(void)
 {
 	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;

 	if (this_cpu_has(X86_FEATURE_PCID))
 		user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
 	/*
 	 * These variables are used by the entry/exit
 	 * code to change PCID and pgd and TLB flushing.
 	 */
 	this_cpu_write(x86_cr3_pcid_user, user_cr3);
 }

 /*
  * Make a note that this cpu will need to flush USER tlb on return to user.
  * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
  */
 void kaiser_flush_tlb_on_return_to_user(void)
 {
 	if (this_cpu_has(X86_FEATURE_PCID))
 		this_cpu_write(x86_cr3_pcid_user,
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
	#include <linux/bug.h>
	#include <linux/kernel.h>
	#include <linux/errno.h>
	#include <linux/string.h>
	#include <linux/types.h>
	#include <linux/bug.h>
	#include <linux/init.h>
	#include <linux/interrupt.h>
	#include <linux/spinlock.h>
	#include <linux/mm.h>
	#include <linux/uaccess.h>
	#include <linux/ftrace.h>

	#undef pr_fmt
	#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt

	#include <asm/kaiser.h>
	#include <asm/tlbflush.h> /* to verify its kaiser declarations */
	#include <asm/pgtable.h>
	#include <asm/pgalloc.h>
	#include <asm/desc.h>
	#include <asm/cmdline.h>
	#include <asm/vsyscall.h>

	int kaiser_enabled __read_mostly = 1;
	EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */

	__visible
	DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);

	/*
	* These can have bit 63 set, so we can not just use a plain "or"
	* instruction to get their value or'd into CR3. It would take
	* another register. So, we use a memory reference to these instead.
	*
	* This is also handy because systems that do not support PCIDs
	* just end up or'ing a 0 into their CR3, which does no harm.
	*/
	DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);

	/*
	* At runtime, the only things we map are some things for CPU
	* hotplug, and stacks for new processes. No two CPUs will ever
	* be populating the same addresses, so we only need to ensure
	* that we protect between two CPUs trying to allocate and
	* populate the same page table page.
	*
	* Only take this lock when doing a set_p[4um]d(), but it is not
	* needed for doing a set_pte(). We assume that only the owner
	* of a given allocation will be doing this for _their_
	* allocation.
	*
	* This ensures that once a system has been running for a while
	* and there have been stacks all over and these page tables
	* are fully populated, there will be no further acquisitions of
	* this lock.
	*/
	static DEFINE_SPINLOCK(shadow_table_allocation_lock);

	/*
	* Returns -1 on error.
	*/
	static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
	{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pgd = pgd_offset_k(vaddr);
	/*
	* We made all the kernel PGDs present in kaiser_init().
	* We expect them to stay that way.
	*/
	BUG_ON(pgd_none(*pgd));
	/*
	* PGDs are either 512GB or 128TB on all x86_64
	* configurations. We don't handle these.
	*/
	BUG_ON(pgd_large(*pgd));

	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
	WARN_ON_ONCE(1);
	return -1;
	}

	if (pud_large(*pud))
	return (pud_pfn(*pud) << PAGE_SHIFT) \| (vaddr & ~PUD_PAGE_MASK);

	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
	WARN_ON_ONCE(1);
	return -1;
	}

	if (pmd_large(*pmd))
	return (pmd_pfn(*pmd) << PAGE_SHIFT) \| (vaddr & ~PMD_PAGE_MASK);

	pte = pte_offset_kernel(pmd, vaddr);
	if (pte_none(*pte)) {
	WARN_ON_ONCE(1);
	return -1;
	}

	return (pte_pfn(*pte) << PAGE_SHIFT) \| (vaddr & ~PAGE_MASK);
	}

	/*
	* This is a relatively normal page table walk, except that it
	* also tries to allocate page tables pages along the way.
	*
	* Returns a pointer to a PTE on success, or NULL on failure.
	*/
	static pte_t *kaiser_pagetable_walk(unsigned long address, bool user)
	{
	pmd_t *pmd;
	pud_t *pud;
	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
	gfp_t gfp = (GFP_KERNEL \| __GFP_NOTRACK \| __GFP_ZERO);
	unsigned long prot = _KERNPG_TABLE;

	if (pgd_none(*pgd)) {
	WARN_ONCE(1, "All shadow pgds should have been populated");
	return NULL;
	}
	BUILD_BUG_ON(pgd_large(*pgd) != 0);

	if (user) {
	/*
	* The vsyscall page is the only page that will have
	* _PAGE_USER set. Catch everything else.
	*/
	BUG_ON(address != VSYSCALL_ADDR);

	set_pgd(pgd, __pgd(pgd_val(*pgd) \| _PAGE_USER));
	prot = _PAGE_TABLE;
	}

	pud = pud_offset(pgd, address);
	/* The shadow page tables do not use large mappings: */
	if (pud_large(*pud)) {
	WARN_ON(1);
	return NULL;
	}
	if (pud_none(*pud)) {
	unsigned long new_pmd_page = __get_free_page(gfp);
	if (!new_pmd_page)
	return NULL;
	spin_lock(&shadow_table_allocation_lock);
	if (pud_none(*pud)) {
	set_pud(pud, __pud(prot \| __pa(new_pmd_page)));
	__inc_zone_page_state(virt_to_page((void *)
	new_pmd_page), NR_KAISERTABLE);
	} else
	free_page(new_pmd_page);
	spin_unlock(&shadow_table_allocation_lock);
	}

	pmd = pmd_offset(pud, address);
	/* The shadow page tables do not use large mappings: */
	if (pmd_large(*pmd)) {
	WARN_ON(1);
	return NULL;
	}
	if (pmd_none(*pmd)) {
	unsigned long new_pte_page = __get_free_page(gfp);
	if (!new_pte_page)
	return NULL;
	spin_lock(&shadow_table_allocation_lock);
	if (pmd_none(*pmd)) {
	set_pmd(pmd, __pmd(prot \| __pa(new_pte_page)));
	__inc_zone_page_state(virt_to_page((void *)
	new_pte_page), NR_KAISERTABLE);
	} else
	free_page(new_pte_page);
	spin_unlock(&shadow_table_allocation_lock);
	}

	return pte_offset_kernel(pmd, address);
	}

	static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
	unsigned long flags)
	{
	int ret = 0;
	pte_t *pte;
	unsigned long start_addr = (unsigned long )__start_addr;
	unsigned long address = start_addr & PAGE_MASK;
	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
	unsigned long target_address;

	/*
	* It is convenient for callers to pass in __PAGE_KERNEL etc,
	* and there is no actual harm from setting _PAGE_GLOBAL, so
	* long as CR4.PGE is not set. But it is nonetheless troubling
	* to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
	* requires that not to be #defined to 0): so mask it off here.
	*/
	flags &= ~_PAGE_GLOBAL;
	if (!(__supported_pte_mask & _PAGE_NX))
	flags &= ~_PAGE_NX;

	for (; address < end_addr; address += PAGE_SIZE) {
	target_address = get_pa_from_mapping(address);
	if (target_address == -1) {
	ret = -EIO;
	break;
	}
	pte = kaiser_pagetable_walk(address, flags & _PAGE_USER);
	if (!pte) {
	ret = -ENOMEM;
	break;
	}
	if (pte_none(*pte)) {
	set_pte(pte, __pte(flags \| target_address));
	} else {
	pte_t tmp;
	set_pte(&tmp, __pte(flags \| target_address));
	WARN_ON_ONCE(!pte_same(*pte, tmp));
	}
	}
	return ret;
	}

	static int kaiser_add_user_map_ptrs(const void start, const void end, unsigned long flags)
	{
	unsigned long size = end - start;

	return kaiser_add_user_map(start, size, flags);
	}

	/*
	* Ensure that the top level of the (shadow) page tables are
	* entirely populated. This ensures that all processes that get
	* forked have the same entries. This way, we do not have to
	* ever go set up new entries in older processes.
	*
	* Note: we never free these, so there are no updates to them
	* after this.
	*/
	static void __init kaiser_init_all_pgds(void)
	{
	pgd_t *pgd;
	int i = 0;

	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
	pgd_t new_pgd;
	pud_t *pud = pud_alloc_one(&init_mm,
	PAGE_OFFSET + i * PGDIR_SIZE);
	if (!pud) {
	WARN_ON(1);
	break;
	}
	inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
	new_pgd = __pgd(_KERNPG_TABLE \|__pa(pud));
	/*
	* Make sure not to stomp on some other pgd entry.
	*/
	if (!pgd_none(pgd[i])) {
	WARN_ON(1);
	continue;
	}
	set_pgd(pgd + i, new_pgd);
	}
	}

	#define kaiser_add_user_map_early(start, size, flags) do { \
	int __ret = kaiser_add_user_map(start, size, flags); \
	WARN_ON(__ret); \
	} while (0)

	#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
	int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
	WARN_ON(__ret); \
	} while (0)

	void __init kaiser_check_boottime_disable(void)
	{
	bool enable = true;
	char arg[5];
	int ret;

	if (boot_cpu_has(X86_FEATURE_XENPV))
	goto silent_disable;

	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
	if (ret > 0) {
	if (!strncmp(arg, "on", 2))
	goto enable;

	if (!strncmp(arg, "off", 3))
	goto disable;

	if (!strncmp(arg, "auto", 4))
	goto skip;
	}

	if (cmdline_find_option_bool(boot_command_line, "nopti"))
	goto disable;

	skip:
	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
	goto disable;

	enable:
	if (enable)
	setup_force_cpu_cap(X86_FEATURE_KAISER);

	return;

	disable:
	pr_info("disabled\n");

	silent_disable:
	kaiser_enabled = 0;
	setup_clear_cpu_cap(X86_FEATURE_KAISER);
	}

	/*
	* If anything in here fails, we will likely die on one of the
	* first kernel->user transitions and init will die. But, we
	* will have most of the kernel up by then and should be able to
	* get a clean warning out of it. If we BUG_ON() here, we run
	* the risk of being before we have good console output.
	*/
	void __init kaiser_init(void)
	{
	int cpu;

	if (!kaiser_enabled)
	return;

	kaiser_init_all_pgds();

	/*
	* Note that this sets _PAGE_USER and it needs to happen when the
	* pagetable hierarchy gets created, i.e., early. Otherwise
	* kaiser_pagetable_walk() will encounter initialized PTEs in the
	* hierarchy and not set the proper permissions, leading to the
	* pagefaults with page-protection violations when trying to read the
	* vsyscall page. For example.
	*/
	if (vsyscall_enabled())
	kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
	PAGE_SIZE,
	vsyscall_pgprot);

	for_each_possible_cpu(cpu) {
	void *percpu_vaddr = __per_cpu_user_mapped_start +
	per_cpu_offset(cpu);
	unsigned long percpu_sz = __per_cpu_user_mapped_end -
	__per_cpu_user_mapped_start;
	kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
	__PAGE_KERNEL);
	}

	/*
	* Map the entry/exit text section, which is needed at
	* switches from user to and from kernel.
	*/
	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
	__PAGE_KERNEL_RX);

	#ifdef CONFIG_FUNCTION_GRAPH_TRACER
	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
	__irqentry_text_end,
	__PAGE_KERNEL_RX);
	#endif
	kaiser_add_user_map_early((void *)idt_descr.address,
	sizeof(gate_desc) * NR_VECTORS,
	__PAGE_KERNEL_RO);
	#ifdef CONFIG_TRACING
	kaiser_add_user_map_early(&trace_idt_descr,
	sizeof(trace_idt_descr),
	__PAGE_KERNEL);
	kaiser_add_user_map_early(&trace_idt_table,
	sizeof(gate_desc) * NR_VECTORS,
	__PAGE_KERNEL);
	#endif
	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
	__PAGE_KERNEL);
	kaiser_add_user_map_early(&debug_idt_table,
	sizeof(gate_desc) * NR_VECTORS,
	__PAGE_KERNEL);

	pr_info("enabled\n");
	}

	/* Add a mapping to the shadow mapping, and synchronize the mappings */
	int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
	{
	if (!kaiser_enabled)
	return 0;
	return kaiser_add_user_map((const void *)addr, size, flags);
	}

	void kaiser_remove_mapping(unsigned long start, unsigned long size)
	{
	extern void unmap_pud_range_nofree(pgd_t *pgd,
	unsigned long start, unsigned long end);
	unsigned long end = start + size;
	unsigned long addr, next;
	pgd_t *pgd;

	if (!kaiser_enabled)
	return;
	pgd = native_get_shadow_pgd(pgd_offset_k(start));
	for (addr = start; addr < end; pgd++, addr = next) {
	next = pgd_addr_end(addr, end);
	unmap_pud_range_nofree(pgd, addr, next);
	}
	}

	/*
	* Page table pages are page-aligned. The lower half of the top
	* level is used for userspace and the top half for the kernel.
	* This returns true for user pages that need to get copied into
	* both the user and kernel copies of the page tables, and false
	* for kernel pages that should only be in the kernel copy.
	*/
	static inline bool is_userspace_pgd(pgd_t *pgdp)
	{
	return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
	}

	pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
	{
	if (!kaiser_enabled)
	return pgd;
	/*
	* Do we need to also populate the shadow pgd? Check _PAGE_USER to
	* skip cases like kexec and EFI which make temporary low mappings.
	*/
	if (pgd.pgd & _PAGE_USER) {
	if (is_userspace_pgd(pgdp)) {
	native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
	/*
	* Even if the entry is mapping userspace, ensure
	* that userspace can not use it. This way, if we
	* get out to userspace running on the kernel CR3,
	* userspace will crash instead of running.
	*/
	if (__supported_pte_mask & _PAGE_NX)
	pgd.pgd \|= _PAGE_NX;
	}
	} else if (!pgd.pgd) {
	/*
	* pgd_clear() cannot check _PAGE_USER, and is even used to
	* clear corrupted pgd entries: so just rely on cases like
	* kexec and EFI never to be using pgd_clear().
	*/
	if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
	is_userspace_pgd(pgdp))
	native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
	}
	return pgd;
	}

	void kaiser_setup_pcid(void)
	{
	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;

	if (this_cpu_has(X86_FEATURE_PCID))
	user_cr3 \|= X86_CR3_PCID_USER_NOFLUSH;
	/*
	* These variables are used by the entry/exit
	* code to change PCID and pgd and TLB flushing.
	*/
	this_cpu_write(x86_cr3_pcid_user, user_cr3);
	}

	/*
	* Make a note that this cpu will need to flush USER tlb on return to user.
	* If cpu does not have PCID, then the NOFLUSH bit will never have been set.
	*/
	void kaiser_flush_tlb_on_return_to_user(void)
	{
	if (this_cpu_has(X86_FEATURE_PCID))
	this_cpu_write(x86_cr3_pcid_user,
	X86_CR3_PCID_USER_FLUSH \| KAISER_SHADOW_PGD_OFFSET);
	}
	EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);