intel-iommu: optimize sg map/unmap calls

This patch adds PageSelectiveInvalidation support replacing existing
DomainSelectiveInvalidation for intel_{map/unmap}_sg() calls and also
enables to mapping one big contiguous DMA virtual address which is mapped
to discontiguous physical address for SG map/unmap calls.

"Doamin selective invalidations" wipes out the IOMMU address translation
cache based on domain ID where as "Page selective invalidations" wipes out
the IOMMU address translation cache for that address mask range which is
more cache friendly when compared to Domain selective invalidations.

Here is how it is done.
1) changes to iova.c
alloc_iova() now takes a bool size_aligned argument, which
when when set, returns the io virtual address that is
naturally aligned to 2 ^ x, where x is the order
of the size requested.

Returning this io vitual address which is naturally
aligned helps iommu to do the "page selective
invalidations" which is IOMMU cache friendly
over "domain selective invalidations".

2) Changes to driver/pci/intel-iommu.c
Clean up intel_{map/unmap}_{single/sg} () calls so that
s/g map/unamp calls is no more dependent on
intel_{map/unmap}_single()

intel_map_sg() now computes the total DMA virtual address
required and allocates the size aligned total DMA virtual address
and maps the discontiguous physical address to the allocated
contiguous DMA virtual address.

In the intel_unmap_sg() case since the DMA virtual address
is contiguous and size_aligned, PageSelectiveInvalidation
is used replacing earlier DomainSelectiveInvalidations.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Greg KH <greg@kroah.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Suresh B <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 4cca5b9..dab329f 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -665,24 +665,10 @@
 		non_present_entry_flush);
 }
 
-static int iommu_get_alignment(u64 base, unsigned int size)
-{
-	int t = 0;
-	u64 end;
-
-	end = base + size - 1;
-	while (base != end) {
-		t++;
-		base >>= 1;
-		end >>= 1;
-	}
-	return t;
-}
-
 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 	u64 addr, unsigned int pages, int non_present_entry_flush)
 {
-	unsigned int align;
+	unsigned int mask;
 
 	BUG_ON(addr & (~PAGE_MASK_4K));
 	BUG_ON(pages == 0);
@@ -696,16 +682,13 @@
 	 * PSI requires page size to be 2 ^ x, and the base address is naturally
 	 * aligned to the size
 	 */
-	align = iommu_get_alignment(addr >> PAGE_SHIFT_4K, pages);
+	mask = ilog2(__roundup_pow_of_two(pages));
 	/* Fallback to domain selective flush if size is too big */
-	if (align > cap_max_amask_val(iommu->cap))
+	if (mask > cap_max_amask_val(iommu->cap))
 		return iommu_flush_iotlb_dsi(iommu, did,
 			non_present_entry_flush);
 
-	addr >>= PAGE_SHIFT_4K + align;
-	addr <<= PAGE_SHIFT_4K + align;
-
-	return __iommu_flush_iotlb(iommu, did, addr, align,
+	return __iommu_flush_iotlb(iommu, did, addr, mask,
 		DMA_TLB_PSI_FLUSH, non_present_entry_flush);
 }
 
@@ -1772,39 +1755,53 @@
 }
 
 struct iova *
-iommu_alloc_iova(struct dmar_domain *domain, void *host_addr, size_t size,
-		u64 start, u64 end)
+iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
 {
-	u64 start_addr;
 	struct iova *piova;
 
 	/* Make sure it's in range */
-	if ((start > DOMAIN_MAX_ADDR(domain->gaw)) || end < start)
-		return NULL;
-
 	end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
-	start_addr = PAGE_ALIGN_4K(start);
-	size = aligned_size((u64)host_addr, size);
-	if (!size || (start_addr + size > end))
+	if (!size || (IOVA_START_ADDR + size > end))
 		return NULL;
 
 	piova = alloc_iova(&domain->iovad,
-			size >> PAGE_SHIFT_4K, IOVA_PFN(end));
-
+			size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
 	return piova;
 }
 
-static dma_addr_t __intel_map_single(struct device *dev, void *addr,
-	size_t size, int dir, u64 *flush_addr, unsigned int *flush_size)
+static struct iova *
+__intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
+		size_t size)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct iova *iova = NULL;
+
+	if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
+		iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
+	} else  {
+		/*
+		 * First try to allocate an io virtual address in
+		 * DMA_32BIT_MASK and if that fails then try allocating
+		 * from higer range
+		 */
+		iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
+		if (!iova)
+			iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
+	}
+
+	if (!iova) {
+		printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
+		return NULL;
+	}
+
+	return iova;
+}
+
+static struct dmar_domain *
+get_valid_domain_for_dev(struct pci_dev *pdev)
 {
 	struct dmar_domain *domain;
-	struct pci_dev *pdev = to_pci_dev(dev);
 	int ret;
-	int prot = 0;
-	struct iova *iova = NULL;
-	u64 start_addr;
-
-	addr = (void *)virt_to_phys(addr);
 
 	domain = get_domain_for_dev(pdev,
 			DEFAULT_DOMAIN_ADDRESS_WIDTH);
@@ -1814,36 +1811,47 @@
 		return 0;
 	}
 
-	start_addr = IOVA_START_ADDR;
-
-	if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
-		iova = iommu_alloc_iova(domain, addr, size, start_addr,
-			pdev->dma_mask);
-	} else  {
-		/*
-		 * First try to allocate an io virtual address in
-		 * DMA_32BIT_MASK and if that fails then try allocating
-		 * from higer range
-		 */
-		iova = iommu_alloc_iova(domain, addr, size, start_addr,
-			DMA_32BIT_MASK);
-		if (!iova)
-			iova = iommu_alloc_iova(domain, addr, size, start_addr,
-			pdev->dma_mask);
-	}
-
-	if (!iova) {
-		printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
-		return 0;
-	}
-
 	/* make sure context mapping is ok */
 	if (unlikely(!domain_context_mapped(domain, pdev))) {
 		ret = domain_context_mapping(domain, pdev);
-		if (ret)
-			goto error;
+		if (ret) {
+			printk(KERN_ERR
+				"Domain context map for %s failed",
+				pci_name(pdev));
+			return 0;
+		}
 	}
 
+	return domain;
+}
+
+static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
+	size_t size, int dir)
+{
+	struct pci_dev *pdev = to_pci_dev(hwdev);
+	int ret;
+	struct dmar_domain *domain;
+	unsigned long start_addr;
+	struct iova *iova;
+	int prot = 0;
+
+	BUG_ON(dir == DMA_NONE);
+	if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
+		return virt_to_bus(addr);
+
+	domain = get_valid_domain_for_dev(pdev);
+	if (!domain)
+		return 0;
+
+	addr = (void *)virt_to_phys(addr);
+	size = aligned_size((u64)addr, size);
+
+	iova = __intel_alloc_iova(hwdev, domain, size);
+	if (!iova)
+		goto error;
+
+	start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
+
 	/*
 	 * Check if DMAR supports zero-length reads on write only
 	 * mappings..
@@ -1859,101 +1867,65 @@
 	 * might have two guest_addr mapping to the same host addr, but this
 	 * is not a big problem
 	 */
-	ret = domain_page_mapping(domain, iova->pfn_lo << PAGE_SHIFT_4K,
-		((u64)addr) & PAGE_MASK_4K,
-		(iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K, prot);
+	ret = domain_page_mapping(domain, start_addr,
+		((u64)addr) & PAGE_MASK_4K, size, prot);
 	if (ret)
 		goto error;
 
 	pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
 		pci_name(pdev), size, (u64)addr,
-		(iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K,
-		(u64)(iova->pfn_lo << PAGE_SHIFT_4K), dir);
+		size, (u64)start_addr, dir);
 
-	*flush_addr = iova->pfn_lo << PAGE_SHIFT_4K;
-	*flush_size = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K;
-	return (iova->pfn_lo << PAGE_SHIFT_4K) + ((u64)addr & (~PAGE_MASK_4K));
+	/* it's a non-present to present mapping */
+	ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
+			start_addr, size >> PAGE_SHIFT_4K, 1);
+	if (ret)
+		iommu_flush_write_buffer(domain->iommu);
+
+	return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
+
 error:
-	__free_iova(&domain->iovad, iova);
+	if (iova)
+		__free_iova(&domain->iovad, iova);
 	printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
 		pci_name(pdev), size, (u64)addr, dir);
 	return 0;
 }
 
-static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
-	size_t size, int dir)
-{
-	struct pci_dev *pdev = to_pci_dev(hwdev);
-	dma_addr_t ret;
-	struct dmar_domain *domain;
-	u64 flush_addr;
-	unsigned int flush_size;
-
-	BUG_ON(dir == DMA_NONE);
-	if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
-		return virt_to_bus(addr);
-
-	ret = __intel_map_single(hwdev, addr, size,
-			dir, &flush_addr, &flush_size);
-	if (ret) {
-		domain = find_domain(pdev);
-		/* it's a non-present to present mapping */
-		if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
-				flush_addr, flush_size >> PAGE_SHIFT_4K, 1))
-			iommu_flush_write_buffer(domain->iommu);
-	}
-	return ret;
-}
-
-static void __intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
-	size_t size, int dir, u64 *flush_addr, unsigned int *flush_size)
-{
-	struct dmar_domain *domain;
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct iova *iova;
-
-	domain = find_domain(pdev);
-	BUG_ON(!domain);
-
-	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
-	if (!iova) {
-		*flush_size = 0;
-		return;
-	}
-	pr_debug("Device %s unmapping: %lx@%llx\n",
-		pci_name(pdev),
-		(iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K,
-		(u64)(iova->pfn_lo << PAGE_SHIFT_4K));
-
-	*flush_addr = iova->pfn_lo << PAGE_SHIFT_4K;
-	*flush_size = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT_4K;
-	/*  clear the whole page, not just dev_addr - (dev_addr + size) */
-	dma_pte_clear_range(domain, *flush_addr, *flush_addr + *flush_size);
-	/* free page tables */
-	dma_pte_free_pagetable(domain, *flush_addr, *flush_addr + *flush_size);
-	/* free iova */
-	__free_iova(&domain->iovad, iova);
-}
-
 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
 	size_t size, int dir)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct dmar_domain *domain;
-	u64 flush_addr;
-	unsigned int flush_size;
+	unsigned long start_addr;
+	struct iova *iova;
 
 	if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
 		return;
-
 	domain = find_domain(pdev);
-	__intel_unmap_single(dev, dev_addr, size,
-		dir, &flush_addr, &flush_size);
-	if (flush_size == 0)
+	BUG_ON(!domain);
+
+	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
+	if (!iova)
 		return;
-	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, flush_addr,
-			flush_size >> PAGE_SHIFT_4K, 0))
+
+	start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
+	size = aligned_size((u64)dev_addr, size);
+
+	pr_debug("Device %s unmapping: %lx@%llx\n",
+		pci_name(pdev), size, (u64)start_addr);
+
+	/*  clear the whole page */
+	dma_pte_clear_range(domain, start_addr, start_addr + size);
+	/* free page tables */
+	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
+
+	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
+			size >> PAGE_SHIFT_4K, 0))
 		iommu_flush_write_buffer(domain->iommu);
+
+	/* free iova */
+	__free_iova(&domain->iovad, iova);
 }
 
 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -1990,28 +1962,46 @@
 	free_pages((unsigned long)vaddr, order);
 }
 
+#define SG_ENT_VIRT_ADDRESS(sg)	(page_address((sg)->page) + (sg)->offset)
 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sg,
 	int nelems, int dir)
 {
 	int i;
 	struct pci_dev *pdev = to_pci_dev(hwdev);
 	struct dmar_domain *domain;
-	u64 flush_addr;
-	unsigned int flush_size;
+	unsigned long start_addr;
+	struct iova *iova;
+	size_t size = 0;
+	void *addr;
 
 	if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
 		return;
 
 	domain = find_domain(pdev);
-	for (i = 0; i < nelems; i++, sg++)
-		__intel_unmap_single(hwdev, sg->dma_address,
-			sg->dma_length, dir, &flush_addr, &flush_size);
 
-	if (iommu_flush_iotlb_dsi(domain->iommu, domain->id, 0))
+	iova = find_iova(&domain->iovad, IOVA_PFN(sg[0].dma_address));
+	if (!iova)
+		return;
+	for (i = 0; i < nelems; i++, sg++) {
+		addr = SG_ENT_VIRT_ADDRESS(sg);
+		size += aligned_size((u64)addr, sg->length);
+	}
+
+	start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
+
+	/*  clear the whole page */
+	dma_pte_clear_range(domain, start_addr, start_addr + size);
+	/* free page tables */
+	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
+
+	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
+			size >> PAGE_SHIFT_4K, 0))
 		iommu_flush_write_buffer(domain->iommu);
+
+	/* free iova */
+	__free_iova(&domain->iovad, iova);
 }
 
-#define SG_ENT_VIRT_ADDRESS(sg)	(page_address((sg)->page) + (sg)->offset)
 static int intel_nontranslate_map_sg(struct device *hddev,
 	struct scatterlist *sg, int nelems, int dir)
 {
@@ -2031,33 +2021,76 @@
 {
 	void *addr;
 	int i;
-	dma_addr_t dma_handle;
 	struct pci_dev *pdev = to_pci_dev(hwdev);
 	struct dmar_domain *domain;
-	u64 flush_addr;
-	unsigned int flush_size;
+	size_t size = 0;
+	int prot = 0;
+	size_t offset = 0;
+	struct iova *iova = NULL;
+	int ret;
+	struct scatterlist *orig_sg = sg;
+	unsigned long start_addr;
 
 	BUG_ON(dir == DMA_NONE);
 	if (pdev->sysdata == DUMMY_DEVICE_DOMAIN_INFO)
 		return intel_nontranslate_map_sg(hwdev, sg, nelems, dir);
 
+	domain = get_valid_domain_for_dev(pdev);
+	if (!domain)
+		return 0;
+
 	for (i = 0; i < nelems; i++, sg++) {
 		addr = SG_ENT_VIRT_ADDRESS(sg);
-		dma_handle = __intel_map_single(hwdev, addr,
-				sg->length, dir, &flush_addr, &flush_size);
-		if (!dma_handle) {
-			intel_unmap_sg(hwdev, sg - i, i, dir);
-			sg[0].dma_length = 0;
-			return 0;
-		}
-		sg->dma_address = dma_handle;
-		sg->dma_length = sg->length;
+		addr = (void *)virt_to_phys(addr);
+		size += aligned_size((u64)addr, sg->length);
 	}
 
-	domain = find_domain(pdev);
+	iova = __intel_alloc_iova(hwdev, domain, size);
+	if (!iova) {
+		orig_sg->dma_length = 0;
+		return 0;
+	}
+
+	/*
+	 * Check if DMAR supports zero-length reads on write only
+	 * mappings..
+	 */
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
+			!cap_zlr(domain->iommu->cap))
+		prot |= DMA_PTE_READ;
+	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+		prot |= DMA_PTE_WRITE;
+
+	start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
+	offset = 0;
+	sg = orig_sg;
+	for (i = 0; i < nelems; i++, sg++) {
+		addr = SG_ENT_VIRT_ADDRESS(sg);
+		addr = (void *)virt_to_phys(addr);
+		size = aligned_size((u64)addr, sg->length);
+		ret = domain_page_mapping(domain, start_addr + offset,
+			((u64)addr) & PAGE_MASK_4K,
+			size, prot);
+		if (ret) {
+			/*  clear the page */
+			dma_pte_clear_range(domain, start_addr,
+				  start_addr + offset);
+			/* free page tables */
+			dma_pte_free_pagetable(domain, start_addr,
+				  start_addr + offset);
+			/* free iova */
+			__free_iova(&domain->iovad, iova);
+			return 0;
+		}
+		sg->dma_address = start_addr + offset +
+				((u64)addr & (~PAGE_MASK_4K));
+		sg->dma_length = sg->length;
+		offset += size;
+	}
 
 	/* it's a non-present to present mapping */
-	if (iommu_flush_iotlb_dsi(domain->iommu, domain->id, 1))
+	if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
+			start_addr, offset >> PAGE_SHIFT_4K, 1))
 		iommu_flush_write_buffer(domain->iommu);
 	return nelems;
 }