From 9b76f03f944c911744ee4209ba245feb70e9adf7 Mon Sep 17 00:00:00 2001
From: Swathi Sridhar <swatsrid@codeaurora.org>
Date: Thu, 30 Apr 2020 17:57:48 -0700
Subject: [PATCH] ANDROID: GKI: iommu: dma-mapping-fast: Fast ARMv7/v8 Long
 Descriptor Format

Snapshot of vendor added support for Fast ARMv7/v8 Long Descriptor
Format.

Signed-off-by: Charan Teja Reddy <charante@codeaurora.org>
Signed-off-by: Liam Mark <lmark@codeaurora.org>
Signed-off-by: Patrick Daly <pdaly@codeaurora.org>
Signed-off-by: Prakash Gupta <guptap@codeaurora.org>
Signed-off-by: Qingqing Zhou <qqzhou@codeaurora.org>
Signed-off-by: Rishabh Bhatnagar <rishabhb@codeaurora.org>
Signed-off-by: Shiraz Hashim <shashim@codeaurora.org>
Signed-off-by: Sudarshan Rajagopalan <sudaraja@codeaurora.org>
Signed-off-by: Swathi Sridhar <swatsrid@codeaurora.org>
Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
Bug: 155522481
Signed-off-by: Mark Salyzyn <salyzyn@google.com>
[saravanak snapshot from commit 79efc458af96 and disabled for ARM]
Signed-off-by: Saravana Kannan <saravanak@google.com>
Change-Id: Ifae8f889737f25cd8a9729101cd8251974109842
---
 arch/arm64/include/asm/dma-iommu.h |    2 +
 arch/arm64/mm/dma-mapping.c        |    6 +-
 drivers/iommu/Kconfig              |   12 +
 drivers/iommu/Makefile             |    1 +
 drivers/iommu/dma-mapping-fast.c   | 1249 ++++++++++++++++++++++++++++
 drivers/iommu/io-pgtable-fast.c    |  816 ++++++++++++++++++
 drivers/iommu/io-pgtable.c         |    3 +
 include/linux/dma-mapping-fast.h   |   52 ++
 include/linux/io-pgtable-fast.h    |  104 +++
 include/linux/io-pgtable.h         |    9 +
 10 files changed, 2253 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/dma-mapping-fast.c
 create mode 100644 drivers/iommu/io-pgtable-fast.c
 create mode 100644 include/linux/dma-mapping-fast.h
 create mode 100644 include/linux/io-pgtable-fast.h

diff --git a/arch/arm64/include/asm/dma-iommu.h b/arch/arm64/include/asm/dma-iommu.h
index 699d2c457dc9..80397db4b40e 100644
--- a/arch/arm64/include/asm/dma-iommu.h
+++ b/arch/arm64/include/asm/dma-iommu.h
@@ -23,6 +23,8 @@ struct dma_iommu_mapping {
 	void			*bitmap;
 	size_t			bits;
 	dma_addr_t		base;
+
+	struct dma_fast_smmu_mapping *fast;
 };
 
 #ifdef CONFIG_ARM64_DMA_USE_IOMMU
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 9015d437893c..a8d411c0c6f3 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -37,6 +37,7 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/dma-iommu.h>
+#include <linux/dma-mapping-fast.h>
 
 static int swiotlb __ro_after_init;
 
@@ -1009,16 +1010,19 @@ iommu_init_mapping(struct device *dev, struct dma_iommu_mapping *mapping)
 static int arm_iommu_get_dma_cookie(struct device *dev,
 				    struct dma_iommu_mapping *mapping)
 {
-	int s1_bypass = 0;
+	int s1_bypass = 0, is_fast = 0;
 	int err = 0;
 
 	mutex_lock(&iommu_dma_init_mutex);
 
 	iommu_domain_get_attr(mapping->domain, DOMAIN_ATTR_S1_BYPASS,
 					&s1_bypass);
+	iommu_domain_get_attr(mapping->domain, DOMAIN_ATTR_FAST, &is_fast);
 
 	if (s1_bypass)
 		mapping->ops = &arm64_swiotlb_dma_ops;
+	else if (is_fast)
+		err = fast_smmu_init_mapping(dev, mapping);
 	else
 		err = iommu_init_mapping(dev, mapping);
 
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 64d17f53bced..bc691526ef9d 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -58,6 +58,18 @@ config IOMMU_IO_PGTABLE_ARMV7S_SELFTEST
 
 	  If unsure, say N here.
 
+config IOMMU_IO_PGTABLE_FAST
+	bool "Fast ARMv7/v8 Long Descriptor Format"
+	depends on ARM64_DMA_USE_IOMMU && IOMMU_DMA
+	help
+          Enable support for a subset of the ARM long descriptor pagetable
+	  format.  This allocator achieves fast performance by
+	  pre-allocating and pre-populating page table memory up front.
+	  only supports a 32 bit virtual address space.
+
+          This implementation is mainly optimized for use cases where the
+          buffers are small (<= 64K) since it only supports 4K page sizes.
+
 endmenu
 
 config IOMMU_DEBUGFS
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index e13ea199f589..11d6de2ddc02 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_FAST) += io-pgtable-fast.o dma-mapping-fast.o
 obj-$(CONFIG_OF_IOMMU)	+= of_iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
 obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o amd_iommu_quirks.o
diff --git a/drivers/iommu/dma-mapping-fast.c b/drivers/iommu/dma-mapping-fast.c
new file mode 100644
index 000000000000..c3b5e82858c3
--- /dev/null
+++ b/drivers/iommu/dma-mapping-fast.c
@@ -0,0 +1,1249 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2016-2017, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/dma-contiguous.h>
+#include <linux/dma-mapping.h>
+#include <linux/dma-mapping-fast.h>
+#include <linux/io-pgtable.h>
+#include <linux/io-pgtable-fast.h>
+#include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/dma-iommu.h>
+#include <linux/slab.h>
+#include <linux/genalloc.h>
+#include <linux/vmalloc.h>
+#include <linux/pci.h>
+#include <linux/dma-iommu.h>
+#include <linux/iova.h>
+#include <trace/events/iommu.h>
+
+/* some redundant definitions... :( TODO: move to io-pgtable-fast.h */
+#define FAST_PAGE_SHIFT		12
+#define FAST_PAGE_SIZE (1UL << FAST_PAGE_SHIFT)
+#define FAST_PAGE_MASK (~(PAGE_SIZE - 1))
+
+#define DEFAULT_DMA_COHERENT_POOL_SIZE	SZ_256K
+static struct gen_pool *atomic_pool __ro_after_init;
+
+static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
+
+static int __init early_coherent_pool(char *p)
+{
+	atomic_pool_size = memparse(p, &p);
+	return 0;
+}
+early_param("coherent_pool", early_coherent_pool);
+
+static pgprot_t __get_dma_pgprot(unsigned long attrs, pgprot_t prot,
+				 bool coherent)
+{
+	if (attrs & DMA_ATTR_STRONGLY_ORDERED)
+		return pgprot_noncached(prot);
+	else if (!coherent || (attrs & DMA_ATTR_WRITE_COMBINE))
+		return pgprot_writecombine(prot);
+	return prot;
+}
+
+static void *__alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
+{
+	unsigned long val;
+	void *ptr = NULL;
+
+	if (!atomic_pool) {
+		WARN(1, "coherent pool not initialised!\n");
+		return NULL;
+	}
+
+	val = gen_pool_alloc(atomic_pool, size);
+	if (val) {
+		phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
+
+		*ret_page = phys_to_page(phys);
+		ptr = (void *)val;
+		memset(ptr, 0, size);
+	}
+
+	return ptr;
+}
+
+static phys_addr_t __atomic_get_phys(void *addr)
+{
+	return gen_pool_virt_to_phys(atomic_pool, (unsigned long)addr);
+}
+
+static bool __in_atomic_pool(void *start, size_t size)
+{
+	if (!atomic_pool)
+		return false;
+	return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
+}
+
+static int __free_from_pool(void *start, size_t size)
+{
+	if (!__in_atomic_pool(start, size))
+		return 0;
+
+	gen_pool_free(atomic_pool, (unsigned long)start, size);
+
+	return 1;
+}
+
+static bool is_dma_coherent(struct device *dev, unsigned long attrs)
+{
+	bool is_coherent;
+
+	if (attrs & DMA_ATTR_FORCE_COHERENT)
+		is_coherent = true;
+	else if (attrs & DMA_ATTR_FORCE_NON_COHERENT)
+		is_coherent = false;
+	else if (is_device_dma_coherent(dev))
+		is_coherent = true;
+	else
+		is_coherent = false;
+
+	return is_coherent;
+}
+
+static struct dma_fast_smmu_mapping *dev_get_mapping(struct device *dev)
+{
+	struct iommu_domain *domain;
+
+	domain = iommu_get_domain_for_dev(dev);
+	if (!domain)
+		return ERR_PTR(-EINVAL);
+	return domain->iova_cookie;
+}
+
+/*
+ * Checks if the allocated range (ending at @end) covered the upcoming
+ * stale bit.  We don't need to know exactly where the range starts since
+ * we already know where the candidate search range started.  If, starting
+ * from the beginning of the candidate search range, we had to step over
+ * (or landed directly on top of) the upcoming stale bit, then we return
+ * true.
+ *
+ * Due to wrapping, there are two scenarios we'll need to check: (1) if the
+ * range [search_start, upcoming_stale] spans 0 (i.e. search_start >
+ * upcoming_stale), and, (2) if the range: [search_start, upcoming_stale]
+ * does *not* span 0 (i.e. search_start <= upcoming_stale).  And for each
+ * of those two scenarios we need to handle three cases: (1) the bit was
+ * found before wrapping or
+ */
+static bool __bit_covered_stale(unsigned long upcoming_stale,
+				unsigned long search_start,
+				unsigned long end)
+{
+	if (search_start > upcoming_stale) {
+		if (end >= search_start) {
+			/*
+			 * We started searching above upcoming_stale and we
+			 * didn't wrap, so we couldn't have crossed
+			 * upcoming_stale.
+			 */
+			return false;
+		}
+		/*
+		 * We wrapped. Did we cross (or land on top of)
+		 * upcoming_stale?
+		 */
+		return end >= upcoming_stale;
+	}
+
+	if (search_start <= upcoming_stale) {
+		if (end >= search_start) {
+			/*
+			 * We didn't wrap.  Did we cross (or land on top
+			 * of) upcoming_stale?
+			 */
+			return end >= upcoming_stale;
+		}
+		/*
+		 * We wrapped. So we must have crossed upcoming_stale
+		 * (since we started searching below it).
+		 */
+		return true;
+	}
+
+	/* we should have covered all logical combinations... */
+	WARN_ON(1);
+	return true;
+}
+
+static dma_addr_t __fast_smmu_alloc_iova(struct dma_fast_smmu_mapping *mapping,
+					 unsigned long attrs,
+					 size_t size)
+{
+	unsigned long bit, prev_search_start, nbits = size >> FAST_PAGE_SHIFT;
+	unsigned long align = (1 << get_order(size)) - 1;
+
+	bit = bitmap_find_next_zero_area(
+		mapping->bitmap, mapping->num_4k_pages, mapping->next_start,
+		nbits, align);
+	if (unlikely(bit > mapping->num_4k_pages)) {
+		/* try wrapping */
+		bit = bitmap_find_next_zero_area(
+			mapping->bitmap, mapping->num_4k_pages, 0, nbits,
+			align);
+		if (unlikely(bit > mapping->num_4k_pages))
+			return DMA_ERROR_CODE;
+	}
+
+	bitmap_set(mapping->bitmap, bit, nbits);
+	prev_search_start = mapping->next_start;
+	mapping->next_start = bit + nbits;
+	if (unlikely(mapping->next_start >= mapping->num_4k_pages))
+		mapping->next_start = 0;
+
+	/*
+	 * If we just re-allocated a VA whose TLB hasn't been invalidated
+	 * since it was last used and unmapped, we need to invalidate it
+	 * here.  We actually invalidate the entire TLB so that we don't
+	 * have to invalidate the TLB again until we wrap back around.
+	 */
+	if (mapping->have_stale_tlbs &&
+	    __bit_covered_stale(mapping->upcoming_stale_bit,
+				prev_search_start,
+				bit + nbits - 1)) {
+		bool skip_sync = (attrs & DMA_ATTR_SKIP_CPU_SYNC);
+
+		iommu_tlbiall(mapping->domain);
+		mapping->have_stale_tlbs = false;
+		av8l_fast_clear_stale_ptes(mapping->pgtbl_ops,
+				mapping->domain->geometry.aperture_start,
+				mapping->base,
+				mapping->base + mapping->size - 1,
+				skip_sync);
+	}
+
+	return (bit << FAST_PAGE_SHIFT) + mapping->base;
+}
+
+/*
+ * Checks whether the candidate bit will be allocated sooner than the
+ * current upcoming stale bit.  We can say candidate will be upcoming
+ * sooner than the current upcoming stale bit if it lies between the
+ * starting bit of the next search range and the upcoming stale bit
+ * (allowing for wrap-around).
+ *
+ * Stated differently, we're checking the relative ordering of three
+ * unsigned numbers.  So we need to check all 6 (i.e. 3!) permutations,
+ * namely:
+ *
+ *     0 |---A---B---C---| TOP (Case 1)
+ *     0 |---A---C---B---| TOP (Case 2)
+ *     0 |---B---A---C---| TOP (Case 3)
+ *     0 |---B---C---A---| TOP (Case 4)
+ *     0 |---C---A---B---| TOP (Case 5)
+ *     0 |---C---B---A---| TOP (Case 6)
+ *
+ * Note that since we're allowing numbers to wrap, the following three
+ * scenarios are all equivalent for Case 1:
+ *
+ *     0 |---A---B---C---| TOP
+ *     0 |---C---A---B---| TOP (C has wrapped. This is Case 5.)
+ *     0 |---B---C---A---| TOP (C and B have wrapped. This is Case 4.)
+ *
+ * In any of these cases, if we start searching from A, we will find B
+ * before we find C.
+ *
+ * We can also find two equivalent cases for Case 2:
+ *
+ *     0 |---A---C---B---| TOP
+ *     0 |---B---A---C---| TOP (B has wrapped. This is Case 3.)
+ *     0 |---C---B---A---| TOP (B and C have wrapped. This is Case 6.)
+ *
+ * In any of these cases, if we start searching from A, we will find C
+ * before we find B.
+ */
+static bool __bit_is_sooner(unsigned long candidate,
+			    struct dma_fast_smmu_mapping *mapping)
+{
+	unsigned long A = mapping->next_start;
+	unsigned long B = candidate;
+	unsigned long C = mapping->upcoming_stale_bit;
+
+	if ((A < B && B < C) ||	/* Case 1 */
+	    (C < A && A < B) ||	/* Case 5 */
+	    (B < C && C < A))	/* Case 4 */
+		return true;
+
+	if ((A < C && C < B) ||	/* Case 2 */
+	    (B < A && A < C) ||	/* Case 3 */
+	    (C < B && B < A))	/* Case 6 */
+		return false;
+
+	/*
+	 * For simplicity, we've been ignoring the possibility of any of
+	 * our three numbers being equal.  Handle those cases here (they
+	 * shouldn't happen very often, (I think?)).
+	 */
+
+	/*
+	 * If candidate is the next bit to be searched then it's definitely
+	 * sooner.
+	 */
+	if (A == B)
+		return true;
+
+	/*
+	 * If candidate is the next upcoming stale bit we'll return false
+	 * to avoid doing `upcoming = candidate' in the caller (which would
+	 * be useless since they're already equal)
+	 */
+	if (B == C)
+		return false;
+
+	/*
+	 * If next start is the upcoming stale bit then candidate can't
+	 * possibly be sooner.  The "soonest" bit is already selected.
+	 */
+	if (A == C)
+		return false;
+
+	/* We should have covered all logical combinations. */
+	WARN(1, "Well, that's awkward. A=%ld, B=%ld, C=%ld\n", A, B, C);
+	return true;
+}
+
+#ifdef CONFIG_ARM64
+static int __init atomic_pool_init(void)
+{
+	pgprot_t prot = __pgprot(PROT_NORMAL_NC);
+	unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
+	struct page *page;
+	void *addr;
+	unsigned int pool_size_order = get_order(atomic_pool_size);
+
+	if (dev_get_cma_area(NULL))
+		page = dma_alloc_from_contiguous(NULL, nr_pages,
+						 pool_size_order, false);
+	else
+		page = alloc_pages(GFP_DMA32, pool_size_order);
+
+	if (page) {
+		int ret;
+		void *page_addr = page_address(page);
+
+		memset(page_addr, 0, atomic_pool_size);
+		__dma_flush_area(page_addr, atomic_pool_size);
+
+		atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
+		if (!atomic_pool)
+			goto free_page;
+
+		addr = dma_common_contiguous_remap(page, atomic_pool_size,
+					VM_USERMAP, prot, atomic_pool_init);
+
+		if (!addr)
+			goto destroy_genpool;
+
+		ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
+					page_to_phys(page),
+					atomic_pool_size, -1);
+		if (ret)
+			goto remove_mapping;
+
+		gen_pool_set_algo(atomic_pool,
+				  gen_pool_first_fit_order_align,
+				  NULL);
+
+		pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
+			atomic_pool_size / 1024);
+		return 0;
+	}
+	goto out;
+
+remove_mapping:
+	dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP, false);
+destroy_genpool:
+	gen_pool_destroy(atomic_pool);
+	atomic_pool = NULL;
+free_page:
+	if (!dma_release_from_contiguous(NULL, page, nr_pages))
+		__free_pages(page, pool_size_order);
+out:
+	pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n",
+		atomic_pool_size / 1024);
+	return -ENOMEM;
+}
+arch_initcall(atomic_pool_init);
+#endif
+
+static void __fast_smmu_free_iova(struct dma_fast_smmu_mapping *mapping,
+				  dma_addr_t iova, size_t size)
+{
+	unsigned long start_bit = (iova - mapping->base) >> FAST_PAGE_SHIFT;
+	unsigned long nbits = size >> FAST_PAGE_SHIFT;
+
+	/*
+	 * We don't invalidate TLBs on unmap.  We invalidate TLBs on map
+	 * when we're about to re-allocate a VA that was previously
+	 * unmapped but hasn't yet been invalidated.  So we need to keep
+	 * track of which bit is the closest to being re-allocated here.
+	 */
+	if (__bit_is_sooner(start_bit, mapping))
+		mapping->upcoming_stale_bit = start_bit;
+
+	bitmap_clear(mapping->bitmap, start_bit, nbits);
+	mapping->have_stale_tlbs = true;
+}
+
+
+static void __fast_dma_page_cpu_to_dev(struct page *page, unsigned long off,
+				       size_t size, enum dma_data_direction dir)
+{
+	__dma_map_area(page_address(page) + off, size, dir);
+}
+
+static void __fast_dma_page_dev_to_cpu(struct page *page, unsigned long off,
+				       size_t size, enum dma_data_direction dir)
+{
+	__dma_unmap_area(page_address(page) + off, size, dir);
+
+	/* TODO: WHAT IS THIS? */
+	/*
+	 * Mark the D-cache clean for this page to avoid extra flushing.
+	 */
+	if (dir != DMA_TO_DEVICE && off == 0 && size >= PAGE_SIZE)
+		set_bit(PG_dcache_clean, &page->flags);
+}
+
+static dma_addr_t fast_smmu_map_page(struct device *dev, struct page *page,
+				   unsigned long offset, size_t size,
+				   enum dma_data_direction dir,
+				   unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	dma_addr_t iova;
+	unsigned long flags;
+	phys_addr_t phys_plus_off = page_to_phys(page) + offset;
+	phys_addr_t phys_to_map = round_down(phys_plus_off, FAST_PAGE_SIZE);
+	unsigned long offset_from_phys_to_map = phys_plus_off & ~FAST_PAGE_MASK;
+	size_t len = ALIGN(size + offset_from_phys_to_map, FAST_PAGE_SIZE);
+	bool skip_sync = (attrs & DMA_ATTR_SKIP_CPU_SYNC);
+	bool is_coherent = is_dma_coherent(dev, attrs);
+	int prot = dma_info_to_prot(dir, is_coherent, attrs);
+
+	if (!skip_sync && !is_coherent)
+		__fast_dma_page_cpu_to_dev(phys_to_page(phys_to_map),
+					   offset_from_phys_to_map, size, dir);
+
+	spin_lock_irqsave(&mapping->lock, flags);
+
+	iova = __fast_smmu_alloc_iova(mapping, attrs, len);
+
+	if (unlikely(iova == DMA_ERROR_CODE))
+		goto fail;
+
+	if (unlikely(av8l_fast_map_public(mapping->pgtbl_ops, iova,
+					  phys_to_map, len, prot)))
+		goto fail_free_iova;
+
+	spin_unlock_irqrestore(&mapping->lock, flags);
+
+	trace_map(mapping->domain, iova, phys_to_map, len, prot);
+	return iova + offset_from_phys_to_map;
+
+fail_free_iova:
+	__fast_smmu_free_iova(mapping, iova, size);
+fail:
+	spin_unlock_irqrestore(&mapping->lock, flags);
+	return DMA_ERROR_CODE;
+}
+
+static void fast_smmu_unmap_page(struct device *dev, dma_addr_t iova,
+			       size_t size, enum dma_data_direction dir,
+			       unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	unsigned long flags;
+	unsigned long offset = iova & ~FAST_PAGE_MASK;
+	size_t len = ALIGN(size + offset, FAST_PAGE_SIZE);
+	bool skip_sync = (attrs & DMA_ATTR_SKIP_CPU_SYNC);
+	bool is_coherent = is_dma_coherent(dev, attrs);
+
+	if (!skip_sync && !is_coherent) {
+		phys_addr_t phys;
+
+		phys = av8l_fast_iova_to_phys_public(mapping->pgtbl_ops, iova);
+		WARN_ON(!phys);
+
+		__fast_dma_page_dev_to_cpu(phys_to_page(phys), offset,
+						size, dir);
+	}
+
+	spin_lock_irqsave(&mapping->lock, flags);
+	av8l_fast_unmap_public(mapping->pgtbl_ops, iova, len);
+	__fast_smmu_free_iova(mapping, iova, len);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+
+	trace_unmap(mapping->domain, iova - offset, len, len);
+}
+
+static void fast_smmu_sync_single_for_cpu(struct device *dev,
+		dma_addr_t iova, size_t size, enum dma_data_direction dir)
+{
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	unsigned long offset = iova & ~FAST_PAGE_MASK;
+
+	if (!av8l_fast_iova_coherent_public(mapping->pgtbl_ops, iova)) {
+		phys_addr_t phys;
+
+		phys = av8l_fast_iova_to_phys_public(mapping->pgtbl_ops, iova);
+		WARN_ON(!phys);
+
+		__fast_dma_page_dev_to_cpu(phys_to_page(phys), offset,
+						size, dir);
+	}
+}
+
+static void fast_smmu_sync_single_for_device(struct device *dev,
+		dma_addr_t iova, size_t size, enum dma_data_direction dir)
+{
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	unsigned long offset = iova & ~FAST_PAGE_MASK;
+
+	if (!av8l_fast_iova_coherent_public(mapping->pgtbl_ops, iova)) {
+		phys_addr_t phys;
+
+		phys = av8l_fast_iova_to_phys_public(mapping->pgtbl_ops, iova);
+		WARN_ON(!phys);
+
+		__fast_dma_page_cpu_to_dev(phys_to_page(phys), offset,
+						size, dir);
+	}
+}
+
+static void fast_smmu_sync_sg_for_cpu(struct device *dev,
+				    struct scatterlist *sgl, int nelems,
+				    enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	dma_addr_t iova = sg_dma_address(sgl);
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	int i;
+
+	if (av8l_fast_iova_coherent_public(mapping->pgtbl_ops, iova))
+		return;
+
+	for_each_sg(sgl, sg, nelems, i)
+		__dma_unmap_area(sg_virt(sg), sg->length, dir);
+}
+
+static void fast_smmu_sync_sg_for_device(struct device *dev,
+				       struct scatterlist *sgl, int nelems,
+				       enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	dma_addr_t iova = sg_dma_address(sgl);
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	int i;
+
+	if (av8l_fast_iova_coherent_public(mapping->pgtbl_ops, iova))
+		return;
+
+	for_each_sg(sgl, sg, nelems, i)
+		__dma_map_area(sg_virt(sg), sg->length, dir);
+}
+
+static int fast_smmu_map_sg(struct device *dev, struct scatterlist *sg,
+			    int nents, enum dma_data_direction dir,
+			    unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	size_t iova_len;
+	bool is_coherent = is_dma_coherent(dev, attrs);
+	int prot = dma_info_to_prot(dir, is_coherent, attrs);
+	int ret;
+	dma_addr_t iova;
+	unsigned long flags;
+	size_t unused;
+
+	iova_len = iommu_dma_prepare_map_sg(dev, mapping->iovad, sg, nents);
+
+	spin_lock_irqsave(&mapping->lock, flags);
+	iova = __fast_smmu_alloc_iova(mapping, attrs, iova_len);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+
+	if (unlikely(iova == DMA_ERROR_CODE))
+		goto fail;
+
+	av8l_fast_map_sg_public(mapping->pgtbl_ops, iova, sg, nents, prot,
+				&unused);
+
+	ret = iommu_dma_finalise_sg(dev, sg, nents, iova);
+
+	if ((attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0)
+		fast_smmu_sync_sg_for_device(dev, sg, nents, dir);
+
+	return ret;
+fail:
+	iommu_dma_invalidate_sg(sg, nents);
+	return 0;
+}
+
+static void fast_smmu_unmap_sg(struct device *dev,
+			       struct scatterlist *sg, int nelems,
+			       enum dma_data_direction dir,
+			       unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	unsigned long flags;
+	dma_addr_t start;
+	size_t len;
+	struct scatterlist *tmp;
+	int i;
+
+	if ((attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0)
+		fast_smmu_sync_sg_for_cpu(dev, sg, nelems, dir);
+
+	/*
+	 * The scatterlist segments are mapped into a single
+	 * contiguous IOVA allocation, so this is incredibly easy.
+	 */
+	start = sg_dma_address(sg);
+	for_each_sg(sg_next(sg), tmp, nelems - 1, i) {
+		if (sg_dma_len(tmp) == 0)
+			break;
+		sg = tmp;
+	}
+	len = sg_dma_address(sg) + sg_dma_len(sg) - start;
+
+	av8l_fast_unmap_public(mapping->pgtbl_ops, start, len);
+
+	spin_lock_irqsave(&mapping->lock, flags);
+	__fast_smmu_free_iova(mapping, start, len);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+}
+
+static void __fast_smmu_free_pages(struct page **pages, int count)
+{
+	int i;
+
+	if (!pages)
+		return;
+	for (i = 0; i < count; i++)
+		__free_page(pages[i]);
+	kvfree(pages);
+}
+
+static void *fast_smmu_alloc_atomic(struct dma_fast_smmu_mapping *mapping,
+				    size_t size, gfp_t gfp, unsigned long attrs,
+				    dma_addr_t *handle, bool coherent)
+{
+	void *addr;
+	unsigned long flags;
+	struct page *page;
+	dma_addr_t dma_addr;
+	int prot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
+
+	if (coherent) {
+		page = alloc_pages(gfp, get_order(size));
+		addr = page ? page_address(page) : NULL;
+	} else
+		addr = __alloc_from_pool(size, &page, gfp);
+	if (!addr)
+		return NULL;
+
+	spin_lock_irqsave(&mapping->lock, flags);
+	dma_addr = __fast_smmu_alloc_iova(mapping, attrs, size);
+	if (dma_addr == DMA_ERROR_CODE) {
+		dev_err(mapping->dev, "no iova\n");
+		spin_unlock_irqrestore(&mapping->lock, flags);
+		goto out_free_page;
+	}
+	if (unlikely(av8l_fast_map_public(mapping->pgtbl_ops, dma_addr,
+					  page_to_phys(page), size, prot))) {
+		dev_err(mapping->dev, "no map public\n");
+		goto out_free_iova;
+	}
+	spin_unlock_irqrestore(&mapping->lock, flags);
+	*handle = dma_addr;
+	return addr;
+
+out_free_iova:
+	__fast_smmu_free_iova(mapping, dma_addr, size);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+out_free_page:
+	coherent ? __free_pages(page, get_order(size)) :
+		   __free_from_pool(addr, size);
+	return NULL;
+}
+
+static struct page **__fast_smmu_alloc_pages(unsigned int count, gfp_t gfp)
+{
+	struct page **pages;
+	unsigned int i = 0, array_size = count * sizeof(*pages);
+
+	if (array_size <= PAGE_SIZE)
+		pages = kzalloc(array_size, GFP_KERNEL);
+	else
+		pages = vzalloc(array_size);
+	if (!pages)
+		return NULL;
+
+	/* IOMMU can map any pages, so himem can also be used here */
+	gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
+
+	for (i = 0; i < count; ++i) {
+		struct page *page = alloc_page(gfp);
+
+		if (!page) {
+			__fast_smmu_free_pages(pages, i);
+			return NULL;
+		}
+		pages[i] = page;
+	}
+	return pages;
+}
+
+static void *__fast_smmu_alloc_contiguous(struct device *dev, size_t size,
+			dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	bool is_coherent = is_dma_coherent(dev, attrs);
+	int prot = dma_info_to_prot(DMA_BIDIRECTIONAL, is_coherent, attrs);
+	pgprot_t remap_prot = __get_dma_pgprot(attrs, PAGE_KERNEL, is_coherent);
+	struct page *page;
+	dma_addr_t iova;
+	unsigned long flags;
+	void *coherent_addr;
+
+	page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
+					get_order(size), gfp & __GFP_NOWARN);
+	if (!page)
+		return NULL;
+
+
+	spin_lock_irqsave(&mapping->lock, flags);
+	iova = __fast_smmu_alloc_iova(mapping, attrs, size);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+	if (iova == DMA_ERROR_CODE)
+		goto release_page;
+
+	if (av8l_fast_map_public(mapping->pgtbl_ops, iova, page_to_phys(page),
+				 size, prot))
+		goto release_iova;
+
+	coherent_addr = dma_common_contiguous_remap(page, size, VM_USERMAP,
+				remap_prot, __fast_smmu_alloc_contiguous);
+	if (!coherent_addr)
+		goto release_mapping;
+
+	if (!is_coherent)
+		__dma_flush_area(page_to_virt(page), size);
+
+	*handle = iova;
+	return coherent_addr;
+
+release_mapping:
+	av8l_fast_unmap_public(mapping->pgtbl_ops, iova, size);
+release_iova:
+	__fast_smmu_free_iova(mapping, iova, size);
+release_page:
+	dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
+	return NULL;
+}
+
+static void *fast_smmu_alloc(struct device *dev, size_t size,
+			     dma_addr_t *handle, gfp_t gfp,
+			     unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	struct sg_table sgt;
+	dma_addr_t dma_addr, iova_iter;
+	void *addr;
+	unsigned long flags;
+	struct sg_mapping_iter miter;
+	size_t count = ALIGN(size, SZ_4K) >> PAGE_SHIFT;
+	bool is_coherent = is_dma_coherent(dev, attrs);
+	int prot = dma_info_to_prot(DMA_BIDIRECTIONAL, is_coherent, attrs);
+	pgprot_t remap_prot = __get_dma_pgprot(attrs, PAGE_KERNEL, is_coherent);
+	struct page **pages;
+
+	/*
+	 * sg_alloc_table_from_pages accepts unsigned int value for count
+	 * so check count doesn't exceed UINT_MAX.
+	 */
+
+	if (count > UINT_MAX) {
+		dev_err(dev, "count: %zx exceeds UNIT_MAX\n", count);
+		return NULL;
+	}
+
+	*handle = DMA_ERROR_CODE;
+	size = ALIGN(size, SZ_4K);
+
+	if (atomic_pool && !gfpflags_allow_blocking(gfp))
+		return fast_smmu_alloc_atomic(mapping, size, gfp, attrs, handle,
+					      is_coherent);
+	else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS)
+		return __fast_smmu_alloc_contiguous(dev, size, handle, gfp,
+						    attrs);
+
+	pages = __fast_smmu_alloc_pages(count, gfp);
+	if (!pages) {
+		dev_err(dev, "no pages\n");
+		return NULL;
+	}
+
+	if (sg_alloc_table_from_pages(&sgt, pages, count, 0, size, gfp)) {
+		dev_err(dev, "no sg tablen\n");
+		goto out_free_pages;
+	}
+
+	if (!is_coherent) {
+		/*
+		 * The CPU-centric flushing implied by SG_MITER_TO_SG isn't
+		 * sufficient here, so skip it by using the "wrong" direction.
+		 */
+		sg_miter_start(&miter, sgt.sgl, sgt.orig_nents,
+			       SG_MITER_FROM_SG);
+		while (sg_miter_next(&miter))
+			__dma_flush_area(miter.addr, miter.length);
+		sg_miter_stop(&miter);
+	}
+
+	spin_lock_irqsave(&mapping->lock, flags);
+	dma_addr = __fast_smmu_alloc_iova(mapping, attrs, size);
+	if (dma_addr == DMA_ERROR_CODE) {
+		dev_err(dev, "no iova\n");
+		spin_unlock_irqrestore(&mapping->lock, flags);
+		goto out_free_sg;
+	}
+	iova_iter = dma_addr;
+	sg_miter_start(&miter, sgt.sgl, sgt.orig_nents,
+		       SG_MITER_FROM_SG | SG_MITER_ATOMIC);
+	while (sg_miter_next(&miter)) {
+		if (unlikely(av8l_fast_map_public(
+				     mapping->pgtbl_ops, iova_iter,
+				     page_to_phys(miter.page),
+				     miter.length, prot))) {
+			dev_err(dev, "no map public\n");
+			/* TODO: unwind previously successful mappings */
+			goto out_free_iova;
+		}
+		iova_iter += miter.length;
+	}
+	sg_miter_stop(&miter);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+
+	addr = dma_common_pages_remap(pages, size, VM_USERMAP, remap_prot,
+				      __builtin_return_address(0));
+	if (!addr) {
+		dev_err(dev, "no common pages\n");
+		goto out_unmap;
+	}
+
+	*handle = dma_addr;
+	sg_free_table(&sgt);
+	return addr;
+
+out_unmap:
+	/* need to take the lock again for page tables and iova */
+	spin_lock_irqsave(&mapping->lock, flags);
+	av8l_fast_unmap_public(mapping->pgtbl_ops, dma_addr, size);
+out_free_iova:
+	__fast_smmu_free_iova(mapping, dma_addr, size);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+out_free_sg:
+	sg_free_table(&sgt);
+out_free_pages:
+	__fast_smmu_free_pages(pages, count);
+	return NULL;
+}
+
+static void fast_smmu_free(struct device *dev, size_t size,
+			   void *cpu_addr, dma_addr_t dma_handle,
+			   unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	struct vm_struct *area;
+	unsigned long flags;
+
+	size = ALIGN(size, FAST_PAGE_SIZE);
+
+	spin_lock_irqsave(&mapping->lock, flags);
+	av8l_fast_unmap_public(mapping->pgtbl_ops, dma_handle, size);
+	__fast_smmu_free_iova(mapping, dma_handle, size);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+
+	area = find_vm_area(cpu_addr);
+	if (area && area->pages) {
+		struct page **pages = area->pages;
+
+		dma_common_free_remap(cpu_addr, size, VM_USERMAP, false);
+		__fast_smmu_free_pages(pages, size >> FAST_PAGE_SHIFT);
+	} else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
+		struct page *page = vmalloc_to_page(cpu_addr);
+
+		dma_common_free_remap(cpu_addr, size, VM_USERMAP, false);
+		dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
+	} else if (!is_vmalloc_addr(cpu_addr)) {
+		__free_pages(virt_to_page(cpu_addr), get_order(size));
+	} else if (__in_atomic_pool(cpu_addr, size)) {
+		// Keep remap
+		__free_from_pool(cpu_addr, size);
+	}
+}
+
+/* __swiotlb_mmap_pfn is not currently exported. */
+static int fast_smmu_mmap_pfn(struct vm_area_struct *vma, unsigned long pfn,
+			     size_t size)
+{
+	int ret = -ENXIO;
+	unsigned long nr_vma_pages = vma_pages(vma);
+	unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned long off = vma->vm_pgoff;
+
+	if (off < nr_pages && nr_vma_pages <= (nr_pages - off)) {
+		ret = remap_pfn_range(vma, vma->vm_start, pfn + off,
+				      vma->vm_end - vma->vm_start,
+				      vma->vm_page_prot);
+	}
+
+	return ret;
+}
+
+static int fast_smmu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
+				void *cpu_addr, dma_addr_t dma_addr,
+				size_t size, unsigned long attrs)
+{
+	struct vm_struct *area;
+	bool coherent = is_dma_coherent(dev, attrs);
+	unsigned long pfn = 0;
+
+	vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot,
+					     coherent);
+	area = find_vm_area(cpu_addr);
+	if (area && area->pages)
+		return iommu_dma_mmap(area->pages, size, vma);
+	else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS)
+		pfn = vmalloc_to_pfn(cpu_addr);
+	else if (!is_vmalloc_addr(cpu_addr))
+		pfn = page_to_pfn(virt_to_page(cpu_addr));
+	else if (__in_atomic_pool(cpu_addr, size))
+		pfn = __atomic_get_phys(cpu_addr) >> PAGE_SHIFT;
+
+
+	if (pfn)
+		return fast_smmu_mmap_pfn(vma, pfn, size);
+
+	return -EINVAL;
+}
+
+static int fast_smmu_get_sgtable(struct device *dev, struct sg_table *sgt,
+				void *cpu_addr, dma_addr_t dma_addr,
+				size_t size, unsigned long attrs)
+{
+	unsigned int n_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	struct vm_struct *area;
+	struct page *page = NULL;
+	int ret = -ENXIO;
+
+	area = find_vm_area(cpu_addr);
+	if (area && area->pages)
+		return sg_alloc_table_from_pages(sgt, area->pages, n_pages, 0,
+						 size, GFP_KERNEL);
+	else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS)
+		page = vmalloc_to_page(cpu_addr);
+	else if (!is_vmalloc_addr(cpu_addr))
+		page = virt_to_page(cpu_addr);
+	else if (__in_atomic_pool(cpu_addr, size))
+		page = phys_to_page(__atomic_get_phys(cpu_addr));
+
+	if (page) {
+		ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+		if (!ret)
+			sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+	}
+
+	return ret;
+}
+
+static dma_addr_t fast_smmu_dma_map_resource(
+			struct device *dev, phys_addr_t phys_addr,
+			size_t size, enum dma_data_direction dir,
+			unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	size_t offset = phys_addr & ~FAST_PAGE_MASK;
+	size_t len = round_up(size + offset, FAST_PAGE_SIZE);
+	dma_addr_t dma_addr;
+	int prot;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mapping->lock, flags);
+	dma_addr = __fast_smmu_alloc_iova(mapping, attrs, len);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+
+	if (dma_addr == DMA_ERROR_CODE)
+		return dma_addr;
+
+	prot = dma_info_to_prot(dir, false, attrs);
+	prot |= IOMMU_MMIO;
+
+	if (iommu_map(mapping->domain, dma_addr, phys_addr - offset,
+			len, prot)) {
+		spin_lock_irqsave(&mapping->lock, flags);
+		__fast_smmu_free_iova(mapping, dma_addr, len);
+		spin_unlock_irqrestore(&mapping->lock, flags);
+		return DMA_ERROR_CODE;
+	}
+	return dma_addr + offset;
+}
+
+static void fast_smmu_dma_unmap_resource(
+			struct device *dev, dma_addr_t addr,
+			size_t size, enum dma_data_direction dir,
+			unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev_get_mapping(dev);
+	size_t offset = addr & ~FAST_PAGE_MASK;
+	size_t len = round_up(size + offset, FAST_PAGE_SIZE);
+	unsigned long flags;
+
+	iommu_unmap(mapping->domain, addr - offset, len);
+	spin_lock_irqsave(&mapping->lock, flags);
+	__fast_smmu_free_iova(mapping, addr, len);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+}
+
+static int fast_smmu_mapping_error(struct device *dev,
+				   dma_addr_t dma_addr)
+{
+	return dma_addr == DMA_ERROR_CODE;
+}
+
+static void __fast_smmu_mapped_over_stale(struct dma_fast_smmu_mapping *fast,
+					  void *data)
+{
+	av8l_fast_iopte *pmds, *ptep = data;
+	dma_addr_t iova;
+	unsigned long bitmap_idx;
+	struct io_pgtable *tbl;
+
+	tbl  = container_of(fast->pgtbl_ops, struct io_pgtable, ops);
+	pmds = tbl->cfg.av8l_fast_cfg.pmds;
+
+	bitmap_idx = (unsigned long)(ptep - pmds);
+	iova = bitmap_idx << FAST_PAGE_SHIFT;
+	dev_err(fast->dev, "Mapped over stale tlb at %pa\n", &iova);
+	dev_err(fast->dev, "bitmap (failure at idx %lu):\n", bitmap_idx);
+	dev_err(fast->dev, "ptep: %p pmds: %p diff: %lu\n", ptep,
+		pmds, bitmap_idx);
+	print_hex_dump(KERN_ERR, "bmap: ", DUMP_PREFIX_ADDRESS,
+		       32, 8, fast->bitmap, fast->bitmap_size, false);
+}
+
+static int fast_smmu_notify(struct notifier_block *self,
+			    unsigned long action, void *data)
+{
+	struct dma_fast_smmu_mapping *fast = container_of(
+		self, struct dma_fast_smmu_mapping, notifier);
+
+	switch (action) {
+	case MAPPED_OVER_STALE_TLB:
+		__fast_smmu_mapped_over_stale(fast, data);
+		return NOTIFY_OK;
+	default:
+		WARN(1, "Unhandled notifier action");
+		return NOTIFY_DONE;
+	}
+}
+
+static const struct dma_map_ops fast_smmu_dma_ops = {
+	.alloc = fast_smmu_alloc,
+	.free = fast_smmu_free,
+	.mmap = fast_smmu_mmap_attrs,
+	.get_sgtable = fast_smmu_get_sgtable,
+	.map_page = fast_smmu_map_page,
+	.unmap_page = fast_smmu_unmap_page,
+	.sync_single_for_cpu = fast_smmu_sync_single_for_cpu,
+	.sync_single_for_device = fast_smmu_sync_single_for_device,
+	.map_sg = fast_smmu_map_sg,
+	.unmap_sg = fast_smmu_unmap_sg,
+	.sync_sg_for_cpu = fast_smmu_sync_sg_for_cpu,
+	.sync_sg_for_device = fast_smmu_sync_sg_for_device,
+	.map_resource = fast_smmu_dma_map_resource,
+	.unmap_resource = fast_smmu_dma_unmap_resource,
+	.mapping_error = fast_smmu_mapping_error,
+};
+
+/**
+ * __fast_smmu_create_mapping_sized
+ * @base: bottom of the VA range
+ * @size: size of the VA range in bytes
+ *
+ * Creates a mapping structure which holds information about used/unused IO
+ * address ranges, which is required to perform mapping with IOMMU aware
+ * functions. The only VA range supported is [0, 4GB].
+ *
+ * The client device need to be attached to the mapping with
+ * fast_smmu_attach_device function.
+ */
+static struct dma_fast_smmu_mapping *__fast_smmu_create_mapping_sized(
+	dma_addr_t base, u64 size)
+{
+	struct dma_fast_smmu_mapping *fast;
+
+	fast = kzalloc(sizeof(struct dma_fast_smmu_mapping), GFP_KERNEL);
+	if (!fast)
+		goto err;
+
+	fast->base = base;
+	fast->size = size;
+	fast->num_4k_pages = size >> FAST_PAGE_SHIFT;
+	fast->bitmap_size = BITS_TO_LONGS(fast->num_4k_pages) * sizeof(long);
+
+	fast->bitmap = kzalloc(fast->bitmap_size, GFP_KERNEL | __GFP_NOWARN |
+								__GFP_NORETRY);
+	if (!fast->bitmap)
+		fast->bitmap = vzalloc(fast->bitmap_size);
+
+	if (!fast->bitmap)
+		goto err2;
+
+	spin_lock_init(&fast->lock);
+
+	fast->iovad = kzalloc(sizeof(*fast->iovad), GFP_KERNEL);
+	if (!fast->iovad)
+		goto err_free_bitmap;
+	init_iova_domain(fast->iovad, FAST_PAGE_SIZE,
+			base >> FAST_PAGE_SHIFT);
+
+	return fast;
+
+err_free_bitmap:
+	kvfree(fast->bitmap);
+err2:
+	kfree(fast);
+err:
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Based off of similar code from dma-iommu.c, but modified to use a different
+ * iova allocator
+ */
+static void fast_smmu_reserve_pci_windows(struct device *dev,
+			    struct dma_fast_smmu_mapping *mapping)
+{
+	struct pci_host_bridge *bridge;
+	struct resource_entry *window;
+	phys_addr_t start, end;
+	struct pci_dev *pci_dev;
+	unsigned long flags;
+
+	if (!dev_is_pci(dev))
+		return;
+
+	pci_dev = to_pci_dev(dev);
+	bridge = pci_find_host_bridge(pci_dev->bus);
+
+	spin_lock_irqsave(&mapping->lock, flags);
+	resource_list_for_each_entry(window, &bridge->windows) {
+		if (resource_type(window->res) != IORESOURCE_MEM &&
+		    resource_type(window->res) != IORESOURCE_IO)
+			continue;
+
+		start = round_down(window->res->start - window->offset,
+				FAST_PAGE_SIZE);
+		end = round_up(window->res->end - window->offset,
+				FAST_PAGE_SIZE);
+		start = max_t(unsigned long, mapping->base, start);
+		end = min_t(unsigned long, mapping->base + mapping->size, end);
+		if (start >= end)
+			continue;
+
+		dev_dbg(dev, "iova allocator reserved 0x%pa-0x%pa\n",
+				&start, &end);
+
+		start = (start - mapping->base) >> FAST_PAGE_SHIFT;
+		end = (end - mapping->base) >> FAST_PAGE_SHIFT;
+		bitmap_set(mapping->bitmap, start, end - start);
+	}
+	spin_unlock_irqrestore(&mapping->lock, flags);
+}
+
+void fast_smmu_put_dma_cookie(struct iommu_domain *domain)
+{
+	struct dma_fast_smmu_mapping *fast = domain->iova_cookie;
+
+	if (!fast)
+		return;
+
+	if (fast->iovad) {
+		put_iova_domain(fast->iovad);
+		kfree(fast->iovad);
+	}
+
+	if (fast->bitmap)
+		kvfree(fast->bitmap);
+
+	kfree(fast);
+	domain->iova_cookie = NULL;
+}
+EXPORT_SYMBOL_GPL(fast_smmu_put_dma_cookie);
+
+/**
+ * fast_smmu_init_mapping
+ * @dev: valid struct device pointer
+ * @mapping: io address space mapping structure (returned from
+ *	arm_iommu_create_mapping)
+ *
+ * Called the first time a device is attached to this mapping.
+ * Not for dma client use.
+ */
+int fast_smmu_init_mapping(struct device *dev,
+			    struct dma_iommu_mapping *mapping)
+{
+	int err = 0;
+	struct iommu_domain *domain = mapping->domain;
+	struct iommu_pgtbl_info info;
+	u64 size = (u64)mapping->bits << PAGE_SHIFT;
+	struct dma_fast_smmu_mapping *fast;
+
+	if (domain->iova_cookie) {
+		fast = domain->iova_cookie;
+		goto finish;
+	}
+
+	if (mapping->base + size > (SZ_1G * 4ULL)) {
+		dev_err(dev, "Iova end address too large\n");
+		return -EINVAL;
+	}
+
+	fast = __fast_smmu_create_mapping_sized(mapping->base, size);
+	if (IS_ERR(fast))
+		return -ENOMEM;
+
+	fast->domain = domain;
+	fast->dev = dev;
+	domain->iova_cookie = fast;
+
+	domain->geometry.aperture_start = mapping->base;
+	domain->geometry.aperture_end = mapping->base + size - 1;
+
+	if (iommu_domain_get_attr(domain, DOMAIN_ATTR_PGTBL_INFO,
+				  &info)) {
+		dev_err(dev, "Couldn't get page table info\n");
+		err = -EINVAL;
+		goto release_mapping;
+	}
+	fast->pgtbl_ops = (struct io_pgtable_ops *)info.ops;
+
+	fast->notifier.notifier_call = fast_smmu_notify;
+	av8l_register_notify(&fast->notifier);
+
+finish:
+	fast_smmu_reserve_pci_windows(dev, fast);
+	mapping->ops = &fast_smmu_dma_ops;
+	return 0;
+
+release_mapping:
+	fast_smmu_put_dma_cookie(domain);
+	return err;
+}
diff --git a/drivers/iommu/io-pgtable-fast.c b/drivers/iommu/io-pgtable-fast.c
new file mode 100644
index 000000000000..61b86f00b3f4
--- /dev/null
+++ b/drivers/iommu/io-pgtable-fast.c
@@ -0,0 +1,816 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2016-2017, The Linux Foundation. All rights reserved.
+ */
+
+#define pr_fmt(fmt)	"io-pgtable-fast: " fmt
+
+#include <linux/iommu.h>
+#include <linux/kernel.h>
+#include <linux/io-pgtable.h>
+#include <linux/scatterlist.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/io-pgtable-fast.h>
+#include <linux/mm.h>
+#include <asm/cacheflush.h>
+#include <linux/vmalloc.h>
+
+#define AV8L_FAST_MAX_ADDR_BITS		48
+
+/* Struct accessors */
+#define iof_pgtable_to_data(x)						\
+	container_of((x), struct av8l_fast_io_pgtable, iop)
+
+#define iof_pgtable_ops_to_pgtable(x)					\
+	container_of((x), struct io_pgtable, ops)
+
+#define iof_pgtable_ops_to_data(x)					\
+	iof_pgtable_to_data(iof_pgtable_ops_to_pgtable(x))
+
+struct av8l_fast_io_pgtable {
+	struct io_pgtable	  iop;
+	av8l_fast_iopte		 *pgd;
+	av8l_fast_iopte		 *puds[4];
+	av8l_fast_iopte		 *pmds;
+	struct page		**pages; /* page table memory */
+	int			nr_pages;
+	dma_addr_t		base;
+	dma_addr_t		start;
+	dma_addr_t		end;
+};
+
+/* Page table bits */
+#define AV8L_FAST_PTE_TYPE_SHIFT	0
+#define AV8L_FAST_PTE_TYPE_MASK		0x3
+
+#define AV8L_FAST_PTE_TYPE_BLOCK	1
+#define AV8L_FAST_PTE_TYPE_TABLE	3
+#define AV8L_FAST_PTE_TYPE_PAGE		3
+
+#define AV8L_FAST_PTE_NSTABLE		(((av8l_fast_iopte)1) << 63)
+#define AV8L_FAST_PTE_XN		(((av8l_fast_iopte)3) << 53)
+#define AV8L_FAST_PTE_AF		(((av8l_fast_iopte)1) << 10)
+#define AV8L_FAST_PTE_SH_NS		(((av8l_fast_iopte)0) << 8)
+#define AV8L_FAST_PTE_SH_OS		(((av8l_fast_iopte)2) << 8)
+#define AV8L_FAST_PTE_SH_IS		(((av8l_fast_iopte)3) << 8)
+#define AV8L_FAST_PTE_SH_MASK		(((av8l_fast_iopte)3) << 8)
+#define AV8L_FAST_PTE_NS		(((av8l_fast_iopte)1) << 5)
+#define AV8L_FAST_PTE_VALID		(((av8l_fast_iopte)1) << 0)
+
+#define AV8L_FAST_PTE_ATTR_LO_MASK	(((av8l_fast_iopte)0x3ff) << 2)
+/* Ignore the contiguous bit for block splitting */
+#define AV8L_FAST_PTE_ATTR_HI_MASK	(((av8l_fast_iopte)6) << 52)
+#define AV8L_FAST_PTE_ATTR_MASK		(AV8L_FAST_PTE_ATTR_LO_MASK |	\
+					 AV8L_FAST_PTE_ATTR_HI_MASK)
+#define AV8L_FAST_PTE_ADDR_MASK		((av8l_fast_iopte)0xfffffffff000)
+
+
+/* Stage-1 PTE */
+#define AV8L_FAST_PTE_AP_PRIV_RW	(((av8l_fast_iopte)0) << 6)
+#define AV8L_FAST_PTE_AP_RW		(((av8l_fast_iopte)1) << 6)
+#define AV8L_FAST_PTE_AP_PRIV_RO	(((av8l_fast_iopte)2) << 6)
+#define AV8L_FAST_PTE_AP_RO		(((av8l_fast_iopte)3) << 6)
+#define AV8L_FAST_PTE_ATTRINDX_SHIFT	2
+#define AV8L_FAST_PTE_ATTRINDX_MASK	0x7
+#define AV8L_FAST_PTE_nG		(((av8l_fast_iopte)1) << 11)
+
+/* Stage-2 PTE */
+#define AV8L_FAST_PTE_HAP_FAULT		(((av8l_fast_iopte)0) << 6)
+#define AV8L_FAST_PTE_HAP_READ		(((av8l_fast_iopte)1) << 6)
+#define AV8L_FAST_PTE_HAP_WRITE		(((av8l_fast_iopte)2) << 6)
+#define AV8L_FAST_PTE_MEMATTR_OIWB	(((av8l_fast_iopte)0xf) << 2)
+#define AV8L_FAST_PTE_MEMATTR_NC	(((av8l_fast_iopte)0x5) << 2)
+#define AV8L_FAST_PTE_MEMATTR_DEV	(((av8l_fast_iopte)0x1) << 2)
+
+/* Register bits */
+#define ARM_32_LPAE_TCR_EAE		(1 << 31)
+#define ARM_64_LPAE_S2_TCR_RES1		(1 << 31)
+
+#define AV8L_FAST_TCR_TG0_4K		(0 << 14)
+#define AV8L_FAST_TCR_TG0_64K		(1 << 14)
+#define AV8L_FAST_TCR_TG0_16K		(2 << 14)
+
+#define AV8L_FAST_TCR_SH0_SHIFT		12
+#define AV8L_FAST_TCR_SH0_MASK		0x3
+#define AV8L_FAST_TCR_SH_NS		0
+#define AV8L_FAST_TCR_SH_OS		2
+#define AV8L_FAST_TCR_SH_IS		3
+
+#define AV8L_FAST_TCR_ORGN0_SHIFT	10
+#define AV8L_FAST_TCR_IRGN0_SHIFT	8
+#define AV8L_FAST_TCR_RGN_MASK		0x3
+#define AV8L_FAST_TCR_RGN_NC		0
+#define AV8L_FAST_TCR_RGN_WBWA		1
+#define AV8L_FAST_TCR_RGN_WT		2
+#define AV8L_FAST_TCR_RGN_WB		3
+
+#define AV8L_FAST_TCR_SL0_SHIFT		6
+#define AV8L_FAST_TCR_SL0_MASK		0x3
+
+#define AV8L_FAST_TCR_T0SZ_SHIFT	0
+#define AV8L_FAST_TCR_SZ_MASK		0xf
+
+#define AV8L_FAST_TCR_PS_SHIFT		16
+#define AV8L_FAST_TCR_PS_MASK		0x7
+
+#define AV8L_FAST_TCR_IPS_SHIFT		32
+#define AV8L_FAST_TCR_IPS_MASK		0x7
+
+#define AV8L_FAST_TCR_PS_32_BIT		0x0ULL
+#define AV8L_FAST_TCR_PS_36_BIT		0x1ULL
+#define AV8L_FAST_TCR_PS_40_BIT		0x2ULL
+#define AV8L_FAST_TCR_PS_42_BIT		0x3ULL
+#define AV8L_FAST_TCR_PS_44_BIT		0x4ULL
+#define AV8L_FAST_TCR_PS_48_BIT		0x5ULL
+
+#define AV8L_FAST_TCR_EPD1_SHIFT	23
+#define AV8L_FAST_TCR_EPD1_FAULT	1
+
+#define AV8L_FAST_MAIR_ATTR_SHIFT(n)	((n) << 3)
+#define AV8L_FAST_MAIR_ATTR_MASK	0xff
+#define AV8L_FAST_MAIR_ATTR_DEVICE	0x04
+#define AV8L_FAST_MAIR_ATTR_NC		0x44
+#define AV8L_FAST_MAIR_ATTR_WBRWA	0xff
+#define AV8L_FAST_MAIR_ATTR_UPSTREAM	0xf4
+#define AV8L_FAST_MAIR_ATTR_IDX_NC	0
+#define AV8L_FAST_MAIR_ATTR_IDX_CACHE	1
+#define AV8L_FAST_MAIR_ATTR_IDX_DEV	2
+#define AV8L_FAST_MAIR_ATTR_IDX_UPSTREAM	3
+
+#define AV8L_FAST_PAGE_SHIFT		12
+
+#define PTE_MAIR_IDX(pte)				\
+	((pte >> AV8L_FAST_PTE_ATTRINDX_SHIFT) &	\
+	 AV8L_FAST_PTE_ATTRINDX_MASK)
+
+#define PTE_SH_IDX(pte) (pte & AV8L_FAST_PTE_SH_MASK)
+
+#define iopte_pmd_offset(pmds, base, iova) (pmds + ((iova - base) >> 12))
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST_PROVE_TLB
+
+#include <asm/cacheflush.h>
+#include <linux/notifier.h>
+
+static ATOMIC_NOTIFIER_HEAD(av8l_notifier_list);
+
+void av8l_register_notify(struct notifier_block *nb)
+{
+	atomic_notifier_chain_register(&av8l_notifier_list, nb);
+}
+EXPORT_SYMBOL(av8l_register_notify);
+
+static void __av8l_check_for_stale_tlb(av8l_fast_iopte *ptep)
+{
+	if (unlikely(*ptep)) {
+		atomic_notifier_call_chain(
+			&av8l_notifier_list, MAPPED_OVER_STALE_TLB,
+			(void *) ptep);
+		pr_err("Tried to map over a non-vacant pte: 0x%llx @ %p\n",
+		       *ptep, ptep);
+		pr_err("Nearby memory:\n");
+		print_hex_dump(KERN_ERR, "pgtbl: ", DUMP_PREFIX_ADDRESS,
+			       32, 8, ptep - 16, 32 * sizeof(*ptep), false);
+	}
+}
+
+void av8l_fast_clear_stale_ptes(struct io_pgtable_ops *ops, u64 base,
+		u64 start, u64 end, bool skip_sync)
+{
+	int i;
+	struct av8l_fast_io_pgtable *data = iof_pgtable_ops_to_data(ops);
+	av8l_fast_iopte *pmdp = iopte_pmd_offset(pmds, base, start);
+
+	for (i = start >> AV8L_FAST_PAGE_SHIFT;
+			i <= (end >> AV8L_FAST_PAGE_SHIFT); ++i) {
+		if (!(*pmdp & AV8L_FAST_PTE_VALID)) {
+			*pmdp = 0;
+			if (!skip_sync)
+				dmac_clean_range(pmdp, pmdp + 1);
+		}
+		pmdp++;
+	}
+}
+#else
+static void __av8l_check_for_stale_tlb(av8l_fast_iopte *ptep)
+{
+}
+#endif
+
+static void av8l_clean_range(struct io_pgtable_ops *ops,
+			av8l_fast_iopte *start, av8l_fast_iopte *end)
+{
+	struct io_pgtable *iop = iof_pgtable_ops_to_pgtable(ops);
+
+	if (!(iop->cfg.quirks & IO_PGTABLE_QUIRK_NO_DMA))
+		dmac_clean_range(start, end);
+}
+
+static av8l_fast_iopte
+av8l_fast_prot_to_pte(struct av8l_fast_io_pgtable *data, int prot)
+{
+	av8l_fast_iopte pte = AV8L_FAST_PTE_XN
+		| AV8L_FAST_PTE_TYPE_PAGE
+		| AV8L_FAST_PTE_AF
+		| AV8L_FAST_PTE_nG
+		| AV8L_FAST_PTE_SH_OS;
+
+	if (prot & IOMMU_MMIO)
+		pte |= (AV8L_FAST_MAIR_ATTR_IDX_DEV
+			<< AV8L_FAST_PTE_ATTRINDX_SHIFT);
+	else if (prot & IOMMU_CACHE)
+		pte |= (AV8L_FAST_MAIR_ATTR_IDX_CACHE
+			<< AV8L_FAST_PTE_ATTRINDX_SHIFT);
+	else if (prot & IOMMU_USE_UPSTREAM_HINT)
+		pte |= (AV8L_FAST_MAIR_ATTR_IDX_UPSTREAM
+			<< AV8L_FAST_PTE_ATTRINDX_SHIFT);
+
+	if (!(prot & IOMMU_WRITE))
+		pte |= AV8L_FAST_PTE_AP_RO;
+	else
+		pte |= AV8L_FAST_PTE_AP_RW;
+
+	return pte;
+}
+
+static int av8l_fast_map(struct io_pgtable_ops *ops, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot)
+{
+	struct av8l_fast_io_pgtable *data = iof_pgtable_ops_to_data(ops);
+	av8l_fast_iopte *ptep = iopte_pmd_offset(data->pmds, data->base, iova);
+	unsigned long i, nptes = size >> AV8L_FAST_PAGE_SHIFT;
+	av8l_fast_iopte pte;
+
+	pte = av8l_fast_prot_to_pte(data, prot);
+	paddr &= AV8L_FAST_PTE_ADDR_MASK;
+	for (i = 0; i < nptes; i++, paddr += SZ_4K) {
+		__av8l_check_for_stale_tlb(ptep + i);
+		*(ptep + i) = pte | paddr;
+	}
+	av8l_clean_range(ops, ptep, ptep + nptes);
+
+	return 0;
+}
+
+int av8l_fast_map_public(struct io_pgtable_ops *ops, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot)
+{
+	return av8l_fast_map(ops, iova, paddr, size, prot);
+}
+
+static size_t
+__av8l_fast_unmap(struct io_pgtable_ops *ops, unsigned long iova,
+			size_t size, bool allow_stale_tlb)
+{
+	struct av8l_fast_io_pgtable *data = iof_pgtable_ops_to_data(ops);
+	unsigned long nptes;
+	av8l_fast_iopte *ptep;
+	int val = allow_stale_tlb
+		? AV8L_FAST_PTE_UNMAPPED_NEED_TLBI
+		: 0;
+
+	ptep = iopte_pmd_offset(data->pmds, data->base, iova);
+	nptes = size >> AV8L_FAST_PAGE_SHIFT;
+
+	memset(ptep, val, sizeof(*ptep) * nptes);
+	av8l_clean_range(ops, ptep, ptep + nptes);
+	if (!allow_stale_tlb)
+		io_pgtable_tlb_flush_all(&data->iop);
+
+	return size;
+}
+
+/* caller must take care of tlb cache maintenance */
+void av8l_fast_unmap_public(struct io_pgtable_ops *ops, unsigned long iova,
+				size_t size)
+{
+	__av8l_fast_unmap(ops, iova, size, true);
+}
+
+static size_t av8l_fast_unmap(struct io_pgtable_ops *ops, unsigned long iova,
+			      size_t size)
+{
+	return __av8l_fast_unmap(ops, iova, size, false);
+}
+
+static int av8l_fast_map_sg(struct io_pgtable_ops *ops,
+			unsigned long iova, struct scatterlist *sgl,
+			unsigned int nents, int prot, size_t *size)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		av8l_fast_map(ops, iova, sg_phys(sg), sg->length, prot);
+		iova += sg->length;
+	}
+
+	return nents;
+}
+
+int av8l_fast_map_sg_public(struct io_pgtable_ops *ops,
+			    unsigned long iova, struct scatterlist *sgl,
+			    unsigned int nents, int prot, size_t *size)
+{
+	return av8l_fast_map_sg(ops, iova, sgl, nents, prot, size);
+}
+
+#if defined(CONFIG_ARM64)
+#define FAST_PGDNDX(va) (((va) & 0x7fc0000000) >> 27)
+#elif defined(CONFIG_ARM)
+#define FAST_PGDNDX(va) (((va) & 0xc0000000) >> 27)
+#endif
+
+static phys_addr_t av8l_fast_iova_to_phys(struct io_pgtable_ops *ops,
+					  unsigned long iova)
+{
+	struct av8l_fast_io_pgtable *data = iof_pgtable_ops_to_data(ops);
+	av8l_fast_iopte pte, *pgdp, *pudp, *pmdp;
+	unsigned long pgd;
+	phys_addr_t phys;
+	const unsigned long pts = AV8L_FAST_PTE_TYPE_SHIFT;
+	const unsigned long ptm = AV8L_FAST_PTE_TYPE_MASK;
+	const unsigned long ptt = AV8L_FAST_PTE_TYPE_TABLE;
+	const unsigned long ptp = AV8L_FAST_PTE_TYPE_PAGE;
+	const av8l_fast_iopte am = AV8L_FAST_PTE_ADDR_MASK;
+
+	/* TODO: clean up some of these magic numbers... */
+
+	pgd = (unsigned long)data->pgd | FAST_PGDNDX(iova);
+	pgdp = (av8l_fast_iopte *)pgd;
+
+	pte = *pgdp;
+	if (((pte >> pts) & ptm) != ptt)
+		return 0;
+	pudp = phys_to_virt((pte & am) | ((iova & 0x3fe00000) >> 18));
+
+	pte = *pudp;
+	if (((pte >> pts) & ptm) != ptt)
+		return 0;
+	pmdp = phys_to_virt((pte & am) | ((iova & 0x1ff000) >> 9));
+
+	pte = *pmdp;
+	if (((pte >> pts) & ptm) != ptp)
+		return 0;
+	phys = pte & am;
+
+	return phys | (iova & 0xfff);
+}
+
+phys_addr_t av8l_fast_iova_to_phys_public(struct io_pgtable_ops *ops,
+					  unsigned long iova)
+{
+	return av8l_fast_iova_to_phys(ops, iova);
+}
+
+static bool av8l_fast_iova_coherent(struct io_pgtable_ops *ops,
+					unsigned long iova)
+{
+	struct av8l_fast_io_pgtable *data = iof_pgtable_ops_to_data(ops);
+	av8l_fast_iopte *ptep = iopte_pmd_offset(data->pmds, data->base, iova);
+
+	return ((PTE_MAIR_IDX(*ptep) == AV8L_FAST_MAIR_ATTR_IDX_CACHE) &&
+		((PTE_SH_IDX(*ptep) == AV8L_FAST_PTE_SH_OS) ||
+		 (PTE_SH_IDX(*ptep) == AV8L_FAST_PTE_SH_IS)));
+}
+
+bool av8l_fast_iova_coherent_public(struct io_pgtable_ops *ops,
+					unsigned long iova)
+{
+	return av8l_fast_iova_coherent(ops, iova);
+}
+
+static struct av8l_fast_io_pgtable *
+av8l_fast_alloc_pgtable_data(struct io_pgtable_cfg *cfg)
+{
+	struct av8l_fast_io_pgtable *data;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return NULL;
+
+	data->iop.ops = (struct io_pgtable_ops) {
+		.map		= av8l_fast_map,
+		.map_sg		= av8l_fast_map_sg,
+		.unmap		= av8l_fast_unmap,
+		.iova_to_phys	= av8l_fast_iova_to_phys,
+		.is_iova_coherent = av8l_fast_iova_coherent,
+	};
+
+	return data;
+}
+
+/*
+ * We need max 1 page for the pgd, 4 pages for puds (1GB VA per pud page) and
+ * 2048 pages for pmds (each pud page contains 512 table entries, each
+ * pointing to a pmd).
+ */
+#define NUM_PGD_PAGES 1
+#define NUM_PUD_PAGES 4
+#define NUM_PMD_PAGES 2048
+#define NUM_PGTBL_PAGES (NUM_PGD_PAGES + NUM_PUD_PAGES + NUM_PMD_PAGES)
+
+/* undefine arch specific definitions which depends on page table format */
+#undef pud_index
+#undef pud_mask
+#undef pud_next
+#undef pmd_index
+#undef pmd_mask
+#undef pmd_next
+
+#define pud_index(addr)		(((addr) >> 30) & 0x3)
+#define pud_mask(addr)		((addr) & ~((1UL << 30) - 1))
+#define pud_next(addr, end)					\
+({	unsigned long __boundary = pud_mask(addr + (1UL << 30));\
+	(__boundary - 1 < (end) - 1) ? __boundary : (end);	\
+})
+
+#define pmd_index(addr)		(((addr) >> 21) & 0x1ff)
+#define pmd_mask(addr)		((addr) & ~((1UL << 21) - 1))
+#define pmd_next(addr, end)					\
+({	unsigned long __boundary = pmd_mask(addr + (1UL << 21));\
+	(__boundary - 1 < (end) - 1) ? __boundary : (end);	\
+})
+
+static int
+av8l_fast_prepopulate_pgtables(struct av8l_fast_io_pgtable *data,
+			       struct io_pgtable_cfg *cfg, void *cookie)
+{
+	int i, j, pg = 0;
+	struct page **pages, *page;
+	dma_addr_t base = cfg->iova_base;
+	dma_addr_t end = cfg->iova_end;
+	dma_addr_t pud, pmd;
+	int pmd_pg_index;
+
+	pages = kmalloc(sizeof(*pages) * NUM_PGTBL_PAGES, __GFP_NOWARN |
+							__GFP_NORETRY);
+
+	if (!pages)
+		pages = vmalloc(sizeof(*pages) * NUM_PGTBL_PAGES);
+
+	if (!pages)
+		return -ENOMEM;
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		goto err_free_pages_arr;
+	pages[pg++] = page;
+	data->pgd = page_address(page);
+
+	/*
+	 * We need max 2048 entries at level 2 to map 4GB of VA space. A page
+	 * can hold 512 entries, so we need max 4 pages.
+	 */
+	for (i = pud_index(base), pud = base; pud < end;
+			++i, pud = pud_next(pud, end)) {
+		av8l_fast_iopte pte, *ptep;
+
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!page)
+			goto err_free_pages;
+		pages[pg++] = page;
+		data->puds[i] = page_address(page);
+		pte = page_to_phys(page) | AV8L_FAST_PTE_TYPE_TABLE;
+		ptep = ((av8l_fast_iopte *)data->pgd) + i;
+		*ptep = pte;
+	}
+	dmac_clean_range(data->pgd, data->pgd + 4);
+
+	/*
+	 * We have max 4 puds, each of which can point to 512 pmds, so we'll
+	 * have max 2048 pmds, each of which can hold 512 ptes, for a grand
+	 * total of 2048*512=1048576 PTEs.
+	 */
+	pmd_pg_index = pg;
+	for (i = pud_index(base), pud = base; pud < end;
+			++i, pud = pud_next(pud, end)) {
+		for (j = pmd_index(pud), pmd = pud; pmd < pud_next(pud, end);
+				++j, pmd = pmd_next(pmd, end)) {
+			av8l_fast_iopte pte, *pudp;
+			void *addr;
+
+			page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+			if (!page)
+				goto err_free_pages;
+			pages[pg++] = page;
+
+			addr = page_address(page);
+			dmac_clean_range(addr, addr + SZ_4K);
+
+			pte = page_to_phys(page) | AV8L_FAST_PTE_TYPE_TABLE;
+			pudp = data->puds[i] + j;
+			*pudp = pte;
+		}
+		dmac_clean_range(data->puds[i], data->puds[i] + 512);
+	}
+
+	/*
+	 * We map the pmds into a virtually contiguous space so that we
+	 * don't have to traverse the first two levels of the page tables
+	 * to find the appropriate pud.  Instead, it will be a simple
+	 * offset from the virtual base of the pmds.
+	 */
+	data->pmds = vmap(&pages[pmd_pg_index], pg - pmd_pg_index,
+			  VM_IOREMAP, PAGE_KERNEL);
+	if (!data->pmds)
+		goto err_free_pages;
+
+	data->pages = pages;
+	data->nr_pages = pg;
+	data->base = base;
+	data->end = end;
+	return 0;
+
+err_free_pages:
+	for (i = 0; i < pg; ++i)
+		__free_page(pages[i]);
+err_free_pages_arr:
+	kvfree(pages);
+	return -ENOMEM;
+}
+
+static struct io_pgtable *
+av8l_fast_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
+{
+	u64 reg;
+	struct av8l_fast_io_pgtable *data =
+		av8l_fast_alloc_pgtable_data(cfg);
+
+	if (!data)
+		return NULL;
+
+	/* restrict according to the fast map requirements */
+	cfg->ias = 32;
+	cfg->pgsize_bitmap = SZ_4K;
+
+	/* TCR */
+	if (cfg->quirks & IO_PGTABLE_QUIRK_QCOM_USE_UPSTREAM_HINT)
+		reg = (AV8L_FAST_TCR_SH_OS << AV8L_FAST_TCR_SH0_SHIFT) |
+			(AV8L_FAST_TCR_RGN_NC << AV8L_FAST_TCR_IRGN0_SHIFT) |
+			(AV8L_FAST_TCR_RGN_WBWA << AV8L_FAST_TCR_ORGN0_SHIFT);
+	else if (cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)
+		reg = (AV8L_FAST_TCR_SH_OS << AV8L_FAST_TCR_SH0_SHIFT) |
+			(AV8L_FAST_TCR_RGN_WBWA << AV8L_FAST_TCR_IRGN0_SHIFT) |
+			(AV8L_FAST_TCR_RGN_WBWA << AV8L_FAST_TCR_ORGN0_SHIFT);
+	else
+		reg = (AV8L_FAST_TCR_SH_OS << AV8L_FAST_TCR_SH0_SHIFT) |
+			(AV8L_FAST_TCR_RGN_NC << AV8L_FAST_TCR_IRGN0_SHIFT) |
+			(AV8L_FAST_TCR_RGN_NC << AV8L_FAST_TCR_ORGN0_SHIFT);
+
+	reg |= AV8L_FAST_TCR_TG0_4K;
+
+	switch (cfg->oas) {
+	case 32:
+		reg |= (AV8L_FAST_TCR_PS_32_BIT << AV8L_FAST_TCR_IPS_SHIFT);
+		break;
+	case 36:
+		reg |= (AV8L_FAST_TCR_PS_36_BIT << AV8L_FAST_TCR_IPS_SHIFT);
+		break;
+	case 40:
+		reg |= (AV8L_FAST_TCR_PS_40_BIT << AV8L_FAST_TCR_IPS_SHIFT);
+		break;
+	case 42:
+		reg |= (AV8L_FAST_TCR_PS_42_BIT << AV8L_FAST_TCR_IPS_SHIFT);
+		break;
+	case 44:
+		reg |= (AV8L_FAST_TCR_PS_44_BIT << AV8L_FAST_TCR_IPS_SHIFT);
+		break;
+	case 48:
+		reg |= (AV8L_FAST_TCR_PS_48_BIT << AV8L_FAST_TCR_IPS_SHIFT);
+		break;
+	default:
+		goto out_free_data;
+	}
+
+	reg |= (64ULL - cfg->ias) << AV8L_FAST_TCR_T0SZ_SHIFT;
+	reg |= AV8L_FAST_TCR_EPD1_FAULT << AV8L_FAST_TCR_EPD1_SHIFT;
+#if defined(CONFIG_ARM)
+	reg |= ARM_32_LPAE_TCR_EAE;
+#endif
+	cfg->av8l_fast_cfg.tcr = reg;
+
+	/* MAIRs */
+	reg = (AV8L_FAST_MAIR_ATTR_NC
+	       << AV8L_FAST_MAIR_ATTR_SHIFT(AV8L_FAST_MAIR_ATTR_IDX_NC)) |
+	      (AV8L_FAST_MAIR_ATTR_WBRWA
+	       << AV8L_FAST_MAIR_ATTR_SHIFT(AV8L_FAST_MAIR_ATTR_IDX_CACHE)) |
+	      (AV8L_FAST_MAIR_ATTR_DEVICE
+	       << AV8L_FAST_MAIR_ATTR_SHIFT(AV8L_FAST_MAIR_ATTR_IDX_DEV)) |
+	      (AV8L_FAST_MAIR_ATTR_UPSTREAM
+	       << AV8L_FAST_MAIR_ATTR_SHIFT(AV8L_FAST_MAIR_ATTR_IDX_UPSTREAM));
+
+	cfg->av8l_fast_cfg.mair[0] = reg;
+	cfg->av8l_fast_cfg.mair[1] = 0;
+
+	/* Allocate all page table memory! */
+	if (av8l_fast_prepopulate_pgtables(data, cfg, cookie))
+		goto out_free_data;
+
+	cfg->av8l_fast_cfg.pmds = data->pmds;
+
+	/* TTBRs */
+	cfg->av8l_fast_cfg.ttbr[0] = virt_to_phys(data->pgd);
+	cfg->av8l_fast_cfg.ttbr[1] = 0;
+	return &data->iop;
+
+out_free_data:
+	kfree(data);
+	return NULL;
+}
+
+static void av8l_fast_free_pgtable(struct io_pgtable *iop)
+{
+	int i;
+	struct av8l_fast_io_pgtable *data = iof_pgtable_to_data(iop);
+
+	vunmap(data->pmds);
+	for (i = 0; i < data->nr_pages; ++i)
+		__free_page(data->pages[i]);
+	kvfree(data->pages);
+	kfree(data);
+}
+
+struct io_pgtable_init_fns io_pgtable_av8l_fast_init_fns = {
+	.alloc	= av8l_fast_alloc_pgtable,
+	.free	= av8l_fast_free_pgtable,
+};
+
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST_SELFTEST
+
+#include <linux/dma-contiguous.h>
+
+static struct io_pgtable_cfg *cfg_cookie;
+
+static void dummy_tlb_flush_all(void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_tlb_add_flush(unsigned long iova, size_t size, size_t granule,
+				bool leaf, void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+	WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
+}
+
+static void dummy_tlb_sync(void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static struct iommu_gather_ops dummy_tlb_ops __initdata = {
+	.tlb_flush_all	= dummy_tlb_flush_all,
+	.tlb_add_flush	= dummy_tlb_add_flush,
+	.tlb_sync	= dummy_tlb_sync,
+};
+
+/*
+ * Returns true if the iova range is successfully mapped to the contiguous
+ * phys range in ops.
+ */
+static bool av8l_fast_range_has_specific_mapping(struct io_pgtable_ops *ops,
+						 const unsigned long iova_start,
+						 const phys_addr_t phys_start,
+						 const size_t size)
+{
+	u64 iova = iova_start;
+	phys_addr_t phys = phys_start;
+
+	while (iova < (iova_start + size)) {
+		/* + 42 just to make sure offsetting is working */
+		if (ops->iova_to_phys(ops, iova + 42) != (phys + 42))
+			return false;
+		iova += SZ_4K;
+		phys += SZ_4K;
+	}
+	return true;
+}
+
+static int __init av8l_fast_positive_testing(void)
+{
+	int failed = 0;
+	u64 iova;
+	struct io_pgtable_ops *ops;
+	struct io_pgtable_cfg cfg;
+	struct av8l_fast_io_pgtable *data;
+	av8l_fast_iopte *pmds;
+	u64 max = SZ_1G * 4ULL - 1;
+	u64 base = 0;
+
+	cfg = (struct io_pgtable_cfg) {
+		.quirks = 0,
+		.tlb = &dummy_tlb_ops,
+		.ias = 32,
+		.oas = 32,
+		.pgsize_bitmap = SZ_4K,
+		.iova_base = base,
+		.iova_end = max,
+	};
+
+	cfg_cookie = &cfg;
+	ops = alloc_io_pgtable_ops(ARM_V8L_FAST, &cfg, &cfg);
+
+	if (WARN_ON(!ops))
+		return 1;
+
+	data = iof_pgtable_ops_to_data(ops);
+	pmds = data->pmds;
+
+	/* map the entire 4GB VA space with 4K map calls */
+	for (iova = base; iova < max; iova += SZ_4K) {
+		if (WARN_ON(ops->map(ops, iova, iova, SZ_4K, IOMMU_READ))) {
+			failed++;
+			continue;
+		}
+	}
+	if (WARN_ON(!av8l_fast_range_has_specific_mapping(ops, base,
+					base, max - base)))
+		failed++;
+
+	/* unmap it all */
+	for (iova = base; iova < max; iova += SZ_4K) {
+		if (WARN_ON(ops->unmap(ops, iova, SZ_4K) != SZ_4K))
+			failed++;
+	}
+
+	/* sweep up TLB proving PTEs */
+	av8l_fast_clear_stale_ptes(pmds, base, base, max, false);
+
+	/* map the entire 4GB VA space with 8K map calls */
+	for (iova = base; iova < max; iova += SZ_8K) {
+		if (WARN_ON(ops->map(ops, iova, iova, SZ_8K, IOMMU_READ))) {
+			failed++;
+			continue;
+		}
+	}
+
+	if (WARN_ON(!av8l_fast_range_has_specific_mapping(ops, base,
+					base, max - base)))
+		failed++;
+
+	/* unmap it all with 8K unmap calls */
+	for (iova = base; iova < max; iova += SZ_8K) {
+		if (WARN_ON(ops->unmap(ops, iova, SZ_8K) != SZ_8K))
+			failed++;
+	}
+
+	/* sweep up TLB proving PTEs */
+	av8l_fast_clear_stale_ptes(pmds, base, base, max, false);
+
+	/* map the entire 4GB VA space with 16K map calls */
+	for (iova = base; iova < max; iova += SZ_16K) {
+		if (WARN_ON(ops->map(ops, iova, iova, SZ_16K, IOMMU_READ))) {
+			failed++;
+			continue;
+		}
+	}
+
+	if (WARN_ON(!av8l_fast_range_has_specific_mapping(ops, base,
+					base, max - base)))
+		failed++;
+
+	/* unmap it all */
+	for (iova = base; iova < max; iova += SZ_16K) {
+		if (WARN_ON(ops->unmap(ops, iova, SZ_16K) != SZ_16K))
+			failed++;
+	}
+
+	/* sweep up TLB proving PTEs */
+	av8l_fast_clear_stale_ptes(pmds, base, base, max, false);
+
+	/* map the entire 4GB VA space with 64K map calls */
+	for (iova = base; iova < max; iova += SZ_64K) {
+		if (WARN_ON(ops->map(ops, iova, iova, SZ_64K, IOMMU_READ))) {
+			failed++;
+			continue;
+		}
+	}
+
+	if (WARN_ON(!av8l_fast_range_has_specific_mapping(ops, base,
+					base, max - base)))
+		failed++;
+
+	/* unmap it all at once */
+	if (WARN_ON(ops->unmap(ops, base, max - base) != (max - base)))
+		failed++;
+
+	free_io_pgtable_ops(ops);
+	return failed;
+}
+
+static int __init av8l_fast_do_selftests(void)
+{
+	int failed = 0;
+
+	failed += av8l_fast_positive_testing();
+
+	pr_err("selftest: completed with %d failures\n", failed);
+
+	return 0;
+}
+subsys_initcall(av8l_fast_do_selftests);
+#endif
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 93f2880be6c6..2b12499a9638 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -34,6 +34,9 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
 #ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
 	[ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
 #endif
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST
+	[ARM_V8L_FAST] = &io_pgtable_av8l_fast_init_fns,
+#endif
 };
 
 struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
diff --git a/include/linux/dma-mapping-fast.h b/include/linux/dma-mapping-fast.h
new file mode 100644
index 000000000000..aca7592bfb6e
--- /dev/null
+++ b/include/linux/dma-mapping-fast.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef __LINUX_DMA_MAPPING_FAST_H
+#define __LINUX_DMA_MAPPING_FAST_H
+
+#include <linux/iommu.h>
+#include <linux/io-pgtable-fast.h>
+
+struct dma_iommu_mapping;
+struct io_pgtable_ops;
+struct iova_domain;
+
+struct dma_fast_smmu_mapping {
+	struct device		*dev;
+	struct iommu_domain	*domain;
+	struct iova_domain	*iovad;
+
+	dma_addr_t	 base;
+	size_t		 size;
+	size_t		 num_4k_pages;
+
+	unsigned int	bitmap_size;
+	unsigned long	*bitmap;
+	unsigned long	next_start;
+	unsigned long	upcoming_stale_bit;
+	bool		have_stale_tlbs;
+
+	dma_addr_t	pgtbl_dma_handle;
+	struct io_pgtable_ops *pgtbl_ops;
+
+	spinlock_t	lock;
+	struct notifier_block notifier;
+};
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST
+int fast_smmu_init_mapping(struct device *dev,
+			    struct dma_iommu_mapping *mapping);
+void fast_smmu_put_dma_cookie(struct iommu_domain *domain);
+#else
+static inline int fast_smmu_init_mapping(struct device *dev,
+					  struct dma_iommu_mapping *mapping)
+{
+	return -ENODEV;
+}
+
+static inline void fast_smmu_put_dma_cookie(struct iommu_domain *domain) {}
+#endif
+
+#endif /* __LINUX_DMA_MAPPING_FAST_H */
diff --git a/include/linux/io-pgtable-fast.h b/include/linux/io-pgtable-fast.h
new file mode 100644
index 000000000000..3eeef2655280
--- /dev/null
+++ b/include/linux/io-pgtable-fast.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2017, The Linux Foundation. All rights reserved.
+ */
+
+#ifndef __LINUX_IO_PGTABLE_FAST_H
+#define __LINUX_IO_PGTABLE_FAST_H
+
+#include <linux/notifier.h>
+
+/*
+ * This ought to be private to io-pgtable-fast, but dma-mapping-fast
+ * currently requires it for a debug usecase.
+ */
+typedef u64 av8l_fast_iopte;
+
+struct io_pgtable_ops;
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST
+
+int av8l_fast_map_public(struct io_pgtable_ops *ops, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot);
+
+void av8l_fast_unmap_public(struct io_pgtable_ops *ops, unsigned long iova,
+				size_t size);
+
+int av8l_fast_map_sg_public(struct io_pgtable_ops *ops,
+			unsigned long iova, struct scatterlist *sgl,
+			unsigned int nents, int prot, size_t *size);
+
+bool av8l_fast_iova_coherent_public(struct io_pgtable_ops *ops,
+					unsigned long iova);
+
+phys_addr_t av8l_fast_iova_to_phys_public(struct io_pgtable_ops *ops,
+					  unsigned long iova);
+#else
+static inline int
+av8l_fast_map_public(struct io_pgtable_ops *ops, unsigned long iova,
+		     phys_addr_t paddr, size_t size, int prot)
+{
+	return -EINVAL;
+}
+static inline void av8l_fast_unmap_public(struct io_pgtable_ops *ops,
+					  unsigned long iova, size_t size)
+{
+}
+
+static inline int av8l_fast_map_sg_public(struct io_pgtable_ops *ops,
+				unsigned long iova, struct scatterlist *sgl,
+				unsigned int nents, int prot, size_t *size)
+{
+	return 0;
+}
+
+static inline bool av8l_fast_iova_coherent_public(struct io_pgtable_ops *ops,
+						  unsigned long iova)
+{
+	return false;
+}
+static inline phys_addr_t
+av8l_fast_iova_to_phys_public(struct io_pgtable_ops *ops,
+				  unsigned long iova)
+{
+	return 0;
+}
+#endif /* CONFIG_IOMMU_IO_PGTABLE_FAST */
+
+
+/* events for notifiers passed to av8l_register_notify */
+#define MAPPED_OVER_STALE_TLB 1
+
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST_PROVE_TLB
+/*
+ * Doesn't matter what we use as long as bit 0 is unset.  The reason why we
+ * need a different value at all is that there are certain hardware
+ * platforms with erratum that require that a PTE actually be zero'd out
+ * and not just have its valid bit unset.
+ */
+#define AV8L_FAST_PTE_UNMAPPED_NEED_TLBI 0xa
+
+void av8l_fast_clear_stale_ptes(struct io_pgtable_ops *ops, u64 base,
+				u64 start, u64 end, bool skip_sync);
+void av8l_register_notify(struct notifier_block *nb);
+
+#else  /* !CONFIG_IOMMU_IO_PGTABLE_FAST_PROVE_TLB */
+
+#define AV8L_FAST_PTE_UNMAPPED_NEED_TLBI 0
+
+static inline void av8l_fast_clear_stale_ptes(struct io_pgtable_ops *ops,
+					      u64 base,
+					      u64 start,
+					      u64 end,
+					      bool skip_sync)
+{
+}
+
+static inline void av8l_register_notify(struct notifier_block *nb)
+{
+}
+
+#endif	/* CONFIG_IOMMU_IO_PGTABLE_FAST_PROVE_TLB */
+
+#endif /* __LINUX_IO_PGTABLE_FAST_H */
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 2df79093cad9..a95f4784bf22 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -12,6 +12,7 @@ enum io_pgtable_fmt {
 	ARM_64_LPAE_S1,
 	ARM_64_LPAE_S2,
 	ARM_V7S,
+	ARM_V8L_FAST,
 	IO_PGTABLE_NUM_FMTS,
 };
 
@@ -103,6 +104,13 @@ struct io_pgtable_cfg {
 			u32	nmrr;
 			u32	prrr;
 		} arm_v7s_cfg;
+
+		struct {
+			u64	ttbr[2];
+			u64	tcr;
+			u64	mair[2];
+			void	*pmds;
+		} av8l_fast_cfg;
 	};
 };
 
@@ -204,5 +212,6 @@ extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_av8l_fast_init_fns;
 
 #endif /* __IO_PGTABLE_H */