[AVR32] Make I/O access macros work with external devices

Fix the I/O access macros so that they work with externally connected
devices accessed in little-endian mode over any bus width:

* Use a set of macros to define I/O port- and memory operations
  borrowed from MIPS.
* Allow subarchitecture to specify address- and data-mangling
* Implement at32ap-specific port mangling (with build-time
  configurable bus width. Only one bus width at a time supported
  for now.)
* Rewrite iowriteN and friends to use write[bwl] and friends
  (not the __raw counterparts.)

This has been tested using pata_pcmcia to access a CompactFlash card
connected to the EBI (16-bit bus width.)

Signed-off-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
diff --git a/arch/avr32/mach-at32ap/Kconfig b/arch/avr32/mach-at32ap/Kconfig
index d749771..eb30783 100644
--- a/arch/avr32/mach-at32ap/Kconfig
+++ b/arch/avr32/mach-at32ap/Kconfig
@@ -2,6 +2,30 @@
 
 menu "Atmel AVR32 AP options"
 
+choice
+	prompt "AT32AP7000 static memory bus width"
+	depends on CPU_AT32AP7000
+	default AP7000_16_BIT_SMC
+	help
+	  Define the width of the AP7000 external static memory interface.
+	  This is used to determine how to mangle the address and/or data
+	  when doing little-endian port access.
+
+	  The current code can only support a single external memory bus
+	  width for all chip selects, excluding the flash (which is using
+	  raw access and is thus not affected by any of this.)
+
+config AP7000_32_BIT_SMC
+	bool "32 bit"
+
+config AP7000_16_BIT_SMC
+	bool "16 bit"
+
+config AP7000_8_BIT_SMC
+	bool "8 bit"
+
+endchoice
+
 endmenu
 
-endif
+endif # PLATFORM_AT32AP
diff --git a/include/asm-avr32/arch-at32ap/io.h b/include/asm-avr32/arch-at32ap/io.h
new file mode 100644
index 0000000..ee59e40
--- /dev/null
+++ b/include/asm-avr32/arch-at32ap/io.h
@@ -0,0 +1,39 @@
+#ifndef __ASM_AVR32_ARCH_AT32AP_IO_H
+#define __ASM_AVR32_ARCH_AT32AP_IO_H
+
+/* For "bizarre" halfword swapping */
+#include <linux/byteorder/swabb.h>
+
+#if defined(CONFIG_AP7000_32_BIT_SMC)
+# define __swizzle_addr_b(addr)	(addr ^ 3UL)
+# define __swizzle_addr_w(addr)	(addr ^ 2UL)
+# define __swizzle_addr_l(addr)	(addr)
+# define ioswabb(a, x)		(x)
+# define ioswabw(a, x)		(x)
+# define ioswabl(a, x)		(x)
+# define __mem_ioswabb(a, x)	(x)
+# define __mem_ioswabw(a, x)	swab16(x)
+# define __mem_ioswabl(a, x)	swab32(x)
+#elif defined(CONFIG_AP7000_16_BIT_SMC)
+# define __swizzle_addr_b(addr)	(addr ^ 1UL)
+# define __swizzle_addr_w(addr)	(addr)
+# define __swizzle_addr_l(addr)	(addr)
+# define ioswabb(a, x)		(x)
+# define ioswabw(a, x)		(x)
+# define ioswabl(a, x)		swahw32(x)
+# define __mem_ioswabb(a, x)	(x)
+# define __mem_ioswabw(a, x)	swab16(x)
+# define __mem_ioswabl(a, x)	swahb32(x)
+#else
+# define __swizzle_addr_b(addr)	(addr)
+# define __swizzle_addr_w(addr)	(addr)
+# define __swizzle_addr_l(addr)	(addr)
+# define ioswabb(a, x)		(x)
+# define ioswabw(a, x)		swab16(x)
+# define ioswabl(a, x)		swab32(x)
+# define __mem_ioswabb(a, x)	(x)
+# define __mem_ioswabw(a, x)	(x)
+# define __mem_ioswabl(a, x)	(x)
+#endif
+
+#endif /* __ASM_AVR32_ARCH_AT32AP_IO_H */
diff --git a/include/asm-avr32/io.h b/include/asm-avr32/io.h
index c08e810..27b1523 100644
--- a/include/asm-avr32/io.h
+++ b/include/asm-avr32/io.h
@@ -1,13 +1,15 @@
 #ifndef __ASM_AVR32_IO_H
 #define __ASM_AVR32_IO_H
 
+#include <linux/kernel.h>
 #include <linux/string.h>
-
-#ifdef __KERNEL__
+#include <linux/types.h>
 
 #include <asm/addrspace.h>
 #include <asm/byteorder.h>
 
+#include <asm/arch/io.h>
+
 /* virt_to_phys will only work when address is in P1 or P2 */
 static __inline__ unsigned long virt_to_phys(volatile void *address)
 {
@@ -36,205 +38,236 @@
 extern void __raw_readsw(const void __iomem *addr, void *data, int wordlen);
 extern void __raw_readsl(const void __iomem *addr, void *data, int longlen);
 
-static inline void writeb(unsigned char b, volatile void __iomem *addr)
+static inline void __raw_writeb(u8 v, volatile void __iomem *addr)
 {
-	*(volatile unsigned char __force *)addr = b;
+	*(volatile u8 __force *)addr = v;
 }
-static inline void writew(unsigned short b, volatile void __iomem *addr)
+static inline void __raw_writew(u16 v, volatile void __iomem *addr)
 {
-	*(volatile unsigned short __force *)addr = b;
+	*(volatile u16 __force *)addr = v;
 }
-static inline void writel(unsigned int b, volatile void __iomem *addr)
+static inline void __raw_writel(u32 v, volatile void __iomem *addr)
 {
-	*(volatile unsigned int __force *)addr = b;
+	*(volatile u32 __force *)addr = v;
 }
-#define __raw_writeb writeb
-#define __raw_writew writew
-#define __raw_writel writel
 
-static inline unsigned char readb(const volatile void __iomem *addr)
+static inline u8 __raw_readb(const volatile void __iomem *addr)
 {
-	return *(const volatile unsigned char __force *)addr;
+	return *(const volatile u8 __force *)addr;
 }
-static inline unsigned short readw(const volatile void __iomem *addr)
+static inline u16 __raw_readw(const volatile void __iomem *addr)
 {
-	return *(const volatile unsigned short __force *)addr;
+	return *(const volatile u16 __force *)addr;
 }
-static inline unsigned int readl(const volatile void __iomem *addr)
+static inline u32 __raw_readl(const volatile void __iomem *addr)
 {
-	return *(const volatile unsigned int __force *)addr;
+	return *(const volatile u32 __force *)addr;
 }
-#define __raw_readb readb
-#define __raw_readw readw
-#define __raw_readl readl
 
-#define writesb(p, d, l)	__raw_writesb((unsigned int)p, d, l)
-#define writesw(p, d, l)	__raw_writesw((unsigned int)p, d, l)
-#define writesl(p, d, l)	__raw_writesl((unsigned int)p, d, l)
+/* Convert I/O port address to virtual address */
+#ifndef __io
+# define __io(p)	((void *)phys_to_uncached(p))
+#endif
 
-#define readsb(p, d, l)		__raw_readsb((unsigned int)p, d, l)
-#define readsw(p, d, l)		__raw_readsw((unsigned int)p, d, l)
-#define readsl(p, d, l)		__raw_readsl((unsigned int)p, d, l)
+/*
+ * Not really sure about the best way to slow down I/O on
+ * AVR32. Defining it as a no-op until we have an actual test case.
+ */
+#define SLOW_DOWN_IO	do { } while (0)
 
+#define __BUILD_MEMORY_SINGLE(pfx, bwl, type)				\
+static inline void							\
+pfx##write##bwl(type val, volatile void __iomem *addr)			\
+{									\
+	volatile type *__addr;						\
+	type __val;							\
+									\
+	__addr = (void *)__swizzle_addr_##bwl((unsigned long)(addr));	\
+	__val = pfx##ioswab##bwl(__addr, val);				\
+									\
+	BUILD_BUG_ON(sizeof(type) > sizeof(unsigned long));		\
+									\
+	*__addr = __val;						\
+}									\
+									\
+static inline type pfx##read##bwl(const volatile void __iomem *addr)	\
+{									\
+	volatile type *__addr;						\
+	type __val;							\
+									\
+	__addr = (void *)__swizzle_addr_##bwl((unsigned long)(addr));	\
+									\
+	BUILD_BUG_ON(sizeof(type) > sizeof(unsigned long));		\
+									\
+	__val = *__addr;						\
+	return pfx##ioswab##bwl(__addr, __val);				\
+}
+
+#define __BUILD_IOPORT_SINGLE(pfx, bwl, type, p, slow)			\
+static inline void pfx##out##bwl##p(type val, unsigned long port)	\
+{									\
+	volatile type *__addr;						\
+	type __val;							\
+									\
+	__addr = __io(__swizzle_addr_##bwl(port));			\
+	__val = pfx##ioswab##bwl(__addr, val);				\
+									\
+	BUILD_BUG_ON(sizeof(type) > sizeof(unsigned long));		\
+									\
+	*__addr = __val;						\
+	slow;								\
+}									\
+									\
+static inline type pfx##in##bwl##p(unsigned long port)			\
+{									\
+	volatile type *__addr;						\
+	type __val;							\
+									\
+	__addr = __io(__swizzle_addr_##bwl(port));			\
+									\
+	BUILD_BUG_ON(sizeof(type) > sizeof(unsigned long));		\
+									\
+	__val = *__addr;						\
+	slow;								\
+									\
+	return pfx##ioswab##bwl(__addr, __val);				\
+}
+
+#define __BUILD_MEMORY_PFX(bus, bwl, type)				\
+	__BUILD_MEMORY_SINGLE(bus, bwl, type)
+
+#define BUILDIO_MEM(bwl, type)						\
+	__BUILD_MEMORY_PFX(, bwl, type)					\
+	__BUILD_MEMORY_PFX(__mem_, bwl, type)
+
+#define __BUILD_IOPORT_PFX(bus, bwl, type)				\
+	__BUILD_IOPORT_SINGLE(bus, bwl, type, ,)			\
+	__BUILD_IOPORT_SINGLE(bus, bwl, type, _p, SLOW_DOWN_IO)
+
+#define BUILDIO_IOPORT(bwl, type)					\
+	__BUILD_IOPORT_PFX(, bwl, type)					\
+	__BUILD_IOPORT_PFX(__mem_, bwl, type)
+
+BUILDIO_MEM(b, u8)
+BUILDIO_MEM(w, u16)
+BUILDIO_MEM(l, u32)
+
+BUILDIO_IOPORT(b, u8)
+BUILDIO_IOPORT(w, u16)
+BUILDIO_IOPORT(l, u32)
+
+#define readb_relaxed			readb
+#define readw_relaxed			readw
+#define readl_relaxed			readl
+
+#define __BUILD_MEMORY_STRING(bwl, type)				\
+static inline void writes##bwl(volatile void __iomem *addr,		\
+			       const void *data, unsigned int count)	\
+{									\
+	const type *__data = data;					\
+									\
+	while (count--)							\
+		__mem_write##bwl(*__data++, addr);			\
+}									\
+									\
+static inline void reads##bwl(const volatile void __iomem *addr,	\
+			      void *data, unsigned int count)		\
+{									\
+	type *__data = data;						\
+									\
+	while (count--)							\
+		*__data++ = __mem_read##bwl(addr);			\
+}
+
+#define __BUILD_IOPORT_STRING(bwl, type)				\
+static inline void outs##bwl(unsigned long port, const void *data,	\
+			     unsigned int count)			\
+{									\
+	const type *__data = data;					\
+									\
+	while (count--)							\
+		__mem_out##bwl(*__data++, port);			\
+}									\
+									\
+static inline void ins##bwl(unsigned long port, void *data,		\
+			   unsigned int count)				\
+{									\
+	type *__data = data;						\
+									\
+	while (count--)							\
+		*__data++ = __mem_in##bwl(port);			\
+}
+
+#define BUILDSTRING(bwl, type)						\
+	__BUILD_MEMORY_STRING(bwl, type)				\
+	__BUILD_IOPORT_STRING(bwl, type)
+
+BUILDSTRING(b, u8)
+BUILDSTRING(w, u16)
+BUILDSTRING(l, u32)
 
 /*
  * io{read,write}{8,16,32} macros in both le (for PCI style consumers) and native be
  */
 #ifndef ioread8
 
-#define ioread8(p)	({ unsigned int __v = __raw_readb(p); __v; })
+#define ioread8(p)		((unsigned int)readb(p))
 
-#define ioread16(p)	({ unsigned int __v = le16_to_cpu(__raw_readw(p)); __v; })
-#define ioread16be(p)	({ unsigned int __v = be16_to_cpu(__raw_readw(p)); __v; })
+#define ioread16(p)		((unsigned int)readw(p))
+#define ioread16be(p)		((unsigned int)__raw_readw(p))
 
-#define ioread32(p)	({ unsigned int __v = le32_to_cpu(__raw_readl(p)); __v; })
-#define ioread32be(p)	({ unsigned int __v = be32_to_cpu(__raw_readl(p)); __v; })
+#define ioread32(p)		((unsigned int)readl(p))
+#define ioread32be(p)		((unsigned int)__raw_readl(p))
 
-#define iowrite8(v,p)	__raw_writeb(v, p)
+#define iowrite8(v,p)		writeb(v, p)
 
-#define iowrite16(v,p)	__raw_writew(cpu_to_le16(v), p)
-#define iowrite16be(v,p)	__raw_writew(cpu_to_be16(v), p)
+#define iowrite16(v,p)		writew(v, p)
+#define iowrite16be(v,p)	__raw_writew(v, p)
 
-#define iowrite32(v,p)	__raw_writel(cpu_to_le32(v), p)
-#define iowrite32be(v,p)	__raw_writel(cpu_to_be32(v), p)
+#define iowrite32(v,p)		writel(v, p)
+#define iowrite32be(v,p)	__raw_writel(v, p)
 
-#define ioread8_rep(p,d,c)	__raw_readsb(p,d,c)
-#define ioread16_rep(p,d,c)	__raw_readsw(p,d,c)
-#define ioread32_rep(p,d,c)	__raw_readsl(p,d,c)
+#define ioread8_rep(p,d,c)	readsb(p,d,c)
+#define ioread16_rep(p,d,c)	readsw(p,d,c)
+#define ioread32_rep(p,d,c)	readsl(p,d,c)
 
-#define iowrite8_rep(p,s,c)	__raw_writesb(p,s,c)
-#define iowrite16_rep(p,s,c)	__raw_writesw(p,s,c)
-#define iowrite32_rep(p,s,c)	__raw_writesl(p,s,c)
+#define iowrite8_rep(p,s,c)	writesb(p,s,c)
+#define iowrite16_rep(p,s,c)	writesw(p,s,c)
+#define iowrite32_rep(p,s,c)	writesl(p,s,c)
 
 #endif
 
-
-/*
- * These two are only here because ALSA _thinks_ it needs them...
- */
 static inline void memcpy_fromio(void * to, const volatile void __iomem *from,
 				 unsigned long count)
 {
 	char *p = to;
-	while (count) {
-		count--;
-		*p = readb(from);
-		p++;
-		from++;
-	}
+	volatile const char __iomem *addr = from;
+
+	while (count--)
+		*p++ = readb(addr++);
 }
 
 static inline void  memcpy_toio(volatile void __iomem *to, const void * from,
 				unsigned long count)
 {
 	const char *p = from;
-	while (count) {
-		count--;
-		writeb(*p, to);
-		p++;
-		to++;
-	}
+	volatile char __iomem *addr = to;
+
+	while (count--)
+		writeb(*p++, addr++);
 }
 
 static inline void memset_io(volatile void __iomem *addr, unsigned char val,
 			     unsigned long count)
 {
-	memset((void __force *)addr, val, count);
-}
+	volatile char __iomem *p = addr;
 
-/*
- * Bad read/write accesses...
- */
-extern void __readwrite_bug(const char *fn);
+	while (count--)
+		writeb(val, p++);
+}
 
 #define IO_SPACE_LIMIT	0xffffffff
 
-/* Convert I/O port address to virtual address */
-#define __io(p)		((void __iomem *)phys_to_uncached(p))
-
-/*
- *  IO port access primitives
- *  -------------------------
- *
- * The AVR32 doesn't have special IO access instructions; all IO is memory
- * mapped. Note that these are defined to perform little endian accesses
- * only. Their primary purpose is to access PCI and ISA peripherals.
- *
- * Note that for a big endian machine, this implies that the following
- * big endian mode connectivity is in place.
- *
- * The machine specific io.h include defines __io to translate an "IO"
- * address to a memory address.
- *
- * Note that we prevent GCC re-ordering or caching values in expressions
- * by introducing sequence points into the in*() definitions.  Note that
- * __raw_* do not guarantee this behaviour.
- *
- * The {in,out}[bwl] macros are for emulating x86-style PCI/ISA IO space.
- */
-#define outb(v, p)		__raw_writeb(v, __io(p))
-#define outw(v, p)		__raw_writew(cpu_to_le16(v), __io(p))
-#define outl(v, p)		__raw_writel(cpu_to_le32(v), __io(p))
-
-#define inb(p)			__raw_readb(__io(p))
-#define inw(p)			le16_to_cpu(__raw_readw(__io(p)))
-#define inl(p)			le32_to_cpu(__raw_readl(__io(p)))
-
-static inline void __outsb(unsigned long port, void *addr, unsigned int count)
-{
-	while (count--) {
-		outb(*(u8 *)addr, port);
-		addr++;
-	}
-}
-
-static inline void __insb(unsigned long port, void *addr, unsigned int count)
-{
-	while (count--) {
-		*(u8 *)addr = inb(port);
-		addr++;
-	}
-}
-
-static inline void __outsw(unsigned long port, void *addr, unsigned int count)
-{
-	while (count--) {
-		outw(*(u16 *)addr, port);
-		addr += 2;
-	}
-}
-
-static inline void __insw(unsigned long port, void *addr, unsigned int count)
-{
-	while (count--) {
-		*(u16 *)addr = inw(port);
-		addr += 2;
-	}
-}
-
-static inline void __outsl(unsigned long port, void *addr, unsigned int count)
-{
-	while (count--) {
-		outl(*(u32 *)addr, port);
-		addr += 4;
-	}
-}
-
-static inline void __insl(unsigned long port, void *addr, unsigned int count)
-{
-	while (count--) {
-		*(u32 *)addr = inl(port);
-		addr += 4;
-	}
-}
-
-#define outsb(port, addr, count)	__outsb(port, addr, count)
-#define insb(port, addr, count)		__insb(port, addr, count)
-#define outsw(port, addr, count)	__outsw(port, addr, count)
-#define insw(port, addr, count)		__insw(port, addr, count)
-#define outsl(port, addr, count)	__outsl(port, addr, count)
-#define insl(port, addr, count)		__insl(port, addr, count)
-
 extern void __iomem *__ioremap(unsigned long offset, size_t size,
 			       unsigned long flags);
 extern void __iounmap(void __iomem *addr);
@@ -292,6 +325,4 @@
  */
 #define xlate_dev_kmem_ptr(p)   p
 
-#endif /* __KERNEL__ */
-
 #endif /* __ASM_AVR32_IO_H */