Merge branch 'for-linus' of git://git.linaro.org/people/rmk/linux-arm

Pull ARM updates from Russell King:
 "Here's the updates for ARM for this merge window, which cover quite a
  variety of areas.

  There's a bunch of patch series from Will tackling various bugs like
  the PROT_NONE handling, ASID allocation, cluster boot protocol and
  ASID TLB tagging updates.

  We move to a build-time sorted exception table rather than doing the
  sorting at run-time, add support for the secure computing filter, and
  some updates to the perf code.  We also have sorted out the placement
  of some headers, fixed some build warnings, fixed some hotplug
  problems with the per-cpu TWD code."

* 'for-linus' of git://git.linaro.org/people/rmk/linux-arm: (73 commits)
  ARM: 7594/1: Add .smp entry for REALVIEW_EB
  ARM: 7599/1: head: Remove boot-time HYP mode check for v5 and below
  ARM: 7598/1: net: bpf_jit_32: fix sp-relative load/stores offsets.
  ARM: 7595/1: syscall: rework ordering in syscall_trace_exit
  ARM: 7596/1: mmci: replace readsl/writesl with ioread32_rep/iowrite32_rep
  ARM: 7597/1: net: bpf_jit_32: fix kzalloc gfp/size mismatch.
  ARM: 7593/1: nommu: do not enable DCACHE_WORD_ACCESS when !CONFIG_MMU
  ARM: 7592/1: nommu: prevent generation of kernel unaligned memory accesses
  ARM: 7591/1: nommu: Enable the strict alignment (CR_A) bit only if ARCH < v6
  ARM: 7590/1: /proc/interrupts: limit the display of IPIs to online CPUs only
  ARM: 7587/1: implement optimized percpu variable access
  ARM: 7589/1: integrator: pass the lm resource to amba
  ARM: 7588/1: amba: create a resource parent registrator
  ARM: 7582/2: rename kvm_seq to vmalloc_seq so to avoid confusion with KVM
  ARM: 7585/1: kernel: fix nr_cpu_ids check in DT logical map init
  ARM: 7584/1: perf: fix link error when CONFIG_HW_PERF_EVENTS is not selected
  ARM: gic: use a private mapping for CPU target interfaces
  ARM: kernel: add logical mappings look-up
  ARM: kernel: add cpu logical map DT init in setup_arch
  ARM: kernel: add device tree init map function
  ...
diff --git a/Documentation/devicetree/bindings/arm/cpus.txt b/Documentation/devicetree/bindings/arm/cpus.txt
new file mode 100644
index 0000000..f32494d
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/cpus.txt
@@ -0,0 +1,77 @@
+* ARM CPUs binding description
+
+The device tree allows to describe the layout of CPUs in a system through
+the "cpus" node, which in turn contains a number of subnodes (ie "cpu")
+defining properties for every cpu.
+
+Bindings for CPU nodes follow the ePAPR standard, available from:
+
+http://devicetree.org
+
+For the ARM architecture every CPU node must contain the following properties:
+
+- device_type:	must be "cpu"
+- reg:		property matching the CPU MPIDR[23:0] register bits
+		reg[31:24] bits must be set to 0
+- compatible:	should be one of:
+		"arm,arm1020"
+		"arm,arm1020e"
+		"arm,arm1022"
+		"arm,arm1026"
+		"arm,arm720"
+		"arm,arm740"
+		"arm,arm7tdmi"
+		"arm,arm920"
+		"arm,arm922"
+		"arm,arm925"
+		"arm,arm926"
+		"arm,arm940"
+		"arm,arm946"
+		"arm,arm9tdmi"
+		"arm,cortex-a5"
+		"arm,cortex-a7"
+		"arm,cortex-a8"
+		"arm,cortex-a9"
+		"arm,cortex-a15"
+		"arm,arm1136"
+		"arm,arm1156"
+		"arm,arm1176"
+		"arm,arm11mpcore"
+		"faraday,fa526"
+		"intel,sa110"
+		"intel,sa1100"
+		"marvell,feroceon"
+		"marvell,mohawk"
+		"marvell,xsc3"
+		"marvell,xscale"
+
+Example:
+
+	cpus {
+		#size-cells = <0>;
+		#address-cells = <1>;
+
+		CPU0: cpu@0 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a15";
+			reg = <0x0>;
+		};
+
+		CPU1: cpu@1 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a15";
+			reg = <0x1>;
+		};
+
+		CPU2: cpu@100 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a7";
+			reg = <0x100>;
+		};
+
+		CPU3: cpu@101 {
+			device_type = "cpu";
+			compatible = "arm,cortex-a7";
+			reg = <0x101>;
+		};
+	};
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 8a027f9..825066e 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -5,8 +5,9 @@
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_WANT_IPC_PARSE_VERSION
+	select BUILDTIME_EXTABLE_SORT if MMU
 	select CPU_PM if (SUSPEND || CPU_IDLE)
-	select DCACHE_WORD_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && !CPU_BIG_ENDIAN
+	select DCACHE_WORD_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && !CPU_BIG_ENDIAN && MMU
 	select GENERIC_ATOMIC64 if (CPU_V6 || !CPU_32v6K || !AEABI)
 	select GENERIC_CLOCKEVENTS_BROADCAST if SMP
 	select GENERIC_IRQ_PROBE
@@ -21,6 +22,7 @@
 	select HAVE_AOUT
 	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
 	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_BPF_JIT
 	select HAVE_C_RECORDMCOUNT
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index c35baf1..45096a1 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -32,6 +32,7 @@
 # defines filename extension depending memory management type.
 ifeq ($(CONFIG_MMU),)
 MMUEXT		:= -nommu
+KBUILD_CFLAGS	+= $(call cc-option,-mno-unaligned-access)
 endif
 
 ifeq ($(CONFIG_FRAME_POINTER),y)
diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c
index aa52699..36ae03a 100644
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -70,6 +70,14 @@
 static DEFINE_RAW_SPINLOCK(irq_controller_lock);
 
 /*
+ * The GIC mapping of CPU interfaces does not necessarily match
+ * the logical CPU numbering.  Let's use a mapping as returned
+ * by the GIC itself.
+ */
+#define NR_GIC_CPU_IF 8
+static u8 gic_cpu_map[NR_GIC_CPU_IF] __read_mostly;
+
+/*
  * Supported arch specific GIC irq extension.
  * Default make them NULL.
  */
@@ -238,11 +246,11 @@
 	unsigned int cpu = cpumask_any_and(mask_val, cpu_online_mask);
 	u32 val, mask, bit;
 
-	if (cpu >= 8 || cpu >= nr_cpu_ids)
+	if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids)
 		return -EINVAL;
 
 	mask = 0xff << shift;
-	bit = 1 << (cpu_logical_map(cpu) + shift);
+	bit = gic_cpu_map[cpu] << shift;
 
 	raw_spin_lock(&irq_controller_lock);
 	val = readl_relaxed(reg) & ~mask;
@@ -349,11 +357,6 @@
 	u32 cpumask;
 	unsigned int gic_irqs = gic->gic_irqs;
 	void __iomem *base = gic_data_dist_base(gic);
-	u32 cpu = cpu_logical_map(smp_processor_id());
-
-	cpumask = 1 << cpu;
-	cpumask |= cpumask << 8;
-	cpumask |= cpumask << 16;
 
 	writel_relaxed(0, base + GIC_DIST_CTRL);
 
@@ -366,6 +369,7 @@
 	/*
 	 * Set all global interrupts to this CPU only.
 	 */
+	cpumask = readl_relaxed(base + GIC_DIST_TARGET + 0);
 	for (i = 32; i < gic_irqs; i += 4)
 		writel_relaxed(cpumask, base + GIC_DIST_TARGET + i * 4 / 4);
 
@@ -389,9 +393,25 @@
 {
 	void __iomem *dist_base = gic_data_dist_base(gic);
 	void __iomem *base = gic_data_cpu_base(gic);
+	unsigned int cpu_mask, cpu = smp_processor_id();
 	int i;
 
 	/*
+	 * Get what the GIC says our CPU mask is.
+	 */
+	BUG_ON(cpu >= NR_GIC_CPU_IF);
+	cpu_mask = readl_relaxed(dist_base + GIC_DIST_TARGET + 0);
+	gic_cpu_map[cpu] = cpu_mask;
+
+	/*
+	 * Clear our mask from the other map entries in case they're
+	 * still undefined.
+	 */
+	for (i = 0; i < NR_GIC_CPU_IF; i++)
+		if (i != cpu)
+			gic_cpu_map[i] &= ~cpu_mask;
+
+	/*
 	 * Deal with the banked PPI and SGI interrupts - disable all
 	 * PPI interrupts, ensure all SGI interrupts are enabled.
 	 */
@@ -646,7 +666,7 @@
 {
 	irq_hw_number_t hwirq_base;
 	struct gic_chip_data *gic;
-	int gic_irqs, irq_base;
+	int gic_irqs, irq_base, i;
 
 	BUG_ON(gic_nr >= MAX_GIC_NR);
 
@@ -683,6 +703,13 @@
 	}
 
 	/*
+	 * Initialize the CPU interface map to all CPUs.
+	 * It will be refined as each CPU probes its ID.
+	 */
+	for (i = 0; i < NR_GIC_CPU_IF; i++)
+		gic_cpu_map[i] = 0xff;
+
+	/*
 	 * For primary GICs, skip over SGIs.
 	 * For secondary GICs, skip over PPIs, too.
 	 */
@@ -737,7 +764,7 @@
 
 	/* Convert our logical CPU mask into a physical one. */
 	for_each_cpu(cpu, mask)
-		map |= 1 << cpu_logical_map(cpu);
+		map |= gic_cpu_map[cpu];
 
 	/*
 	 * Ensure that stores to Normal memory are visible to the
diff --git a/arch/arm/common/vic.c b/arch/arm/common/vic.c
index e0d5388..e4df17c 100644
--- a/arch/arm/common/vic.c
+++ b/arch/arm/common/vic.c
@@ -218,7 +218,7 @@
 	v->resume_sources = resume_sources;
 	v->irq = irq;
 	vic_id++;
-	v->domain = irq_domain_add_legacy(node, fls(valid_sources), irq, 0,
+	v->domain = irq_domain_add_simple(node, fls(valid_sources), irq,
 					  &vic_irqdomain_ops, v);
 }
 
@@ -350,7 +350,7 @@
 	vic_register(base, irq_start, vic_sources, 0, node);
 }
 
-void __init __vic_init(void __iomem *base, unsigned int irq_start,
+void __init __vic_init(void __iomem *base, int irq_start,
 			      u32 vic_sources, u32 resume_sources,
 			      struct device_node *node)
 {
@@ -407,7 +407,6 @@
 int __init vic_of_init(struct device_node *node, struct device_node *parent)
 {
 	void __iomem *regs;
-	int irq_base;
 
 	if (WARN(parent, "non-root VICs are not supported"))
 		return -EINVAL;
@@ -416,18 +415,12 @@
 	if (WARN_ON(!regs))
 		return -EIO;
 
-	irq_base = irq_alloc_descs(-1, 0, 32, numa_node_id());
-	if (WARN_ON(irq_base < 0))
-		goto out_unmap;
-
-	__vic_init(regs, irq_base, ~0, ~0, node);
+	/*
+	 * Passing -1 as first IRQ makes the simple domain allocate descriptors
+	 */
+	__vic_init(regs, -1, ~0, ~0, node);
 
 	return 0;
-
- out_unmap:
-	iounmap(regs);
-
-	return -EIO;
 }
 #endif /* CONFIG OF */
 
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 514e398..d3db398 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -16,7 +16,6 @@
 generic-y += msgbuf.h
 generic-y += param.h
 generic-y += parport.h
-generic-y += percpu.h
 generic-y += poll.h
 generic-y += resource.h
 generic-y += sections.h
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 2ef9581..eb87200 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -250,6 +250,7 @@
  * Beware, it also clobers LR.
  */
 .macro safe_svcmode_maskall reg:req
+#if __LINUX_ARM_ARCH__ >= 6
 	mrs	\reg , cpsr
 	mov	lr , \reg
 	and	lr , lr , #MODE_MASK
@@ -266,6 +267,13 @@
 	__ERET
 1:	msr	cpsr_c, \reg
 2:
+#else
+/*
+ * workaround for possibly broken pre-v6 hardware
+ * (akita, Sharp Zaurus C-1000, PXA270-based)
+ */
+	setmode	PSR_F_BIT | PSR_I_BIT | SVC_MODE, \reg
+#endif
 .endm
 
 /*
diff --git a/arch/arm/include/asm/cpu.h b/arch/arm/include/asm/cpu.h
index d797223..2744f06 100644
--- a/arch/arm/include/asm/cpu.h
+++ b/arch/arm/include/asm/cpu.h
@@ -15,6 +15,7 @@
 
 struct cpuinfo_arm {
 	struct cpu	cpu;
+	u32		cpuid;
 #ifdef CONFIG_SMP
 	unsigned int	loops_per_jiffy;
 #endif
diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h
index cb47d28..a59dcb5 100644
--- a/arch/arm/include/asm/cputype.h
+++ b/arch/arm/include/asm/cputype.h
@@ -25,6 +25,19 @@
 #define CPUID_EXT_ISAR4	"c2, 4"
 #define CPUID_EXT_ISAR5	"c2, 5"
 
+#define MPIDR_SMP_BITMASK (0x3 << 30)
+#define MPIDR_SMP_VALUE (0x2 << 30)
+
+#define MPIDR_MT_BITMASK (0x1 << 24)
+
+#define MPIDR_HWID_BITMASK 0xFFFFFF
+
+#define MPIDR_LEVEL_BITS 8
+#define MPIDR_LEVEL_MASK ((1 << MPIDR_LEVEL_BITS) - 1)
+
+#define MPIDR_AFFINITY_LEVEL(mpidr, level) \
+	((mpidr >> (MPIDR_LEVEL_BITS * level)) & MPIDR_LEVEL_MASK)
+
 extern unsigned int processor_id;
 
 #ifdef CONFIG_CPU_CP15
diff --git a/arch/arm/include/asm/cti.h b/arch/arm/include/asm/cti.h
index a0ada3e..f2e5cad 100644
--- a/arch/arm/include/asm/cti.h
+++ b/arch/arm/include/asm/cti.h
@@ -146,15 +146,7 @@
  */
 static inline void cti_unlock(struct cti *cti)
 {
-	void __iomem *base = cti->base;
-	unsigned long val;
-
-	val = __raw_readl(base + LOCKSTATUS);
-
-	if (val & 1) {
-		val = LOCKCODE;
-		__raw_writel(val, base + LOCKACCESS);
-	}
+	__raw_writel(LOCKCODE, cti->base + LOCKACCESS);
 }
 
 /**
@@ -166,14 +158,6 @@
  */
 static inline void cti_lock(struct cti *cti)
 {
-	void __iomem *base = cti->base;
-	unsigned long val;
-
-	val = __raw_readl(base + LOCKSTATUS);
-
-	if (!(val & 1)) {
-		val = ~LOCKCODE;
-		__raw_writel(val, base + LOCKACCESS);
-	}
+	__raw_writel(~LOCKCODE, cti->base + LOCKACCESS);
 }
 #endif
diff --git a/arch/arm/include/asm/hardware/cache-l2x0.h b/arch/arm/include/asm/hardware/cache-l2x0.h
index c4c87bc..3b2c40b 100644
--- a/arch/arm/include/asm/hardware/cache-l2x0.h
+++ b/arch/arm/include/asm/hardware/cache-l2x0.h
@@ -102,6 +102,10 @@
 
 #define L2X0_ADDR_FILTER_EN		1
 
+#define L2X0_CTRL_EN			1
+
+#define L2X0_WAY_SIZE_SHIFT		3
+
 #ifndef __ASSEMBLY__
 extern void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask);
 #if defined(CONFIG_CACHE_L2X0) && defined(CONFIG_OF)
@@ -126,6 +130,7 @@
 	unsigned long filter_end;
 	unsigned long prefetch_ctrl;
 	unsigned long pwr_ctrl;
+	unsigned long ctrl;
 };
 
 extern struct l2x0_regs l2x0_saved_regs;
diff --git a/arch/arm/include/asm/hardware/vic.h b/arch/arm/include/asm/hardware/vic.h
index e14af1a..2bebad3 100644
--- a/arch/arm/include/asm/hardware/vic.h
+++ b/arch/arm/include/asm/hardware/vic.h
@@ -47,7 +47,7 @@
 struct device_node;
 struct pt_regs;
 
-void __vic_init(void __iomem *base, unsigned int irq_start, u32 vic_sources,
+void __vic_init(void __iomem *base, int irq_start, u32 vic_sources,
 		u32 resume_sources, struct device_node *node);
 void vic_init(void __iomem *base, unsigned int irq_start, u32 vic_sources, u32 resume_sources);
 int vic_of_init(struct device_node *node, struct device_node *parent);
diff --git a/arch/arm/include/asm/hw_breakpoint.h b/arch/arm/include/asm/hw_breakpoint.h
index c190bc9..01169dd 100644
--- a/arch/arm/include/asm/hw_breakpoint.h
+++ b/arch/arm/include/asm/hw_breakpoint.h
@@ -98,12 +98,12 @@
 #define ARM_BASE_WCR		112
 
 /* Accessor macros for the debug registers. */
-#define ARM_DBG_READ(M, OP2, VAL) do {\
-	asm volatile("mrc p14, 0, %0, c0," #M ", " #OP2 : "=r" (VAL));\
+#define ARM_DBG_READ(N, M, OP2, VAL) do {\
+	asm volatile("mrc p14, 0, %0, " #N "," #M ", " #OP2 : "=r" (VAL));\
 } while (0)
 
-#define ARM_DBG_WRITE(M, OP2, VAL) do {\
-	asm volatile("mcr p14, 0, %0, c0," #M ", " #OP2 : : "r" (VAL));\
+#define ARM_DBG_WRITE(N, M, OP2, VAL) do {\
+	asm volatile("mcr p14, 0, %0, " #N "," #M ", " #OP2 : : "r" (VAL));\
 } while (0)
 
 struct notifier_block;
diff --git a/arch/arm/include/asm/mach/serial_at91.h b/arch/arm/include/asm/mach/serial_at91.h
deleted file mode 100644
index ea6d063..0000000
--- a/arch/arm/include/asm/mach/serial_at91.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  arch/arm/include/asm/mach/serial_at91.h
- *
- *  Based on serial_sa1100.h  by Nicolas Pitre
- *
- *  Copyright (C) 2002 ATMEL Rousset
- *
- *  Low level machine dependent UART functions.
- */
-
-struct uart_port;
-
-/*
- * This is a temporary structure for registering these
- * functions; it is intended to be discarded after boot.
- */
-struct atmel_port_fns {
-	void	(*set_mctrl)(struct uart_port *, u_int);
-	u_int	(*get_mctrl)(struct uart_port *);
-	void	(*enable_ms)(struct uart_port *);
-	void	(*pm)(struct uart_port *, u_int, u_int);
-	int	(*set_wake)(struct uart_port *, u_int);
-	int	(*open)(struct uart_port *);
-	void	(*close)(struct uart_port *);
-};
-
-#if defined(CONFIG_SERIAL_ATMEL)
-void atmel_register_uart_fns(struct atmel_port_fns *fns);
-#else
-#define atmel_register_uart_fns(fns) do { } while (0)
-#endif
-
-
diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h
index 1496565..9f77e78 100644
--- a/arch/arm/include/asm/mmu.h
+++ b/arch/arm/include/asm/mmu.h
@@ -5,18 +5,15 @@
 
 typedef struct {
 #ifdef CONFIG_CPU_HAS_ASID
-	unsigned int id;
-	raw_spinlock_t id_lock;
+	u64 id;
 #endif
-	unsigned int kvm_seq;
+	unsigned int vmalloc_seq;
 } mm_context_t;
 
 #ifdef CONFIG_CPU_HAS_ASID
-#define ASID(mm)	((mm)->context.id & 255)
-
-/* init_mm.context.id_lock should be initialized. */
-#define INIT_MM_CONTEXT(name)                                                 \
-	.context.id_lock    = __RAW_SPIN_LOCK_UNLOCKED(name.context.id_lock),
+#define ASID_BITS	8
+#define ASID_MASK	((~0ULL) << ASID_BITS)
+#define ASID(mm)	((mm)->context.id & ~ASID_MASK)
 #else
 #define ASID(mm)	(0)
 #endif
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index 0306bc6..e1f644b 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -20,88 +20,12 @@
 #include <asm/proc-fns.h>
 #include <asm-generic/mm_hooks.h>
 
-void __check_kvm_seq(struct mm_struct *mm);
+void __check_vmalloc_seq(struct mm_struct *mm);
 
 #ifdef CONFIG_CPU_HAS_ASID
 
-/*
- * On ARMv6, we have the following structure in the Context ID:
- *
- * 31                         7          0
- * +-------------------------+-----------+
- * |      process ID         |   ASID    |
- * +-------------------------+-----------+
- * |              context ID             |
- * +-------------------------------------+
- *
- * The ASID is used to tag entries in the CPU caches and TLBs.
- * The context ID is used by debuggers and trace logic, and
- * should be unique within all running processes.
- */
-#define ASID_BITS		8
-#define ASID_MASK		((~0) << ASID_BITS)
-#define ASID_FIRST_VERSION	(1 << ASID_BITS)
-
-extern unsigned int cpu_last_asid;
-
-void __init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void __new_context(struct mm_struct *mm);
-void cpu_set_reserved_ttbr0(void);
-
-static inline void switch_new_context(struct mm_struct *mm)
-{
-	unsigned long flags;
-
-	__new_context(mm);
-
-	local_irq_save(flags);
-	cpu_switch_mm(mm->pgd, mm);
-	local_irq_restore(flags);
-}
-
-static inline void check_and_switch_context(struct mm_struct *mm,
-					    struct task_struct *tsk)
-{
-	if (unlikely(mm->context.kvm_seq != init_mm.context.kvm_seq))
-		__check_kvm_seq(mm);
-
-	/*
-	 * Required during context switch to avoid speculative page table
-	 * walking with the wrong TTBR.
-	 */
-	cpu_set_reserved_ttbr0();
-
-	if (!((mm->context.id ^ cpu_last_asid) >> ASID_BITS))
-		/*
-		 * The ASID is from the current generation, just switch to the
-		 * new pgd. This condition is only true for calls from
-		 * context_switch() and interrupts are already disabled.
-		 */
-		cpu_switch_mm(mm->pgd, mm);
-	else if (irqs_disabled())
-		/*
-		 * Defer the new ASID allocation until after the context
-		 * switch critical region since __new_context() cannot be
-		 * called with interrupts disabled (it sends IPIs).
-		 */
-		set_ti_thread_flag(task_thread_info(tsk), TIF_SWITCH_MM);
-	else
-		/*
-		 * That is a direct call to switch_mm() or activate_mm() with
-		 * interrupts enabled and a new context.
-		 */
-		switch_new_context(mm);
-}
-
-#define init_new_context(tsk,mm)	(__init_new_context(tsk,mm),0)
-
-#define finish_arch_post_lock_switch \
-	finish_arch_post_lock_switch
-static inline void finish_arch_post_lock_switch(void)
-{
-	if (test_and_clear_thread_flag(TIF_SWITCH_MM))
-		switch_new_context(current->mm);
-}
+void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk);
+#define init_new_context(tsk,mm)	({ mm->context.id = 0; })
 
 #else	/* !CONFIG_CPU_HAS_ASID */
 
@@ -110,8 +34,8 @@
 static inline void check_and_switch_context(struct mm_struct *mm,
 					    struct task_struct *tsk)
 {
-	if (unlikely(mm->context.kvm_seq != init_mm.context.kvm_seq))
-		__check_kvm_seq(mm);
+	if (unlikely(mm->context.vmalloc_seq != init_mm.context.vmalloc_seq))
+		__check_vmalloc_seq(mm);
 
 	if (irqs_disabled())
 		/*
@@ -143,6 +67,7 @@
 #endif	/* CONFIG_CPU_HAS_ASID */
 
 #define destroy_context(mm)		do { } while(0)
+#define activate_mm(prev,next)		switch_mm(prev, next, NULL)
 
 /*
  * This is called when "tsk" is about to enter lazy TLB mode.
@@ -186,6 +111,5 @@
 }
 
 #define deactivate_mm(tsk,mm)	do { } while (0)
-#define activate_mm(prev,next)	switch_mm(prev, next, NULL)
 
 #endif
diff --git a/arch/arm/include/asm/percpu.h b/arch/arm/include/asm/percpu.h
new file mode 100644
index 0000000..968c0a1
--- /dev/null
+++ b/arch/arm/include/asm/percpu.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2012 Calxeda, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _ASM_ARM_PERCPU_H_
+#define _ASM_ARM_PERCPU_H_
+
+/*
+ * Same as asm-generic/percpu.h, except that we store the per cpu offset
+ * in the TPIDRPRW. TPIDRPRW only exists on V6K and V7
+ */
+#if defined(CONFIG_SMP) && !defined(CONFIG_CPU_V6)
+static inline void set_my_cpu_offset(unsigned long off)
+{
+	/* Set TPIDRPRW */
+	asm volatile("mcr p15, 0, %0, c13, c0, 4" : : "r" (off) : "memory");
+}
+
+static inline unsigned long __my_cpu_offset(void)
+{
+	unsigned long off;
+	/* Read TPIDRPRW */
+	asm("mrc p15, 0, %0, c13, c0, 4" : "=r" (off) : : "memory");
+	return off;
+}
+#define __my_cpu_offset __my_cpu_offset()
+#else
+#define set_my_cpu_offset(x)	do {} while(0)
+
+#endif /* CONFIG_SMP */
+
+#include <asm-generic/percpu.h>
+
+#endif /* _ASM_ARM_PERCPU_H_ */
diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h
index 625cd62..7558775 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -21,4 +21,11 @@
 #define C(_x)				PERF_COUNT_HW_CACHE_##_x
 #define CACHE_OP_UNSUPPORTED		0xFFFF
 
+#ifdef CONFIG_HW_PERF_EVENTS
+struct pt_regs;
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs)	perf_misc_flags(regs)
+#endif
+
 #endif /* __ARM_PERF_EVENT_H__ */
diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h
index 2317a71..f97ee02 100644
--- a/arch/arm/include/asm/pgtable-2level.h
+++ b/arch/arm/include/asm/pgtable-2level.h
@@ -115,6 +115,7 @@
  * The PTE table pointer refers to the hardware entries; the "Linux"
  * entries are stored 1024 bytes below.
  */
+#define L_PTE_VALID		(_AT(pteval_t, 1) << 0)		/* Valid */
 #define L_PTE_PRESENT		(_AT(pteval_t, 1) << 0)
 #define L_PTE_YOUNG		(_AT(pteval_t, 1) << 1)
 #define L_PTE_FILE		(_AT(pteval_t, 1) << 2)	/* only when !PRESENT */
@@ -123,6 +124,7 @@
 #define L_PTE_USER		(_AT(pteval_t, 1) << 8)
 #define L_PTE_XN		(_AT(pteval_t, 1) << 9)
 #define L_PTE_SHARED		(_AT(pteval_t, 1) << 10)	/* shared(v6), coherent(xsc3) */
+#define L_PTE_NONE		(_AT(pteval_t, 1) << 11)
 
 /*
  * These are the memory types, defined to be compatible with
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index b249035..a3f3792 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -67,7 +67,8 @@
  * These bits overlap with the hardware bits but the naming is preserved for
  * consistency with the classic page table format.
  */
-#define L_PTE_PRESENT		(_AT(pteval_t, 3) << 0)		/* Valid */
+#define L_PTE_VALID		(_AT(pteval_t, 1) << 0)		/* Valid */
+#define L_PTE_PRESENT		(_AT(pteval_t, 3) << 0)		/* Present */
 #define L_PTE_FILE		(_AT(pteval_t, 1) << 2)		/* only when !PRESENT */
 #define L_PTE_USER		(_AT(pteval_t, 1) << 6)		/* AP[1] */
 #define L_PTE_RDONLY		(_AT(pteval_t, 1) << 7)		/* AP[2] */
@@ -76,6 +77,7 @@
 #define L_PTE_XN		(_AT(pteval_t, 1) << 54)	/* XN */
 #define L_PTE_DIRTY		(_AT(pteval_t, 1) << 55)	/* unused */
 #define L_PTE_SPECIAL		(_AT(pteval_t, 1) << 56)	/* unused */
+#define L_PTE_NONE		(_AT(pteval_t, 1) << 57)	/* PROT_NONE */
 
 /*
  * To be used in assembly code with the upper page attributes.
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 08c1231..9c82f988 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -73,7 +73,7 @@
 
 #define _MOD_PROT(p, b)	__pgprot(pgprot_val(p) | (b))
 
-#define PAGE_NONE		_MOD_PROT(pgprot_user, L_PTE_XN | L_PTE_RDONLY)
+#define PAGE_NONE		_MOD_PROT(pgprot_user, L_PTE_XN | L_PTE_RDONLY | L_PTE_NONE)
 #define PAGE_SHARED		_MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_XN)
 #define PAGE_SHARED_EXEC	_MOD_PROT(pgprot_user, L_PTE_USER)
 #define PAGE_COPY		_MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY | L_PTE_XN)
@@ -83,7 +83,7 @@
 #define PAGE_KERNEL		_MOD_PROT(pgprot_kernel, L_PTE_XN)
 #define PAGE_KERNEL_EXEC	pgprot_kernel
 
-#define __PAGE_NONE		__pgprot(_L_PTE_DEFAULT | L_PTE_RDONLY | L_PTE_XN)
+#define __PAGE_NONE		__pgprot(_L_PTE_DEFAULT | L_PTE_RDONLY | L_PTE_XN | L_PTE_NONE)
 #define __PAGE_SHARED		__pgprot(_L_PTE_DEFAULT | L_PTE_USER | L_PTE_XN)
 #define __PAGE_SHARED_EXEC	__pgprot(_L_PTE_DEFAULT | L_PTE_USER)
 #define __PAGE_COPY		__pgprot(_L_PTE_DEFAULT | L_PTE_USER | L_PTE_RDONLY | L_PTE_XN)
@@ -203,9 +203,7 @@
 #define pte_exec(pte)		(!(pte_val(pte) & L_PTE_XN))
 #define pte_special(pte)	(0)
 
-#define pte_present_user(pte) \
-	((pte_val(pte) & (L_PTE_PRESENT | L_PTE_USER)) == \
-	 (L_PTE_PRESENT | L_PTE_USER))
+#define pte_present_user(pte)  (pte_present(pte) && (pte_val(pte) & L_PTE_USER))
 
 #if __LINUX_ARM_ARCH__ < 6
 static inline void __sync_icache_dcache(pte_t pteval)
@@ -242,7 +240,7 @@
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
-	const pteval_t mask = L_PTE_XN | L_PTE_RDONLY | L_PTE_USER;
+	const pteval_t mask = L_PTE_XN | L_PTE_RDONLY | L_PTE_USER | L_PTE_NONE;
 	pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask);
 	return pte;
 }
diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
index a26170d..f24edad 100644
--- a/arch/arm/include/asm/pmu.h
+++ b/arch/arm/include/asm/pmu.h
@@ -67,19 +67,19 @@
 	cpumask_t	active_irqs;
 	char		*name;
 	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
-	void		(*enable)(struct hw_perf_event *evt, int idx);
-	void		(*disable)(struct hw_perf_event *evt, int idx);
+	void		(*enable)(struct perf_event *event);
+	void		(*disable)(struct perf_event *event);
 	int		(*get_event_idx)(struct pmu_hw_events *hw_events,
-					 struct hw_perf_event *hwc);
+					 struct perf_event *event);
 	int		(*set_event_filter)(struct hw_perf_event *evt,
 					    struct perf_event_attr *attr);
-	u32		(*read_counter)(int idx);
-	void		(*write_counter)(int idx, u32 val);
-	void		(*start)(void);
-	void		(*stop)(void);
+	u32		(*read_counter)(struct perf_event *event);
+	void		(*write_counter)(struct perf_event *event, u32 val);
+	void		(*start)(struct arm_pmu *);
+	void		(*stop)(struct arm_pmu *);
 	void		(*reset)(void *);
-	int		(*request_irq)(irq_handler_t handler);
-	void		(*free_irq)(void);
+	int		(*request_irq)(struct arm_pmu *, irq_handler_t handler);
+	void		(*free_irq)(struct arm_pmu *);
 	int		(*map_event)(struct perf_event *event);
 	int		num_events;
 	atomic_t	active_events;
@@ -93,15 +93,11 @@
 
 extern const struct dev_pm_ops armpmu_dev_pm_ops;
 
-int armpmu_register(struct arm_pmu *armpmu, char *name, int type);
+int armpmu_register(struct arm_pmu *armpmu, int type);
 
-u64 armpmu_event_update(struct perf_event *event,
-			struct hw_perf_event *hwc,
-			int idx);
+u64 armpmu_event_update(struct perf_event *event);
 
-int armpmu_event_set_period(struct perf_event *event,
-			    struct hw_perf_event *hwc,
-			    int idx);
+int armpmu_event_set_period(struct perf_event *event);
 
 int armpmu_map_event(struct perf_event *event,
 		     const unsigned (*event_map)[PERF_COUNT_HW_MAX],
diff --git a/arch/arm/include/asm/prom.h b/arch/arm/include/asm/prom.h
index 6d65ba2..a219227 100644
--- a/arch/arm/include/asm/prom.h
+++ b/arch/arm/include/asm/prom.h
@@ -17,6 +17,7 @@
 
 extern struct machine_desc *setup_machine_fdt(unsigned int dt_phys);
 extern void arm_dt_memblock_reserve(void);
+extern void __init arm_dt_init_cpu_maps(void);
 
 #else /* CONFIG_OF */
 
@@ -26,6 +27,7 @@
 }
 
 static inline void arm_dt_memblock_reserve(void) { }
+static inline void arm_dt_init_cpu_maps(void) { }
 
 #endif /* CONFIG_OF */
 #endif /* ASMARM_PROM_H */
diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h
index 2e3be16..d3a22be 100644
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -79,6 +79,7 @@
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
+extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
 
 struct smp_operations {
 #ifdef CONFIG_SMP
diff --git a/arch/arm/include/asm/smp_plat.h b/arch/arm/include/asm/smp_plat.h
index 558d6c8..aaa61b6 100644
--- a/arch/arm/include/asm/smp_plat.h
+++ b/arch/arm/include/asm/smp_plat.h
@@ -5,6 +5,9 @@
 #ifndef __ASMARM_SMP_PLAT_H
 #define __ASMARM_SMP_PLAT_H
 
+#include <linux/cpumask.h>
+#include <linux/err.h>
+
 #include <asm/cputype.h>
 
 /*
@@ -48,5 +51,19 @@
  */
 extern int __cpu_logical_map[];
 #define cpu_logical_map(cpu)	__cpu_logical_map[cpu]
+/*
+ * Retrieve logical cpu index corresponding to a given MPIDR[23:0]
+ *  - mpidr: MPIDR[23:0] to be used for the look-up
+ *
+ * Returns the cpu logical index or -EINVAL on look-up error
+ */
+static inline int get_logical_index(u32 mpidr)
+{
+	int cpu;
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+		if (cpu_logical_map(cpu) == mpidr)
+			return cpu;
+	return -EINVAL;
+}
 
 #endif
diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h
index 9fdded6..f1d96d4 100644
--- a/arch/arm/include/asm/syscall.h
+++ b/arch/arm/include/asm/syscall.h
@@ -7,6 +7,8 @@
 #ifndef _ASM_ARM_SYSCALL_H
 #define _ASM_ARM_SYSCALL_H
 
+#include <linux/audit.h> /* for AUDIT_ARCH_* */
+#include <linux/elf.h> /* for ELF_EM */
 #include <linux/err.h>
 #include <linux/sched.h>
 
@@ -95,4 +97,11 @@
 	memcpy(&regs->ARM_r0 + i, args, n * sizeof(args[0]));
 }
 
+static inline int syscall_get_arch(struct task_struct *task,
+				   struct pt_regs *regs)
+{
+	/* ARM tasks don't change audit architectures on the fly. */
+	return AUDIT_ARCH_ARM;
+}
+
 #endif /* _ASM_ARM_SYSCALL_H */
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 8477b4c..cddda1f 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -151,10 +151,10 @@
 #define TIF_SYSCALL_TRACE	8
 #define TIF_SYSCALL_AUDIT	9
 #define TIF_SYSCALL_TRACEPOINT	10
+#define TIF_SECCOMP		11	/* seccomp syscall filtering active */
 #define TIF_USING_IWMMXT	17
 #define TIF_MEMDIE		18	/* is terminating due to OOM killer */
 #define TIF_RESTORE_SIGMASK	20
-#define TIF_SECCOMP		21
 #define TIF_SWITCH_MM		22	/* deferred switch_mm */
 
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
@@ -163,11 +163,12 @@
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
-#define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
 
 /* Checks for any syscall work in entry-common.S */
-#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+			   _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP)
 
 /*
  * Change these and you break ASM code in entry-common.S
diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c
index bee7f9d..70f1bde 100644
--- a/arch/arm/kernel/devtree.c
+++ b/arch/arm/kernel/devtree.c
@@ -19,8 +19,10 @@
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
 
+#include <asm/cputype.h>
 #include <asm/setup.h>
 #include <asm/page.h>
+#include <asm/smp_plat.h>
 #include <asm/mach/arch.h>
 #include <asm/mach-types.h>
 
@@ -61,6 +63,108 @@
 	}
 }
 
+/*
+ * arm_dt_init_cpu_maps - Function retrieves cpu nodes from the device tree
+ * and builds the cpu logical map array containing MPIDR values related to
+ * logical cpus
+ *
+ * Updates the cpu possible mask with the number of parsed cpu nodes
+ */
+void __init arm_dt_init_cpu_maps(void)
+{
+	/*
+	 * Temp logical map is initialized with UINT_MAX values that are
+	 * considered invalid logical map entries since the logical map must
+	 * contain a list of MPIDR[23:0] values where MPIDR[31:24] must
+	 * read as 0.
+	 */
+	struct device_node *cpu, *cpus;
+	u32 i, j, cpuidx = 1;
+	u32 mpidr = is_smp() ? read_cpuid_mpidr() & MPIDR_HWID_BITMASK : 0;
+
+	u32 tmp_map[NR_CPUS] = { [0 ... NR_CPUS-1] = UINT_MAX };
+	bool bootcpu_valid = false;
+	cpus = of_find_node_by_path("/cpus");
+
+	if (!cpus)
+		return;
+
+	for_each_child_of_node(cpus, cpu) {
+		u32 hwid;
+
+		pr_debug(" * %s...\n", cpu->full_name);
+		/*
+		 * A device tree containing CPU nodes with missing "reg"
+		 * properties is considered invalid to build the
+		 * cpu_logical_map.
+		 */
+		if (of_property_read_u32(cpu, "reg", &hwid)) {
+			pr_debug(" * %s missing reg property\n",
+				     cpu->full_name);
+			return;
+		}
+
+		/*
+		 * 8 MSBs must be set to 0 in the DT since the reg property
+		 * defines the MPIDR[23:0].
+		 */
+		if (hwid & ~MPIDR_HWID_BITMASK)
+			return;
+
+		/*
+		 * Duplicate MPIDRs are a recipe for disaster.
+		 * Scan all initialized entries and check for
+		 * duplicates. If any is found just bail out.
+		 * temp values were initialized to UINT_MAX
+		 * to avoid matching valid MPIDR[23:0] values.
+		 */
+		for (j = 0; j < cpuidx; j++)
+			if (WARN(tmp_map[j] == hwid, "Duplicate /cpu reg "
+						     "properties in the DT\n"))
+				return;
+
+		/*
+		 * Build a stashed array of MPIDR values. Numbering scheme
+		 * requires that if detected the boot CPU must be assigned
+		 * logical id 0. Other CPUs get sequential indexes starting
+		 * from 1. If a CPU node with a reg property matching the
+		 * boot CPU MPIDR is detected, this is recorded so that the
+		 * logical map built from DT is validated and can be used
+		 * to override the map created in smp_setup_processor_id().
+		 */
+		if (hwid == mpidr) {
+			i = 0;
+			bootcpu_valid = true;
+		} else {
+			i = cpuidx++;
+		}
+
+		if (WARN(cpuidx > nr_cpu_ids, "DT /cpu %u nodes greater than "
+					       "max cores %u, capping them\n",
+					       cpuidx, nr_cpu_ids)) {
+			cpuidx = nr_cpu_ids;
+			break;
+		}
+
+		tmp_map[i] = hwid;
+	}
+
+	if (WARN(!bootcpu_valid, "DT missing boot CPU MPIDR[23:0], "
+				 "fall back to default cpu_logical_map\n"))
+		return;
+
+	/*
+	 * Since the boot CPU node contains proper data, and all nodes have
+	 * a reg property, the DT CPU list can be considered valid and the
+	 * logical map created in smp_setup_processor_id() can be overridden
+	 */
+	for (i = 0; i < cpuidx; i++) {
+		set_cpu_possible(i, true);
+		cpu_logical_map(i) = tmp_map[i];
+		pr_debug("cpu logical map 0x%x\n", cpu_logical_map(i));
+	}
+}
+
 /**
  * setup_machine_fdt - Machine setup when an dtb was passed to the kernel
  * @dt_phys: physical address of dt blob
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 3471175..804153c 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -417,16 +417,6 @@
 	ldr	r10, [tsk, #TI_FLAGS]		@ check for syscall tracing
 	stmdb	sp!, {r4, r5}			@ push fifth and sixth args
 
-#ifdef CONFIG_SECCOMP
-	tst	r10, #_TIF_SECCOMP
-	beq	1f
-	mov	r0, scno
-	bl	__secure_computing	
-	add	r0, sp, #S_R0 + S_OFF		@ pointer to regs
-	ldmia	r0, {r0 - r3}			@ have to reload r0 - r3
-1:
-#endif
-
 	tst	r10, #_TIF_SYSCALL_WORK		@ are we tracing syscalls?
 	bne	__sys_trace
 
@@ -458,11 +448,13 @@
 	ldmccia	r1, {r0 - r6}			@ have to reload r0 - r6
 	stmccia	sp, {r4, r5}			@ and update the stack args
 	ldrcc	pc, [tbl, scno, lsl #2]		@ call sys_* routine
-	b	2b
+	cmp	scno, #-1			@ skip the syscall?
+	bne	2b
+	add	sp, sp, #S_OFF			@ restore stack
+	b	ret_slow_syscall
 
 __sys_trace_return:
 	str	r0, [sp, #S_R0 + S_OFF]!	@ save returned r0
-	mov	r1, scno
 	mov	r0, sp
 	bl	syscall_trace_exit
 	b	ret_slow_syscall
diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S
index 278cfc1..2c228a0 100644
--- a/arch/arm/kernel/head-nommu.S
+++ b/arch/arm/kernel/head-nommu.S
@@ -68,7 +68,7 @@
 	 * CP15 system control register value returned in r0 from
 	 * the CPU init function.
 	 */
-#ifdef CONFIG_ALIGNMENT_TRAP
+#if defined(CONFIG_ALIGNMENT_TRAP) && __LINUX_ARM_ARCH__ < 6
 	orr	r0, r0, #CR_A
 #else
 	bic	r0, r0, #CR_A
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 281bf33..5ff2e77 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -52,14 +52,14 @@
 /* Maximum supported watchpoint length. */
 static u8 max_watchpoint_len;
 
-#define READ_WB_REG_CASE(OP2, M, VAL)		\
-	case ((OP2 << 4) + M):			\
-		ARM_DBG_READ(c ## M, OP2, VAL); \
+#define READ_WB_REG_CASE(OP2, M, VAL)			\
+	case ((OP2 << 4) + M):				\
+		ARM_DBG_READ(c0, c ## M, OP2, VAL);	\
 		break
 
-#define WRITE_WB_REG_CASE(OP2, M, VAL)		\
-	case ((OP2 << 4) + M):			\
-		ARM_DBG_WRITE(c ## M, OP2, VAL);\
+#define WRITE_WB_REG_CASE(OP2, M, VAL)			\
+	case ((OP2 << 4) + M):				\
+		ARM_DBG_WRITE(c0, c ## M, OP2, VAL);	\
 		break
 
 #define GEN_READ_WB_REG_CASES(OP2, VAL)		\
@@ -136,12 +136,12 @@
 
 	/* Do we implement the extended CPUID interface? */
 	if (((read_cpuid_id() >> 16) & 0xf) != 0xf) {
-		pr_warning("CPUID feature registers not supported. "
-			   "Assuming v6 debug is present.\n");
+		pr_warn_once("CPUID feature registers not supported. "
+			     "Assuming v6 debug is present.\n");
 		return ARM_DEBUG_ARCH_V6;
 	}
 
-	ARM_DBG_READ(c0, 0, didr);
+	ARM_DBG_READ(c0, c0, 0, didr);
 	return (didr >> 16) & 0xf;
 }
 
@@ -169,7 +169,7 @@
 static int get_num_wrp_resources(void)
 {
 	u32 didr;
-	ARM_DBG_READ(c0, 0, didr);
+	ARM_DBG_READ(c0, c0, 0, didr);
 	return ((didr >> 28) & 0xf) + 1;
 }
 
@@ -177,7 +177,7 @@
 static int get_num_brp_resources(void)
 {
 	u32 didr;
-	ARM_DBG_READ(c0, 0, didr);
+	ARM_DBG_READ(c0, c0, 0, didr);
 	return ((didr >> 24) & 0xf) + 1;
 }
 
@@ -228,19 +228,17 @@
  * be put into halting debug mode at any time by an external debugger
  * but there is nothing we can do to prevent that.
  */
+static int monitor_mode_enabled(void)
+{
+	u32 dscr;
+	ARM_DBG_READ(c0, c1, 0, dscr);
+	return !!(dscr & ARM_DSCR_MDBGEN);
+}
+
 static int enable_monitor_mode(void)
 {
 	u32 dscr;
-	int ret = 0;
-
-	ARM_DBG_READ(c1, 0, dscr);
-
-	/* Ensure that halting mode is disabled. */
-	if (WARN_ONCE(dscr & ARM_DSCR_HDBGEN,
-		"halting debug mode enabled. Unable to access hardware resources.\n")) {
-		ret = -EPERM;
-		goto out;
-	}
+	ARM_DBG_READ(c0, c1, 0, dscr);
 
 	/* If monitor mode is already enabled, just return. */
 	if (dscr & ARM_DSCR_MDBGEN)
@@ -250,24 +248,27 @@
 	switch (get_debug_arch()) {
 	case ARM_DEBUG_ARCH_V6:
 	case ARM_DEBUG_ARCH_V6_1:
-		ARM_DBG_WRITE(c1, 0, (dscr | ARM_DSCR_MDBGEN));
+		ARM_DBG_WRITE(c0, c1, 0, (dscr | ARM_DSCR_MDBGEN));
 		break;
 	case ARM_DEBUG_ARCH_V7_ECP14:
 	case ARM_DEBUG_ARCH_V7_1:
-		ARM_DBG_WRITE(c2, 2, (dscr | ARM_DSCR_MDBGEN));
+		ARM_DBG_WRITE(c0, c2, 2, (dscr | ARM_DSCR_MDBGEN));
+		isb();
 		break;
 	default:
-		ret = -ENODEV;
-		goto out;
+		return -ENODEV;
 	}
 
 	/* Check that the write made it through. */
-	ARM_DBG_READ(c1, 0, dscr);
-	if (!(dscr & ARM_DSCR_MDBGEN))
-		ret = -EPERM;
+	ARM_DBG_READ(c0, c1, 0, dscr);
+	if (!(dscr & ARM_DSCR_MDBGEN)) {
+		pr_warn_once("Failed to enable monitor mode on CPU %d.\n",
+				smp_processor_id());
+		return -EPERM;
+	}
 
 out:
-	return ret;
+	return 0;
 }
 
 int hw_breakpoint_slots(int type)
@@ -328,14 +329,9 @@
 {
 	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 	struct perf_event **slot, **slots;
-	int i, max_slots, ctrl_base, val_base, ret = 0;
+	int i, max_slots, ctrl_base, val_base;
 	u32 addr, ctrl;
 
-	/* Ensure that we are in monitor mode and halting mode is disabled. */
-	ret = enable_monitor_mode();
-	if (ret)
-		goto out;
-
 	addr = info->address;
 	ctrl = encode_ctrl_reg(info->ctrl) | 0x1;
 
@@ -362,9 +358,9 @@
 		}
 	}
 
-	if (WARN_ONCE(i == max_slots, "Can't find any breakpoint slot\n")) {
-		ret = -EBUSY;
-		goto out;
+	if (i == max_slots) {
+		pr_warning("Can't find any breakpoint slot\n");
+		return -EBUSY;
 	}
 
 	/* Override the breakpoint data with the step data. */
@@ -383,9 +379,7 @@
 
 	/* Setup the control register. */
 	write_wb_reg(ctrl_base + i, ctrl);
-
-out:
-	return ret;
+	return 0;
 }
 
 void arch_uninstall_hw_breakpoint(struct perf_event *bp)
@@ -416,8 +410,10 @@
 		}
 	}
 
-	if (WARN_ONCE(i == max_slots, "Can't find any breakpoint slot\n"))
+	if (i == max_slots) {
+		pr_warning("Can't find any breakpoint slot\n");
 		return;
+	}
 
 	/* Ensure that we disable the mismatch breakpoint. */
 	if (info->ctrl.type != ARM_BREAKPOINT_EXECUTE &&
@@ -596,6 +592,10 @@
 	int ret = 0;
 	u32 offset, alignment_mask = 0x3;
 
+	/* Ensure that we are in monitor debug mode. */
+	if (!monitor_mode_enabled())
+		return -ENODEV;
+
 	/* Build the arch_hw_breakpoint. */
 	ret = arch_build_bp_info(bp);
 	if (ret)
@@ -858,7 +858,7 @@
 		local_irq_enable();
 
 	/* We only handle watchpoints and hardware breakpoints. */
-	ARM_DBG_READ(c1, 0, dscr);
+	ARM_DBG_READ(c0, c1, 0, dscr);
 
 	/* Perform perf callbacks. */
 	switch (ARM_DSCR_MOE(dscr)) {
@@ -906,7 +906,7 @@
 static void reset_ctrl_regs(void *unused)
 {
 	int i, raw_num_brps, err = 0, cpu = smp_processor_id();
-	u32 dbg_power;
+	u32 val;
 
 	/*
 	 * v7 debug contains save and restore registers so that debug state
@@ -919,23 +919,30 @@
 	switch (debug_arch) {
 	case ARM_DEBUG_ARCH_V6:
 	case ARM_DEBUG_ARCH_V6_1:
-		/* ARMv6 cores just need to reset the registers. */
-		goto reset_regs;
+		/* ARMv6 cores clear the registers out of reset. */
+		goto out_mdbgen;
 	case ARM_DEBUG_ARCH_V7_ECP14:
 		/*
 		 * Ensure sticky power-down is clear (i.e. debug logic is
 		 * powered up).
 		 */
-		asm volatile("mrc p14, 0, %0, c1, c5, 4" : "=r" (dbg_power));
-		if ((dbg_power & 0x1) == 0)
+		ARM_DBG_READ(c1, c5, 4, val);
+		if ((val & 0x1) == 0)
 			err = -EPERM;
+
+		/*
+		 * Check whether we implement OS save and restore.
+		 */
+		ARM_DBG_READ(c1, c1, 4, val);
+		if ((val & 0x9) == 0)
+			goto clear_vcr;
 		break;
 	case ARM_DEBUG_ARCH_V7_1:
 		/*
 		 * Ensure the OS double lock is clear.
 		 */
-		asm volatile("mrc p14, 0, %0, c1, c3, 4" : "=r" (dbg_power));
-		if ((dbg_power & 0x1) == 1)
+		ARM_DBG_READ(c1, c3, 4, val);
+		if ((val & 0x1) == 1)
 			err = -EPERM;
 		break;
 	}
@@ -947,24 +954,29 @@
 	}
 
 	/*
-	 * Unconditionally clear the lock by writing a value
+	 * Unconditionally clear the OS lock by writing a value
 	 * other than 0xC5ACCE55 to the access register.
 	 */
-	asm volatile("mcr p14, 0, %0, c1, c0, 4" : : "r" (0));
+	ARM_DBG_WRITE(c1, c0, 4, 0);
 	isb();
 
 	/*
 	 * Clear any configured vector-catch events before
 	 * enabling monitor mode.
 	 */
-	asm volatile("mcr p14, 0, %0, c0, c7, 0" : : "r" (0));
+clear_vcr:
+	ARM_DBG_WRITE(c0, c7, 0, 0);
 	isb();
 
-reset_regs:
-	if (enable_monitor_mode())
+	if (cpumask_intersects(&debug_err_mask, cpumask_of(cpu))) {
+		pr_warning("CPU %d failed to disable vector catch\n", cpu);
 		return;
+	}
 
-	/* We must also reset any reserved registers. */
+	/*
+	 * The control/value register pairs are UNKNOWN out of reset so
+	 * clear them to avoid spurious debug events.
+	 */
 	raw_num_brps = get_num_brp_resources();
 	for (i = 0; i < raw_num_brps; ++i) {
 		write_wb_reg(ARM_BASE_BCR + i, 0UL);
@@ -975,6 +987,19 @@
 		write_wb_reg(ARM_BASE_WCR + i, 0UL);
 		write_wb_reg(ARM_BASE_WVR + i, 0UL);
 	}
+
+	if (cpumask_intersects(&debug_err_mask, cpumask_of(cpu))) {
+		pr_warning("CPU %d failed to clear debug register pairs\n", cpu);
+		return;
+	}
+
+	/*
+	 * Have a crack at enabling monitor mode. We don't actually need
+	 * it yet, but reporting an error early is useful if it fails.
+	 */
+out_mdbgen:
+	if (enable_monitor_mode())
+		cpumask_or(&debug_err_mask, &debug_err_mask, cpumask_of(cpu));
 }
 
 static int __cpuinit dbg_reset_notify(struct notifier_block *self,
@@ -992,8 +1017,6 @@
 
 static int __init arch_hw_breakpoint_init(void)
 {
-	u32 dscr;
-
 	debug_arch = get_debug_arch();
 
 	if (!debug_arch_supported()) {
@@ -1028,17 +1051,10 @@
 		core_num_brps, core_has_mismatch_brps() ? "(+1 reserved) " :
 		"", core_num_wrps);
 
-	ARM_DBG_READ(c1, 0, dscr);
-	if (dscr & ARM_DSCR_HDBGEN) {
-		max_watchpoint_len = 4;
-		pr_warning("halting debug mode enabled. Assuming maximum watchpoint size of %u bytes.\n",
-			   max_watchpoint_len);
-	} else {
-		/* Work out the maximum supported watchpoint length. */
-		max_watchpoint_len = get_max_wp_len();
-		pr_info("maximum watchpoint size is %u bytes.\n",
-				max_watchpoint_len);
-	}
+	/* Work out the maximum supported watchpoint length. */
+	max_watchpoint_len = get_max_wp_len();
+	pr_info("maximum watchpoint size is %u bytes.\n",
+			max_watchpoint_len);
 
 	/* Register debug fault handler. */
 	hook_fault_code(FAULT_CODE_DEBUG, hw_breakpoint_pending, SIGTRAP,
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 53c0304..f9e8657 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -86,12 +86,10 @@
 	return -ENOENT;
 }
 
-int
-armpmu_event_set_period(struct perf_event *event,
-			struct hw_perf_event *hwc,
-			int idx)
+int armpmu_event_set_period(struct perf_event *event)
 {
 	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	s64 left = local64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
 	int ret = 0;
@@ -119,24 +117,22 @@
 
 	local64_set(&hwc->prev_count, (u64)-left);
 
-	armpmu->write_counter(idx, (u64)(-left) & 0xffffffff);
+	armpmu->write_counter(event, (u64)(-left) & 0xffffffff);
 
 	perf_event_update_userpage(event);
 
 	return ret;
 }
 
-u64
-armpmu_event_update(struct perf_event *event,
-		    struct hw_perf_event *hwc,
-		    int idx)
+u64 armpmu_event_update(struct perf_event *event)
 {
 	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	u64 delta, prev_raw_count, new_raw_count;
 
 again:
 	prev_raw_count = local64_read(&hwc->prev_count);
-	new_raw_count = armpmu->read_counter(idx);
+	new_raw_count = armpmu->read_counter(event);
 
 	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
@@ -159,7 +155,7 @@
 	if (hwc->idx < 0)
 		return;
 
-	armpmu_event_update(event, hwc, hwc->idx);
+	armpmu_event_update(event);
 }
 
 static void
@@ -173,14 +169,13 @@
 	 * PERF_EF_UPDATE, see comments in armpmu_start().
 	 */
 	if (!(hwc->state & PERF_HES_STOPPED)) {
-		armpmu->disable(hwc, hwc->idx);
-		armpmu_event_update(event, hwc, hwc->idx);
+		armpmu->disable(event);
+		armpmu_event_update(event);
 		hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
 	}
 }
 
-static void
-armpmu_start(struct perf_event *event, int flags)
+static void armpmu_start(struct perf_event *event, int flags)
 {
 	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
@@ -200,8 +195,8 @@
 	 * get an interrupt too soon or *way* too late if the overflow has
 	 * happened since disabling.
 	 */
-	armpmu_event_set_period(event, hwc, hwc->idx);
-	armpmu->enable(hwc, hwc->idx);
+	armpmu_event_set_period(event);
+	armpmu->enable(event);
 }
 
 static void
@@ -233,7 +228,7 @@
 	perf_pmu_disable(event->pmu);
 
 	/* If we don't have a space for the counter then finish early. */
-	idx = armpmu->get_event_idx(hw_events, hwc);
+	idx = armpmu->get_event_idx(hw_events, event);
 	if (idx < 0) {
 		err = idx;
 		goto out;
@@ -244,7 +239,7 @@
 	 * sure it is disabled.
 	 */
 	event->hw.idx = idx;
-	armpmu->disable(hwc, idx);
+	armpmu->disable(event);
 	hw_events->events[idx] = event;
 
 	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
@@ -264,13 +259,12 @@
 	       struct perf_event *event)
 {
 	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	struct hw_perf_event fake_event = event->hw;
 	struct pmu *leader_pmu = event->group_leader->pmu;
 
 	if (event->pmu != leader_pmu || event->state <= PERF_EVENT_STATE_OFF)
 		return 1;
 
-	return armpmu->get_event_idx(hw_events, &fake_event) >= 0;
+	return armpmu->get_event_idx(hw_events, event) >= 0;
 }
 
 static int
@@ -316,7 +310,7 @@
 static void
 armpmu_release_hardware(struct arm_pmu *armpmu)
 {
-	armpmu->free_irq();
+	armpmu->free_irq(armpmu);
 	pm_runtime_put_sync(&armpmu->plat_device->dev);
 }
 
@@ -330,7 +324,7 @@
 		return -ENODEV;
 
 	pm_runtime_get_sync(&pmu_device->dev);
-	err = armpmu->request_irq(armpmu_dispatch_irq);
+	err = armpmu->request_irq(armpmu, armpmu_dispatch_irq);
 	if (err) {
 		armpmu_release_hardware(armpmu);
 		return err;
@@ -465,13 +459,13 @@
 	int enabled = bitmap_weight(hw_events->used_mask, armpmu->num_events);
 
 	if (enabled)
-		armpmu->start();
+		armpmu->start(armpmu);
 }
 
 static void armpmu_disable(struct pmu *pmu)
 {
 	struct arm_pmu *armpmu = to_arm_pmu(pmu);
-	armpmu->stop();
+	armpmu->stop(armpmu);
 }
 
 #ifdef CONFIG_PM_RUNTIME
@@ -517,12 +511,13 @@
 	};
 }
 
-int armpmu_register(struct arm_pmu *armpmu, char *name, int type)
+int armpmu_register(struct arm_pmu *armpmu, int type)
 {
 	armpmu_init(armpmu);
+	pm_runtime_enable(&armpmu->plat_device->dev);
 	pr_info("enabled with %s PMU driver, %d counters available\n",
 			armpmu->name, armpmu->num_events);
-	return perf_pmu_register(&armpmu->pmu, name, type);
+	return perf_pmu_register(&armpmu->pmu, armpmu->name, type);
 }
 
 /*
@@ -576,6 +571,10 @@
 {
 	struct frame_tail __user *tail;
 
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* We don't support guest os callchain now */
+		return;
+	}
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
@@ -603,9 +602,41 @@
 {
 	struct stackframe fr;
 
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* We don't support guest os callchain now */
+		return;
+	}
+
 	fr.fp = regs->ARM_fp;
 	fr.sp = regs->ARM_sp;
 	fr.lr = regs->ARM_lr;
 	fr.pc = regs->ARM_pc;
 	walk_stackframe(&fr, callchain_trace, entry);
 }
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+		return perf_guest_cbs->get_guest_ip();
+
+	return instruction_pointer(regs);
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	int misc = 0;
+
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		if (perf_guest_cbs->is_user_mode())
+			misc |= PERF_RECORD_MISC_GUEST_USER;
+		else
+			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+	} else {
+		if (user_mode(regs))
+			misc |= PERF_RECORD_MISC_USER;
+		else
+			misc |= PERF_RECORD_MISC_KERNEL;
+	}
+
+	return misc;
+}
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index 8d7d8d4..9a4f630 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
 
 #include <asm/cputype.h>
@@ -45,7 +46,7 @@
 	if (!cpu_pmu)
 		return NULL;
 
-	return cpu_pmu->pmu.name;
+	return cpu_pmu->name;
 }
 EXPORT_SYMBOL_GPL(perf_pmu_name);
 
@@ -70,7 +71,7 @@
 	return &__get_cpu_var(cpu_hw_events);
 }
 
-static void cpu_pmu_free_irq(void)
+static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 {
 	int i, irq, irqs;
 	struct platform_device *pmu_device = cpu_pmu->plat_device;
@@ -86,7 +87,7 @@
 	}
 }
 
-static int cpu_pmu_request_irq(irq_handler_t handler)
+static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
 {
 	int i, err, irq, irqs;
 	struct platform_device *pmu_device = cpu_pmu->plat_device;
@@ -147,7 +148,7 @@
 
 	/* Ensure the PMU has sane values out of reset. */
 	if (cpu_pmu && cpu_pmu->reset)
-		on_each_cpu(cpu_pmu->reset, NULL, 1);
+		on_each_cpu(cpu_pmu->reset, cpu_pmu, 1);
 }
 
 /*
@@ -163,7 +164,9 @@
 		return NOTIFY_DONE;
 
 	if (cpu_pmu && cpu_pmu->reset)
-		cpu_pmu->reset(NULL);
+		cpu_pmu->reset(cpu_pmu);
+	else
+		return NOTIFY_DONE;
 
 	return NOTIFY_OK;
 }
@@ -195,13 +198,13 @@
 /*
  * CPU PMU identification and probing.
  */
-static struct arm_pmu *__devinit probe_current_pmu(void)
+static int __devinit probe_current_pmu(struct arm_pmu *pmu)
 {
-	struct arm_pmu *pmu = NULL;
 	int cpu = get_cpu();
 	unsigned long cpuid = read_cpuid_id();
 	unsigned long implementor = (cpuid & 0xFF000000) >> 24;
 	unsigned long part_number = (cpuid & 0xFFF0);
+	int ret = -ENODEV;
 
 	pr_info("probing PMU on CPU %d\n", cpu);
 
@@ -211,25 +214,25 @@
 		case 0xB360:	/* ARM1136 */
 		case 0xB560:	/* ARM1156 */
 		case 0xB760:	/* ARM1176 */
-			pmu = armv6pmu_init();
+			ret = armv6pmu_init(pmu);
 			break;
 		case 0xB020:	/* ARM11mpcore */
-			pmu = armv6mpcore_pmu_init();
+			ret = armv6mpcore_pmu_init(pmu);
 			break;
 		case 0xC080:	/* Cortex-A8 */
-			pmu = armv7_a8_pmu_init();
+			ret = armv7_a8_pmu_init(pmu);
 			break;
 		case 0xC090:	/* Cortex-A9 */
-			pmu = armv7_a9_pmu_init();
+			ret = armv7_a9_pmu_init(pmu);
 			break;
 		case 0xC050:	/* Cortex-A5 */
-			pmu = armv7_a5_pmu_init();
+			ret = armv7_a5_pmu_init(pmu);
 			break;
 		case 0xC0F0:	/* Cortex-A15 */
-			pmu = armv7_a15_pmu_init();
+			ret = armv7_a15_pmu_init(pmu);
 			break;
 		case 0xC070:	/* Cortex-A7 */
-			pmu = armv7_a7_pmu_init();
+			ret = armv7_a7_pmu_init(pmu);
 			break;
 		}
 	/* Intel CPUs [xscale]. */
@@ -237,43 +240,54 @@
 		part_number = (cpuid >> 13) & 0x7;
 		switch (part_number) {
 		case 1:
-			pmu = xscale1pmu_init();
+			ret = xscale1pmu_init(pmu);
 			break;
 		case 2:
-			pmu = xscale2pmu_init();
+			ret = xscale2pmu_init(pmu);
 			break;
 		}
 	}
 
 	put_cpu();
-	return pmu;
+	return ret;
 }
 
 static int __devinit cpu_pmu_device_probe(struct platform_device *pdev)
 {
 	const struct of_device_id *of_id;
-	struct arm_pmu *(*init_fn)(void);
+	int (*init_fn)(struct arm_pmu *);
 	struct device_node *node = pdev->dev.of_node;
+	struct arm_pmu *pmu;
+	int ret = -ENODEV;
 
 	if (cpu_pmu) {
 		pr_info("attempt to register multiple PMU devices!");
 		return -ENOSPC;
 	}
 
-	if (node && (of_id = of_match_node(cpu_pmu_of_device_ids, pdev->dev.of_node))) {
-		init_fn = of_id->data;
-		cpu_pmu = init_fn();
-	} else {
-		cpu_pmu = probe_current_pmu();
+	pmu = kzalloc(sizeof(struct arm_pmu), GFP_KERNEL);
+	if (!pmu) {
+		pr_info("failed to allocate PMU device!");
+		return -ENOMEM;
 	}
 
-	if (!cpu_pmu)
-		return -ENODEV;
+	if (node && (of_id = of_match_node(cpu_pmu_of_device_ids, pdev->dev.of_node))) {
+		init_fn = of_id->data;
+		ret = init_fn(pmu);
+	} else {
+		ret = probe_current_pmu(pmu);
+	}
 
+	if (ret) {
+		pr_info("failed to register PMU devices!");
+		kfree(pmu);
+		return ret;
+	}
+
+	cpu_pmu = pmu;
 	cpu_pmu->plat_device = pdev;
 	cpu_pmu_init(cpu_pmu);
-	register_cpu_notifier(&cpu_pmu_hotplug_notifier);
-	armpmu_register(cpu_pmu, cpu_pmu->name, PERF_TYPE_RAW);
+	armpmu_register(cpu_pmu, PERF_TYPE_RAW);
 
 	return 0;
 }
@@ -290,6 +304,16 @@
 
 static int __init register_pmu_driver(void)
 {
-	return platform_driver_register(&cpu_pmu_driver);
+	int err;
+
+	err = register_cpu_notifier(&cpu_pmu_hotplug_notifier);
+	if (err)
+		return err;
+
+	err = platform_driver_register(&cpu_pmu_driver);
+	if (err)
+		unregister_cpu_notifier(&cpu_pmu_hotplug_notifier);
+
+	return err;
 }
 device_initcall(register_pmu_driver);
diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c
index 6ccc079..f3e22ff 100644
--- a/arch/arm/kernel/perf_event_v6.c
+++ b/arch/arm/kernel/perf_event_v6.c
@@ -401,9 +401,10 @@
 	return ret;
 }
 
-static inline u32
-armv6pmu_read_counter(int counter)
+static inline u32 armv6pmu_read_counter(struct perf_event *event)
 {
+	struct hw_perf_event *hwc = &event->hw;
+	int counter = hwc->idx;
 	unsigned long value = 0;
 
 	if (ARMV6_CYCLE_COUNTER == counter)
@@ -418,10 +419,11 @@
 	return value;
 }
 
-static inline void
-armv6pmu_write_counter(int counter,
-		       u32 value)
+static inline void armv6pmu_write_counter(struct perf_event *event, u32 value)
 {
+	struct hw_perf_event *hwc = &event->hw;
+	int counter = hwc->idx;
+
 	if (ARMV6_CYCLE_COUNTER == counter)
 		asm volatile("mcr   p15, 0, %0, c15, c12, 1" : : "r"(value));
 	else if (ARMV6_COUNTER0 == counter)
@@ -432,12 +434,13 @@
 		WARN_ONCE(1, "invalid counter number (%d)\n", counter);
 }
 
-static void
-armv6pmu_enable_event(struct hw_perf_event *hwc,
-		      int idx)
+static void armv6pmu_enable_event(struct perf_event *event)
 {
 	unsigned long val, mask, evt, flags;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+	int idx = hwc->idx;
 
 	if (ARMV6_CYCLE_COUNTER == idx) {
 		mask	= 0;
@@ -473,7 +476,8 @@
 {
 	unsigned long pmcr = armv6_pmcr_read();
 	struct perf_sample_data data;
-	struct pmu_hw_events *cpuc;
+	struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
+	struct pmu_hw_events *cpuc = cpu_pmu->get_hw_events();
 	struct pt_regs *regs;
 	int idx;
 
@@ -489,7 +493,6 @@
 	 */
 	armv6_pmcr_write(pmcr);
 
-	cpuc = &__get_cpu_var(cpu_hw_events);
 	for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
 		struct perf_event *event = cpuc->events[idx];
 		struct hw_perf_event *hwc;
@@ -506,13 +509,13 @@
 			continue;
 
 		hwc = &event->hw;
-		armpmu_event_update(event, hwc, idx);
+		armpmu_event_update(event);
 		perf_sample_data_init(&data, 0, hwc->last_period);
-		if (!armpmu_event_set_period(event, hwc, idx))
+		if (!armpmu_event_set_period(event))
 			continue;
 
 		if (perf_event_overflow(event, &data, regs))
-			cpu_pmu->disable(hwc, idx);
+			cpu_pmu->disable(event);
 	}
 
 	/*
@@ -527,8 +530,7 @@
 	return IRQ_HANDLED;
 }
 
-static void
-armv6pmu_start(void)
+static void armv6pmu_start(struct arm_pmu *cpu_pmu)
 {
 	unsigned long flags, val;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -540,8 +542,7 @@
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static void
-armv6pmu_stop(void)
+static void armv6pmu_stop(struct arm_pmu *cpu_pmu)
 {
 	unsigned long flags, val;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -555,10 +556,11 @@
 
 static int
 armv6pmu_get_event_idx(struct pmu_hw_events *cpuc,
-		       struct hw_perf_event *event)
+				struct perf_event *event)
 {
+	struct hw_perf_event *hwc = &event->hw;
 	/* Always place a cycle counter into the cycle counter. */
-	if (ARMV6_PERFCTR_CPU_CYCLES == event->config_base) {
+	if (ARMV6_PERFCTR_CPU_CYCLES == hwc->config_base) {
 		if (test_and_set_bit(ARMV6_CYCLE_COUNTER, cpuc->used_mask))
 			return -EAGAIN;
 
@@ -579,12 +581,13 @@
 	}
 }
 
-static void
-armv6pmu_disable_event(struct hw_perf_event *hwc,
-		       int idx)
+static void armv6pmu_disable_event(struct perf_event *event)
 {
 	unsigned long val, mask, evt, flags;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+	int idx = hwc->idx;
 
 	if (ARMV6_CYCLE_COUNTER == idx) {
 		mask	= ARMV6_PMCR_CCOUNT_IEN;
@@ -613,12 +616,13 @@
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static void
-armv6mpcore_pmu_disable_event(struct hw_perf_event *hwc,
-			      int idx)
+static void armv6mpcore_pmu_disable_event(struct perf_event *event)
 {
 	unsigned long val, mask, flags, evt = 0;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+	int idx = hwc->idx;
 
 	if (ARMV6_CYCLE_COUNTER == idx) {
 		mask	= ARMV6_PMCR_CCOUNT_IEN;
@@ -649,24 +653,22 @@
 				&armv6_perf_cache_map, 0xFF);
 }
 
-static struct arm_pmu armv6pmu = {
-	.name			= "v6",
-	.handle_irq		= armv6pmu_handle_irq,
-	.enable			= armv6pmu_enable_event,
-	.disable		= armv6pmu_disable_event,
-	.read_counter		= armv6pmu_read_counter,
-	.write_counter		= armv6pmu_write_counter,
-	.get_event_idx		= armv6pmu_get_event_idx,
-	.start			= armv6pmu_start,
-	.stop			= armv6pmu_stop,
-	.map_event		= armv6_map_event,
-	.num_events		= 3,
-	.max_period		= (1LLU << 32) - 1,
-};
-
-static struct arm_pmu *__devinit armv6pmu_init(void)
+static int __devinit armv6pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return &armv6pmu;
+	cpu_pmu->name		= "v6";
+	cpu_pmu->handle_irq	= armv6pmu_handle_irq;
+	cpu_pmu->enable		= armv6pmu_enable_event;
+	cpu_pmu->disable	= armv6pmu_disable_event;
+	cpu_pmu->read_counter	= armv6pmu_read_counter;
+	cpu_pmu->write_counter	= armv6pmu_write_counter;
+	cpu_pmu->get_event_idx	= armv6pmu_get_event_idx;
+	cpu_pmu->start		= armv6pmu_start;
+	cpu_pmu->stop		= armv6pmu_stop;
+	cpu_pmu->map_event	= armv6_map_event;
+	cpu_pmu->num_events	= 3;
+	cpu_pmu->max_period	= (1LLU << 32) - 1;
+
+	return 0;
 }
 
 /*
@@ -683,33 +685,31 @@
 				&armv6mpcore_perf_cache_map, 0xFF);
 }
 
-static struct arm_pmu armv6mpcore_pmu = {
-	.name			= "v6mpcore",
-	.handle_irq		= armv6pmu_handle_irq,
-	.enable			= armv6pmu_enable_event,
-	.disable		= armv6mpcore_pmu_disable_event,
-	.read_counter		= armv6pmu_read_counter,
-	.write_counter		= armv6pmu_write_counter,
-	.get_event_idx		= armv6pmu_get_event_idx,
-	.start			= armv6pmu_start,
-	.stop			= armv6pmu_stop,
-	.map_event		= armv6mpcore_map_event,
-	.num_events		= 3,
-	.max_period		= (1LLU << 32) - 1,
-};
-
-static struct arm_pmu *__devinit armv6mpcore_pmu_init(void)
+static int __devinit armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return &armv6mpcore_pmu;
+	cpu_pmu->name		= "v6mpcore";
+	cpu_pmu->handle_irq	= armv6pmu_handle_irq;
+	cpu_pmu->enable		= armv6pmu_enable_event;
+	cpu_pmu->disable	= armv6mpcore_pmu_disable_event;
+	cpu_pmu->read_counter	= armv6pmu_read_counter;
+	cpu_pmu->write_counter	= armv6pmu_write_counter;
+	cpu_pmu->get_event_idx	= armv6pmu_get_event_idx;
+	cpu_pmu->start		= armv6pmu_start;
+	cpu_pmu->stop		= armv6pmu_stop;
+	cpu_pmu->map_event	= armv6mpcore_map_event;
+	cpu_pmu->num_events	= 3;
+	cpu_pmu->max_period	= (1LLU << 32) - 1;
+
+	return 0;
 }
 #else
-static struct arm_pmu *__devinit armv6pmu_init(void)
+static int armv6pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return NULL;
+	return -ENODEV;
 }
 
-static struct arm_pmu *__devinit armv6mpcore_pmu_init(void)
+static int armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return NULL;
+	return -ENODEV;
 }
 #endif	/* CONFIG_CPU_V6 || CONFIG_CPU_V6K */
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index bd4b090..7d0cce8 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -18,8 +18,6 @@
 
 #ifdef CONFIG_CPU_V7
 
-static struct arm_pmu armv7pmu;
-
 /*
  * Common ARMv7 event types
  *
@@ -738,7 +736,8 @@
  */
 #define	ARMV7_IDX_CYCLE_COUNTER	0
 #define	ARMV7_IDX_COUNTER0	1
-#define	ARMV7_IDX_COUNTER_LAST	(ARMV7_IDX_CYCLE_COUNTER + cpu_pmu->num_events - 1)
+#define	ARMV7_IDX_COUNTER_LAST(cpu_pmu) \
+	(ARMV7_IDX_CYCLE_COUNTER + cpu_pmu->num_events - 1)
 
 #define	ARMV7_MAX_COUNTERS	32
 #define	ARMV7_COUNTER_MASK	(ARMV7_MAX_COUNTERS - 1)
@@ -804,49 +803,34 @@
 	return pmnc & ARMV7_OVERFLOWED_MASK;
 }
 
-static inline int armv7_pmnc_counter_valid(int idx)
+static inline int armv7_pmnc_counter_valid(struct arm_pmu *cpu_pmu, int idx)
 {
-	return idx >= ARMV7_IDX_CYCLE_COUNTER && idx <= ARMV7_IDX_COUNTER_LAST;
+	return idx >= ARMV7_IDX_CYCLE_COUNTER &&
+		idx <= ARMV7_IDX_COUNTER_LAST(cpu_pmu);
 }
 
 static inline int armv7_pmnc_counter_has_overflowed(u32 pmnc, int idx)
 {
-	int ret = 0;
-	u32 counter;
-
-	if (!armv7_pmnc_counter_valid(idx)) {
-		pr_err("CPU%u checking wrong counter %d overflow status\n",
-			smp_processor_id(), idx);
-	} else {
-		counter = ARMV7_IDX_TO_COUNTER(idx);
-		ret = pmnc & BIT(counter);
-	}
-
-	return ret;
+	return pmnc & BIT(ARMV7_IDX_TO_COUNTER(idx));
 }
 
 static inline int armv7_pmnc_select_counter(int idx)
 {
-	u32 counter;
-
-	if (!armv7_pmnc_counter_valid(idx)) {
-		pr_err("CPU%u selecting wrong PMNC counter %d\n",
-			smp_processor_id(), idx);
-		return -EINVAL;
-	}
-
-	counter = ARMV7_IDX_TO_COUNTER(idx);
+	u32 counter = ARMV7_IDX_TO_COUNTER(idx);
 	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (counter));
 	isb();
 
 	return idx;
 }
 
-static inline u32 armv7pmu_read_counter(int idx)
+static inline u32 armv7pmu_read_counter(struct perf_event *event)
 {
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
 	u32 value = 0;
 
-	if (!armv7_pmnc_counter_valid(idx))
+	if (!armv7_pmnc_counter_valid(cpu_pmu, idx))
 		pr_err("CPU%u reading wrong counter %d\n",
 			smp_processor_id(), idx);
 	else if (idx == ARMV7_IDX_CYCLE_COUNTER)
@@ -857,9 +841,13 @@
 	return value;
 }
 
-static inline void armv7pmu_write_counter(int idx, u32 value)
+static inline void armv7pmu_write_counter(struct perf_event *event, u32 value)
 {
-	if (!armv7_pmnc_counter_valid(idx))
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	if (!armv7_pmnc_counter_valid(cpu_pmu, idx))
 		pr_err("CPU%u writing wrong counter %d\n",
 			smp_processor_id(), idx);
 	else if (idx == ARMV7_IDX_CYCLE_COUNTER)
@@ -878,60 +866,28 @@
 
 static inline int armv7_pmnc_enable_counter(int idx)
 {
-	u32 counter;
-
-	if (!armv7_pmnc_counter_valid(idx)) {
-		pr_err("CPU%u enabling wrong PMNC counter %d\n",
-			smp_processor_id(), idx);
-		return -EINVAL;
-	}
-
-	counter = ARMV7_IDX_TO_COUNTER(idx);
+	u32 counter = ARMV7_IDX_TO_COUNTER(idx);
 	asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r" (BIT(counter)));
 	return idx;
 }
 
 static inline int armv7_pmnc_disable_counter(int idx)
 {
-	u32 counter;
-
-	if (!armv7_pmnc_counter_valid(idx)) {
-		pr_err("CPU%u disabling wrong PMNC counter %d\n",
-			smp_processor_id(), idx);
-		return -EINVAL;
-	}
-
-	counter = ARMV7_IDX_TO_COUNTER(idx);
+	u32 counter = ARMV7_IDX_TO_COUNTER(idx);
 	asm volatile("mcr p15, 0, %0, c9, c12, 2" : : "r" (BIT(counter)));
 	return idx;
 }
 
 static inline int armv7_pmnc_enable_intens(int idx)
 {
-	u32 counter;
-
-	if (!armv7_pmnc_counter_valid(idx)) {
-		pr_err("CPU%u enabling wrong PMNC counter IRQ enable %d\n",
-			smp_processor_id(), idx);
-		return -EINVAL;
-	}
-
-	counter = ARMV7_IDX_TO_COUNTER(idx);
+	u32 counter = ARMV7_IDX_TO_COUNTER(idx);
 	asm volatile("mcr p15, 0, %0, c9, c14, 1" : : "r" (BIT(counter)));
 	return idx;
 }
 
 static inline int armv7_pmnc_disable_intens(int idx)
 {
-	u32 counter;
-
-	if (!armv7_pmnc_counter_valid(idx)) {
-		pr_err("CPU%u disabling wrong PMNC counter IRQ enable %d\n",
-			smp_processor_id(), idx);
-		return -EINVAL;
-	}
-
-	counter = ARMV7_IDX_TO_COUNTER(idx);
+	u32 counter = ARMV7_IDX_TO_COUNTER(idx);
 	asm volatile("mcr p15, 0, %0, c9, c14, 2" : : "r" (BIT(counter)));
 	isb();
 	/* Clear the overflow flag in case an interrupt is pending. */
@@ -956,7 +912,7 @@
 }
 
 #ifdef DEBUG
-static void armv7_pmnc_dump_regs(void)
+static void armv7_pmnc_dump_regs(struct arm_pmu *cpu_pmu)
 {
 	u32 val;
 	unsigned int cnt;
@@ -981,7 +937,8 @@
 	asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (val));
 	printk(KERN_INFO "CCNT  =0x%08x\n", val);
 
-	for (cnt = ARMV7_IDX_COUNTER0; cnt <= ARMV7_IDX_COUNTER_LAST; cnt++) {
+	for (cnt = ARMV7_IDX_COUNTER0;
+			cnt <= ARMV7_IDX_COUNTER_LAST(cpu_pmu); cnt++) {
 		armv7_pmnc_select_counter(cnt);
 		asm volatile("mrc p15, 0, %0, c9, c13, 2" : "=r" (val));
 		printk(KERN_INFO "CNT[%d] count =0x%08x\n",
@@ -993,10 +950,19 @@
 }
 #endif
 
-static void armv7pmu_enable_event(struct hw_perf_event *hwc, int idx)
+static void armv7pmu_enable_event(struct perf_event *event)
 {
 	unsigned long flags;
+	struct hw_perf_event *hwc = &event->hw;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+	int idx = hwc->idx;
+
+	if (!armv7_pmnc_counter_valid(cpu_pmu, idx)) {
+		pr_err("CPU%u enabling wrong PMNC counter IRQ enable %d\n",
+			smp_processor_id(), idx);
+		return;
+	}
 
 	/*
 	 * Enable counter and interrupt, and set the counter to count
@@ -1014,7 +980,7 @@
 	 * We only need to set the event for the cycle counter if we
 	 * have the ability to perform event filtering.
 	 */
-	if (armv7pmu.set_event_filter || idx != ARMV7_IDX_CYCLE_COUNTER)
+	if (cpu_pmu->set_event_filter || idx != ARMV7_IDX_CYCLE_COUNTER)
 		armv7_pmnc_write_evtsel(idx, hwc->config_base);
 
 	/*
@@ -1030,10 +996,19 @@
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static void armv7pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static void armv7pmu_disable_event(struct perf_event *event)
 {
 	unsigned long flags;
+	struct hw_perf_event *hwc = &event->hw;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+	int idx = hwc->idx;
+
+	if (!armv7_pmnc_counter_valid(cpu_pmu, idx)) {
+		pr_err("CPU%u disabling wrong PMNC counter IRQ enable %d\n",
+			smp_processor_id(), idx);
+		return;
+	}
 
 	/*
 	 * Disable counter and interrupt
@@ -1057,7 +1032,8 @@
 {
 	u32 pmnc;
 	struct perf_sample_data data;
-	struct pmu_hw_events *cpuc;
+	struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
+	struct pmu_hw_events *cpuc = cpu_pmu->get_hw_events();
 	struct pt_regs *regs;
 	int idx;
 
@@ -1077,7 +1053,6 @@
 	 */
 	regs = get_irq_regs();
 
-	cpuc = &__get_cpu_var(cpu_hw_events);
 	for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
 		struct perf_event *event = cpuc->events[idx];
 		struct hw_perf_event *hwc;
@@ -1094,13 +1069,13 @@
 			continue;
 
 		hwc = &event->hw;
-		armpmu_event_update(event, hwc, idx);
+		armpmu_event_update(event);
 		perf_sample_data_init(&data, 0, hwc->last_period);
-		if (!armpmu_event_set_period(event, hwc, idx))
+		if (!armpmu_event_set_period(event))
 			continue;
 
 		if (perf_event_overflow(event, &data, regs))
-			cpu_pmu->disable(hwc, idx);
+			cpu_pmu->disable(event);
 	}
 
 	/*
@@ -1115,7 +1090,7 @@
 	return IRQ_HANDLED;
 }
 
-static void armv7pmu_start(void)
+static void armv7pmu_start(struct arm_pmu *cpu_pmu)
 {
 	unsigned long flags;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -1126,7 +1101,7 @@
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static void armv7pmu_stop(void)
+static void armv7pmu_stop(struct arm_pmu *cpu_pmu)
 {
 	unsigned long flags;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -1138,10 +1113,12 @@
 }
 
 static int armv7pmu_get_event_idx(struct pmu_hw_events *cpuc,
-				  struct hw_perf_event *event)
+				  struct perf_event *event)
 {
 	int idx;
-	unsigned long evtype = event->config_base & ARMV7_EVTYPE_EVENT;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long evtype = hwc->config_base & ARMV7_EVTYPE_EVENT;
 
 	/* Always place a cycle counter into the cycle counter. */
 	if (evtype == ARMV7_PERFCTR_CPU_CYCLES) {
@@ -1192,11 +1169,14 @@
 
 static void armv7pmu_reset(void *info)
 {
+	struct arm_pmu *cpu_pmu = (struct arm_pmu *)info;
 	u32 idx, nb_cnt = cpu_pmu->num_events;
 
 	/* The counter and interrupt enable registers are unknown at reset. */
-	for (idx = ARMV7_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx)
-		armv7pmu_disable_event(NULL, idx);
+	for (idx = ARMV7_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) {
+		armv7_pmnc_disable_counter(idx);
+		armv7_pmnc_disable_intens(idx);
+	}
 
 	/* Initialize & Reset PMNC: C and P bits */
 	armv7_pmnc_write(ARMV7_PMNC_P | ARMV7_PMNC_C);
@@ -1232,17 +1212,18 @@
 				&armv7_a7_perf_cache_map, 0xFF);
 }
 
-static struct arm_pmu armv7pmu = {
-	.handle_irq		= armv7pmu_handle_irq,
-	.enable			= armv7pmu_enable_event,
-	.disable		= armv7pmu_disable_event,
-	.read_counter		= armv7pmu_read_counter,
-	.write_counter		= armv7pmu_write_counter,
-	.get_event_idx		= armv7pmu_get_event_idx,
-	.start			= armv7pmu_start,
-	.stop			= armv7pmu_stop,
-	.reset			= armv7pmu_reset,
-	.max_period		= (1LLU << 32) - 1,
+static void armv7pmu_init(struct arm_pmu *cpu_pmu)
+{
+	cpu_pmu->handle_irq	= armv7pmu_handle_irq;
+	cpu_pmu->enable		= armv7pmu_enable_event;
+	cpu_pmu->disable	= armv7pmu_disable_event;
+	cpu_pmu->read_counter	= armv7pmu_read_counter;
+	cpu_pmu->write_counter	= armv7pmu_write_counter;
+	cpu_pmu->get_event_idx	= armv7pmu_get_event_idx;
+	cpu_pmu->start		= armv7pmu_start;
+	cpu_pmu->stop		= armv7pmu_stop;
+	cpu_pmu->reset		= armv7pmu_reset;
+	cpu_pmu->max_period	= (1LLU << 32) - 1;
 };
 
 static u32 __devinit armv7_read_num_pmnc_events(void)
@@ -1256,70 +1237,75 @@
 	return nb_cnt + 1;
 }
 
-static struct arm_pmu *__devinit armv7_a8_pmu_init(void)
+static int __devinit armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	armv7pmu.name		= "ARMv7 Cortex-A8";
-	armv7pmu.map_event	= armv7_a8_map_event;
-	armv7pmu.num_events	= armv7_read_num_pmnc_events();
-	return &armv7pmu;
+	armv7pmu_init(cpu_pmu);
+	cpu_pmu->name		= "ARMv7 Cortex-A8";
+	cpu_pmu->map_event	= armv7_a8_map_event;
+	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
+	return 0;
 }
 
-static struct arm_pmu *__devinit armv7_a9_pmu_init(void)
+static int __devinit armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	armv7pmu.name		= "ARMv7 Cortex-A9";
-	armv7pmu.map_event	= armv7_a9_map_event;
-	armv7pmu.num_events	= armv7_read_num_pmnc_events();
-	return &armv7pmu;
+	armv7pmu_init(cpu_pmu);
+	cpu_pmu->name		= "ARMv7 Cortex-A9";
+	cpu_pmu->map_event	= armv7_a9_map_event;
+	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
+	return 0;
 }
 
-static struct arm_pmu *__devinit armv7_a5_pmu_init(void)
+static int __devinit armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	armv7pmu.name		= "ARMv7 Cortex-A5";
-	armv7pmu.map_event	= armv7_a5_map_event;
-	armv7pmu.num_events	= armv7_read_num_pmnc_events();
-	return &armv7pmu;
+	armv7pmu_init(cpu_pmu);
+	cpu_pmu->name		= "ARMv7 Cortex-A5";
+	cpu_pmu->map_event	= armv7_a5_map_event;
+	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
+	return 0;
 }
 
-static struct arm_pmu *__devinit armv7_a15_pmu_init(void)
+static int __devinit armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	armv7pmu.name		= "ARMv7 Cortex-A15";
-	armv7pmu.map_event	= armv7_a15_map_event;
-	armv7pmu.num_events	= armv7_read_num_pmnc_events();
-	armv7pmu.set_event_filter = armv7pmu_set_event_filter;
-	return &armv7pmu;
+	armv7pmu_init(cpu_pmu);
+	cpu_pmu->name		= "ARMv7 Cortex-A15";
+	cpu_pmu->map_event	= armv7_a15_map_event;
+	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
+	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
+	return 0;
 }
 
-static struct arm_pmu *__devinit armv7_a7_pmu_init(void)
+static int __devinit armv7_a7_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	armv7pmu.name		= "ARMv7 Cortex-A7";
-	armv7pmu.map_event	= armv7_a7_map_event;
-	armv7pmu.num_events	= armv7_read_num_pmnc_events();
-	armv7pmu.set_event_filter = armv7pmu_set_event_filter;
-	return &armv7pmu;
+	armv7pmu_init(cpu_pmu);
+	cpu_pmu->name		= "ARMv7 Cortex-A7";
+	cpu_pmu->map_event	= armv7_a7_map_event;
+	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
+	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
+	return 0;
 }
 #else
-static struct arm_pmu *__devinit armv7_a8_pmu_init(void)
+static inline int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return NULL;
+	return -ENODEV;
 }
 
-static struct arm_pmu *__devinit armv7_a9_pmu_init(void)
+static inline int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return NULL;
+	return -ENODEV;
 }
 
-static struct arm_pmu *__devinit armv7_a5_pmu_init(void)
+static inline int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return NULL;
+	return -ENODEV;
 }
 
-static struct arm_pmu *__devinit armv7_a15_pmu_init(void)
+static inline int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return NULL;
+	return -ENODEV;
 }
 
-static struct arm_pmu *__devinit armv7_a7_pmu_init(void)
+static inline int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return NULL;
+	return -ENODEV;
 }
 #endif	/* CONFIG_CPU_V7 */
diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c
index 426e19f..0c8265e 100644
--- a/arch/arm/kernel/perf_event_xscale.c
+++ b/arch/arm/kernel/perf_event_xscale.c
@@ -224,7 +224,8 @@
 {
 	unsigned long pmnc;
 	struct perf_sample_data data;
-	struct pmu_hw_events *cpuc;
+	struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
+	struct pmu_hw_events *cpuc = cpu_pmu->get_hw_events();
 	struct pt_regs *regs;
 	int idx;
 
@@ -248,7 +249,6 @@
 
 	regs = get_irq_regs();
 
-	cpuc = &__get_cpu_var(cpu_hw_events);
 	for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
 		struct perf_event *event = cpuc->events[idx];
 		struct hw_perf_event *hwc;
@@ -260,13 +260,13 @@
 			continue;
 
 		hwc = &event->hw;
-		armpmu_event_update(event, hwc, idx);
+		armpmu_event_update(event);
 		perf_sample_data_init(&data, 0, hwc->last_period);
-		if (!armpmu_event_set_period(event, hwc, idx))
+		if (!armpmu_event_set_period(event))
 			continue;
 
 		if (perf_event_overflow(event, &data, regs))
-			cpu_pmu->disable(hwc, idx);
+			cpu_pmu->disable(event);
 	}
 
 	irq_work_run();
@@ -280,11 +280,13 @@
 	return IRQ_HANDLED;
 }
 
-static void
-xscale1pmu_enable_event(struct hw_perf_event *hwc, int idx)
+static void xscale1pmu_enable_event(struct perf_event *event)
 {
 	unsigned long val, mask, evt, flags;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+	int idx = hwc->idx;
 
 	switch (idx) {
 	case XSCALE_CYCLE_COUNTER:
@@ -314,11 +316,13 @@
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static void
-xscale1pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static void xscale1pmu_disable_event(struct perf_event *event)
 {
 	unsigned long val, mask, evt, flags;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+	int idx = hwc->idx;
 
 	switch (idx) {
 	case XSCALE_CYCLE_COUNTER:
@@ -348,9 +352,10 @@
 
 static int
 xscale1pmu_get_event_idx(struct pmu_hw_events *cpuc,
-			struct hw_perf_event *event)
+				struct perf_event *event)
 {
-	if (XSCALE_PERFCTR_CCNT == event->config_base) {
+	struct hw_perf_event *hwc = &event->hw;
+	if (XSCALE_PERFCTR_CCNT == hwc->config_base) {
 		if (test_and_set_bit(XSCALE_CYCLE_COUNTER, cpuc->used_mask))
 			return -EAGAIN;
 
@@ -366,8 +371,7 @@
 	}
 }
 
-static void
-xscale1pmu_start(void)
+static void xscale1pmu_start(struct arm_pmu *cpu_pmu)
 {
 	unsigned long flags, val;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -379,8 +383,7 @@
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static void
-xscale1pmu_stop(void)
+static void xscale1pmu_stop(struct arm_pmu *cpu_pmu)
 {
 	unsigned long flags, val;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -392,9 +395,10 @@
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static inline u32
-xscale1pmu_read_counter(int counter)
+static inline u32 xscale1pmu_read_counter(struct perf_event *event)
 {
+	struct hw_perf_event *hwc = &event->hw;
+	int counter = hwc->idx;
 	u32 val = 0;
 
 	switch (counter) {
@@ -412,9 +416,11 @@
 	return val;
 }
 
-static inline void
-xscale1pmu_write_counter(int counter, u32 val)
+static inline void xscale1pmu_write_counter(struct perf_event *event, u32 val)
 {
+	struct hw_perf_event *hwc = &event->hw;
+	int counter = hwc->idx;
+
 	switch (counter) {
 	case XSCALE_CYCLE_COUNTER:
 		asm volatile("mcr p14, 0, %0, c1, c0, 0" : : "r" (val));
@@ -434,24 +440,22 @@
 				&xscale_perf_cache_map, 0xFF);
 }
 
-static struct arm_pmu xscale1pmu = {
-	.name		= "xscale1",
-	.handle_irq	= xscale1pmu_handle_irq,
-	.enable		= xscale1pmu_enable_event,
-	.disable	= xscale1pmu_disable_event,
-	.read_counter	= xscale1pmu_read_counter,
-	.write_counter	= xscale1pmu_write_counter,
-	.get_event_idx	= xscale1pmu_get_event_idx,
-	.start		= xscale1pmu_start,
-	.stop		= xscale1pmu_stop,
-	.map_event	= xscale_map_event,
-	.num_events	= 3,
-	.max_period	= (1LLU << 32) - 1,
-};
-
-static struct arm_pmu *__devinit xscale1pmu_init(void)
+static int __devinit xscale1pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return &xscale1pmu;
+	cpu_pmu->name		= "xscale1";
+	cpu_pmu->handle_irq	= xscale1pmu_handle_irq;
+	cpu_pmu->enable		= xscale1pmu_enable_event;
+	cpu_pmu->disable	= xscale1pmu_disable_event;
+	cpu_pmu->read_counter	= xscale1pmu_read_counter;
+	cpu_pmu->write_counter	= xscale1pmu_write_counter;
+	cpu_pmu->get_event_idx	= xscale1pmu_get_event_idx;
+	cpu_pmu->start		= xscale1pmu_start;
+	cpu_pmu->stop		= xscale1pmu_stop;
+	cpu_pmu->map_event	= xscale_map_event;
+	cpu_pmu->num_events	= 3;
+	cpu_pmu->max_period	= (1LLU << 32) - 1;
+
+	return 0;
 }
 
 #define XSCALE2_OVERFLOWED_MASK	0x01f
@@ -567,7 +571,8 @@
 {
 	unsigned long pmnc, of_flags;
 	struct perf_sample_data data;
-	struct pmu_hw_events *cpuc;
+	struct arm_pmu *cpu_pmu = (struct arm_pmu *)dev;
+	struct pmu_hw_events *cpuc = cpu_pmu->get_hw_events();
 	struct pt_regs *regs;
 	int idx;
 
@@ -585,7 +590,6 @@
 
 	regs = get_irq_regs();
 
-	cpuc = &__get_cpu_var(cpu_hw_events);
 	for (idx = 0; idx < cpu_pmu->num_events; ++idx) {
 		struct perf_event *event = cpuc->events[idx];
 		struct hw_perf_event *hwc;
@@ -597,13 +601,13 @@
 			continue;
 
 		hwc = &event->hw;
-		armpmu_event_update(event, hwc, idx);
+		armpmu_event_update(event);
 		perf_sample_data_init(&data, 0, hwc->last_period);
-		if (!armpmu_event_set_period(event, hwc, idx))
+		if (!armpmu_event_set_period(event))
 			continue;
 
 		if (perf_event_overflow(event, &data, regs))
-			cpu_pmu->disable(hwc, idx);
+			cpu_pmu->disable(event);
 	}
 
 	irq_work_run();
@@ -617,11 +621,13 @@
 	return IRQ_HANDLED;
 }
 
-static void
-xscale2pmu_enable_event(struct hw_perf_event *hwc, int idx)
+static void xscale2pmu_enable_event(struct perf_event *event)
 {
 	unsigned long flags, ien, evtsel;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+	int idx = hwc->idx;
 
 	ien = xscale2pmu_read_int_enable();
 	evtsel = xscale2pmu_read_event_select();
@@ -661,11 +667,13 @@
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static void
-xscale2pmu_disable_event(struct hw_perf_event *hwc, int idx)
+static void xscale2pmu_disable_event(struct perf_event *event)
 {
 	unsigned long flags, ien, evtsel, of_flags;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
+	int idx = hwc->idx;
 
 	ien = xscale2pmu_read_int_enable();
 	evtsel = xscale2pmu_read_event_select();
@@ -713,7 +721,7 @@
 
 static int
 xscale2pmu_get_event_idx(struct pmu_hw_events *cpuc,
-			struct hw_perf_event *event)
+				struct perf_event *event)
 {
 	int idx = xscale1pmu_get_event_idx(cpuc, event);
 	if (idx >= 0)
@@ -727,8 +735,7 @@
 	return idx;
 }
 
-static void
-xscale2pmu_start(void)
+static void xscale2pmu_start(struct arm_pmu *cpu_pmu)
 {
 	unsigned long flags, val;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -740,8 +747,7 @@
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static void
-xscale2pmu_stop(void)
+static void xscale2pmu_stop(struct arm_pmu *cpu_pmu)
 {
 	unsigned long flags, val;
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
@@ -753,9 +759,10 @@
 	raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
 }
 
-static inline u32
-xscale2pmu_read_counter(int counter)
+static inline u32 xscale2pmu_read_counter(struct perf_event *event)
 {
+	struct hw_perf_event *hwc = &event->hw;
+	int counter = hwc->idx;
 	u32 val = 0;
 
 	switch (counter) {
@@ -779,9 +786,11 @@
 	return val;
 }
 
-static inline void
-xscale2pmu_write_counter(int counter, u32 val)
+static inline void xscale2pmu_write_counter(struct perf_event *event, u32 val)
 {
+	struct hw_perf_event *hwc = &event->hw;
+	int counter = hwc->idx;
+
 	switch (counter) {
 	case XSCALE_CYCLE_COUNTER:
 		asm volatile("mcr p14, 0, %0, c1, c1, 0" : : "r" (val));
@@ -801,33 +810,31 @@
 	}
 }
 
-static struct arm_pmu xscale2pmu = {
-	.name		= "xscale2",
-	.handle_irq	= xscale2pmu_handle_irq,
-	.enable		= xscale2pmu_enable_event,
-	.disable	= xscale2pmu_disable_event,
-	.read_counter	= xscale2pmu_read_counter,
-	.write_counter	= xscale2pmu_write_counter,
-	.get_event_idx	= xscale2pmu_get_event_idx,
-	.start		= xscale2pmu_start,
-	.stop		= xscale2pmu_stop,
-	.map_event	= xscale_map_event,
-	.num_events	= 5,
-	.max_period	= (1LLU << 32) - 1,
-};
-
-static struct arm_pmu *__devinit xscale2pmu_init(void)
+static int __devinit xscale2pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return &xscale2pmu;
+	cpu_pmu->name		= "xscale2";
+	cpu_pmu->handle_irq	= xscale2pmu_handle_irq;
+	cpu_pmu->enable		= xscale2pmu_enable_event;
+	cpu_pmu->disable	= xscale2pmu_disable_event;
+	cpu_pmu->read_counter	= xscale2pmu_read_counter;
+	cpu_pmu->write_counter	= xscale2pmu_write_counter;
+	cpu_pmu->get_event_idx	= xscale2pmu_get_event_idx;
+	cpu_pmu->start		= xscale2pmu_start;
+	cpu_pmu->stop		= xscale2pmu_stop;
+	cpu_pmu->map_event	= xscale_map_event;
+	cpu_pmu->num_events	= 5;
+	cpu_pmu->max_period	= (1LLU << 32) - 1;
+
+	return 0;
 }
 #else
-static struct arm_pmu *__devinit xscale1pmu_init(void)
+static inline int xscale1pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return NULL;
+	return -ENODEV;
 }
 
-static struct arm_pmu *__devinit xscale2pmu_init(void)
+static inline int xscale2pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return NULL;
+	return -ENODEV;
 }
 #endif	/* CONFIG_CPU_XSCALE */
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 90084a6..44bc0b3 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -34,6 +34,7 @@
 #include <linux/leds.h>
 
 #include <asm/cacheflush.h>
+#include <asm/idmap.h>
 #include <asm/processor.h>
 #include <asm/thread_notify.h>
 #include <asm/stacktrace.h>
@@ -56,8 +57,6 @@
   "ARM" , "Thumb" , "Jazelle", "ThumbEE"
 };
 
-extern void setup_mm_for_reboot(void);
-
 static volatile int hlt_counter;
 
 void disable_hlt(void)
@@ -70,6 +69,7 @@
 void enable_hlt(void)
 {
 	hlt_counter--;
+	BUG_ON(hlt_counter < 0);
 }
 
 EXPORT_SYMBOL(enable_hlt);
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 739db3a..03deeff 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -916,16 +916,11 @@
 	PTRACE_SYSCALL_EXIT,
 };
 
-static int ptrace_syscall_trace(struct pt_regs *regs, int scno,
-				enum ptrace_syscall_dir dir)
+static int tracehook_report_syscall(struct pt_regs *regs,
+				    enum ptrace_syscall_dir dir)
 {
 	unsigned long ip;
 
-	current_thread_info()->syscall = scno;
-
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return scno;
-
 	/*
 	 * IP is used to denote syscall entry/exit:
 	 * IP = 0 -> entry, =1 -> exit
@@ -944,19 +939,41 @@
 
 asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
 {
-	scno = ptrace_syscall_trace(regs, scno, PTRACE_SYSCALL_ENTER);
+	current_thread_info()->syscall = scno;
+
+	/* Do the secure computing check first; failures should be fast. */
+	if (secure_computing(scno) == -1)
+		return -1;
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		scno = tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
+
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
 		trace_sys_enter(regs, scno);
+
 	audit_syscall_entry(AUDIT_ARCH_ARM, scno, regs->ARM_r0, regs->ARM_r1,
 			    regs->ARM_r2, regs->ARM_r3);
+
 	return scno;
 }
 
-asmlinkage int syscall_trace_exit(struct pt_regs *regs, int scno)
+asmlinkage void syscall_trace_exit(struct pt_regs *regs)
 {
-	scno = ptrace_syscall_trace(regs, scno, PTRACE_SYSCALL_EXIT);
-	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
-		trace_sys_exit(regs, scno);
+	/*
+	 * Audit the syscall before anything else, as a debugger may
+	 * come in and change the current registers.
+	 */
 	audit_syscall_exit(regs);
-	return scno;
+
+	/*
+	 * Note that we haven't updated the ->syscall field for the
+	 * current thread. This isn't a problem because it will have
+	 * been set on syscall entry and there hasn't been an opportunity
+	 * for a PTRACE_SET_SYSCALL since then.
+	 */
+	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+		trace_sys_exit(regs, regs_return_value(regs));
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT);
 }
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index da1d1aa..9a89bf4 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -383,6 +383,12 @@
 		BUG();
 	}
 
+	/*
+	 * This only works on resume and secondary cores. For booting on the
+	 * boot cpu, smp_prepare_boot_cpu is called after percpu area setup.
+	 */
+	set_my_cpu_offset(per_cpu_offset(cpu));
+
 	cpu_proc_init();
 
 	/*
@@ -426,13 +432,14 @@
 void __init smp_setup_processor_id(void)
 {
 	int i;
-	u32 cpu = is_smp() ? read_cpuid_mpidr() & 0xff : 0;
+	u32 mpidr = is_smp() ? read_cpuid_mpidr() & MPIDR_HWID_BITMASK : 0;
+	u32 cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 
 	cpu_logical_map(0) = cpu;
-	for (i = 1; i < NR_CPUS; ++i)
+	for (i = 1; i < nr_cpu_ids; ++i)
 		cpu_logical_map(i) = i == cpu ? 0 : i;
 
-	printk(KERN_INFO "Booting Linux on physical CPU %d\n", cpu);
+	printk(KERN_INFO "Booting Linux on physical CPU 0x%x\n", mpidr);
 }
 
 static void __init setup_processor(void)
@@ -758,6 +765,7 @@
 
 	unflatten_device_tree();
 
+	arm_dt_init_cpu_maps();
 #ifdef CONFIG_SMP
 	if (is_smp()) {
 		smp_set_ops(mdesc->smp);
@@ -841,12 +849,9 @@
 
 static int c_show(struct seq_file *m, void *v)
 {
-	int i;
+	int i, j;
+	u32 cpuid;
 
-	seq_printf(m, "Processor\t: %s rev %d (%s)\n",
-		   cpu_name, read_cpuid_id() & 15, elf_platform);
-
-#if defined(CONFIG_SMP)
 	for_each_online_cpu(i) {
 		/*
 		 * glibc reads /proc/cpuinfo to determine the number of
@@ -854,45 +859,48 @@
 		 * "processor".  Give glibc what it expects.
 		 */
 		seq_printf(m, "processor\t: %d\n", i);
-		seq_printf(m, "BogoMIPS\t: %lu.%02lu\n\n",
+		cpuid = is_smp() ? per_cpu(cpu_data, i).cpuid : read_cpuid_id();
+		seq_printf(m, "model name\t: %s rev %d (%s)\n",
+			   cpu_name, cpuid & 15, elf_platform);
+
+#if defined(CONFIG_SMP)
+		seq_printf(m, "BogoMIPS\t: %lu.%02lu\n",
 			   per_cpu(cpu_data, i).loops_per_jiffy / (500000UL/HZ),
 			   (per_cpu(cpu_data, i).loops_per_jiffy / (5000UL/HZ)) % 100);
-	}
-#else /* CONFIG_SMP */
-	seq_printf(m, "BogoMIPS\t: %lu.%02lu\n",
-		   loops_per_jiffy / (500000/HZ),
-		   (loops_per_jiffy / (5000/HZ)) % 100);
+#else
+		seq_printf(m, "BogoMIPS\t: %lu.%02lu\n",
+			   loops_per_jiffy / (500000/HZ),
+			   (loops_per_jiffy / (5000/HZ)) % 100);
 #endif
+		/* dump out the processor features */
+		seq_puts(m, "Features\t: ");
 
-	/* dump out the processor features */
-	seq_puts(m, "Features\t: ");
+		for (j = 0; hwcap_str[j]; j++)
+			if (elf_hwcap & (1 << j))
+				seq_printf(m, "%s ", hwcap_str[j]);
 
-	for (i = 0; hwcap_str[i]; i++)
-		if (elf_hwcap & (1 << i))
-			seq_printf(m, "%s ", hwcap_str[i]);
+		seq_printf(m, "\nCPU implementer\t: 0x%02x\n", cpuid >> 24);
+		seq_printf(m, "CPU architecture: %s\n",
+			   proc_arch[cpu_architecture()]);
 
-	seq_printf(m, "\nCPU implementer\t: 0x%02x\n", read_cpuid_id() >> 24);
-	seq_printf(m, "CPU architecture: %s\n", proc_arch[cpu_architecture()]);
-
-	if ((read_cpuid_id() & 0x0008f000) == 0x00000000) {
-		/* pre-ARM7 */
-		seq_printf(m, "CPU part\t: %07x\n", read_cpuid_id() >> 4);
-	} else {
-		if ((read_cpuid_id() & 0x0008f000) == 0x00007000) {
-			/* ARM7 */
-			seq_printf(m, "CPU variant\t: 0x%02x\n",
-				   (read_cpuid_id() >> 16) & 127);
+		if ((cpuid & 0x0008f000) == 0x00000000) {
+			/* pre-ARM7 */
+			seq_printf(m, "CPU part\t: %07x\n", cpuid >> 4);
 		} else {
-			/* post-ARM7 */
-			seq_printf(m, "CPU variant\t: 0x%x\n",
-				   (read_cpuid_id() >> 20) & 15);
+			if ((cpuid & 0x0008f000) == 0x00007000) {
+				/* ARM7 */
+				seq_printf(m, "CPU variant\t: 0x%02x\n",
+					   (cpuid >> 16) & 127);
+			} else {
+				/* post-ARM7 */
+				seq_printf(m, "CPU variant\t: 0x%x\n",
+					   (cpuid >> 20) & 15);
+			}
+			seq_printf(m, "CPU part\t: 0x%03x\n",
+				   (cpuid >> 4) & 0xfff);
 		}
-		seq_printf(m, "CPU part\t: 0x%03x\n",
-			   (read_cpuid_id() >> 4) & 0xfff);
+		seq_printf(m, "CPU revision\t: %d\n\n", cpuid & 15);
 	}
-	seq_printf(m, "CPU revision\t: %d\n", read_cpuid_id() & 15);
-
-	seq_puts(m, "\n");
 
 	seq_printf(m, "Hardware\t: %s\n", machine_name);
 	seq_printf(m, "Revision\t: %04x\n", system_rev);
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index fbc8b26..84f4cbf 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -281,6 +281,7 @@
 	struct cpuinfo_arm *cpu_info = &per_cpu(cpu_data, cpuid);
 
 	cpu_info->loops_per_jiffy = loops_per_jiffy;
+	cpu_info->cpuid = read_cpuid_id();
 
 	store_cpu_topology(cpuid);
 }
@@ -313,9 +314,10 @@
 	current->active_mm = mm;
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
 
+	cpu_init();
+
 	printk("CPU%u: Booted secondary processor\n", cpu);
 
-	cpu_init();
 	preempt_disable();
 	trace_hardirqs_off();
 
@@ -371,6 +373,7 @@
 
 void __init smp_prepare_boot_cpu(void)
 {
+	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
 }
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
@@ -421,6 +424,11 @@
 	smp_cross_call(mask, IPI_CALL_FUNC);
 }
 
+void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
+{
+	smp_cross_call(mask, IPI_WAKEUP);
+}
+
 void arch_send_call_function_single_ipi(int cpu)
 {
 	smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE);
@@ -443,7 +451,7 @@
 	for (i = 0; i < NR_IPI; i++) {
 		seq_printf(p, "%*s%u: ", prec - 1, "IPI", i);
 
-		for_each_present_cpu(cpu)
+		for_each_online_cpu(cpu)
 			seq_printf(p, "%10u ",
 				   __get_irq_stat(cpu, ipi_irqs[i]));
 
diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c
index b22d700..ff07879 100644
--- a/arch/arm/kernel/smp_twd.c
+++ b/arch/arm/kernel/smp_twd.c
@@ -31,6 +31,8 @@
 
 static struct clk *twd_clk;
 static unsigned long twd_timer_rate;
+static bool common_setup_called;
+static DEFINE_PER_CPU(bool, percpu_setup_called);
 
 static struct clock_event_device __percpu **twd_evt;
 static int twd_ppi;
@@ -248,17 +250,9 @@
 		return clk;
 	}
 
-	err = clk_prepare(clk);
+	err = clk_prepare_enable(clk);
 	if (err) {
-		pr_err("smp_twd: clock failed to prepare: %d\n", err);
-		clk_put(clk);
-		return ERR_PTR(err);
-	}
-
-	err = clk_enable(clk);
-	if (err) {
-		pr_err("smp_twd: clock failed to enable: %d\n", err);
-		clk_unprepare(clk);
+		pr_err("smp_twd: clock failed to prepare+enable: %d\n", err);
 		clk_put(clk);
 		return ERR_PTR(err);
 	}
@@ -272,15 +266,45 @@
 static int __cpuinit twd_timer_setup(struct clock_event_device *clk)
 {
 	struct clock_event_device **this_cpu_clk;
+	int cpu = smp_processor_id();
 
-	if (!twd_clk)
+	/*
+	 * If the basic setup for this CPU has been done before don't
+	 * bother with the below.
+	 */
+	if (per_cpu(percpu_setup_called, cpu)) {
+		__raw_writel(0, twd_base + TWD_TIMER_CONTROL);
+		clockevents_register_device(*__this_cpu_ptr(twd_evt));
+		enable_percpu_irq(clk->irq, 0);
+		return 0;
+	}
+	per_cpu(percpu_setup_called, cpu) = true;
+
+	/*
+	 * This stuff only need to be done once for the entire TWD cluster
+	 * during the runtime of the system.
+	 */
+	if (!common_setup_called) {
 		twd_clk = twd_get_clock();
 
-	if (!IS_ERR_OR_NULL(twd_clk))
-		twd_timer_rate = clk_get_rate(twd_clk);
-	else
-		twd_calibrate_rate();
+		/*
+		 * We use IS_ERR_OR_NULL() here, because if the clock stubs
+		 * are active we will get a valid clk reference which is
+		 * however NULL and will return the rate 0. In that case we
+		 * need to calibrate the rate instead.
+		 */
+		if (!IS_ERR_OR_NULL(twd_clk))
+			twd_timer_rate = clk_get_rate(twd_clk);
+		else
+			twd_calibrate_rate();
 
+		common_setup_called = true;
+	}
+
+	/*
+	 * The following is done once per CPU the first time .setup() is
+	 * called.
+	 */
 	__raw_writel(0, twd_base + TWD_TIMER_CONTROL);
 
 	clk->name = "local_timer";
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 26c12c6..79282eb 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -196,32 +196,7 @@
 static inline void update_cpu_power(unsigned int cpuid, unsigned int mpidr) {}
 #endif
 
-
-/*
- * cpu topology management
- */
-
-#define MPIDR_SMP_BITMASK (0x3 << 30)
-#define MPIDR_SMP_VALUE (0x2 << 30)
-
-#define MPIDR_MT_BITMASK (0x1 << 24)
-
-/*
- * These masks reflect the current use of the affinity levels.
- * The affinity level can be up to 16 bits according to ARM ARM
- */
-#define MPIDR_HWID_BITMASK 0xFFFFFF
-
-#define MPIDR_LEVEL0_MASK 0x3
-#define MPIDR_LEVEL0_SHIFT 0
-
-#define MPIDR_LEVEL1_MASK 0xF
-#define MPIDR_LEVEL1_SHIFT 8
-
-#define MPIDR_LEVEL2_MASK 0xFF
-#define MPIDR_LEVEL2_SHIFT 16
-
-/*
+ /*
  * cpu topology table
  */
 struct cputopo_arm cpu_topology[NR_CPUS];
@@ -282,19 +257,14 @@
 
 		if (mpidr & MPIDR_MT_BITMASK) {
 			/* core performance interdependency */
-			cpuid_topo->thread_id = (mpidr >> MPIDR_LEVEL0_SHIFT)
-				& MPIDR_LEVEL0_MASK;
-			cpuid_topo->core_id = (mpidr >> MPIDR_LEVEL1_SHIFT)
-				& MPIDR_LEVEL1_MASK;
-			cpuid_topo->socket_id = (mpidr >> MPIDR_LEVEL2_SHIFT)
-				& MPIDR_LEVEL2_MASK;
+			cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+			cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+			cpuid_topo->socket_id = MPIDR_AFFINITY_LEVEL(mpidr, 2);
 		} else {
 			/* largely independent cores */
 			cpuid_topo->thread_id = -1;
-			cpuid_topo->core_id = (mpidr >> MPIDR_LEVEL0_SHIFT)
-				& MPIDR_LEVEL0_MASK;
-			cpuid_topo->socket_id = (mpidr >> MPIDR_LEVEL1_SHIFT)
-				& MPIDR_LEVEL1_MASK;
+			cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+			cpuid_topo->socket_id = MPIDR_AFFINITY_LEVEL(mpidr, 1);
 		}
 	} else {
 		/*
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 36ff15b..b9f38e3 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -114,6 +114,15 @@
 
 	RO_DATA(PAGE_SIZE)
 
+	. = ALIGN(4);
+	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+#ifdef CONFIG_MMU
+		*(__ex_table)
+#endif
+		__stop___ex_table = .;
+	}
+
 #ifdef CONFIG_ARM_UNWIND
 	/*
 	 * Stack unwinding tables
@@ -220,16 +229,6 @@
 		READ_MOSTLY_DATA(L1_CACHE_BYTES)
 
 		/*
-		 * The exception fixup table (might need resorting at runtime)
-		 */
-		. = ALIGN(4);
-		__start___ex_table = .;
-#ifdef CONFIG_MMU
-		*(__ex_table)
-#endif
-		__stop___ex_table = .;
-
-		/*
 		 * and the usual data section
 		 */
 		DATA_DATA
diff --git a/arch/arm/mach-integrator/impd1.c b/arch/arm/mach-integrator/impd1.c
index b3d86d7..9f82f9d 100644
--- a/arch/arm/mach-integrator/impd1.c
+++ b/arch/arm/mach-integrator/impd1.c
@@ -342,9 +342,10 @@
 
 		pc_base = dev->resource.start + idev->offset;
 		snprintf(devname, 32, "lm%x:%5.5lx", dev->id, idev->offset >> 12);
-		d = amba_ahb_device_add(&dev->dev, devname, pc_base, SZ_4K,
-					dev->irq, dev->irq,
-					idev->platform_data, idev->id);
+		d = amba_ahb_device_add_res(&dev->dev, devname, pc_base, SZ_4K,
+					    dev->irq, dev->irq,
+					    idev->platform_data, idev->id,
+					    &dev->resource);
 		if (IS_ERR(d)) {
 			dev_err(&dev->dev, "unable to register device: %ld\n", PTR_ERR(d));
 			continue;
diff --git a/arch/arm/mach-ixp4xx/include/mach/udc.h b/arch/arm/mach-ixp4xx/include/mach/udc.h
index 80d6da2..7bd8b96 100644
--- a/arch/arm/mach-ixp4xx/include/mach/udc.h
+++ b/arch/arm/mach-ixp4xx/include/mach/udc.h
@@ -2,7 +2,7 @@
  * arch/arm/mach-ixp4xx/include/mach/udc.h
  *
  */
-#include <asm/mach/udc_pxa2xx.h>
+#include <linux/platform_data/pxa2xx_udc.h>
 
 extern void ixp4xx_set_udc_info(struct pxa2xx_udc_mach_info *info);
 
diff --git a/arch/arm/mach-omap2/pmu.c b/arch/arm/mach-omap2/pmu.c
index 2a79176..031e2fb 100644
--- a/arch/arm/mach-omap2/pmu.c
+++ b/arch/arm/mach-omap2/pmu.c
@@ -57,8 +57,6 @@
 	if (IS_ERR(omap_pmu_dev))
 		return PTR_ERR(omap_pmu_dev);
 
-	pm_runtime_enable(&omap_pmu_dev->dev);
-
 	return 0;
 }
 
diff --git a/arch/arm/mach-pxa/include/mach/udc.h b/arch/arm/mach-pxa/include/mach/udc.h
index 2f82332..9a827e3 100644
--- a/arch/arm/mach-pxa/include/mach/udc.h
+++ b/arch/arm/mach-pxa/include/mach/udc.h
@@ -2,7 +2,7 @@
  * arch/arm/mach-pxa/include/mach/udc.h
  *
  */
-#include <asm/mach/udc_pxa2xx.h>
+#include <linux/platform_data/pxa2xx_udc.h>
 
 extern void pxa_set_udc_info(struct pxa2xx_udc_mach_info *info);
 
diff --git a/arch/arm/mach-realview/realview_eb.c b/arch/arm/mach-realview/realview_eb.c
index d3b3cd2..28511d4 100644
--- a/arch/arm/mach-realview/realview_eb.c
+++ b/arch/arm/mach-realview/realview_eb.c
@@ -467,6 +467,7 @@
 MACHINE_START(REALVIEW_EB, "ARM-RealView EB")
 	/* Maintainer: ARM Ltd/Deep Blue Solutions Ltd */
 	.atag_offset	= 0x100,
+	.smp		= smp_ops(realview_smp_ops),
 	.fixup		= realview_fixup,
 	.map_io		= realview_eb_map_io,
 	.init_early	= realview_init_early,
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index 6a7ad3c..9a23739 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/serial_core.h>
 #include <linux/mfd/ucb1x00.h>
 #include <linux/mtd/mtd.h>
@@ -37,7 +38,6 @@
 #include <asm/mach/flash.h>
 #include <asm/mach/irda.h>
 #include <asm/mach/map.h>
-#include <asm/mach/serial_sa1100.h>
 #include <mach/assabet.h>
 #include <linux/platform_data/mfd-mcp-sa11x0.h>
 #include <mach/irqs.h>
diff --git a/arch/arm/mach-sa1100/badge4.c b/arch/arm/mach-sa1100/badge4.c
index 038df48..b2dadf3 100644
--- a/arch/arm/mach-sa1100/badge4.c
+++ b/arch/arm/mach-sa1100/badge4.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/platform_device.h>
 #include <linux/delay.h>
 #include <linux/tty.h>
@@ -34,7 +35,6 @@
 #include <asm/mach/flash.h>
 #include <asm/mach/map.h>
 #include <asm/hardware/sa1111.h>
-#include <asm/mach/serial_sa1100.h>
 
 #include <mach/badge4.h>
 
diff --git a/arch/arm/mach-sa1100/cerf.c b/arch/arm/mach-sa1100/cerf.c
index ad0eb08..304bca4 100644
--- a/arch/arm/mach-sa1100/cerf.c
+++ b/arch/arm/mach-sa1100/cerf.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/tty.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/platform_device.h>
 #include <linux/irq.h>
 #include <linux/mtd/mtd.h>
@@ -27,7 +28,6 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
 #include <asm/mach/map.h>
-#include <asm/mach/serial_sa1100.h>
 
 #include <mach/cerf.h>
 #include <linux/platform_data/mfd-mcp-sa11x0.h>
diff --git a/arch/arm/mach-sa1100/collie.c b/arch/arm/mach-sa1100/collie.c
index 170cb61..45f424f 100644
--- a/arch/arm/mach-sa1100/collie.c
+++ b/arch/arm/mach-sa1100/collie.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/tty.h>
 #include <linux/delay.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/platform_device.h>
 #include <linux/mfd/ucb1x00.h>
 #include <linux/mtd/mtd.h>
@@ -40,7 +41,6 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
 #include <asm/mach/map.h>
-#include <asm/mach/serial_sa1100.h>
 
 #include <asm/hardware/scoop.h>
 #include <asm/mach/sharpsl_param.h>
diff --git a/arch/arm/mach-sa1100/h3xxx.c b/arch/arm/mach-sa1100/h3xxx.c
index 63150e1..f17e738 100644
--- a/arch/arm/mach-sa1100/h3xxx.c
+++ b/arch/arm/mach-sa1100/h3xxx.c
@@ -17,12 +17,12 @@
 #include <linux/mfd/htc-egpio.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/platform_device.h>
 #include <linux/serial_core.h>
 
 #include <asm/mach/flash.h>
 #include <asm/mach/map.h>
-#include <asm/mach/serial_sa1100.h>
 
 #include <mach/h3xxx.h>
 
diff --git a/arch/arm/mach-sa1100/hackkit.c b/arch/arm/mach-sa1100/hackkit.c
index fc106aa..d005939 100644
--- a/arch/arm/mach-sa1100/hackkit.c
+++ b/arch/arm/mach-sa1100/hackkit.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/cpufreq.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/serial_core.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
@@ -35,7 +36,6 @@
 #include <asm/mach/flash.h>
 #include <asm/mach/map.h>
 #include <asm/mach/irq.h>
-#include <asm/mach/serial_sa1100.h>
 
 #include <mach/hardware.h>
 #include <mach/irqs.h>
diff --git a/arch/arm/mach-sa1100/jornada720.c b/arch/arm/mach-sa1100/jornada720.c
index e3084f4..35cfc42 100644
--- a/arch/arm/mach-sa1100/jornada720.c
+++ b/arch/arm/mach-sa1100/jornada720.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/tty.h>
 #include <linux/delay.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/platform_device.h>
 #include <linux/ioport.h>
 #include <linux/mtd/mtd.h>
@@ -30,7 +31,6 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
 #include <asm/mach/map.h>
-#include <asm/mach/serial_sa1100.h>
 
 #include <mach/hardware.h>
 #include <mach/irqs.h>
diff --git a/arch/arm/mach-sa1100/lart.c b/arch/arm/mach-sa1100/lart.c
index 3048b17..f69f78f 100644
--- a/arch/arm/mach-sa1100/lart.c
+++ b/arch/arm/mach-sa1100/lart.c
@@ -4,6 +4,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/tty.h>
 #include <linux/gpio.h>
 #include <linux/leds.h>
@@ -18,7 +19,6 @@
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
-#include <asm/mach/serial_sa1100.h>
 #include <linux/platform_data/mfd-mcp-sa11x0.h>
 #include <mach/irqs.h>
 
diff --git a/arch/arm/mach-sa1100/nanoengine.c b/arch/arm/mach-sa1100/nanoengine.c
index 41f69d9..102e08f 100644
--- a/arch/arm/mach-sa1100/nanoengine.c
+++ b/arch/arm/mach-sa1100/nanoengine.c
@@ -13,6 +13,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 #include <linux/root_dev.h>
@@ -24,7 +25,6 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
 #include <asm/mach/map.h>
-#include <asm/mach/serial_sa1100.h>
 
 #include <mach/hardware.h>
 #include <mach/nanoengine.h>
diff --git a/arch/arm/mach-sa1100/neponset.c b/arch/arm/mach-sa1100/neponset.c
index 266db87..88be047 100644
--- a/arch/arm/mach-sa1100/neponset.c
+++ b/arch/arm/mach-sa1100/neponset.c
@@ -7,6 +7,7 @@
 #include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
 #include <linux/serial_core.h>
@@ -14,7 +15,6 @@
 
 #include <asm/mach-types.h>
 #include <asm/mach/map.h>
-#include <asm/mach/serial_sa1100.h>
 #include <asm/hardware/sa1111.h>
 #include <asm/sizes.h>
 
diff --git a/arch/arm/mach-sa1100/pleb.c b/arch/arm/mach-sa1100/pleb.c
index 37fe0a0..c51bb63 100644
--- a/arch/arm/mach-sa1100/pleb.c
+++ b/arch/arm/mach-sa1100/pleb.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/tty.h>
 #include <linux/ioport.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/platform_device.h>
 #include <linux/irq.h>
 #include <linux/io.h>
@@ -18,7 +19,6 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
 #include <asm/mach/flash.h>
-#include <asm/mach/serial_sa1100.h>
 #include <mach/irqs.h>
 
 #include "generic.h"
diff --git a/arch/arm/mach-sa1100/shannon.c b/arch/arm/mach-sa1100/shannon.c
index ff6b7b3..6460d25 100644
--- a/arch/arm/mach-sa1100/shannon.c
+++ b/arch/arm/mach-sa1100/shannon.c
@@ -5,6 +5,7 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/tty.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
@@ -18,7 +19,6 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
 #include <asm/mach/map.h>
-#include <asm/mach/serial_sa1100.h>
 #include <linux/platform_data/mfd-mcp-sa11x0.h>
 #include <mach/shannon.h>
 #include <mach/irqs.h>
diff --git a/arch/arm/mach-sa1100/simpad.c b/arch/arm/mach-sa1100/simpad.c
index 71790e5..6d65f65 100644
--- a/arch/arm/mach-sa1100/simpad.c
+++ b/arch/arm/mach-sa1100/simpad.c
@@ -9,6 +9,7 @@
 #include <linux/proc_fs.h>
 #include <linux/string.h>
 #include <linux/pm.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/platform_device.h>
 #include <linux/mfd/ucb1x00.h>
 #include <linux/mtd/mtd.h>
@@ -23,7 +24,6 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
 #include <asm/mach/map.h>
-#include <asm/mach/serial_sa1100.h>
 #include <linux/platform_data/mfd-mcp-sa11x0.h>
 #include <mach/simpad.h>
 #include <mach/irqs.h>
diff --git a/arch/arm/mm/cache-aurora-l2.h b/arch/arm/mm/cache-aurora-l2.h
new file mode 100644
index 0000000..c861247
--- /dev/null
+++ b/arch/arm/mm/cache-aurora-l2.h
@@ -0,0 +1,55 @@
+/*
+ * AURORA shared L2 cache controller support
+ *
+ * Copyright (C) 2012 Marvell
+ *
+ * Yehuda Yitschak <yehuday@marvell.com>
+ * Gregory CLEMENT <gregory.clement@free-electrons.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef __ASM_ARM_HARDWARE_AURORA_L2_H
+#define __ASM_ARM_HARDWARE_AURORA_L2_H
+
+#define AURORA_SYNC_REG		    0x700
+#define AURORA_RANGE_BASE_ADDR_REG  0x720
+#define AURORA_FLUSH_PHY_ADDR_REG   0x7f0
+#define AURORA_INVAL_RANGE_REG	    0x774
+#define AURORA_CLEAN_RANGE_REG	    0x7b4
+#define AURORA_FLUSH_RANGE_REG	    0x7f4
+
+#define AURORA_ACR_REPLACEMENT_OFFSET	    27
+#define AURORA_ACR_REPLACEMENT_MASK	     \
+	(0x3 << AURORA_ACR_REPLACEMENT_OFFSET)
+#define AURORA_ACR_REPLACEMENT_TYPE_WAYRR    \
+	(0 << AURORA_ACR_REPLACEMENT_OFFSET)
+#define AURORA_ACR_REPLACEMENT_TYPE_LFSR     \
+	(1 << AURORA_ACR_REPLACEMENT_OFFSET)
+#define AURORA_ACR_REPLACEMENT_TYPE_SEMIPLRU \
+	(3 << AURORA_ACR_REPLACEMENT_OFFSET)
+
+#define AURORA_ACR_FORCE_WRITE_POLICY_OFFSET	0
+#define AURORA_ACR_FORCE_WRITE_POLICY_MASK	\
+	(0x3 << AURORA_ACR_FORCE_WRITE_POLICY_OFFSET)
+#define AURORA_ACR_FORCE_WRITE_POLICY_DIS	\
+	(0 << AURORA_ACR_FORCE_WRITE_POLICY_OFFSET)
+#define AURORA_ACR_FORCE_WRITE_BACK_POLICY	\
+	(1 << AURORA_ACR_FORCE_WRITE_POLICY_OFFSET)
+#define AURORA_ACR_FORCE_WRITE_THRO_POLICY	\
+	(2 << AURORA_ACR_FORCE_WRITE_POLICY_OFFSET)
+
+#define MAX_RANGE_SIZE		1024
+
+#define AURORA_WAY_SIZE_SHIFT	2
+
+#define AURORA_CTRL_FW		0x100
+
+/* chose a number outside L2X0_CACHE_ID_PART_MASK to be sure to make
+ * the distinction between a number coming from hardware and a number
+ * coming from the device tree */
+#define AURORA_CACHE_ID	       0x100
+
+#endif /* __ASM_ARM_HARDWARE_AURORA_L2_H */
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index 8a97e64..6911b8b 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -25,6 +25,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/hardware/cache-l2x0.h>
+#include "cache-aurora-l2.h"
 
 #define CACHE_LINE_SIZE		32
 
@@ -34,14 +35,20 @@
 static u32 l2x0_size;
 static unsigned long sync_reg_offset = L2X0_CACHE_SYNC;
 
+/* Aurora don't have the cache ID register available, so we have to
+ * pass it though the device tree */
+static u32  cache_id_part_number_from_dt;
+
 struct l2x0_regs l2x0_saved_regs;
 
 struct l2x0_of_data {
 	void (*setup)(const struct device_node *, u32 *, u32 *);
 	void (*save)(void);
-	void (*resume)(void);
+	struct outer_cache_fns outer_cache;
 };
 
+static bool of_init = false;
+
 static inline void cache_wait_way(void __iomem *reg, unsigned long mask)
 {
 	/* wait for cache operation by line or way to complete */
@@ -168,7 +175,7 @@
 	/* invalidate all ways */
 	raw_spin_lock_irqsave(&l2x0_lock, flags);
 	/* Invalidating when L2 is enabled is a nono */
-	BUG_ON(readl(l2x0_base + L2X0_CTRL) & 1);
+	BUG_ON(readl(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN);
 	writel_relaxed(l2x0_way_mask, l2x0_base + L2X0_INV_WAY);
 	cache_wait_way(l2x0_base + L2X0_INV_WAY, l2x0_way_mask);
 	cache_sync();
@@ -292,11 +299,18 @@
 	int lockregs;
 	int i;
 
-	if (cache_id == L2X0_CACHE_ID_PART_L310)
+	switch (cache_id) {
+	case L2X0_CACHE_ID_PART_L310:
 		lockregs = 8;
-	else
+		break;
+	case AURORA_CACHE_ID:
+		lockregs = 4;
+		break;
+	default:
 		/* L210 and unknown types */
 		lockregs = 1;
+		break;
+	}
 
 	for (i = 0; i < lockregs; i++) {
 		writel_relaxed(0x0, l2x0_base + L2X0_LOCKDOWN_WAY_D_BASE +
@@ -312,18 +326,22 @@
 	u32 cache_id;
 	u32 way_size = 0;
 	int ways;
+	int way_size_shift = L2X0_WAY_SIZE_SHIFT;
 	const char *type;
 
 	l2x0_base = base;
-
-	cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID);
+	if (cache_id_part_number_from_dt)
+		cache_id = cache_id_part_number_from_dt;
+	else
+		cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID)
+			& L2X0_CACHE_ID_PART_MASK;
 	aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
 
 	aux &= aux_mask;
 	aux |= aux_val;
 
 	/* Determine the number of ways */
-	switch (cache_id & L2X0_CACHE_ID_PART_MASK) {
+	switch (cache_id) {
 	case L2X0_CACHE_ID_PART_L310:
 		if (aux & (1 << 16))
 			ways = 16;
@@ -340,6 +358,14 @@
 		ways = (aux >> 13) & 0xf;
 		type = "L210";
 		break;
+
+	case AURORA_CACHE_ID:
+		sync_reg_offset = AURORA_SYNC_REG;
+		ways = (aux >> 13) & 0xf;
+		ways = 2 << ((ways + 1) >> 2);
+		way_size_shift = AURORA_WAY_SIZE_SHIFT;
+		type = "Aurora";
+		break;
 	default:
 		/* Assume unknown chips have 8 ways */
 		ways = 8;
@@ -353,7 +379,8 @@
 	 * L2 cache Size =  Way size * Number of ways
 	 */
 	way_size = (aux & L2X0_AUX_CTRL_WAY_SIZE_MASK) >> 17;
-	way_size = 1 << (way_size + 3);
+	way_size = 1 << (way_size + way_size_shift);
+
 	l2x0_size = ways * way_size * SZ_1K;
 
 	/*
@@ -361,7 +388,7 @@
 	 * If you are booting from non-secure mode
 	 * accessing the below registers will fault.
 	 */
-	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & 1)) {
+	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
 		/* Make sure that I&D is not locked down when starting */
 		l2x0_unlock(cache_id);
 
@@ -371,7 +398,7 @@
 		l2x0_inv_all();
 
 		/* enable L2X0 */
-		writel_relaxed(1, l2x0_base + L2X0_CTRL);
+		writel_relaxed(L2X0_CTRL_EN, l2x0_base + L2X0_CTRL);
 	}
 
 	/* Re-read it in case some bits are reserved. */
@@ -380,13 +407,15 @@
 	/* Save the value for resuming. */
 	l2x0_saved_regs.aux_ctrl = aux;
 
-	outer_cache.inv_range = l2x0_inv_range;
-	outer_cache.clean_range = l2x0_clean_range;
-	outer_cache.flush_range = l2x0_flush_range;
-	outer_cache.sync = l2x0_cache_sync;
-	outer_cache.flush_all = l2x0_flush_all;
-	outer_cache.inv_all = l2x0_inv_all;
-	outer_cache.disable = l2x0_disable;
+	if (!of_init) {
+		outer_cache.inv_range = l2x0_inv_range;
+		outer_cache.clean_range = l2x0_clean_range;
+		outer_cache.flush_range = l2x0_flush_range;
+		outer_cache.sync = l2x0_cache_sync;
+		outer_cache.flush_all = l2x0_flush_all;
+		outer_cache.inv_all = l2x0_inv_all;
+		outer_cache.disable = l2x0_disable;
+	}
 
 	printk(KERN_INFO "%s cache controller enabled\n", type);
 	printk(KERN_INFO "l2x0: %d ways, CACHE_ID 0x%08x, AUX_CTRL 0x%08x, Cache size: %d B\n",
@@ -394,6 +423,100 @@
 }
 
 #ifdef CONFIG_OF
+static int l2_wt_override;
+
+/*
+ * Note that the end addresses passed to Linux primitives are
+ * noninclusive, while the hardware cache range operations use
+ * inclusive start and end addresses.
+ */
+static unsigned long calc_range_end(unsigned long start, unsigned long end)
+{
+	/*
+	 * Limit the number of cache lines processed at once,
+	 * since cache range operations stall the CPU pipeline
+	 * until completion.
+	 */
+	if (end > start + MAX_RANGE_SIZE)
+		end = start + MAX_RANGE_SIZE;
+
+	/*
+	 * Cache range operations can't straddle a page boundary.
+	 */
+	if (end > PAGE_ALIGN(start+1))
+		end = PAGE_ALIGN(start+1);
+
+	return end;
+}
+
+/*
+ * Make sure 'start' and 'end' reference the same page, as L2 is PIPT
+ * and range operations only do a TLB lookup on the start address.
+ */
+static void aurora_pa_range(unsigned long start, unsigned long end,
+			unsigned long offset)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&l2x0_lock, flags);
+	writel(start, l2x0_base + AURORA_RANGE_BASE_ADDR_REG);
+	writel(end, l2x0_base + offset);
+	raw_spin_unlock_irqrestore(&l2x0_lock, flags);
+
+	cache_sync();
+}
+
+static void aurora_inv_range(unsigned long start, unsigned long end)
+{
+	/*
+	 * round start and end adresses up to cache line size
+	 */
+	start &= ~(CACHE_LINE_SIZE - 1);
+	end = ALIGN(end, CACHE_LINE_SIZE);
+
+	/*
+	 * Invalidate all full cache lines between 'start' and 'end'.
+	 */
+	while (start < end) {
+		unsigned long range_end = calc_range_end(start, end);
+		aurora_pa_range(start, range_end - CACHE_LINE_SIZE,
+				AURORA_INVAL_RANGE_REG);
+		start = range_end;
+	}
+}
+
+static void aurora_clean_range(unsigned long start, unsigned long end)
+{
+	/*
+	 * If L2 is forced to WT, the L2 will always be clean and we
+	 * don't need to do anything here.
+	 */
+	if (!l2_wt_override) {
+		start &= ~(CACHE_LINE_SIZE - 1);
+		end = ALIGN(end, CACHE_LINE_SIZE);
+		while (start != end) {
+			unsigned long range_end = calc_range_end(start, end);
+			aurora_pa_range(start, range_end - CACHE_LINE_SIZE,
+					AURORA_CLEAN_RANGE_REG);
+			start = range_end;
+		}
+	}
+}
+
+static void aurora_flush_range(unsigned long start, unsigned long end)
+{
+	if (!l2_wt_override) {
+		start &= ~(CACHE_LINE_SIZE - 1);
+		end = ALIGN(end, CACHE_LINE_SIZE);
+		while (start != end) {
+			unsigned long range_end = calc_range_end(start, end);
+			aurora_pa_range(start, range_end - CACHE_LINE_SIZE,
+					AURORA_FLUSH_RANGE_REG);
+			start = range_end;
+		}
+	}
+}
+
 static void __init l2x0_of_setup(const struct device_node *np,
 				 u32 *aux_val, u32 *aux_mask)
 {
@@ -491,9 +614,15 @@
 	}
 }
 
+static void aurora_save(void)
+{
+	l2x0_saved_regs.ctrl = readl_relaxed(l2x0_base + L2X0_CTRL);
+	l2x0_saved_regs.aux_ctrl = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
+}
+
 static void l2x0_resume(void)
 {
-	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & 1)) {
+	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
 		/* restore aux ctrl and enable l2 */
 		l2x0_unlock(readl_relaxed(l2x0_base + L2X0_CACHE_ID));
 
@@ -502,7 +631,7 @@
 
 		l2x0_inv_all();
 
-		writel_relaxed(1, l2x0_base + L2X0_CTRL);
+		writel_relaxed(L2X0_CTRL_EN, l2x0_base + L2X0_CTRL);
 	}
 }
 
@@ -510,7 +639,7 @@
 {
 	u32 l2x0_revision;
 
-	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & 1)) {
+	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
 		/* restore pl310 setup */
 		writel_relaxed(l2x0_saved_regs.tag_latency,
 			l2x0_base + L2X0_TAG_LATENCY_CTRL);
@@ -536,22 +665,108 @@
 	l2x0_resume();
 }
 
+static void aurora_resume(void)
+{
+	if (!(readl(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
+		writel(l2x0_saved_regs.aux_ctrl, l2x0_base + L2X0_AUX_CTRL);
+		writel(l2x0_saved_regs.ctrl, l2x0_base + L2X0_CTRL);
+	}
+}
+
+static void __init aurora_broadcast_l2_commands(void)
+{
+	__u32 u;
+	/* Enable Broadcasting of cache commands to L2*/
+	__asm__ __volatile__("mrc p15, 1, %0, c15, c2, 0" : "=r"(u));
+	u |= AURORA_CTRL_FW;		/* Set the FW bit */
+	__asm__ __volatile__("mcr p15, 1, %0, c15, c2, 0\n" : : "r"(u));
+	isb();
+}
+
+static void __init aurora_of_setup(const struct device_node *np,
+				u32 *aux_val, u32 *aux_mask)
+{
+	u32 val = AURORA_ACR_REPLACEMENT_TYPE_SEMIPLRU;
+	u32 mask =  AURORA_ACR_REPLACEMENT_MASK;
+
+	of_property_read_u32(np, "cache-id-part",
+			&cache_id_part_number_from_dt);
+
+	/* Determine and save the write policy */
+	l2_wt_override = of_property_read_bool(np, "wt-override");
+
+	if (l2_wt_override) {
+		val |= AURORA_ACR_FORCE_WRITE_THRO_POLICY;
+		mask |= AURORA_ACR_FORCE_WRITE_POLICY_MASK;
+	}
+
+	*aux_val &= ~mask;
+	*aux_val |= val;
+	*aux_mask &= ~mask;
+}
+
 static const struct l2x0_of_data pl310_data = {
-	pl310_of_setup,
-	pl310_save,
-	pl310_resume,
+	.setup = pl310_of_setup,
+	.save  = pl310_save,
+	.outer_cache = {
+		.resume      = pl310_resume,
+		.inv_range   = l2x0_inv_range,
+		.clean_range = l2x0_clean_range,
+		.flush_range = l2x0_flush_range,
+		.sync        = l2x0_cache_sync,
+		.flush_all   = l2x0_flush_all,
+		.inv_all     = l2x0_inv_all,
+		.disable     = l2x0_disable,
+		.set_debug   = pl310_set_debug,
+	},
 };
 
 static const struct l2x0_of_data l2x0_data = {
-	l2x0_of_setup,
-	NULL,
-	l2x0_resume,
+	.setup = l2x0_of_setup,
+	.save  = NULL,
+	.outer_cache = {
+		.resume      = l2x0_resume,
+		.inv_range   = l2x0_inv_range,
+		.clean_range = l2x0_clean_range,
+		.flush_range = l2x0_flush_range,
+		.sync        = l2x0_cache_sync,
+		.flush_all   = l2x0_flush_all,
+		.inv_all     = l2x0_inv_all,
+		.disable     = l2x0_disable,
+	},
+};
+
+static const struct l2x0_of_data aurora_with_outer_data = {
+	.setup = aurora_of_setup,
+	.save  = aurora_save,
+	.outer_cache = {
+		.resume      = aurora_resume,
+		.inv_range   = aurora_inv_range,
+		.clean_range = aurora_clean_range,
+		.flush_range = aurora_flush_range,
+		.sync        = l2x0_cache_sync,
+		.flush_all   = l2x0_flush_all,
+		.inv_all     = l2x0_inv_all,
+		.disable     = l2x0_disable,
+	},
+};
+
+static const struct l2x0_of_data aurora_no_outer_data = {
+	.setup = aurora_of_setup,
+	.save  = aurora_save,
+	.outer_cache = {
+		.resume      = aurora_resume,
+	},
 };
 
 static const struct of_device_id l2x0_ids[] __initconst = {
 	{ .compatible = "arm,pl310-cache", .data = (void *)&pl310_data },
 	{ .compatible = "arm,l220-cache", .data = (void *)&l2x0_data },
 	{ .compatible = "arm,l210-cache", .data = (void *)&l2x0_data },
+	{ .compatible = "marvell,aurora-system-cache",
+	  .data = (void *)&aurora_no_outer_data},
+	{ .compatible = "marvell,aurora-outer-cache",
+	  .data = (void *)&aurora_with_outer_data},
 	{}
 };
 
@@ -577,17 +792,24 @@
 	data = of_match_node(l2x0_ids, np)->data;
 
 	/* L2 configuration can only be changed if the cache is disabled */
-	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & 1)) {
+	if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) {
 		if (data->setup)
 			data->setup(np, &aux_val, &aux_mask);
+
+		/* For aurora cache in no outer mode select the
+		 * correct mode using the coprocessor*/
+		if (data == &aurora_no_outer_data)
+			aurora_broadcast_l2_commands();
 	}
 
 	if (data->save)
 		data->save();
 
+	of_init = true;
 	l2x0_init(l2x0_base, aux_val, aux_mask);
 
-	outer_cache.resume = data->resume;
+	memcpy(&outer_cache, &data->outer_cache, sizeof(outer_cache));
+
 	return 0;
 }
 #endif
diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index 4e07eec..bc4a5e9 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -2,6 +2,9 @@
  *  linux/arch/arm/mm/context.c
  *
  *  Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved.
+ *  Copyright (C) 2012 ARM Limited
+ *
+ *  Author: Will Deacon <will.deacon@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -14,14 +17,40 @@
 #include <linux/percpu.h>
 
 #include <asm/mmu_context.h>
+#include <asm/smp_plat.h>
 #include <asm/thread_notify.h>
 #include <asm/tlbflush.h>
 
+/*
+ * On ARMv6, we have the following structure in the Context ID:
+ *
+ * 31                         7          0
+ * +-------------------------+-----------+
+ * |      process ID         |   ASID    |
+ * +-------------------------+-----------+
+ * |              context ID             |
+ * +-------------------------------------+
+ *
+ * The ASID is used to tag entries in the CPU caches and TLBs.
+ * The context ID is used by debuggers and trace logic, and
+ * should be unique within all running processes.
+ */
+#define ASID_FIRST_VERSION	(1ULL << ASID_BITS)
+#define NUM_USER_ASIDS		(ASID_FIRST_VERSION - 1)
+
+#define ASID_TO_IDX(asid)	((asid & ~ASID_MASK) - 1)
+#define IDX_TO_ASID(idx)	((idx + 1) & ~ASID_MASK)
+
 static DEFINE_RAW_SPINLOCK(cpu_asid_lock);
-unsigned int cpu_last_asid = ASID_FIRST_VERSION;
+static atomic64_t asid_generation = ATOMIC64_INIT(ASID_FIRST_VERSION);
+static DECLARE_BITMAP(asid_map, NUM_USER_ASIDS);
+
+static DEFINE_PER_CPU(atomic64_t, active_asids);
+static DEFINE_PER_CPU(u64, reserved_asids);
+static cpumask_t tlb_flush_pending;
 
 #ifdef CONFIG_ARM_LPAE
-void cpu_set_reserved_ttbr0(void)
+static void cpu_set_reserved_ttbr0(void)
 {
 	unsigned long ttbl = __pa(swapper_pg_dir);
 	unsigned long ttbh = 0;
@@ -37,7 +66,7 @@
 	isb();
 }
 #else
-void cpu_set_reserved_ttbr0(void)
+static void cpu_set_reserved_ttbr0(void)
 {
 	u32 ttb;
 	/* Copy TTBR1 into TTBR0 */
@@ -84,124 +113,104 @@
 arch_initcall(contextidr_notifier_init);
 #endif
 
-/*
- * We fork()ed a process, and we need a new context for the child
- * to run in.
- */
-void __init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+static void flush_context(unsigned int cpu)
 {
-	mm->context.id = 0;
-	raw_spin_lock_init(&mm->context.id_lock);
-}
+	int i;
+	u64 asid;
 
-static void flush_context(void)
-{
-	cpu_set_reserved_ttbr0();
-	local_flush_tlb_all();
-	if (icache_is_vivt_asid_tagged()) {
-		__flush_icache_all();
-		dsb();
+	/* Update the list of reserved ASIDs and the ASID bitmap. */
+	bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
+	for_each_possible_cpu(i) {
+		if (i == cpu) {
+			asid = 0;
+		} else {
+			asid = atomic64_xchg(&per_cpu(active_asids, i), 0);
+			__set_bit(ASID_TO_IDX(asid), asid_map);
+		}
+		per_cpu(reserved_asids, i) = asid;
 	}
+
+	/* Queue a TLB invalidate and flush the I-cache if necessary. */
+	if (!tlb_ops_need_broadcast())
+		cpumask_set_cpu(cpu, &tlb_flush_pending);
+	else
+		cpumask_setall(&tlb_flush_pending);
+
+	if (icache_is_vivt_asid_tagged())
+		__flush_icache_all();
 }
 
-#ifdef CONFIG_SMP
-
-static void set_mm_context(struct mm_struct *mm, unsigned int asid)
+static int is_reserved_asid(u64 asid)
 {
-	unsigned long flags;
+	int cpu;
+	for_each_possible_cpu(cpu)
+		if (per_cpu(reserved_asids, cpu) == asid)
+			return 1;
+	return 0;
+}
 
-	/*
-	 * Locking needed for multi-threaded applications where the
-	 * same mm->context.id could be set from different CPUs during
-	 * the broadcast. This function is also called via IPI so the
-	 * mm->context.id_lock has to be IRQ-safe.
-	 */
-	raw_spin_lock_irqsave(&mm->context.id_lock, flags);
-	if (likely((mm->context.id ^ cpu_last_asid) >> ASID_BITS)) {
+static void new_context(struct mm_struct *mm, unsigned int cpu)
+{
+	u64 asid = mm->context.id;
+	u64 generation = atomic64_read(&asid_generation);
+
+	if (asid != 0 && is_reserved_asid(asid)) {
 		/*
-		 * Old version of ASID found. Set the new one and
-		 * reset mm_cpumask(mm).
+		 * Our current ASID was active during a rollover, we can
+		 * continue to use it and this was just a false alarm.
 		 */
-		mm->context.id = asid;
+		asid = generation | (asid & ~ASID_MASK);
+	} else {
+		/*
+		 * Allocate a free ASID. If we can't find one, take a
+		 * note of the currently active ASIDs and mark the TLBs
+		 * as requiring flushes.
+		 */
+		asid = find_first_zero_bit(asid_map, NUM_USER_ASIDS);
+		if (asid == NUM_USER_ASIDS) {
+			generation = atomic64_add_return(ASID_FIRST_VERSION,
+							 &asid_generation);
+			flush_context(cpu);
+			asid = find_first_zero_bit(asid_map, NUM_USER_ASIDS);
+		}
+		__set_bit(asid, asid_map);
+		asid = generation | IDX_TO_ASID(asid);
 		cpumask_clear(mm_cpumask(mm));
 	}
-	raw_spin_unlock_irqrestore(&mm->context.id_lock, flags);
 
-	/*
-	 * Set the mm_cpumask(mm) bit for the current CPU.
-	 */
-	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
-}
-
-/*
- * Reset the ASID on the current CPU. This function call is broadcast
- * from the CPU handling the ASID rollover and holding cpu_asid_lock.
- */
-static void reset_context(void *info)
-{
-	unsigned int asid;
-	unsigned int cpu = smp_processor_id();
-	struct mm_struct *mm = current->active_mm;
-
-	smp_rmb();
-	asid = cpu_last_asid + cpu + 1;
-
-	flush_context();
-	set_mm_context(mm, asid);
-
-	/* set the new ASID */
-	cpu_switch_mm(mm->pgd, mm);
-}
-
-#else
-
-static inline void set_mm_context(struct mm_struct *mm, unsigned int asid)
-{
 	mm->context.id = asid;
-	cpumask_copy(mm_cpumask(mm), cpumask_of(smp_processor_id()));
 }
 
-#endif
-
-void __new_context(struct mm_struct *mm)
+void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk)
 {
-	unsigned int asid;
+	unsigned long flags;
+	unsigned int cpu = smp_processor_id();
 
-	raw_spin_lock(&cpu_asid_lock);
-#ifdef CONFIG_SMP
-	/*
-	 * Check the ASID again, in case the change was broadcast from
-	 * another CPU before we acquired the lock.
-	 */
-	if (unlikely(((mm->context.id ^ cpu_last_asid) >> ASID_BITS) == 0)) {
-		cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
-		raw_spin_unlock(&cpu_asid_lock);
-		return;
-	}
-#endif
-	/*
-	 * At this point, it is guaranteed that the current mm (with
-	 * an old ASID) isn't active on any other CPU since the ASIDs
-	 * are changed simultaneously via IPI.
-	 */
-	asid = ++cpu_last_asid;
-	if (asid == 0)
-		asid = cpu_last_asid = ASID_FIRST_VERSION;
+	if (unlikely(mm->context.vmalloc_seq != init_mm.context.vmalloc_seq))
+		__check_vmalloc_seq(mm);
 
 	/*
-	 * If we've used up all our ASIDs, we need
-	 * to start a new version and flush the TLB.
+	 * Required during context switch to avoid speculative page table
+	 * walking with the wrong TTBR.
 	 */
-	if (unlikely((asid & ~ASID_MASK) == 0)) {
-		asid = cpu_last_asid + smp_processor_id() + 1;
-		flush_context();
-#ifdef CONFIG_SMP
-		smp_wmb();
-		smp_call_function(reset_context, NULL, 1);
-#endif
-		cpu_last_asid += NR_CPUS;
-	}
+	cpu_set_reserved_ttbr0();
 
-	set_mm_context(mm, asid);
-	raw_spin_unlock(&cpu_asid_lock);
+	if (!((mm->context.id ^ atomic64_read(&asid_generation)) >> ASID_BITS)
+	    && atomic64_xchg(&per_cpu(active_asids, cpu), mm->context.id))
+		goto switch_mm_fastpath;
+
+	raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+	/* Check that our ASID belongs to the current generation. */
+	if ((mm->context.id ^ atomic64_read(&asid_generation)) >> ASID_BITS)
+		new_context(mm, cpu);
+
+	atomic64_set(&per_cpu(active_asids, cpu), mm->context.id);
+	cpumask_set_cpu(cpu, mm_cpumask(mm));
+
+	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
+		local_flush_tlb_all();
+	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+
+switch_mm_fastpath:
+	cpu_switch_mm(mm->pgd, mm);
 }
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c
index ab88ed4..99db769 100644
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -92,6 +92,9 @@
 		(long long)idmap_start, (long long)idmap_end);
 	identity_mapping_add(idmap_pgd, idmap_start, idmap_end);
 
+	/* Flush L1 for the hardware to see this page table content */
+	flush_cache_louis();
+
 	return 0;
 }
 early_initcall(init_static_idmap);
@@ -103,12 +106,15 @@
  */
 void setup_mm_for_reboot(void)
 {
-	/* Clean and invalidate L1. */
-	flush_cache_all();
-
 	/* Switch to the identity mapping. */
 	cpu_switch_mm(idmap_pgd, &init_mm);
 
-	/* Flush the TLB. */
+#ifdef CONFIG_CPU_HAS_ASID
+	/*
+	 * We don't have a clean ASID for the identity mapping, which
+	 * may clash with virtual addresses of the previous page tables
+	 * and therefore potentially in the TLB.
+	 */
 	local_flush_tlb_all();
+#endif
 }
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 5dcc2fd..88fd86c 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -47,18 +47,18 @@
 }
 EXPORT_SYMBOL(ioremap_page);
 
-void __check_kvm_seq(struct mm_struct *mm)
+void __check_vmalloc_seq(struct mm_struct *mm)
 {
 	unsigned int seq;
 
 	do {
-		seq = init_mm.context.kvm_seq;
+		seq = init_mm.context.vmalloc_seq;
 		memcpy(pgd_offset(mm, VMALLOC_START),
 		       pgd_offset_k(VMALLOC_START),
 		       sizeof(pgd_t) * (pgd_index(VMALLOC_END) -
 					pgd_index(VMALLOC_START)));
-		mm->context.kvm_seq = seq;
-	} while (seq != init_mm.context.kvm_seq);
+		mm->context.vmalloc_seq = seq;
+	} while (seq != init_mm.context.vmalloc_seq);
 }
 
 #if !defined(CONFIG_SMP) && !defined(CONFIG_ARM_LPAE)
@@ -89,13 +89,13 @@
 		if (!pmd_none(pmd)) {
 			/*
 			 * Clear the PMD from the page table, and
-			 * increment the kvm sequence so others
+			 * increment the vmalloc sequence so others
 			 * notice this change.
 			 *
 			 * Note: this is still racy on SMP machines.
 			 */
 			pmd_clear(pmdp);
-			init_mm.context.kvm_seq++;
+			init_mm.context.vmalloc_seq++;
 
 			/*
 			 * Free the page table, if there was one.
@@ -112,8 +112,8 @@
 	 * Ensure that the active_mm is up to date - we want to
 	 * catch any use-after-iounmap cases.
 	 */
-	if (current->active_mm->context.kvm_seq != init_mm.context.kvm_seq)
-		__check_kvm_seq(current->active_mm);
+	if (current->active_mm->context.vmalloc_seq != init_mm.context.vmalloc_seq)
+		__check_vmalloc_seq(current->active_mm);
 
 	flush_tlb_kernel_range(virt, end);
 }
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 941dfb9..99b47b9 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -488,7 +488,7 @@
 #endif
 
 	for (i = 0; i < 16; i++) {
-		unsigned long v = pgprot_val(protection_map[i]);
+		pteval_t v = pgprot_val(protection_map[i]);
 		protection_map[i] = __pgprot(v | user_pgprot);
 	}
 
diff --git a/arch/arm/mm/proc-macros.S b/arch/arm/mm/proc-macros.S
index b29a226..eb6aa73 100644
--- a/arch/arm/mm/proc-macros.S
+++ b/arch/arm/mm/proc-macros.S
@@ -167,6 +167,10 @@
 	tst	r1, #L_PTE_YOUNG
 	tstne	r1, #L_PTE_PRESENT
 	moveq	r3, #0
+#ifndef CONFIG_CPU_USE_DOMAINS
+	tstne	r1, #L_PTE_NONE
+	movne	r3, #0
+#endif
 
 	str	r3, [r0]
 	mcr	p15, 0, r0, c7, c10, 1		@ flush_pte
diff --git a/arch/arm/mm/proc-v7-2level.S b/arch/arm/mm/proc-v7-2level.S
index fd045e7..6d98c13 100644
--- a/arch/arm/mm/proc-v7-2level.S
+++ b/arch/arm/mm/proc-v7-2level.S
@@ -100,7 +100,11 @@
 	orrne	r3, r3, #PTE_EXT_XN
 
 	tst	r1, #L_PTE_YOUNG
-	tstne	r1, #L_PTE_PRESENT
+	tstne	r1, #L_PTE_VALID
+#ifndef CONFIG_CPU_USE_DOMAINS
+	eorne	r1, r1, #L_PTE_NONE
+	tstne	r1, #L_PTE_NONE
+#endif
 	moveq	r3, #0
 
  ARM(	str	r3, [r0, #2048]! )
@@ -161,11 +165,11 @@
 	 *  TFR   EV X F   I D LR    S
 	 * .EEE ..EE PUI. .T.T 4RVI ZWRS BLDP WCAM
 	 * rxxx rrxx xxx0 0101 xxxx xxxx x111 xxxx < forced
-	 *    1    0 110       0011 1100 .111 1101 < we want
+	 *   01    0 110       0011 1100 .111 1101 < we want
 	 */
 	.align	2
 	.type	v7_crval, #object
 v7_crval:
-	crval	clear=0x0120c302, mmuset=0x10c03c7d, ucset=0x00c01c7c
+	crval	clear=0x2120c302, mmuset=0x10c03c7d, ucset=0x00c01c7c
 
 	.previous
diff --git a/arch/arm/mm/proc-v7-3level.S b/arch/arm/mm/proc-v7-3level.S
index 8de0f1d..7b56386 100644
--- a/arch/arm/mm/proc-v7-3level.S
+++ b/arch/arm/mm/proc-v7-3level.S
@@ -65,8 +65,11 @@
  */
 ENTRY(cpu_v7_set_pte_ext)
 #ifdef CONFIG_MMU
-	tst	r2, #L_PTE_PRESENT
+	tst	r2, #L_PTE_VALID
 	beq	1f
+	tst	r3, #1 << (57 - 32)		@ L_PTE_NONE
+	bicne	r2, #L_PTE_VALID
+	bne	1f
 	tst	r3, #1 << (55 - 32)		@ L_PTE_DIRTY
 	orreq	r2, #L_PTE_RDONLY
 1:	strd	r2, r3, [r0]
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 846d279..42cc833 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -57,7 +57,7 @@
  THUMB(	bic	r1, r1, #1 << 30 )		@ SCTLR.TE (Thumb exceptions)
 	mcr	p15, 0, r1, c1, c0, 0		@ disable MMU
 	isb
-	mov	pc, r0
+	bx	r0
 ENDPROC(cpu_v7_reset)
 	.popsection
 
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index c641fb6..b6f305e 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -42,7 +42,7 @@
 #define r_skb_hl	ARM_R8
 
 #define SCRATCH_SP_OFFSET	0
-#define SCRATCH_OFF(k)		(SCRATCH_SP_OFFSET + (k))
+#define SCRATCH_OFF(k)		(SCRATCH_SP_OFFSET + 4 * (k))
 
 #define SEEN_MEM		((1 << BPF_MEMWORDS) - 1)
 #define SEEN_MEM_WORD(k)	(1 << (k))
@@ -845,7 +845,7 @@
 	ctx.skf		= fp;
 	ctx.ret0_fp_idx = -1;
 
-	ctx.offsets = kzalloc(GFP_KERNEL, 4 * (ctx.skf->len + 1));
+	ctx.offsets = kzalloc(4 * (ctx.skf->len + 1), GFP_KERNEL);
 	if (ctx.offsets == NULL)
 		return;
 
@@ -864,7 +864,7 @@
 
 	ctx.idx += ctx.imm_count;
 	if (ctx.imm_count) {
-		ctx.imms = kzalloc(GFP_KERNEL, 4 * ctx.imm_count);
+		ctx.imms = kzalloc(4 * ctx.imm_count, GFP_KERNEL);
 		if (ctx.imms == NULL)
 			goto out;
 	}
diff --git a/arch/avr32/include/asm/mach/serial_at91.h b/arch/avr32/include/asm/mach/serial_at91.h
deleted file mode 100644
index 55b317a..0000000
--- a/arch/avr32/include/asm/mach/serial_at91.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  linux/include/asm-arm/mach/serial_at91.h
- *
- *  Based on serial_sa1100.h  by Nicolas Pitre
- *
- *  Copyright (C) 2002 ATMEL Rousset
- *
- *  Low level machine dependent UART functions.
- */
-
-struct uart_port;
-
-/*
- * This is a temporary structure for registering these
- * functions; it is intended to be discarded after boot.
- */
-struct atmel_port_fns {
-	void	(*set_mctrl)(struct uart_port *, u_int);
-	u_int	(*get_mctrl)(struct uart_port *);
-	void	(*enable_ms)(struct uart_port *);
-	void	(*pm)(struct uart_port *, u_int, u_int);
-	int	(*set_wake)(struct uart_port *, u_int);
-	int	(*open)(struct uart_port *);
-	void	(*close)(struct uart_port *);
-};
-
-#if defined(CONFIG_SERIAL_ATMEL)
-void atmel_register_uart_fns(struct atmel_port_fns *fns);
-#else
-#define atmel_register_uart_fns(fns) do { } while (0)
-#endif
-
-
diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index e8eb91b..a2fc56d 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -546,7 +546,8 @@
 static struct amba_device *
 amba_aphb_device_add(struct device *parent, const char *name,
 		     resource_size_t base, size_t size, int irq1, int irq2,
-		     void *pdata, unsigned int periphid, u64 dma_mask)
+		     void *pdata, unsigned int periphid, u64 dma_mask,
+		     struct resource *resbase)
 {
 	struct amba_device *dev;
 	int ret;
@@ -563,7 +564,7 @@
 	dev->dev.platform_data = pdata;
 	dev->dev.parent = parent;
 
-	ret = amba_device_add(dev, &iomem_resource);
+	ret = amba_device_add(dev, resbase);
 	if (ret) {
 		amba_device_put(dev);
 		return ERR_PTR(ret);
@@ -578,7 +579,7 @@
 		    void *pdata, unsigned int periphid)
 {
 	return amba_aphb_device_add(parent, name, base, size, irq1, irq2, pdata,
-				    periphid, 0);
+				    periphid, 0, &iomem_resource);
 }
 EXPORT_SYMBOL_GPL(amba_apb_device_add);
 
@@ -588,10 +589,33 @@
 		    void *pdata, unsigned int periphid)
 {
 	return amba_aphb_device_add(parent, name, base, size, irq1, irq2, pdata,
-				    periphid, ~0ULL);
+				    periphid, ~0ULL, &iomem_resource);
 }
 EXPORT_SYMBOL_GPL(amba_ahb_device_add);
 
+struct amba_device *
+amba_apb_device_add_res(struct device *parent, const char *name,
+			resource_size_t base, size_t size, int irq1,
+			int irq2, void *pdata, unsigned int periphid,
+			struct resource *resbase)
+{
+	return amba_aphb_device_add(parent, name, base, size, irq1, irq2, pdata,
+				    periphid, 0, resbase);
+}
+EXPORT_SYMBOL_GPL(amba_apb_device_add_res);
+
+struct amba_device *
+amba_ahb_device_add_res(struct device *parent, const char *name,
+			resource_size_t base, size_t size, int irq1,
+			int irq2, void *pdata, unsigned int periphid,
+			struct resource *resbase)
+{
+	return amba_aphb_device_add(parent, name, base, size, irq1, irq2, pdata,
+				    periphid, ~0ULL, resbase);
+}
+EXPORT_SYMBOL_GPL(amba_ahb_device_add_res);
+
+
 static void amba_device_initialize(struct amba_device *dev, const char *name)
 {
 	device_initialize(&dev->dev);
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index ec28d17..1507723 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -33,6 +33,7 @@
 #include <linux/amba/mmci.h>
 #include <linux/pm_runtime.h>
 #include <linux/types.h>
+#include <linux/pinctrl/consumer.h>
 
 #include <asm/div64.h>
 #include <asm/io.h>
@@ -654,9 +655,31 @@
 
 	/* The ST Micro variants has a special bit to enable SDIO */
 	if (variant->sdio && host->mmc->card)
-		if (mmc_card_sdio(host->mmc->card))
+		if (mmc_card_sdio(host->mmc->card)) {
+			/*
+			 * The ST Micro variants has a special bit
+			 * to enable SDIO.
+			 */
+			u32 clk;
+
 			datactrl |= MCI_ST_DPSM_SDIOEN;
 
+			/*
+			 * The ST Micro variant for SDIO small write transfers
+			 * needs to have clock H/W flow control disabled,
+			 * otherwise the transfer will not start. The threshold
+			 * depends on the rate of MCLK.
+			 */
+			if (data->flags & MMC_DATA_WRITE &&
+			    (host->size < 8 ||
+			     (host->size <= 8 && host->mclk > 50000000)))
+				clk = host->clk_reg & ~variant->clkreg_enable;
+			else
+				clk = host->clk_reg | variant->clkreg_enable;
+
+			mmci_write_clkreg(host, clk);
+		}
+
 	/*
 	 * Attempt to use DMA operation mode, if this
 	 * should fail, fall back to PIO mode
@@ -840,14 +863,14 @@
 		if (unlikely(count & 0x3)) {
 			if (count < 4) {
 				unsigned char buf[4];
-				readsl(base + MMCIFIFO, buf, 1);
+				ioread32_rep(base + MMCIFIFO, buf, 1);
 				memcpy(ptr, buf, count);
 			} else {
-				readsl(base + MMCIFIFO, ptr, count >> 2);
+				ioread32_rep(base + MMCIFIFO, ptr, count >> 2);
 				count &= ~0x3;
 			}
 		} else {
-			readsl(base + MMCIFIFO, ptr, count >> 2);
+			ioread32_rep(base + MMCIFIFO, ptr, count >> 2);
 		}
 
 		ptr += count;
@@ -877,22 +900,6 @@
 		count = min(remain, maxcnt);
 
 		/*
-		 * The ST Micro variant for SDIO transfer sizes
-		 * less then 8 bytes should have clock H/W flow
-		 * control disabled.
-		 */
-		if (variant->sdio &&
-		    mmc_card_sdio(host->mmc->card)) {
-			u32 clk;
-			if (count < 8)
-				clk = host->clk_reg & ~variant->clkreg_enable;
-			else
-				clk = host->clk_reg | variant->clkreg_enable;
-
-			mmci_write_clkreg(host, clk);
-		}
-
-		/*
 		 * SDIO especially may want to send something that is
 		 * not divisible by 4 (as opposed to card sectors
 		 * etc), and the FIFO only accept full 32-bit writes.
@@ -900,7 +907,7 @@
 		 * byte become a 32bit write, 7 bytes will be two
 		 * 32bit writes etc.
 		 */
-		writesl(base + MMCIFIFO, ptr, (count + 3) >> 2);
+		iowrite32_rep(base + MMCIFIFO, ptr, (count + 3) >> 2);
 
 		ptr += count;
 		remain -= count;
@@ -1360,6 +1367,23 @@
 		mmc->f_max = min(host->mclk, fmax);
 	dev_dbg(mmc_dev(mmc), "clocking block at %u Hz\n", mmc->f_max);
 
+	host->pinctrl = devm_pinctrl_get(&dev->dev);
+	if (IS_ERR(host->pinctrl)) {
+		ret = PTR_ERR(host->pinctrl);
+		goto clk_disable;
+	}
+
+	host->pins_default = pinctrl_lookup_state(host->pinctrl,
+			PINCTRL_STATE_DEFAULT);
+
+	/* enable pins to be muxed in and configured */
+	if (!IS_ERR(host->pins_default)) {
+		ret = pinctrl_select_state(host->pinctrl, host->pins_default);
+		if (ret)
+			dev_warn(&dev->dev, "could not set default pins\n");
+	} else
+		dev_warn(&dev->dev, "could not get default pinstate\n");
+
 #ifdef CONFIG_REGULATOR
 	/* If we're using the regulator framework, try to fetch a regulator */
 	host->vcc = regulator_get(&dev->dev, "vmmc");
diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h
index d437ccf..d34d8c0 100644
--- a/drivers/mmc/host/mmci.h
+++ b/drivers/mmc/host/mmci.h
@@ -195,6 +195,10 @@
 	unsigned int		size;
 	struct regulator	*vcc;
 
+	/* pinctrl handles */
+	struct pinctrl		*pinctrl;
+	struct pinctrl_state	*pins_default;
+
 #ifdef CONFIG_DMA_ENGINE
 	/* DMA stuff */
 	struct dma_chan		*dma_current;
diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index b95886c..1a8a271 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -44,7 +44,6 @@
 #include <asm/io.h>
 #include <asm/ioctls.h>
 
-#include <asm/mach/serial_at91.h>
 #include <mach/board.h>
 
 #ifdef CONFIG_ARM
@@ -1514,23 +1513,6 @@
 	}
 }
 
-/*
- * Register board-specific modem-control line handlers.
- */
-void __init atmel_register_uart_fns(struct atmel_port_fns *fns)
-{
-	if (fns->enable_ms)
-		atmel_pops.enable_ms = fns->enable_ms;
-	if (fns->get_mctrl)
-		atmel_pops.get_mctrl = fns->get_mctrl;
-	if (fns->set_mctrl)
-		atmel_pops.set_mctrl = fns->set_mctrl;
-	atmel_open_hook		= fns->open;
-	atmel_close_hook	= fns->close;
-	atmel_pops.pm		= fns->pm;
-	atmel_pops.set_wake	= fns->set_wake;
-}
-
 struct platform_device *atmel_default_console_device;	/* the serial console device */
 
 #ifdef CONFIG_SERIAL_ATMEL_CONSOLE
diff --git a/drivers/tty/serial/sa1100.c b/drivers/tty/serial/sa1100.c
index da56c8a..5d4b9b4 100644
--- a/drivers/tty/serial/sa1100.c
+++ b/drivers/tty/serial/sa1100.c
@@ -29,6 +29,7 @@
 #include <linux/init.h>
 #include <linux/console.h>
 #include <linux/sysrq.h>
+#include <linux/platform_data/sa11x0-serial.h>
 #include <linux/platform_device.h>
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
@@ -39,7 +40,6 @@
 #include <asm/irq.h>
 #include <mach/hardware.h>
 #include <mach/irqs.h>
-#include <asm/mach/serial_sa1100.h>
 
 /* We've been assigned a range on the "Low-density serial ports" major */
 #define SERIAL_SA1100_MAJOR	204
diff --git a/drivers/usb/gadget/pxa25x_udc.c b/drivers/usb/gadget/pxa25x_udc.c
index 8efbf08..d4ca9f1 100644
--- a/drivers/usb/gadget/pxa25x_udc.c
+++ b/drivers/usb/gadget/pxa25x_udc.c
@@ -29,6 +29,7 @@
 #include <linux/list.h>
 #include <linux/interrupt.h>
 #include <linux/mm.h>
+#include <linux/platform_data/pxa2xx_udc.h>
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
 #include <linux/irq.h>
@@ -59,9 +60,6 @@
 #include <mach/lubbock.h>
 #endif
 
-#include <asm/mach/udc_pxa2xx.h>
-
-
 /*
  * This driver handles the USB Device Controller (UDC) in Intel's PXA 25x
  * series processors.  The UDC for the IXP 4xx series is very similar.
diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c
index 4552847..76c73cb 100644
--- a/drivers/watchdog/sp805_wdt.c
+++ b/drivers/watchdog/sp805_wdt.c
@@ -284,8 +284,7 @@
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int sp805_wdt_suspend(struct device *dev)
+static int __maybe_unused sp805_wdt_suspend(struct device *dev)
 {
 	struct sp805_wdt *wdt = dev_get_drvdata(dev);
 
@@ -295,7 +294,7 @@
 	return 0;
 }
 
-static int sp805_wdt_resume(struct device *dev)
+static int __maybe_unused sp805_wdt_resume(struct device *dev)
 {
 	struct sp805_wdt *wdt = dev_get_drvdata(dev);
 
@@ -304,7 +303,6 @@
 
 	return 0;
 }
-#endif /* CONFIG_PM */
 
 static SIMPLE_DEV_PM_OPS(sp805_wdt_dev_pm_ops, sp805_wdt_suspend,
 		sp805_wdt_resume);
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index d364171..43ec7e2 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -71,6 +71,16 @@
 					resource_size_t base, size_t size,
 					int irq1, int irq2, void *pdata,
 					unsigned int periphid);
+struct amba_device *
+amba_apb_device_add_res(struct device *parent, const char *name,
+			resource_size_t base, size_t size, int irq1,
+			int irq2, void *pdata, unsigned int periphid,
+			struct resource *resbase);
+struct amba_device *
+amba_ahb_device_add_res(struct device *parent, const char *name,
+			resource_size_t base, size_t size, int irq1,
+			int irq2, void *pdata, unsigned int periphid,
+			struct resource *resbase);
 void amba_device_unregister(struct amba_device *);
 struct amba_device *amba_find_device(const char *, struct device *, unsigned int, unsigned int);
 int amba_request_regions(struct amba_device *, const char *);
diff --git a/arch/arm/include/asm/mach/udc_pxa2xx.h b/include/linux/platform_data/pxa2xx_udc.h
similarity index 93%
rename from arch/arm/include/asm/mach/udc_pxa2xx.h
rename to include/linux/platform_data/pxa2xx_udc.h
index ea297ac..c6c5e98 100644
--- a/arch/arm/include/asm/mach/udc_pxa2xx.h
+++ b/include/linux/platform_data/pxa2xx_udc.h
@@ -1,6 +1,4 @@
 /*
- * arch/arm/include/asm/mach/udc_pxa2xx.h
- *
  * This supports machine-specific differences in how the PXA2xx
  * USB Device Controller (UDC) is wired.
  *
@@ -8,6 +6,8 @@
  * linux/arch/mach-ixp4xx/<machine>.c and used in
  * the probe routine of linux/drivers/usb/gadget/pxa2xx_udc.c
  */
+#ifndef PXA2XX_UDC_H
+#define PXA2XX_UDC_H
 
 struct pxa2xx_udc_mach_info {
         int  (*udc_is_connected)(void);		/* do we see host? */
@@ -24,3 +24,4 @@
 	int	gpio_pullup;			/* high == pullup activated */
 };
 
+#endif
diff --git a/arch/arm/include/asm/mach/serial_sa1100.h b/include/linux/platform_data/sa11x0-serial.h
similarity index 92%
rename from arch/arm/include/asm/mach/serial_sa1100.h
rename to include/linux/platform_data/sa11x0-serial.h
index d09064b..4504d5d 100644
--- a/arch/arm/include/asm/mach/serial_sa1100.h
+++ b/include/linux/platform_data/sa11x0-serial.h
@@ -1,12 +1,12 @@
 /*
- *  arch/arm/include/asm/mach/serial_sa1100.h
- *
  *  Author: Nicolas Pitre
  *
  * Moved and changed lots, Russell King
  *
  * Low level machine dependent UART functions.
  */
+#ifndef SA11X0_SERIAL_H
+#define SA11X0_SERIAL_H
 
 struct uart_port;
 struct uart_info;
@@ -29,3 +29,5 @@
 #define sa1100_register_uart_fns(fns) do { } while (0)
 #define sa1100_register_uart(idx,port) do { } while (0)
 #endif
+
+#endif
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index f19ddc4..1f10e89 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -248,6 +248,7 @@
 	case EM_S390:
 		custom_sort = sort_relative_table;
 		break;
+	case EM_ARM:
 	case EM_MIPS:
 		break;
 	}  /* end switch */